.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
utils.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Utility functions for the engines
3
4"""
5
6from __future__ import annotations
7
8import re
9import importlib
10import importlib.util
11import json
12import types
13
14from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
15from numbers import Number
16from os.path import splitext, join
17from random import choice
18from html.parser import HTMLParser
19from html import escape
20from urllib.parse import urljoin, urlparse, parse_qs, urlencode
21from markdown_it import MarkdownIt
22
23from lxml import html
24from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError
25
26from searx import settings
27from searx.data import USER_AGENTS, data_dir
28from searx.version import VERSION_TAG
29from searx.sxng_locales import sxng_locales
30from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
31from searx import logger
32
33
34logger = logger.getChild('utils')
35
36XPathSpecType = Union[str, XPath]
37
38_BLOCKED_TAGS = ('script', 'style')
39
40_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
41_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
42
43_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
44_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\‍([0-9]+\‍)')
45_JS_DECIMAL_RE = re.compile(r":\s*\.")
46
47_XPATH_CACHE: Dict[str, XPath] = {}
48_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
49
50_FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None # type: ignore
51"""fasttext model to predict language of a search term"""
52
53SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
54"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
55
56
57class _NotSetClass: # pylint: disable=too-few-public-methods
58 """Internal class for this module, do not create instance of this class.
59 Replace the None value, allow explicitly pass None as a function argument"""
60
61
62_NOTSET = _NotSetClass()
63
64
65def searx_useragent() -> str:
66 """Return the searx User Agent"""
67 return 'searx/{searx_version} {suffix}'.format(
68 searx_version=VERSION_TAG, suffix=settings['outgoing']['useragent_suffix']
69 ).strip()
70
71
72def gen_useragent(os_string: Optional[str] = None) -> str:
73 """Return a random browser User Agent
74
75 See searx/data/useragents.json
76 """
77 return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
78
79
81 """Internal exception raised when the HTML is invalid"""
82
83
84class _HTMLTextExtractor(HTMLParser):
85 """Internal class to extract text from HTML"""
86
87 def __init__(self):
88 HTMLParser.__init__(self)
89 self.result = []
90 self.tags = []
91
92 def handle_starttag(self, tag, attrs):
93 self.tags.append(tag)
94 if tag == 'br':
95 self.result.append(' ')
96
97 def handle_endtag(self, tag):
98 if not self.tags:
99 return
100
101 if tag != self.tags[-1]:
103
104 self.tags.pop()
105
106 def is_valid_tag(self):
107 return not self.tags or self.tags[-1] not in _BLOCKED_TAGS
108
109 def handle_data(self, data):
110 if not self.is_valid_tag():
111 return
112 self.result.append(data)
113
114 def handle_charref(self, name):
115 if not self.is_valid_tag():
116 return
117 if name[0] in ('x', 'X'):
118 codepoint = int(name[1:], 16)
119 else:
120 codepoint = int(name)
121 self.result.append(chr(codepoint))
122
123 def handle_entityref(self, name):
124 if not self.is_valid_tag():
125 return
126 # codepoint = htmlentitydefs.name2codepoint[name]
127 # self.result.append(chr(codepoint))
128 self.result.append(name)
129
130 def get_text(self):
131 return ''.join(self.result).strip()
132
133 def error(self, message):
134 # error handle is needed in <py3.10
135 # https://github.com/python/cpython/pull/8562/files
136 raise AssertionError(message)
137
138
139def html_to_text(html_str: str) -> str:
140 """Extract text from a HTML string
141
142 Args:
143 * html_str (str): string HTML
144
145 Returns:
146 * str: extracted text
147
148 Examples:
149 >>> html_to_text('Example <span id="42">#2</span>')
150 'Example #2'
151
152 >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
153 'Example'
154
155 >>> html_to_text(r'regexp: (?<![a-zA-Z]')
156 'regexp: (?<![a-zA-Z]'
157 """
158 html_str = html_str.replace('\n', ' ').replace('\r', ' ')
159 html_str = ' '.join(html_str.split())
161 try:
162 s.feed(html_str)
163 except AssertionError:
165 s.feed(escape(html_str, quote=True))
166 except _HTMLTextExtractorException:
167 logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
168 return s.get_text()
169
170
171def markdown_to_text(markdown_str: str) -> str:
172 """Extract text from a Markdown string
173
174 Args:
175 * markdown_str (str): string Markdown
176
177 Returns:
178 * str: extracted text
179
180 Examples:
181 >>> markdown_to_text('[example](https://example.com)')
182 'example'
183
184 >>> markdown_to_text('## Headline')
185 'Headline'
186 """
187
188 html_str = (
189 MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
190 )
191 return html_to_text(html_str)
192
193
194def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
195 """Extract text from a lxml result
196
197 * if xpath_results is list, extract the text from each result and concat the list
198 * if xpath_results is a xml element, extract all the text node from it
199 ( text_content() method from lxml )
200 * if xpath_results is a string element, then it's already done
201 """
202 if isinstance(xpath_results, list):
203 # it's list of result : concat everything using recursive call
204 result = ''
205 for e in xpath_results:
206 result = result + (extract_text(e) or '')
207 return result.strip()
208 if isinstance(xpath_results, ElementBase):
209 # it's a element
210 text: str = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False)
211 text = text.strip().replace('\n', ' ')
212 return ' '.join(text.split())
213 if isinstance(xpath_results, (str, Number, bool)):
214 return str(xpath_results)
215 if xpath_results is None and allow_none:
216 return None
217 if xpath_results is None and not allow_none:
218 raise ValueError('extract_text(None, allow_none=False)')
219 raise ValueError('unsupported type')
220
221
222def normalize_url(url: str, base_url: str) -> str:
223 """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
224
225 Args:
226 * url (str): Relative URL
227 * base_url (str): Base URL, it must be an absolute URL.
228
229 Example:
230 >>> normalize_url('https://example.com', 'http://example.com/')
231 'https://example.com/'
232 >>> normalize_url('//example.com', 'http://example.com/')
233 'http://example.com/'
234 >>> normalize_url('//example.com', 'https://example.com/')
235 'https://example.com/'
236 >>> normalize_url('/path?a=1', 'https://example.com')
237 'https://example.com/path?a=1'
238 >>> normalize_url('', 'https://example.com')
239 'https://example.com/'
240 >>> normalize_url('/test', '/path')
241 raise ValueError
242
243 Raises:
244 * lxml.etree.ParserError
245
246 Returns:
247 * str: normalized URL
248 """
249 if url.startswith('//'):
250 # add http or https to this kind of url //example.com/
251 parsed_search_url = urlparse(base_url)
252 url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
253 elif url.startswith('/'):
254 # fix relative url to the search engine
255 url = urljoin(base_url, url)
256
257 # fix relative urls that fall through the crack
258 if '://' not in url:
259 url = urljoin(base_url, url)
260
261 parsed_url = urlparse(url)
262
263 # add a / at this end of the url if there is no path
264 if not parsed_url.netloc:
265 raise ValueError('Cannot parse url')
266 if not parsed_url.path:
267 url += '/'
268
269 return url
270
271
272def extract_url(xpath_results, base_url) -> str:
273 """Extract and normalize URL from lxml Element
274
275 Args:
276 * xpath_results (Union[List[html.HtmlElement], html.HtmlElement]): lxml Element(s)
277 * base_url (str): Base URL
278
279 Example:
280 >>> def f(s, search_url):
281 >>> return searx.utils.extract_url(html.fromstring(s), search_url)
282 >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
283 'https://example.com/'
284 >>> f('https://example.com', 'http://example.com/')
285 'https://example.com/'
286 >>> f('//example.com', 'http://example.com/')
287 'http://example.com/'
288 >>> f('//example.com', 'https://example.com/')
289 'https://example.com/'
290 >>> f('/path?a=1', 'https://example.com')
291 'https://example.com/path?a=1'
292 >>> f('', 'https://example.com')
293 raise lxml.etree.ParserError
294 >>> searx.utils.extract_url([], 'https://example.com')
295 raise ValueError
296
297 Raises:
298 * ValueError
299 * lxml.etree.ParserError
300
301 Returns:
302 * str: normalized URL
303 """
304 if xpath_results == []:
305 raise ValueError('Empty url resultset')
306
307 url = extract_text(xpath_results)
308 if url:
309 return normalize_url(url, base_url)
310 raise ValueError('URL not found')
311
312
313def dict_subset(dictionary: MutableMapping, properties: Set[str]) -> Dict:
314 """Extract a subset of a dict
315
316 Examples:
317 >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
318 {'A': 'a', 'C': 'c'}
319 >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
320 {'A': 'a'}
321 """
322 return {k: dictionary[k] for k in properties if k in dictionary}
323
324
325def humanize_bytes(size, precision=2):
326 """Determine the *human readable* value of bytes on 1024 base (1KB=1024B)."""
327 s = ['B ', 'KB', 'MB', 'GB', 'TB']
328
329 x = len(s)
330 p = 0
331 while size > 1024 and p < x:
332 p += 1
333 size = size / 1024.0
334 return "%.*f %s" % (precision, size, s[p])
335
336
337def humanize_number(size, precision=0):
338 """Determine the *human readable* value of a decimal number."""
339 s = ['', 'K', 'M', 'B', 'T']
340
341 x = len(s)
342 p = 0
343 while size > 1000 and p < x:
344 p += 1
345 size = size / 1000.0
346 return "%.*f%s" % (precision, size, s[p])
347
348
349def convert_str_to_int(number_str: str) -> int:
350 """Convert number_str to int or 0 if number_str is not a number."""
351 if number_str.isdigit():
352 return int(number_str)
353 return 0
354
355
356def extr(txt: str, begin: str, end: str, default: str = ""):
357 """Extract the string between ``begin`` and ``end`` from ``txt``
358
359 :param txt: String to search in
360 :param begin: First string to be searched for
361 :param end: Second string to be searched for after ``begin``
362 :param default: Default value if one of ``begin`` or ``end`` is not
363 found. Defaults to an empty string.
364 :return: The string between the two search-strings ``begin`` and ``end``.
365 If at least one of ``begin`` or ``end`` is not found, the value of
366 ``default`` is returned.
367
368 Examples:
369 >>> extr("abcde", "a", "e")
370 "bcd"
371 >>> extr("abcde", "a", "z", deafult="nothing")
372 "nothing"
373
374 """
375
376 # From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129
377
378 try:
379 first = txt.index(begin) + len(begin)
380 return txt[first : txt.index(end, first)]
381 except ValueError:
382 return default
383
384
385def int_or_zero(num: Union[List[str], str]) -> int:
386 """Convert num to int or 0. num can be either a str or a list.
387 If num is a list, the first element is converted to int (or return 0 if the list is empty).
388 If num is a str, see convert_str_to_int
389 """
390 if isinstance(num, list):
391 if len(num) < 1:
392 return 0
393 num = num[0]
394 return convert_str_to_int(num)
395
396
397def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
398 """Return language code and name if lang describe a language.
399
400 Examples:
401 >>> is_valid_lang('zz')
402 None
403 >>> is_valid_lang('uk')
404 (True, 'uk', 'ukrainian')
405 >>> is_valid_lang(b'uk')
406 (True, 'uk', 'ukrainian')
407 >>> is_valid_lang('en')
408 (True, 'en', 'english')
409 >>> searx.utils.is_valid_lang('Español')
410 (True, 'es', 'spanish')
411 >>> searx.utils.is_valid_lang('Spanish')
412 (True, 'es', 'spanish')
413 """
414 if isinstance(lang, bytes):
415 lang = lang.decode()
416 is_abbr = len(lang) == 2
417 lang = lang.lower()
418 if is_abbr:
419 for l in sxng_locales:
420 if l[0][:2] == lang:
421 return (True, l[0][:2], l[3].lower())
422 return None
423 for l in sxng_locales:
424 if l[1].lower() == lang or l[3].lower() == lang:
425 return (True, l[0][:2], l[3].lower())
426 return None
427
428
429def load_module(filename: str, module_dir: str) -> types.ModuleType:
430 modname = splitext(filename)[0]
431 modpath = join(module_dir, filename)
432 # and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
433 spec = importlib.util.spec_from_file_location(modname, modpath)
434 if not spec:
435 raise ValueError(f"Error loading '{modpath}' module")
436 module = importlib.util.module_from_spec(spec)
437 if not spec.loader:
438 raise ValueError(f"Error loading '{modpath}' module")
439 spec.loader.exec_module(module)
440 return module
441
442
443def to_string(obj: Any) -> str:
444 """Convert obj to its string representation."""
445 if isinstance(obj, str):
446 return obj
447 if hasattr(obj, '__str__'):
448 return str(obj)
449 return repr(obj)
450
451
452def ecma_unescape(string: str) -> str:
453 """Python implementation of the unescape javascript function
454
455 https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
456 https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
457
458 Examples:
459 >>> ecma_unescape('%u5409')
460 '吉'
461 >>> ecma_unescape('%20')
462 ' '
463 >>> ecma_unescape('%F3')
464 'ó'
465 """
466 # "%u5409" becomes "吉"
467 string = _ECMA_UNESCAPE4_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
468 # "%20" becomes " ", "%F3" becomes "ó"
469 string = _ECMA_UNESCAPE2_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
470 return string
471
472
473def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
474 rep = {re.escape(k): v for k, v in replaces.items()}
475 pattern = re.compile("|".join(rep.keys()))
476
477 def func(text):
478 return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
479
480 return func
481
482
483def get_engine_from_settings(name: str) -> Dict:
484 """Return engine configuration from settings.yml of a given engine name"""
485
486 if 'engines' not in settings:
487 return {}
488
489 for engine in settings['engines']:
490 if 'name' not in engine:
491 continue
492 if name == engine['name']:
493 return engine
494
495 return {}
496
497
498def get_xpath(xpath_spec: XPathSpecType) -> XPath:
499 """Return cached compiled XPath
500
501 There is no thread lock.
502 Worst case scenario, xpath_str is compiled more than one time.
503
504 Args:
505 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
506
507 Returns:
508 * result (bool, float, list, str): Results.
509
510 Raises:
511 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
512 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
513 """
514 if isinstance(xpath_spec, str):
515 result = _XPATH_CACHE.get(xpath_spec, None)
516 if result is None:
517 try:
518 result = XPath(xpath_spec)
519 except XPathSyntaxError as e:
520 raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e
521 _XPATH_CACHE[xpath_spec] = result
522 return result
523
524 if isinstance(xpath_spec, XPath):
525 return xpath_spec
526
527 raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath')
528
529
530def eval_xpath(element: ElementBase, xpath_spec: XPathSpecType):
531 """Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.
532 See https://lxml.de/xpathxslt.html#xpath-return-values
533
534 Args:
535 * element (ElementBase): [description]
536 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
537
538 Returns:
539 * result (bool, float, list, str): Results.
540
541 Raises:
542 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
543 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
544 * SearxEngineXPathException: Raise when the XPath can't be evaluated.
545 """
546 xpath = get_xpath(xpath_spec)
547 try:
548 return xpath(element)
549 except XPathError as e:
550 arg = ' '.join([str(i) for i in e.args])
551 raise SearxEngineXPathException(xpath_spec, arg) from e
552
553
554def eval_xpath_list(element: ElementBase, xpath_spec: XPathSpecType, min_len: Optional[int] = None):
555 """Same as eval_xpath, check if the result is a list
556
557 Args:
558 * element (ElementBase): [description]
559 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
560 * min_len (int, optional): [description]. Defaults to None.
561
562 Raises:
563 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
564 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
565 * SearxEngineXPathException: raise if the result is not a list
566
567 Returns:
568 * result (bool, float, list, str): Results.
569 """
570 result = eval_xpath(element, xpath_spec)
571 if not isinstance(result, list):
572 raise SearxEngineXPathException(xpath_spec, 'the result is not a list')
573 if min_len is not None and min_len > len(result):
574 raise SearxEngineXPathException(xpath_spec, 'len(xpath_str) < ' + str(min_len))
575 return result
576
577
578def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index: int, default=_NOTSET):
579 """Call eval_xpath_list then get one element using the index parameter.
580 If the index does not exist, either raise an exception is default is not set,
581 other return the default value (can be None).
582
583 Args:
584 * elements (ElementBase): lxml element to apply the xpath.
585 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath.
586 * index (int): index to get
587 * default (Object, optional): Defaults if index doesn't exist.
588
589 Raises:
590 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
591 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
592 * SearxEngineXPathException: if the index is not found. Also see eval_xpath.
593
594 Returns:
595 * result (bool, float, list, str): Results.
596 """
597 result = eval_xpath_list(elements, xpath_spec)
598 if -len(result) <= index < len(result):
599 return result[index]
600 if default == _NOTSET:
601 # raise an SearxEngineXPathException instead of IndexError
602 # to record xpath_spec
603 raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
604 return default
605
606
607def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore
608 global _FASTTEXT_MODEL # pylint: disable=global-statement
609 if _FASTTEXT_MODEL is None:
610 import fasttext # pylint: disable=import-outside-toplevel
611
612 # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
613 fasttext.FastText.eprint = lambda x: None
614 _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
615 return _FASTTEXT_MODEL
616
617
619 """
620 Converts a standard video URL into its embed format. Supported services include Youtube,
621 Facebook, Instagram, TikTok, and Dailymotion.
622 """
623 parsed_url = urlparse(url)
624 iframe_src = None
625
626 # YouTube
627 if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query:
628 video_id = parse_qs(parsed_url.query).get('v', [])
629 if video_id:
630 iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0]
631
632 # Facebook
633 elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']:
634 encoded_href = urlencode({'href': url})
635 iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href
636
637 # Instagram
638 elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'):
639 if parsed_url.path.endswith('/'):
640 iframe_src = url + 'embed'
641 else:
642 iframe_src = url + '/embed'
643
644 # TikTok
645 elif (
646 parsed_url.netloc in ['www.tiktok.com', 'tiktok.com']
647 and parsed_url.path.startswith('/@')
648 and '/video/' in parsed_url.path
649 ):
650 path_parts = parsed_url.path.split('/video/')
651 video_id = path_parts[1]
652 iframe_src = 'https://www.tiktok.com/embed/' + video_id
653
654 # Dailymotion
655 elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'):
656 path_parts = parsed_url.path.split('/')
657 if len(path_parts) == 3:
658 video_id = path_parts[2]
659 iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id
660
661 return iframe_src
662
663
664def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
665 """Detect the language of the ``text`` parameter.
666
667 :param str text: The string whose language is to be detected.
668
669 :param float threshold: Threshold filters the returned labels by a threshold
670 on probability. A choice of 0.3 will return labels with at least 0.3
671 probability.
672
673 :param bool only_search_languages: If ``True``, returns only supported
674 SearXNG search languages. see :py:obj:`searx.languages`
675
676 :rtype: str, None
677 :returns:
678 The detected language code or ``None``. See below.
679
680 :raises ValueError: If ``text`` is not a string.
681
682 The language detection is done by using `a fork`_ of the fastText_ library
683 (`python fasttext`_). fastText_ distributes the `language identification
684 model`_, for reference:
685
686 - `FastText.zip: Compressing text classification models`_
687 - `Bag of Tricks for Efficient Text Classification`_
688
689 The `language identification model`_ support the language codes
690 (ISO-639-3)::
691
692 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
693 bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
694 et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
695 id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
696 lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
697 nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
698 rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
699 tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
700
701 By using ``only_search_languages=True`` the `language identification model`_
702 is harmonized with the SearXNG's language (locale) model. General
703 conditions of SearXNG's locale model are:
704
705 a. SearXNG's locale of a query is passed to the
706 :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
707 code that is used by an engine.
708
709 b. Most of SearXNG's engines do not support all the languages from `language
710 identification model`_ and there is also a discrepancy in the ISO-639-3
711 (fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
712 locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
713 (``zh_Hans``) while the `language identification model`_ reduce both to
714 ``zh``.
715
716 .. _a fork: https://github.com/searxng/fasttext-predict
717 .. _fastText: https://fasttext.cc/
718 .. _python fasttext: https://pypi.org/project/fasttext/
719 .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
720 .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
721 .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
722
723 """
724 if not isinstance(text, str):
725 raise ValueError('text must a str')
726 r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
727 if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
728 language = r[0][0].split('__label__')[1]
729 if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
730 return None
731 return language
732 return None
733
734
735def js_variable_to_python(js_variable):
736 """Convert a javascript variable into JSON and then load the value
737
738 It does not deal with all cases, but it is good enough for now.
739 chompjs has a better implementation.
740 """
741 # when in_string is not None, it contains the character that has opened the string
742 # either simple quote or double quote
743 in_string = None
744 # cut the string:
745 # r"""{ a:"f\"irst", c:'sec"ond'}"""
746 # becomes
747 # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
748 parts = re.split(r'(["\'])', js_variable)
749 # previous part (to check the escape character antislash)
750 previous_p = ""
751 for i, p in enumerate(parts):
752 # parse characters inside a ECMA string
753 if in_string:
754 # we are in a JS string: replace the colon by a temporary character
755 # so quote_keys_regex doesn't have to deal with colon inside the JS strings
756 parts[i] = parts[i].replace(':', chr(1))
757 if in_string == "'":
758 # the JS string is delimited by simple quote.
759 # This is not supported by JSON.
760 # simple quote delimited string are converted to double quote delimited string
761 # here, inside a JS string, we escape the double quote
762 parts[i] = parts[i].replace('"', r'\"')
763
764 # deal with delimiters and escape character
765 if not in_string and p in ('"', "'"):
766 # we are not in string
767 # but p is double or simple quote
768 # that's the start of a new string
769 # replace simple quote by double quote
770 # (JSON doesn't support simple quote)
771 parts[i] = '"'
772 in_string = p
773 continue
774 if p == in_string:
775 # we are in a string and the current part MAY close the string
776 if len(previous_p) > 0 and previous_p[-1] == '\\':
777 # there is an antislash just before: the ECMA string continue
778 continue
779 # the current p close the string
780 # replace simple quote by double quote
781 parts[i] = '"'
782 in_string = None
783
784 if not in_string:
785 # replace void 0 by null
786 # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
787 # we are sure there is no string in p
788 parts[i] = _JS_VOID_RE.sub("null", p)
789 # update previous_p
790 previous_p = p
791 # join the string
792 s = ''.join(parts)
793 # add quote around the key
794 # { a: 12 }
795 # becomes
796 # { "a": 12 }
797 s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
798 s = _JS_DECIMAL_RE.sub(":0.", s)
799 # replace the surogate character by colon
800 s = s.replace(chr(1), ':')
801 # load the JSON and return the result
802 return json.loads(s)
handle_starttag(self, tag, attrs)
Definition utils.py:92
extr(str txt, str begin, str end, str default="")
Definition utils.py:356
Callable[[str], str] get_string_replaces_function(Dict[str, str] replaces)
Definition utils.py:473
js_variable_to_python(js_variable)
Definition utils.py:735
Optional[str] detect_language(str text, float threshold=0.3, bool only_search_languages=False)
Definition utils.py:664
XPath get_xpath(XPathSpecType xpath_spec)
Definition utils.py:498
str to_string(Any obj)
Definition utils.py:443
humanize_bytes(size, precision=2)
Definition utils.py:325
Optional[str] extract_text(xpath_results, bool allow_none=False)
Definition utils.py:194
str ecma_unescape(str string)
Definition utils.py:452
eval_xpath_getindex(ElementBase elements, XPathSpecType xpath_spec, int index, default=_NOTSET)
Definition utils.py:578
int convert_str_to_int(str number_str)
Definition utils.py:349
eval_xpath(ElementBase element, XPathSpecType xpath_spec)
Definition utils.py:530
str gen_useragent(Optional[str] os_string=None)
Definition utils.py:72
int int_or_zero(Union[List[str], str] num)
Definition utils.py:385
Optional[Tuple[bool, str, str]] is_valid_lang(lang)
Definition utils.py:397
"fasttext.FastText._FastText" _get_fasttext_model()
Definition utils.py:607
str extract_url(xpath_results, base_url)
Definition utils.py:272
eval_xpath_list(ElementBase element, XPathSpecType xpath_spec, Optional[int] min_len=None)
Definition utils.py:554
types.ModuleType load_module(str filename, str module_dir)
Definition utils.py:429
str searx_useragent()
Definition utils.py:65
str normalize_url(str url, str base_url)
Definition utils.py:222
get_embeded_stream_url(url)
Definition utils.py:618
str html_to_text(str html_str)
Definition utils.py:139
humanize_number(size, precision=0)
Definition utils.py:337
Dict dict_subset(MutableMapping dictionary, Set[str] properties)
Definition utils.py:313
Dict get_engine_from_settings(str name)
Definition utils.py:483
str markdown_to_text(str markdown_str)
Definition utils.py:171