.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
searx.utils Namespace Reference

Classes

class  _HTMLTextExtractor
 
class  _HTMLTextExtractorException
 
class  _NotSetClass
 

Functions

str searx_useragent ()
 
str gen_useragent (Optional[str] os_string=None)
 
str html_to_text (str html_str)
 
str markdown_to_text (str markdown_str)
 
Optional[str] extract_text (xpath_results, bool allow_none=False)
 
str normalize_url (str url, str base_url)
 
str extract_url (xpath_results, base_url)
 
Dict dict_subset (MutableMapping dictionary, Set[str] properties)
 
 humanize_bytes (size, precision=2)
 
 humanize_number (size, precision=0)
 
int convert_str_to_int (str number_str)
 
 extr (str txt, str begin, str end, str default="")
 
int int_or_zero (Union[List[str], str] num)
 
Optional[Tuple[bool, str, str]] is_valid_lang (lang)
 
types.ModuleType load_module (str filename, str module_dir)
 
str to_string (Any obj)
 
str ecma_unescape (str string)
 
 remove_pua_from_str (string)
 
Callable[[str], str] get_string_replaces_function (Dict[str, str] replaces)
 
Dict get_engine_from_settings (str name)
 
XPath get_xpath (XPathSpecType xpath_spec)
 
 eval_xpath (ElementBase element, XPathSpecType xpath_spec)
 
 eval_xpath_list (ElementBase element, XPathSpecType xpath_spec, Optional[int] min_len=None)
 
 eval_xpath_getindex (ElementBase elements, XPathSpecType xpath_spec, int index, default=_NOTSET)
 
"fasttext.FastText._FastText" _get_fasttext_model ()
 
 get_embeded_stream_url (url)
 
Optional[str] detect_language (str text, float threshold=0.3, bool only_search_languages=False)
 
 js_variable_to_python (js_variable)
 

Variables

 logger = logger.getChild('utils')
 
 XPathSpecType = Union[str, XPath]
 
tuple _BLOCKED_TAGS = ('script', 'style')
 
 _ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
 
 _ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
 
 _JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
 
 _JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\‍([0-9]+\‍)')
 
 _JS_DECIMAL_RE = re.compile(r":\s*\.")
 
dict _XPATH_CACHE = {}
 
dict _LANG_TO_LC_CACHE = {}
 
Optional _FASTTEXT_MODEL = None
 
 SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
 
 _NOTSET = _NotSetClass()
 

Detailed Description

Utility functions for the engines

Function Documentation

◆ _get_fasttext_model()

"fasttext.FastText._FastText" searx.utils._get_fasttext_model ( )
protected

Definition at line 622 of file utils.py.

622def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore
623 global _FASTTEXT_MODEL # pylint: disable=global-statement
624 if _FASTTEXT_MODEL is None:
625 import fasttext # pylint: disable=import-outside-toplevel
626
627 # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
628 fasttext.FastText.eprint = lambda x: None
629 _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
630 return _FASTTEXT_MODEL
631
632

Referenced by detect_language().

+ Here is the caller graph for this function:

◆ convert_str_to_int()

int searx.utils.convert_str_to_int ( str number_str)
Convert number_str to int or 0 if number_str is not a number.

Definition at line 349 of file utils.py.

349def convert_str_to_int(number_str: str) -> int:
350 """Convert number_str to int or 0 if number_str is not a number."""
351 if number_str.isdigit():
352 return int(number_str)
353 return 0
354
355

Referenced by int_or_zero().

+ Here is the caller graph for this function:

◆ detect_language()

Optional[str] searx.utils.detect_language ( str text,
float threshold = 0.3,
bool only_search_languages = False )
Detect the language of the ``text`` parameter.

:param str text: The string whose language is to be detected.

:param float threshold: Threshold filters the returned labels by a threshold
    on probability.  A choice of 0.3 will return labels with at least 0.3
    probability.

:param bool only_search_languages: If ``True``, returns only supported
    SearXNG search languages.  see :py:obj:`searx.languages`

:rtype: str, None
:returns:
    The detected language code or ``None``. See below.

:raises ValueError: If ``text`` is not a string.

The language detection is done by using `a fork`_ of the fastText_ library
(`python fasttext`_). fastText_ distributes the `language identification
model`_, for reference:

- `FastText.zip: Compressing text classification models`_
- `Bag of Tricks for Efficient Text Classification`_

The `language identification model`_ support the language codes
(ISO-639-3)::

    af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
    bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
    et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
    id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
    lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
    nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
    rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
    tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh

By using ``only_search_languages=True`` the `language identification model`_
is harmonized with the SearXNG's language (locale) model.  General
conditions of SearXNG's locale model are:

a. SearXNG's locale of a query is passed to the
   :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
   code that is used by an engine.

b. Most of SearXNG's engines do not support all the languages from `language
   identification model`_ and there is also a discrepancy in the ISO-639-3
   (fasttext) and ISO-639-2 (SearXNG)handling.  Further more, in SearXNG the
   locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
   (``zh_Hans``) while the `language identification model`_ reduce both to
   ``zh``.

.. _a fork: https://github.com/searxng/fasttext-predict
.. _fastText: https://fasttext.cc/
.. _python fasttext: https://pypi.org/project/fasttext/
.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651

Definition at line 695 of file utils.py.

695def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
696 """Detect the language of the ``text`` parameter.
697
698 :param str text: The string whose language is to be detected.
699
700 :param float threshold: Threshold filters the returned labels by a threshold
701 on probability. A choice of 0.3 will return labels with at least 0.3
702 probability.
703
704 :param bool only_search_languages: If ``True``, returns only supported
705 SearXNG search languages. see :py:obj:`searx.languages`
706
707 :rtype: str, None
708 :returns:
709 The detected language code or ``None``. See below.
710
711 :raises ValueError: If ``text`` is not a string.
712
713 The language detection is done by using `a fork`_ of the fastText_ library
714 (`python fasttext`_). fastText_ distributes the `language identification
715 model`_, for reference:
716
717 - `FastText.zip: Compressing text classification models`_
718 - `Bag of Tricks for Efficient Text Classification`_
719
720 The `language identification model`_ support the language codes
721 (ISO-639-3)::
722
723 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
724 bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
725 et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
726 id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
727 lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
728 nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
729 rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
730 tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
731
732 By using ``only_search_languages=True`` the `language identification model`_
733 is harmonized with the SearXNG's language (locale) model. General
734 conditions of SearXNG's locale model are:
735
736 a. SearXNG's locale of a query is passed to the
737 :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
738 code that is used by an engine.
739
740 b. Most of SearXNG's engines do not support all the languages from `language
741 identification model`_ and there is also a discrepancy in the ISO-639-3
742 (fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
743 locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
744 (``zh_Hans``) while the `language identification model`_ reduce both to
745 ``zh``.
746
747 .. _a fork: https://github.com/searxng/fasttext-predict
748 .. _fastText: https://fasttext.cc/
749 .. _python fasttext: https://pypi.org/project/fasttext/
750 .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
751 .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
752 .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
753
754 """
755 if not isinstance(text, str):
756 raise ValueError('text must a str')
757 r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
758 if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
759 language = r[0][0].split('__label__')[1]
760 if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
761 return None
762 return language
763 return None
764
765

References _get_fasttext_model().

+ Here is the call graph for this function:

◆ dict_subset()

Dict searx.utils.dict_subset ( MutableMapping dictionary,
Set[str] properties )
Extract a subset of a dict

Examples:
    >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
    {'A': 'a', 'C': 'c'}
    >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
    {'A': 'a'}

Definition at line 313 of file utils.py.

313def dict_subset(dictionary: MutableMapping, properties: Set[str]) -> Dict:
314 """Extract a subset of a dict
315
316 Examples:
317 >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
318 {'A': 'a', 'C': 'c'}
319 >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
320 {'A': 'a'}
321 """
322 return {k: dictionary[k] for k in properties if k in dictionary}
323
324

◆ ecma_unescape()

str searx.utils.ecma_unescape ( str string)
Python implementation of the unescape javascript function

https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape

Examples:
    >>> ecma_unescape('%u5409')
    '吉'
    >>> ecma_unescape('%20')
    ' '
    >>> ecma_unescape('%F3')
    'ó'

Definition at line 452 of file utils.py.

452def ecma_unescape(string: str) -> str:
453 """Python implementation of the unescape javascript function
454
455 https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
456 https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
457
458 Examples:
459 >>> ecma_unescape('%u5409')
460 '吉'
461 >>> ecma_unescape('%20')
462 ' '
463 >>> ecma_unescape('%F3')
464 'ó'
465 """
466 # "%u5409" becomes "吉"
467 string = _ECMA_UNESCAPE4_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
468 # "%20" becomes " ", "%F3" becomes "ó"
469 string = _ECMA_UNESCAPE2_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
470 return string
471
472

◆ eval_xpath()

searx.utils.eval_xpath ( ElementBase element,
XPathSpecType xpath_spec )
Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.
See https://lxml.de/xpathxslt.html#xpath-return-values

Args:
    * element (ElementBase): [description]
    * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath

Returns:
    * result (bool, float, list, str): Results.

Raises:
    * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
    * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
    * SearxEngineXPathException: Raise when the XPath can't be evaluated.

Definition at line 545 of file utils.py.

545def eval_xpath(element: ElementBase, xpath_spec: XPathSpecType):
546 """Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.
547 See https://lxml.de/xpathxslt.html#xpath-return-values
548
549 Args:
550 * element (ElementBase): [description]
551 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
552
553 Returns:
554 * result (bool, float, list, str): Results.
555
556 Raises:
557 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
558 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
559 * SearxEngineXPathException: Raise when the XPath can't be evaluated.
560 """
561 xpath = get_xpath(xpath_spec)
562 try:
563 return xpath(element)
564 except XPathError as e:
565 arg = ' '.join([str(i) for i in e.args])
566 raise SearxEngineXPathException(xpath_spec, arg) from e
567
568

References get_xpath().

Referenced by eval_xpath_list().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ eval_xpath_getindex()

searx.utils.eval_xpath_getindex ( ElementBase elements,
XPathSpecType xpath_spec,
int index,
default = _NOTSET )
Call eval_xpath_list then get one element using the index parameter.
If the index does not exist, either raise an exception is default is not set,
other return the default value (can be None).

Args:
    * elements (ElementBase): lxml element to apply the xpath.
    * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath.
    * index (int): index to get
    * default (Object, optional): Defaults if index doesn't exist.

Raises:
    * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
    * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
    * SearxEngineXPathException: if the index is not found. Also see eval_xpath.

Returns:
    * result (bool, float, list, str): Results.

Definition at line 593 of file utils.py.

593def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index: int, default=_NOTSET):
594 """Call eval_xpath_list then get one element using the index parameter.
595 If the index does not exist, either raise an exception is default is not set,
596 other return the default value (can be None).
597
598 Args:
599 * elements (ElementBase): lxml element to apply the xpath.
600 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath.
601 * index (int): index to get
602 * default (Object, optional): Defaults if index doesn't exist.
603
604 Raises:
605 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
606 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
607 * SearxEngineXPathException: if the index is not found. Also see eval_xpath.
608
609 Returns:
610 * result (bool, float, list, str): Results.
611 """
612 result = eval_xpath_list(elements, xpath_spec)
613 if -len(result) <= index < len(result):
614 return result[index]
615 if default == _NOTSET:
616 # raise an SearxEngineXPathException instead of IndexError
617 # to record xpath_spec
618 raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
619 return default
620
621

References eval_xpath_list().

+ Here is the call graph for this function:

◆ eval_xpath_list()

searx.utils.eval_xpath_list ( ElementBase element,
XPathSpecType xpath_spec,
Optional[int] min_len = None )
Same as eval_xpath, check if the result is a list

Args:
    * element (ElementBase): [description]
    * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
    * min_len (int, optional): [description]. Defaults to None.

Raises:
    * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
    * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
    * SearxEngineXPathException: raise if the result is not a list

Returns:
    * result (bool, float, list, str): Results.

Definition at line 569 of file utils.py.

569def eval_xpath_list(element: ElementBase, xpath_spec: XPathSpecType, min_len: Optional[int] = None):
570 """Same as eval_xpath, check if the result is a list
571
572 Args:
573 * element (ElementBase): [description]
574 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
575 * min_len (int, optional): [description]. Defaults to None.
576
577 Raises:
578 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
579 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
580 * SearxEngineXPathException: raise if the result is not a list
581
582 Returns:
583 * result (bool, float, list, str): Results.
584 """
585 result = eval_xpath(element, xpath_spec)
586 if not isinstance(result, list):
587 raise SearxEngineXPathException(xpath_spec, 'the result is not a list')
588 if min_len is not None and min_len > len(result):
589 raise SearxEngineXPathException(xpath_spec, 'len(xpath_str) < ' + str(min_len))
590 return result
591
592

References eval_xpath().

Referenced by eval_xpath_getindex().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ extr()

searx.utils.extr ( str txt,
str begin,
str end,
str default = "" )
Extract the string between ``begin`` and ``end`` from ``txt``

:param txt:     String to search in
:param begin:   First string to be searched for
:param end:     Second string to be searched for after ``begin``
:param default: Default value if one of ``begin`` or ``end`` is not
                found.  Defaults to an empty string.
:return: The string between the two search-strings ``begin`` and ``end``.
         If at least one of ``begin`` or ``end`` is not found, the value of
         ``default`` is returned.

Examples:
  >>> extr("abcde", "a", "e")
  "bcd"
  >>> extr("abcde", "a", "z", deafult="nothing")
  "nothing"

Definition at line 356 of file utils.py.

356def extr(txt: str, begin: str, end: str, default: str = ""):
357 """Extract the string between ``begin`` and ``end`` from ``txt``
358
359 :param txt: String to search in
360 :param begin: First string to be searched for
361 :param end: Second string to be searched for after ``begin``
362 :param default: Default value if one of ``begin`` or ``end`` is not
363 found. Defaults to an empty string.
364 :return: The string between the two search-strings ``begin`` and ``end``.
365 If at least one of ``begin`` or ``end`` is not found, the value of
366 ``default`` is returned.
367
368 Examples:
369 >>> extr("abcde", "a", "e")
370 "bcd"
371 >>> extr("abcde", "a", "z", deafult="nothing")
372 "nothing"
373
374 """
375
376 # From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129
377
378 try:
379 first = txt.index(begin) + len(begin)
380 return txt[first : txt.index(end, first)]
381 except ValueError:
382 return default
383
384

◆ extract_text()

Optional[str] searx.utils.extract_text ( xpath_results,
bool allow_none = False )
Extract text from a lxml result

* if xpath_results is list, extract the text from each result and concat the list
* if xpath_results is a xml element, extract all the text node from it
  ( text_content() method from lxml )
* if xpath_results is a string element, then it's already done

Definition at line 194 of file utils.py.

194def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
195 """Extract text from a lxml result
196
197 * if xpath_results is list, extract the text from each result and concat the list
198 * if xpath_results is a xml element, extract all the text node from it
199 ( text_content() method from lxml )
200 * if xpath_results is a string element, then it's already done
201 """
202 if isinstance(xpath_results, list):
203 # it's list of result : concat everything using recursive call
204 result = ''
205 for e in xpath_results:
206 result = result + (extract_text(e) or '')
207 return result.strip()
208 if isinstance(xpath_results, ElementBase):
209 # it's a element
210 text: str = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False)
211 text = text.strip().replace('\n', ' ')
212 return ' '.join(text.split())
213 if isinstance(xpath_results, (str, Number, bool)):
214 return str(xpath_results)
215 if xpath_results is None and allow_none:
216 return None
217 if xpath_results is None and not allow_none:
218 raise ValueError('extract_text(None, allow_none=False)')
219 raise ValueError('unsupported type')
220
221

References extract_text().

Referenced by extract_text(), and extract_url().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ extract_url()

str searx.utils.extract_url ( xpath_results,
base_url )
Extract and normalize URL from lxml Element

Args:
    * xpath_results (Union[List[html.HtmlElement], html.HtmlElement]): lxml Element(s)
    * base_url (str): Base URL

Example:
    >>> def f(s, search_url):
    >>>    return searx.utils.extract_url(html.fromstring(s), search_url)
    >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
    'https://example.com/'
    >>> f('https://example.com', 'http://example.com/')
    'https://example.com/'
    >>> f('//example.com', 'http://example.com/')
    'http://example.com/'
    >>> f('//example.com', 'https://example.com/')
    'https://example.com/'
    >>> f('/path?a=1', 'https://example.com')
    'https://example.com/path?a=1'
    >>> f('', 'https://example.com')
    raise lxml.etree.ParserError
    >>> searx.utils.extract_url([], 'https://example.com')
    raise ValueError

Raises:
    * ValueError
    * lxml.etree.ParserError

Returns:
    * str: normalized URL

Definition at line 272 of file utils.py.

272def extract_url(xpath_results, base_url) -> str:
273 """Extract and normalize URL from lxml Element
274
275 Args:
276 * xpath_results (Union[List[html.HtmlElement], html.HtmlElement]): lxml Element(s)
277 * base_url (str): Base URL
278
279 Example:
280 >>> def f(s, search_url):
281 >>> return searx.utils.extract_url(html.fromstring(s), search_url)
282 >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
283 'https://example.com/'
284 >>> f('https://example.com', 'http://example.com/')
285 'https://example.com/'
286 >>> f('//example.com', 'http://example.com/')
287 'http://example.com/'
288 >>> f('//example.com', 'https://example.com/')
289 'https://example.com/'
290 >>> f('/path?a=1', 'https://example.com')
291 'https://example.com/path?a=1'
292 >>> f('', 'https://example.com')
293 raise lxml.etree.ParserError
294 >>> searx.utils.extract_url([], 'https://example.com')
295 raise ValueError
296
297 Raises:
298 * ValueError
299 * lxml.etree.ParserError
300
301 Returns:
302 * str: normalized URL
303 """
304 if xpath_results == []:
305 raise ValueError('Empty url resultset')
306
307 url = extract_text(xpath_results)
308 if url:
309 return normalize_url(url, base_url)
310 raise ValueError('URL not found')
311
312

References extract_text(), and normalize_url().

+ Here is the call graph for this function:

◆ gen_useragent()

str searx.utils.gen_useragent ( Optional[str] os_string = None)
Return a random browser User Agent

See searx/data/useragents.json

Definition at line 72 of file utils.py.

72def gen_useragent(os_string: Optional[str] = None) -> str:
73 """Return a random browser User Agent
74
75 See searx/data/useragents.json
76 """
77 return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
78
79

◆ get_embeded_stream_url()

searx.utils.get_embeded_stream_url ( url)
Converts a standard video URL into its embed format. Supported services include Youtube,
Facebook, Instagram, TikTok, Dailymotion, and Bilibili.

Definition at line 633 of file utils.py.

633def get_embeded_stream_url(url):
634 """
635 Converts a standard video URL into its embed format. Supported services include Youtube,
636 Facebook, Instagram, TikTok, Dailymotion, and Bilibili.
637 """
638 parsed_url = urlparse(url)
639 iframe_src = None
640
641 # YouTube
642 if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query:
643 video_id = parse_qs(parsed_url.query).get('v', [])
644 if video_id:
645 iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0]
646
647 # Facebook
648 elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']:
649 encoded_href = urlencode({'href': url})
650 iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href
651
652 # Instagram
653 elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'):
654 if parsed_url.path.endswith('/'):
655 iframe_src = url + 'embed'
656 else:
657 iframe_src = url + '/embed'
658
659 # TikTok
660 elif (
661 parsed_url.netloc in ['www.tiktok.com', 'tiktok.com']
662 and parsed_url.path.startswith('/@')
663 and '/video/' in parsed_url.path
664 ):
665 path_parts = parsed_url.path.split('/video/')
666 video_id = path_parts[1]
667 iframe_src = 'https://www.tiktok.com/embed/' + video_id
668
669 # Dailymotion
670 elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'):
671 path_parts = parsed_url.path.split('/')
672 if len(path_parts) == 3:
673 video_id = path_parts[2]
674 iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id
675
676 # Bilibili
677 elif parsed_url.netloc in ['www.bilibili.com', 'bilibili.com'] and parsed_url.path.startswith('/video/'):
678 path_parts = parsed_url.path.split('/')
679
680 video_id = path_parts[2]
681 param_key = None
682 if video_id.startswith('av'):
683 video_id = video_id[2:]
684 param_key = 'aid'
685 elif video_id.startswith('BV'):
686 param_key = 'bvid'
687
688 iframe_src = (
689 f'https://player.bilibili.com/player.html?{param_key}={video_id}&high_quality=1&autoplay=false&danmaku=0'
690 )
691
692 return iframe_src
693
694

◆ get_engine_from_settings()

Dict searx.utils.get_engine_from_settings ( str name)
Return engine configuration from settings.yml of a given engine name

Definition at line 498 of file utils.py.

498def get_engine_from_settings(name: str) -> Dict:
499 """Return engine configuration from settings.yml of a given engine name"""
500
501 if 'engines' not in settings:
502 return {}
503
504 for engine in settings['engines']:
505 if 'name' not in engine:
506 continue
507 if name == engine['name']:
508 return engine
509
510 return {}
511
512

◆ get_string_replaces_function()

Callable[[str], str] searx.utils.get_string_replaces_function ( Dict[str, str] replaces)

Definition at line 488 of file utils.py.

488def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
489 rep = {re.escape(k): v for k, v in replaces.items()}
490 pattern = re.compile("|".join(rep.keys()))
491
492 def func(text):
493 return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
494
495 return func
496
497

◆ get_xpath()

XPath searx.utils.get_xpath ( XPathSpecType xpath_spec)
Return cached compiled XPath

There is no thread lock.
Worst case scenario, xpath_str is compiled more than one time.

Args:
    * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath

Returns:
    * result (bool, float, list, str): Results.

Raises:
    * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
    * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath

Definition at line 513 of file utils.py.

513def get_xpath(xpath_spec: XPathSpecType) -> XPath:
514 """Return cached compiled XPath
515
516 There is no thread lock.
517 Worst case scenario, xpath_str is compiled more than one time.
518
519 Args:
520 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
521
522 Returns:
523 * result (bool, float, list, str): Results.
524
525 Raises:
526 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
527 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
528 """
529 if isinstance(xpath_spec, str):
530 result = _XPATH_CACHE.get(xpath_spec, None)
531 if result is None:
532 try:
533 result = XPath(xpath_spec)
534 except XPathSyntaxError as e:
535 raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e
536 _XPATH_CACHE[xpath_spec] = result
537 return result
538
539 if isinstance(xpath_spec, XPath):
540 return xpath_spec
541
542 raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath')
543
544

Referenced by eval_xpath().

+ Here is the caller graph for this function:

◆ html_to_text()

str searx.utils.html_to_text ( str html_str)
Extract text from a HTML string

Args:
    * html_str (str): string HTML

Returns:
    * str: extracted text

Examples:
    >>> html_to_text('Example <span id="42">#2</span>')
    'Example #2'

    >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
    'Example'

    >>> html_to_text(r'regexp: (?<![a-zA-Z]')
    'regexp: (?<![a-zA-Z]'

Definition at line 139 of file utils.py.

139def html_to_text(html_str: str) -> str:
140 """Extract text from a HTML string
141
142 Args:
143 * html_str (str): string HTML
144
145 Returns:
146 * str: extracted text
147
148 Examples:
149 >>> html_to_text('Example <span id="42">#2</span>')
150 'Example #2'
151
152 >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
153 'Example'
154
155 >>> html_to_text(r'regexp: (?<![a-zA-Z]')
156 'regexp: (?<![a-zA-Z]'
157 """
158 html_str = html_str.replace('\n', ' ').replace('\r', ' ')
159 html_str = ' '.join(html_str.split())
160 s = _HTMLTextExtractor()
161 try:
162 s.feed(html_str)
163 except AssertionError:
164 s = _HTMLTextExtractor()
165 s.feed(escape(html_str, quote=True))
166 except _HTMLTextExtractorException:
167 logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
168 return s.get_text()
169
170

Referenced by markdown_to_text().

+ Here is the caller graph for this function:

◆ humanize_bytes()

searx.utils.humanize_bytes ( size,
precision = 2 )
Determine the *human readable* value of bytes on 1024 base (1KB=1024B).

Definition at line 325 of file utils.py.

325def humanize_bytes(size, precision=2):
326 """Determine the *human readable* value of bytes on 1024 base (1KB=1024B)."""
327 s = ['B ', 'KB', 'MB', 'GB', 'TB']
328
329 x = len(s)
330 p = 0
331 while size > 1024 and p < x:
332 p += 1
333 size = size / 1024.0
334 return "%.*f %s" % (precision, size, s[p])
335
336

◆ humanize_number()

searx.utils.humanize_number ( size,
precision = 0 )
Determine the *human readable* value of a decimal number.

Definition at line 337 of file utils.py.

337def humanize_number(size, precision=0):
338 """Determine the *human readable* value of a decimal number."""
339 s = ['', 'K', 'M', 'B', 'T']
340
341 x = len(s)
342 p = 0
343 while size > 1000 and p < x:
344 p += 1
345 size = size / 1000.0
346 return "%.*f%s" % (precision, size, s[p])
347
348

◆ int_or_zero()

int searx.utils.int_or_zero ( Union[List[str], str] num)
Convert num to int or 0. num can be either a str or a list.
If num is a list, the first element is converted to int (or return 0 if the list is empty).
If num is a str, see convert_str_to_int

Definition at line 385 of file utils.py.

385def int_or_zero(num: Union[List[str], str]) -> int:
386 """Convert num to int or 0. num can be either a str or a list.
387 If num is a list, the first element is converted to int (or return 0 if the list is empty).
388 If num is a str, see convert_str_to_int
389 """
390 if isinstance(num, list):
391 if len(num) < 1:
392 return 0
393 num = num[0]
394 return convert_str_to_int(num)
395
396

References convert_str_to_int().

+ Here is the call graph for this function:

◆ is_valid_lang()

Optional[Tuple[bool, str, str]] searx.utils.is_valid_lang ( lang)
Return language code and name if lang describe a language.

Examples:
    >>> is_valid_lang('zz')
    None
    >>> is_valid_lang('uk')
    (True, 'uk', 'ukrainian')
    >>> is_valid_lang(b'uk')
    (True, 'uk', 'ukrainian')
    >>> is_valid_lang('en')
    (True, 'en', 'english')
    >>> searx.utils.is_valid_lang('Español')
    (True, 'es', 'spanish')
    >>> searx.utils.is_valid_lang('Spanish')
    (True, 'es', 'spanish')

Definition at line 397 of file utils.py.

397def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
398 """Return language code and name if lang describe a language.
399
400 Examples:
401 >>> is_valid_lang('zz')
402 None
403 >>> is_valid_lang('uk')
404 (True, 'uk', 'ukrainian')
405 >>> is_valid_lang(b'uk')
406 (True, 'uk', 'ukrainian')
407 >>> is_valid_lang('en')
408 (True, 'en', 'english')
409 >>> searx.utils.is_valid_lang('Español')
410 (True, 'es', 'spanish')
411 >>> searx.utils.is_valid_lang('Spanish')
412 (True, 'es', 'spanish')
413 """
414 if isinstance(lang, bytes):
415 lang = lang.decode()
416 is_abbr = len(lang) == 2
417 lang = lang.lower()
418 if is_abbr:
419 for l in sxng_locales:
420 if l[0][:2] == lang:
421 return (True, l[0][:2], l[3].lower())
422 return None
423 for l in sxng_locales:
424 if l[1].lower() == lang or l[3].lower() == lang:
425 return (True, l[0][:2], l[3].lower())
426 return None
427
428

◆ js_variable_to_python()

searx.utils.js_variable_to_python ( js_variable)
Convert a javascript variable into JSON and then load the value

It does not deal with all cases, but it is good enough for now.
chompjs has a better implementation.

Definition at line 766 of file utils.py.

766def js_variable_to_python(js_variable):
767 """Convert a javascript variable into JSON and then load the value
768
769 It does not deal with all cases, but it is good enough for now.
770 chompjs has a better implementation.
771 """
772 # when in_string is not None, it contains the character that has opened the string
773 # either simple quote or double quote
774 in_string = None
775 # cut the string:
776 # r"""{ a:"f\"irst", c:'sec"ond'}"""
777 # becomes
778 # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
779 parts = re.split(r'(["\'])', js_variable)
780 # previous part (to check the escape character antislash)
781 previous_p = ""
782 for i, p in enumerate(parts):
783 # parse characters inside a ECMA string
784 if in_string:
785 # we are in a JS string: replace the colon by a temporary character
786 # so quote_keys_regex doesn't have to deal with colon inside the JS strings
787 parts[i] = parts[i].replace(':', chr(1))
788 if in_string == "'":
789 # the JS string is delimited by simple quote.
790 # This is not supported by JSON.
791 # simple quote delimited string are converted to double quote delimited string
792 # here, inside a JS string, we escape the double quote
793 parts[i] = parts[i].replace('"', r'\"')
794
795 # deal with delimiters and escape character
796 if not in_string and p in ('"', "'"):
797 # we are not in string
798 # but p is double or simple quote
799 # that's the start of a new string
800 # replace simple quote by double quote
801 # (JSON doesn't support simple quote)
802 parts[i] = '"'
803 in_string = p
804 continue
805 if p == in_string:
806 # we are in a string and the current part MAY close the string
807 if len(previous_p) > 0 and previous_p[-1] == '\\':
808 # there is an antislash just before: the ECMA string continue
809 continue
810 # the current p close the string
811 # replace simple quote by double quote
812 parts[i] = '"'
813 in_string = None
814
815 if not in_string:
816 # replace void 0 by null
817 # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
818 # we are sure there is no string in p
819 parts[i] = _JS_VOID_RE.sub("null", p)
820 # update previous_p
821 previous_p = p
822 # join the string
823 s = ''.join(parts)
824 # add quote around the key
825 # { a: 12 }
826 # becomes
827 # { "a": 12 }
828 s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
829 s = _JS_DECIMAL_RE.sub(":0.", s)
830 # replace the surogate character by colon
831 s = s.replace(chr(1), ':')
832 # load the JSON and return the result
833 return json.loads(s)

◆ load_module()

types.ModuleType searx.utils.load_module ( str filename,
str module_dir )

Definition at line 429 of file utils.py.

429def load_module(filename: str, module_dir: str) -> types.ModuleType:
430 modname = splitext(filename)[0]
431 modpath = join(module_dir, filename)
432 # and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
433 spec = importlib.util.spec_from_file_location(modname, modpath)
434 if not spec:
435 raise ValueError(f"Error loading '{modpath}' module")
436 module = importlib.util.module_from_spec(spec)
437 if not spec.loader:
438 raise ValueError(f"Error loading '{modpath}' module")
439 spec.loader.exec_module(module)
440 return module
441
442

◆ markdown_to_text()

str searx.utils.markdown_to_text ( str markdown_str)
Extract text from a Markdown string

Args:
    * markdown_str (str): string Markdown

Returns:
    * str: extracted text

Examples:
    >>> markdown_to_text('[example](https://example.com)')
    'example'

    >>> markdown_to_text('## Headline')
    'Headline'

Definition at line 171 of file utils.py.

171def markdown_to_text(markdown_str: str) -> str:
172 """Extract text from a Markdown string
173
174 Args:
175 * markdown_str (str): string Markdown
176
177 Returns:
178 * str: extracted text
179
180 Examples:
181 >>> markdown_to_text('[example](https://example.com)')
182 'example'
183
184 >>> markdown_to_text('## Headline')
185 'Headline'
186 """
187
188 html_str = (
189 MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
190 )
191 return html_to_text(html_str)
192
193

References html_to_text().

+ Here is the call graph for this function:

◆ normalize_url()

str searx.utils.normalize_url ( str url,
str base_url )
Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path

Args:
    * url (str): Relative URL
    * base_url (str): Base URL, it must be an absolute URL.

Example:
    >>> normalize_url('https://example.com', 'http://example.com/')
    'https://example.com/'
    >>> normalize_url('//example.com', 'http://example.com/')
    'http://example.com/'
    >>> normalize_url('//example.com', 'https://example.com/')
    'https://example.com/'
    >>> normalize_url('/path?a=1', 'https://example.com')
    'https://example.com/path?a=1'
    >>> normalize_url('', 'https://example.com')
    'https://example.com/'
    >>> normalize_url('/test', '/path')
    raise ValueError

Raises:
    * lxml.etree.ParserError

Returns:
    * str: normalized URL

Definition at line 222 of file utils.py.

222def normalize_url(url: str, base_url: str) -> str:
223 """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
224
225 Args:
226 * url (str): Relative URL
227 * base_url (str): Base URL, it must be an absolute URL.
228
229 Example:
230 >>> normalize_url('https://example.com', 'http://example.com/')
231 'https://example.com/'
232 >>> normalize_url('//example.com', 'http://example.com/')
233 'http://example.com/'
234 >>> normalize_url('//example.com', 'https://example.com/')
235 'https://example.com/'
236 >>> normalize_url('/path?a=1', 'https://example.com')
237 'https://example.com/path?a=1'
238 >>> normalize_url('', 'https://example.com')
239 'https://example.com/'
240 >>> normalize_url('/test', '/path')
241 raise ValueError
242
243 Raises:
244 * lxml.etree.ParserError
245
246 Returns:
247 * str: normalized URL
248 """
249 if url.startswith('//'):
250 # add http or https to this kind of url //example.com/
251 parsed_search_url = urlparse(base_url)
252 url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
253 elif url.startswith('/'):
254 # fix relative url to the search engine
255 url = urljoin(base_url, url)
256
257 # fix relative urls that fall through the crack
258 if '://' not in url:
259 url = urljoin(base_url, url)
260
261 parsed_url = urlparse(url)
262
263 # add a / at this end of the url if there is no path
264 if not parsed_url.netloc:
265 raise ValueError('Cannot parse url')
266 if not parsed_url.path:
267 url += '/'
268
269 return url
270
271

Referenced by extract_url().

+ Here is the caller graph for this function:

◆ remove_pua_from_str()

searx.utils.remove_pua_from_str ( string)
Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.

.. _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas

Definition at line 473 of file utils.py.

473def remove_pua_from_str(string):
474 """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
475
476 .. _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
477 """
478 pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
479 s = []
480 for c in string:
481 i = ord(c)
482 if any(a <= i <= b for (a, b) in pua_ranges):
483 continue
484 s.append(c)
485 return "".join(s)
486
487

◆ searx_useragent()

str searx.utils.searx_useragent ( )
Return the searx User Agent

Definition at line 65 of file utils.py.

65def searx_useragent() -> str:
66 """Return the searx User Agent"""
67 return 'searx/{searx_version} {suffix}'.format(
68 searx_version=VERSION_TAG, suffix=settings['outgoing']['useragent_suffix']
69 ).strip()
70
71

◆ to_string()

str searx.utils.to_string ( Any obj)
Convert obj to its string representation.

Definition at line 443 of file utils.py.

443def to_string(obj: Any) -> str:
444 """Convert obj to its string representation."""
445 if isinstance(obj, str):
446 return obj
447 if hasattr(obj, '__str__'):
448 return str(obj)
449 return repr(obj)
450
451

Variable Documentation

◆ _BLOCKED_TAGS

tuple searx.utils._BLOCKED_TAGS = ('script', 'style')
protected

Definition at line 38 of file utils.py.

◆ _ECMA_UNESCAPE2_RE

searx.utils._ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
protected

Definition at line 41 of file utils.py.

◆ _ECMA_UNESCAPE4_RE

searx.utils._ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
protected

Definition at line 40 of file utils.py.

◆ _FASTTEXT_MODEL

Optional searx.utils._FASTTEXT_MODEL = None
protected

Definition at line 50 of file utils.py.

◆ _JS_DECIMAL_RE

searx.utils._JS_DECIMAL_RE = re.compile(r":\s*\.")
protected

Definition at line 45 of file utils.py.

◆ _JS_QUOTE_KEYS_RE

searx.utils._JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
protected

Definition at line 43 of file utils.py.

◆ _JS_VOID_RE

searx.utils._JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\‍([0-9]+\‍)')
protected

Definition at line 44 of file utils.py.

◆ _LANG_TO_LC_CACHE

dict searx.utils._LANG_TO_LC_CACHE = {}
protected

Definition at line 48 of file utils.py.

◆ _NOTSET

searx.utils._NOTSET = _NotSetClass()
protected

Definition at line 62 of file utils.py.

◆ _XPATH_CACHE

dict searx.utils._XPATH_CACHE = {}
protected

Definition at line 47 of file utils.py.

◆ logger

searx.utils.logger = logger.getChild('utils')

Definition at line 34 of file utils.py.

◆ SEARCH_LANGUAGE_CODES

searx.utils.SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])

Definition at line 53 of file utils.py.

◆ XPathSpecType

searx.utils.XPathSpecType = Union[str, XPath]

Definition at line 36 of file utils.py.