.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
searx.utils Namespace Reference

Classes

class  _HTMLTextExtractor
 
class  _HTMLTextExtractorException
 
class  _NotSetClass
 

Functions

str searx_useragent ()
 
str gen_useragent (Optional[str] os_string=None)
 
str html_to_text (str html_str)
 
str markdown_to_text (str markdown_str)
 
Optional[str] extract_text (xpath_results, bool allow_none=False)
 
str normalize_url (str url, str base_url)
 
str extract_url (xpath_results, base_url)
 
Dict dict_subset (MutableMapping dictionary, Set[str] properties)
 
 humanize_bytes (size, precision=2)
 
 humanize_number (size, precision=0)
 
int convert_str_to_int (str number_str)
 
 extr (str txt, str begin, str end, str default="")
 
int int_or_zero (Union[List[str], str] num)
 
Optional[Tuple[bool, str, str]] is_valid_lang (lang)
 
types.ModuleType load_module (str filename, str module_dir)
 
str to_string (Any obj)
 
str ecma_unescape (str string)
 
 remove_pua_from_str (string)
 
Callable[[str], str] get_string_replaces_function (Dict[str, str] replaces)
 
Dict get_engine_from_settings (str name)
 
XPath get_xpath (XPathSpecType xpath_spec)
 
 eval_xpath (ElementBase element, XPathSpecType xpath_spec)
 
 eval_xpath_list (ElementBase element, XPathSpecType xpath_spec, Optional[int] min_len=None)
 
 eval_xpath_getindex (ElementBase elements, XPathSpecType xpath_spec, int index, default=_NOTSET)
 
"fasttext.FastText._FastText" _get_fasttext_model ()
 
 get_embeded_stream_url (url)
 
Optional[str] detect_language (str text, float threshold=0.3, bool only_search_languages=False)
 
 js_variable_to_python (js_variable)
 
timedelta|None parse_duration_string (str duration_str)
 

Variables

 logger = logger.getChild('utils')
 
 XPathSpecType = Union[str, XPath]
 
tuple _BLOCKED_TAGS = ('script', 'style')
 
 _ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
 
 _ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
 
 _JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
 
 _JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\‍([0-9]+\‍)')
 
 _JS_DECIMAL_RE = re.compile(r":\s*\.")
 
dict _XPATH_CACHE = {}
 
dict _LANG_TO_LC_CACHE = {}
 
Optional _FASTTEXT_MODEL = None
 
 SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
 
 _NOTSET = _NotSetClass()
 

Detailed Description

Utility functions for the engines

Function Documentation

◆ _get_fasttext_model()

"fasttext.FastText._FastText" searx.utils._get_fasttext_model ( )
protected

Definition at line 623 of file utils.py.

623def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore
624 global _FASTTEXT_MODEL # pylint: disable=global-statement
625 if _FASTTEXT_MODEL is None:
626 import fasttext # pylint: disable=import-outside-toplevel
627
628 # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
629 fasttext.FastText.eprint = lambda x: None
630 _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
631 return _FASTTEXT_MODEL
632
633

Referenced by detect_language().

+ Here is the caller graph for this function:

◆ convert_str_to_int()

int searx.utils.convert_str_to_int ( str number_str)
Convert number_str to int or 0 if number_str is not a number.

Definition at line 350 of file utils.py.

350def convert_str_to_int(number_str: str) -> int:
351 """Convert number_str to int or 0 if number_str is not a number."""
352 if number_str.isdigit():
353 return int(number_str)
354 return 0
355
356

Referenced by int_or_zero().

+ Here is the caller graph for this function:

◆ detect_language()

Optional[str] searx.utils.detect_language ( str text,
float threshold = 0.3,
bool only_search_languages = False )
Detect the language of the ``text`` parameter.

:param str text: The string whose language is to be detected.

:param float threshold: Threshold filters the returned labels by a threshold
    on probability.  A choice of 0.3 will return labels with at least 0.3
    probability.

:param bool only_search_languages: If ``True``, returns only supported
    SearXNG search languages.  see :py:obj:`searx.languages`

:rtype: str, None
:returns:
    The detected language code or ``None``. See below.

:raises ValueError: If ``text`` is not a string.

The language detection is done by using `a fork`_ of the fastText_ library
(`python fasttext`_). fastText_ distributes the `language identification
model`_, for reference:

- `FastText.zip: Compressing text classification models`_
- `Bag of Tricks for Efficient Text Classification`_

The `language identification model`_ support the language codes
(ISO-639-3)::

    af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
    bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
    et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
    id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
    lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
    nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
    rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
    tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh

By using ``only_search_languages=True`` the `language identification model`_
is harmonized with the SearXNG's language (locale) model.  General
conditions of SearXNG's locale model are:

a. SearXNG's locale of a query is passed to the
   :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
   code that is used by an engine.

b. Most of SearXNG's engines do not support all the languages from `language
   identification model`_ and there is also a discrepancy in the ISO-639-3
   (fasttext) and ISO-639-2 (SearXNG)handling.  Further more, in SearXNG the
   locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
   (``zh_Hans``) while the `language identification model`_ reduce both to
   ``zh``.

.. _a fork: https://github.com/searxng/fasttext-predict
.. _fastText: https://fasttext.cc/
.. _python fasttext: https://pypi.org/project/fasttext/
.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651

Definition at line 696 of file utils.py.

696def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
697 """Detect the language of the ``text`` parameter.
698
699 :param str text: The string whose language is to be detected.
700
701 :param float threshold: Threshold filters the returned labels by a threshold
702 on probability. A choice of 0.3 will return labels with at least 0.3
703 probability.
704
705 :param bool only_search_languages: If ``True``, returns only supported
706 SearXNG search languages. see :py:obj:`searx.languages`
707
708 :rtype: str, None
709 :returns:
710 The detected language code or ``None``. See below.
711
712 :raises ValueError: If ``text`` is not a string.
713
714 The language detection is done by using `a fork`_ of the fastText_ library
715 (`python fasttext`_). fastText_ distributes the `language identification
716 model`_, for reference:
717
718 - `FastText.zip: Compressing text classification models`_
719 - `Bag of Tricks for Efficient Text Classification`_
720
721 The `language identification model`_ support the language codes
722 (ISO-639-3)::
723
724 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
725 bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
726 et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
727 id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
728 lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
729 nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
730 rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
731 tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
732
733 By using ``only_search_languages=True`` the `language identification model`_
734 is harmonized with the SearXNG's language (locale) model. General
735 conditions of SearXNG's locale model are:
736
737 a. SearXNG's locale of a query is passed to the
738 :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
739 code that is used by an engine.
740
741 b. Most of SearXNG's engines do not support all the languages from `language
742 identification model`_ and there is also a discrepancy in the ISO-639-3
743 (fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
744 locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
745 (``zh_Hans``) while the `language identification model`_ reduce both to
746 ``zh``.
747
748 .. _a fork: https://github.com/searxng/fasttext-predict
749 .. _fastText: https://fasttext.cc/
750 .. _python fasttext: https://pypi.org/project/fasttext/
751 .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
752 .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
753 .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
754
755 """
756 if not isinstance(text, str):
757 raise ValueError('text must a str')
758 r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
759 if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
760 language = r[0][0].split('__label__')[1]
761 if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
762 return None
763 return language
764 return None
765
766

References _get_fasttext_model().

+ Here is the call graph for this function:

◆ dict_subset()

Dict searx.utils.dict_subset ( MutableMapping dictionary,
Set[str] properties )
Extract a subset of a dict

Examples:
    >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
    {'A': 'a', 'C': 'c'}
    >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
    {'A': 'a'}

Definition at line 314 of file utils.py.

314def dict_subset(dictionary: MutableMapping, properties: Set[str]) -> Dict:
315 """Extract a subset of a dict
316
317 Examples:
318 >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
319 {'A': 'a', 'C': 'c'}
320 >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
321 {'A': 'a'}
322 """
323 return {k: dictionary[k] for k in properties if k in dictionary}
324
325

◆ ecma_unescape()

str searx.utils.ecma_unescape ( str string)
Python implementation of the unescape javascript function

https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape

Examples:
    >>> ecma_unescape('%u5409')
    '吉'
    >>> ecma_unescape('%20')
    ' '
    >>> ecma_unescape('%F3')
    'ó'

Definition at line 453 of file utils.py.

453def ecma_unescape(string: str) -> str:
454 """Python implementation of the unescape javascript function
455
456 https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
457 https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
458
459 Examples:
460 >>> ecma_unescape('%u5409')
461 '吉'
462 >>> ecma_unescape('%20')
463 ' '
464 >>> ecma_unescape('%F3')
465 'ó'
466 """
467 # "%u5409" becomes "吉"
468 string = _ECMA_UNESCAPE4_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
469 # "%20" becomes " ", "%F3" becomes "ó"
470 string = _ECMA_UNESCAPE2_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
471 return string
472
473

◆ eval_xpath()

searx.utils.eval_xpath ( ElementBase element,
XPathSpecType xpath_spec )
Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.
See https://lxml.de/xpathxslt.html#xpath-return-values

Args:
    * element (ElementBase): [description]
    * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath

Returns:
    * result (bool, float, list, str): Results.

Raises:
    * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
    * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
    * SearxEngineXPathException: Raise when the XPath can't be evaluated.

Definition at line 546 of file utils.py.

546def eval_xpath(element: ElementBase, xpath_spec: XPathSpecType):
547 """Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.
548 See https://lxml.de/xpathxslt.html#xpath-return-values
549
550 Args:
551 * element (ElementBase): [description]
552 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
553
554 Returns:
555 * result (bool, float, list, str): Results.
556
557 Raises:
558 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
559 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
560 * SearxEngineXPathException: Raise when the XPath can't be evaluated.
561 """
562 xpath = get_xpath(xpath_spec)
563 try:
564 return xpath(element)
565 except XPathError as e:
566 arg = ' '.join([str(i) for i in e.args])
567 raise SearxEngineXPathException(xpath_spec, arg) from e
568
569

References get_xpath().

Referenced by eval_xpath_list().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ eval_xpath_getindex()

searx.utils.eval_xpath_getindex ( ElementBase elements,
XPathSpecType xpath_spec,
int index,
default = _NOTSET )
Call eval_xpath_list then get one element using the index parameter.
If the index does not exist, either raise an exception is default is not set,
other return the default value (can be None).

Args:
    * elements (ElementBase): lxml element to apply the xpath.
    * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath.
    * index (int): index to get
    * default (Object, optional): Defaults if index doesn't exist.

Raises:
    * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
    * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
    * SearxEngineXPathException: if the index is not found. Also see eval_xpath.

Returns:
    * result (bool, float, list, str): Results.

Definition at line 594 of file utils.py.

594def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index: int, default=_NOTSET):
595 """Call eval_xpath_list then get one element using the index parameter.
596 If the index does not exist, either raise an exception is default is not set,
597 other return the default value (can be None).
598
599 Args:
600 * elements (ElementBase): lxml element to apply the xpath.
601 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath.
602 * index (int): index to get
603 * default (Object, optional): Defaults if index doesn't exist.
604
605 Raises:
606 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
607 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
608 * SearxEngineXPathException: if the index is not found. Also see eval_xpath.
609
610 Returns:
611 * result (bool, float, list, str): Results.
612 """
613 result = eval_xpath_list(elements, xpath_spec)
614 if -len(result) <= index < len(result):
615 return result[index]
616 if default == _NOTSET:
617 # raise an SearxEngineXPathException instead of IndexError
618 # to record xpath_spec
619 raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
620 return default
621
622

References eval_xpath_list().

+ Here is the call graph for this function:

◆ eval_xpath_list()

searx.utils.eval_xpath_list ( ElementBase element,
XPathSpecType xpath_spec,
Optional[int] min_len = None )
Same as eval_xpath, check if the result is a list

Args:
    * element (ElementBase): [description]
    * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
    * min_len (int, optional): [description]. Defaults to None.

Raises:
    * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
    * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
    * SearxEngineXPathException: raise if the result is not a list

Returns:
    * result (bool, float, list, str): Results.

Definition at line 570 of file utils.py.

570def eval_xpath_list(element: ElementBase, xpath_spec: XPathSpecType, min_len: Optional[int] = None):
571 """Same as eval_xpath, check if the result is a list
572
573 Args:
574 * element (ElementBase): [description]
575 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
576 * min_len (int, optional): [description]. Defaults to None.
577
578 Raises:
579 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
580 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
581 * SearxEngineXPathException: raise if the result is not a list
582
583 Returns:
584 * result (bool, float, list, str): Results.
585 """
586 result = eval_xpath(element, xpath_spec)
587 if not isinstance(result, list):
588 raise SearxEngineXPathException(xpath_spec, 'the result is not a list')
589 if min_len is not None and min_len > len(result):
590 raise SearxEngineXPathException(xpath_spec, 'len(xpath_str) < ' + str(min_len))
591 return result
592
593

References eval_xpath().

Referenced by eval_xpath_getindex().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ extr()

searx.utils.extr ( str txt,
str begin,
str end,
str default = "" )
Extract the string between ``begin`` and ``end`` from ``txt``

:param txt:     String to search in
:param begin:   First string to be searched for
:param end:     Second string to be searched for after ``begin``
:param default: Default value if one of ``begin`` or ``end`` is not
                found.  Defaults to an empty string.
:return: The string between the two search-strings ``begin`` and ``end``.
         If at least one of ``begin`` or ``end`` is not found, the value of
         ``default`` is returned.

Examples:
  >>> extr("abcde", "a", "e")
  "bcd"
  >>> extr("abcde", "a", "z", deafult="nothing")
  "nothing"

Definition at line 357 of file utils.py.

357def extr(txt: str, begin: str, end: str, default: str = ""):
358 """Extract the string between ``begin`` and ``end`` from ``txt``
359
360 :param txt: String to search in
361 :param begin: First string to be searched for
362 :param end: Second string to be searched for after ``begin``
363 :param default: Default value if one of ``begin`` or ``end`` is not
364 found. Defaults to an empty string.
365 :return: The string between the two search-strings ``begin`` and ``end``.
366 If at least one of ``begin`` or ``end`` is not found, the value of
367 ``default`` is returned.
368
369 Examples:
370 >>> extr("abcde", "a", "e")
371 "bcd"
372 >>> extr("abcde", "a", "z", deafult="nothing")
373 "nothing"
374
375 """
376
377 # From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129
378
379 try:
380 first = txt.index(begin) + len(begin)
381 return txt[first : txt.index(end, first)]
382 except ValueError:
383 return default
384
385

◆ extract_text()

Optional[str] searx.utils.extract_text ( xpath_results,
bool allow_none = False )
Extract text from a lxml result

* if xpath_results is list, extract the text from each result and concat the list
* if xpath_results is a xml element, extract all the text node from it
  ( text_content() method from lxml )
* if xpath_results is a string element, then it's already done

Definition at line 195 of file utils.py.

195def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
196 """Extract text from a lxml result
197
198 * if xpath_results is list, extract the text from each result and concat the list
199 * if xpath_results is a xml element, extract all the text node from it
200 ( text_content() method from lxml )
201 * if xpath_results is a string element, then it's already done
202 """
203 if isinstance(xpath_results, list):
204 # it's list of result : concat everything using recursive call
205 result = ''
206 for e in xpath_results:
207 result = result + (extract_text(e) or '')
208 return result.strip()
209 if isinstance(xpath_results, ElementBase):
210 # it's a element
211 text: str = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False)
212 text = text.strip().replace('\n', ' ')
213 return ' '.join(text.split())
214 if isinstance(xpath_results, (str, Number, bool)):
215 return str(xpath_results)
216 if xpath_results is None and allow_none:
217 return None
218 if xpath_results is None and not allow_none:
219 raise ValueError('extract_text(None, allow_none=False)')
220 raise ValueError('unsupported type')
221
222

References extract_text().

Referenced by extract_text(), and extract_url().

+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ extract_url()

str searx.utils.extract_url ( xpath_results,
base_url )
Extract and normalize URL from lxml Element

Args:
    * xpath_results (Union[List[html.HtmlElement], html.HtmlElement]): lxml Element(s)
    * base_url (str): Base URL

Example:
    >>> def f(s, search_url):
    >>>    return searx.utils.extract_url(html.fromstring(s), search_url)
    >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
    'https://example.com/'
    >>> f('https://example.com', 'http://example.com/')
    'https://example.com/'
    >>> f('//example.com', 'http://example.com/')
    'http://example.com/'
    >>> f('//example.com', 'https://example.com/')
    'https://example.com/'
    >>> f('/path?a=1', 'https://example.com')
    'https://example.com/path?a=1'
    >>> f('', 'https://example.com')
    raise lxml.etree.ParserError
    >>> searx.utils.extract_url([], 'https://example.com')
    raise ValueError

Raises:
    * ValueError
    * lxml.etree.ParserError

Returns:
    * str: normalized URL

Definition at line 273 of file utils.py.

273def extract_url(xpath_results, base_url) -> str:
274 """Extract and normalize URL from lxml Element
275
276 Args:
277 * xpath_results (Union[List[html.HtmlElement], html.HtmlElement]): lxml Element(s)
278 * base_url (str): Base URL
279
280 Example:
281 >>> def f(s, search_url):
282 >>> return searx.utils.extract_url(html.fromstring(s), search_url)
283 >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
284 'https://example.com/'
285 >>> f('https://example.com', 'http://example.com/')
286 'https://example.com/'
287 >>> f('//example.com', 'http://example.com/')
288 'http://example.com/'
289 >>> f('//example.com', 'https://example.com/')
290 'https://example.com/'
291 >>> f('/path?a=1', 'https://example.com')
292 'https://example.com/path?a=1'
293 >>> f('', 'https://example.com')
294 raise lxml.etree.ParserError
295 >>> searx.utils.extract_url([], 'https://example.com')
296 raise ValueError
297
298 Raises:
299 * ValueError
300 * lxml.etree.ParserError
301
302 Returns:
303 * str: normalized URL
304 """
305 if xpath_results == []:
306 raise ValueError('Empty url resultset')
307
308 url = extract_text(xpath_results)
309 if url:
310 return normalize_url(url, base_url)
311 raise ValueError('URL not found')
312
313

References extract_text(), and normalize_url().

+ Here is the call graph for this function:

◆ gen_useragent()

str searx.utils.gen_useragent ( Optional[str] os_string = None)
Return a random browser User Agent

See searx/data/useragents.json

Definition at line 71 of file utils.py.

71def gen_useragent(os_string: Optional[str] = None) -> str:
72 """Return a random browser User Agent
73
74 See searx/data/useragents.json
75 """
76 return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
77
78

◆ get_embeded_stream_url()

searx.utils.get_embeded_stream_url ( url)
Converts a standard video URL into its embed format. Supported services include Youtube,
Facebook, Instagram, TikTok, Dailymotion, and Bilibili.

Definition at line 634 of file utils.py.

634def get_embeded_stream_url(url):
635 """
636 Converts a standard video URL into its embed format. Supported services include Youtube,
637 Facebook, Instagram, TikTok, Dailymotion, and Bilibili.
638 """
639 parsed_url = urlparse(url)
640 iframe_src = None
641
642 # YouTube
643 if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query:
644 video_id = parse_qs(parsed_url.query).get('v', [])
645 if video_id:
646 iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0]
647
648 # Facebook
649 elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']:
650 encoded_href = urlencode({'href': url})
651 iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href
652
653 # Instagram
654 elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'):
655 if parsed_url.path.endswith('/'):
656 iframe_src = url + 'embed'
657 else:
658 iframe_src = url + '/embed'
659
660 # TikTok
661 elif (
662 parsed_url.netloc in ['www.tiktok.com', 'tiktok.com']
663 and parsed_url.path.startswith('/@')
664 and '/video/' in parsed_url.path
665 ):
666 path_parts = parsed_url.path.split('/video/')
667 video_id = path_parts[1]
668 iframe_src = 'https://www.tiktok.com/embed/' + video_id
669
670 # Dailymotion
671 elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'):
672 path_parts = parsed_url.path.split('/')
673 if len(path_parts) == 3:
674 video_id = path_parts[2]
675 iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id
676
677 # Bilibili
678 elif parsed_url.netloc in ['www.bilibili.com', 'bilibili.com'] and parsed_url.path.startswith('/video/'):
679 path_parts = parsed_url.path.split('/')
680
681 video_id = path_parts[2]
682 param_key = None
683 if video_id.startswith('av'):
684 video_id = video_id[2:]
685 param_key = 'aid'
686 elif video_id.startswith('BV'):
687 param_key = 'bvid'
688
689 iframe_src = (
690 f'https://player.bilibili.com/player.html?{param_key}={video_id}&high_quality=1&autoplay=false&danmaku=0'
691 )
692
693 return iframe_src
694
695

◆ get_engine_from_settings()

Dict searx.utils.get_engine_from_settings ( str name)
Return engine configuration from settings.yml of a given engine name

Definition at line 499 of file utils.py.

499def get_engine_from_settings(name: str) -> Dict:
500 """Return engine configuration from settings.yml of a given engine name"""
501
502 if 'engines' not in settings:
503 return {}
504
505 for engine in settings['engines']:
506 if 'name' not in engine:
507 continue
508 if name == engine['name']:
509 return engine
510
511 return {}
512
513

◆ get_string_replaces_function()

Callable[[str], str] searx.utils.get_string_replaces_function ( Dict[str, str] replaces)

Definition at line 489 of file utils.py.

489def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
490 rep = {re.escape(k): v for k, v in replaces.items()}
491 pattern = re.compile("|".join(rep.keys()))
492
493 def func(text):
494 return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
495
496 return func
497
498

◆ get_xpath()

XPath searx.utils.get_xpath ( XPathSpecType xpath_spec)
Return cached compiled XPath

There is no thread lock.
Worst case scenario, xpath_str is compiled more than one time.

Args:
    * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath

Returns:
    * result (bool, float, list, str): Results.

Raises:
    * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
    * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath

Definition at line 514 of file utils.py.

514def get_xpath(xpath_spec: XPathSpecType) -> XPath:
515 """Return cached compiled XPath
516
517 There is no thread lock.
518 Worst case scenario, xpath_str is compiled more than one time.
519
520 Args:
521 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
522
523 Returns:
524 * result (bool, float, list, str): Results.
525
526 Raises:
527 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
528 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
529 """
530 if isinstance(xpath_spec, str):
531 result = _XPATH_CACHE.get(xpath_spec, None)
532 if result is None:
533 try:
534 result = XPath(xpath_spec)
535 except XPathSyntaxError as e:
536 raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e
537 _XPATH_CACHE[xpath_spec] = result
538 return result
539
540 if isinstance(xpath_spec, XPath):
541 return xpath_spec
542
543 raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath')
544
545

Referenced by eval_xpath().

+ Here is the caller graph for this function:

◆ html_to_text()

str searx.utils.html_to_text ( str html_str)
Extract text from a HTML string

Args:
    * html_str (str): string HTML

Returns:
    * str: extracted text

Examples:
    >>> html_to_text('Example <span id="42">#2</span>')
    'Example #2'

    >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
    'Example'

    >>> html_to_text(r'regexp: (?<![a-zA-Z]')
    'regexp: (?<![a-zA-Z]'

Definition at line 138 of file utils.py.

138def html_to_text(html_str: str) -> str:
139 """Extract text from a HTML string
140
141 Args:
142 * html_str (str): string HTML
143
144 Returns:
145 * str: extracted text
146
147 Examples:
148 >>> html_to_text('Example <span id="42">#2</span>')
149 'Example #2'
150
151 >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
152 'Example'
153
154 >>> html_to_text(r'regexp: (?<![a-zA-Z]')
155 'regexp: (?<![a-zA-Z]'
156 """
157 if not html_str:
158 return ""
159 html_str = html_str.replace('\n', ' ').replace('\r', ' ')
160 html_str = ' '.join(html_str.split())
161 s = _HTMLTextExtractor()
162 try:
163 s.feed(html_str)
164 except AssertionError:
165 s = _HTMLTextExtractor()
166 s.feed(escape(html_str, quote=True))
167 except _HTMLTextExtractorException:
168 logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
169 return s.get_text()
170
171

Referenced by markdown_to_text().

+ Here is the caller graph for this function:

◆ humanize_bytes()

searx.utils.humanize_bytes ( size,
precision = 2 )
Determine the *human readable* value of bytes on 1024 base (1KB=1024B).

Definition at line 326 of file utils.py.

326def humanize_bytes(size, precision=2):
327 """Determine the *human readable* value of bytes on 1024 base (1KB=1024B)."""
328 s = ['B ', 'KB', 'MB', 'GB', 'TB']
329
330 x = len(s)
331 p = 0
332 while size > 1024 and p < x:
333 p += 1
334 size = size / 1024.0
335 return "%.*f %s" % (precision, size, s[p])
336
337

◆ humanize_number()

searx.utils.humanize_number ( size,
precision = 0 )
Determine the *human readable* value of a decimal number.

Definition at line 338 of file utils.py.

338def humanize_number(size, precision=0):
339 """Determine the *human readable* value of a decimal number."""
340 s = ['', 'K', 'M', 'B', 'T']
341
342 x = len(s)
343 p = 0
344 while size > 1000 and p < x:
345 p += 1
346 size = size / 1000.0
347 return "%.*f%s" % (precision, size, s[p])
348
349

◆ int_or_zero()

int searx.utils.int_or_zero ( Union[List[str], str] num)
Convert num to int or 0. num can be either a str or a list.
If num is a list, the first element is converted to int (or return 0 if the list is empty).
If num is a str, see convert_str_to_int

Definition at line 386 of file utils.py.

386def int_or_zero(num: Union[List[str], str]) -> int:
387 """Convert num to int or 0. num can be either a str or a list.
388 If num is a list, the first element is converted to int (or return 0 if the list is empty).
389 If num is a str, see convert_str_to_int
390 """
391 if isinstance(num, list):
392 if len(num) < 1:
393 return 0
394 num = num[0]
395 return convert_str_to_int(num)
396
397

References convert_str_to_int().

+ Here is the call graph for this function:

◆ is_valid_lang()

Optional[Tuple[bool, str, str]] searx.utils.is_valid_lang ( lang)
Return language code and name if lang describe a language.

Examples:
    >>> is_valid_lang('zz')
    None
    >>> is_valid_lang('uk')
    (True, 'uk', 'ukrainian')
    >>> is_valid_lang(b'uk')
    (True, 'uk', 'ukrainian')
    >>> is_valid_lang('en')
    (True, 'en', 'english')
    >>> searx.utils.is_valid_lang('Español')
    (True, 'es', 'spanish')
    >>> searx.utils.is_valid_lang('Spanish')
    (True, 'es', 'spanish')

Definition at line 398 of file utils.py.

398def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
399 """Return language code and name if lang describe a language.
400
401 Examples:
402 >>> is_valid_lang('zz')
403 None
404 >>> is_valid_lang('uk')
405 (True, 'uk', 'ukrainian')
406 >>> is_valid_lang(b'uk')
407 (True, 'uk', 'ukrainian')
408 >>> is_valid_lang('en')
409 (True, 'en', 'english')
410 >>> searx.utils.is_valid_lang('Español')
411 (True, 'es', 'spanish')
412 >>> searx.utils.is_valid_lang('Spanish')
413 (True, 'es', 'spanish')
414 """
415 if isinstance(lang, bytes):
416 lang = lang.decode()
417 is_abbr = len(lang) == 2
418 lang = lang.lower()
419 if is_abbr:
420 for l in sxng_locales:
421 if l[0][:2] == lang:
422 return (True, l[0][:2], l[3].lower())
423 return None
424 for l in sxng_locales:
425 if l[1].lower() == lang or l[3].lower() == lang:
426 return (True, l[0][:2], l[3].lower())
427 return None
428
429

◆ js_variable_to_python()

searx.utils.js_variable_to_python ( js_variable)
Convert a javascript variable into JSON and then load the value

It does not deal with all cases, but it is good enough for now.
chompjs has a better implementation.

Definition at line 767 of file utils.py.

767def js_variable_to_python(js_variable):
768 """Convert a javascript variable into JSON and then load the value
769
770 It does not deal with all cases, but it is good enough for now.
771 chompjs has a better implementation.
772 """
773 # when in_string is not None, it contains the character that has opened the string
774 # either simple quote or double quote
775 in_string = None
776 # cut the string:
777 # r"""{ a:"f\"irst", c:'sec"ond'}"""
778 # becomes
779 # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
780 parts = re.split(r'(["\'])', js_variable)
781 # previous part (to check the escape character antislash)
782 previous_p = ""
783 for i, p in enumerate(parts):
784 # parse characters inside a ECMA string
785 if in_string:
786 # we are in a JS string: replace the colon by a temporary character
787 # so quote_keys_regex doesn't have to deal with colon inside the JS strings
788 parts[i] = parts[i].replace(':', chr(1))
789 if in_string == "'":
790 # the JS string is delimited by simple quote.
791 # This is not supported by JSON.
792 # simple quote delimited string are converted to double quote delimited string
793 # here, inside a JS string, we escape the double quote
794 parts[i] = parts[i].replace('"', r'\"')
795
796 # deal with delimiters and escape character
797 if not in_string and p in ('"', "'"):
798 # we are not in string
799 # but p is double or simple quote
800 # that's the start of a new string
801 # replace simple quote by double quote
802 # (JSON doesn't support simple quote)
803 parts[i] = '"'
804 in_string = p
805 continue
806 if p == in_string:
807 # we are in a string and the current part MAY close the string
808 if len(previous_p) > 0 and previous_p[-1] == '\\':
809 # there is an antislash just before: the ECMA string continue
810 continue
811 # the current p close the string
812 # replace simple quote by double quote
813 parts[i] = '"'
814 in_string = None
815
816 if not in_string:
817 # replace void 0 by null
818 # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
819 # we are sure there is no string in p
820 parts[i] = _JS_VOID_RE.sub("null", p)
821 # update previous_p
822 previous_p = p
823 # join the string
824 s = ''.join(parts)
825 # add quote around the key
826 # { a: 12 }
827 # becomes
828 # { "a": 12 }
829 s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
830 s = _JS_DECIMAL_RE.sub(":0.", s)
831 # replace the surogate character by colon
832 s = s.replace(chr(1), ':')
833 # load the JSON and return the result
834 return json.loads(s)
835
836

◆ load_module()

types.ModuleType searx.utils.load_module ( str filename,
str module_dir )

Definition at line 430 of file utils.py.

430def load_module(filename: str, module_dir: str) -> types.ModuleType:
431 modname = splitext(filename)[0]
432 modpath = join(module_dir, filename)
433 # and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
434 spec = importlib.util.spec_from_file_location(modname, modpath)
435 if not spec:
436 raise ValueError(f"Error loading '{modpath}' module")
437 module = importlib.util.module_from_spec(spec)
438 if not spec.loader:
439 raise ValueError(f"Error loading '{modpath}' module")
440 spec.loader.exec_module(module)
441 return module
442
443

◆ markdown_to_text()

str searx.utils.markdown_to_text ( str markdown_str)
Extract text from a Markdown string

Args:
    * markdown_str (str): string Markdown

Returns:
    * str: extracted text

Examples:
    >>> markdown_to_text('[example](https://example.com)')
    'example'

    >>> markdown_to_text('## Headline')
    'Headline'

Definition at line 172 of file utils.py.

172def markdown_to_text(markdown_str: str) -> str:
173 """Extract text from a Markdown string
174
175 Args:
176 * markdown_str (str): string Markdown
177
178 Returns:
179 * str: extracted text
180
181 Examples:
182 >>> markdown_to_text('[example](https://example.com)')
183 'example'
184
185 >>> markdown_to_text('## Headline')
186 'Headline'
187 """
188
189 html_str = (
190 MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
191 )
192 return html_to_text(html_str)
193
194

References html_to_text().

+ Here is the call graph for this function:

◆ normalize_url()

str searx.utils.normalize_url ( str url,
str base_url )
Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path

Args:
    * url (str): Relative URL
    * base_url (str): Base URL, it must be an absolute URL.

Example:
    >>> normalize_url('https://example.com', 'http://example.com/')
    'https://example.com/'
    >>> normalize_url('//example.com', 'http://example.com/')
    'http://example.com/'
    >>> normalize_url('//example.com', 'https://example.com/')
    'https://example.com/'
    >>> normalize_url('/path?a=1', 'https://example.com')
    'https://example.com/path?a=1'
    >>> normalize_url('', 'https://example.com')
    'https://example.com/'
    >>> normalize_url('/test', '/path')
    raise ValueError

Raises:
    * lxml.etree.ParserError

Returns:
    * str: normalized URL

Definition at line 223 of file utils.py.

223def normalize_url(url: str, base_url: str) -> str:
224 """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
225
226 Args:
227 * url (str): Relative URL
228 * base_url (str): Base URL, it must be an absolute URL.
229
230 Example:
231 >>> normalize_url('https://example.com', 'http://example.com/')
232 'https://example.com/'
233 >>> normalize_url('//example.com', 'http://example.com/')
234 'http://example.com/'
235 >>> normalize_url('//example.com', 'https://example.com/')
236 'https://example.com/'
237 >>> normalize_url('/path?a=1', 'https://example.com')
238 'https://example.com/path?a=1'
239 >>> normalize_url('', 'https://example.com')
240 'https://example.com/'
241 >>> normalize_url('/test', '/path')
242 raise ValueError
243
244 Raises:
245 * lxml.etree.ParserError
246
247 Returns:
248 * str: normalized URL
249 """
250 if url.startswith('//'):
251 # add http or https to this kind of url //example.com/
252 parsed_search_url = urlparse(base_url)
253 url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
254 elif url.startswith('/'):
255 # fix relative url to the search engine
256 url = urljoin(base_url, url)
257
258 # fix relative urls that fall through the crack
259 if '://' not in url:
260 url = urljoin(base_url, url)
261
262 parsed_url = urlparse(url)
263
264 # add a / at this end of the url if there is no path
265 if not parsed_url.netloc:
266 raise ValueError('Cannot parse url')
267 if not parsed_url.path:
268 url += '/'
269
270 return url
271
272

Referenced by extract_url().

+ Here is the caller graph for this function:

◆ parse_duration_string()

timedelta | None searx.utils.parse_duration_string ( str duration_str)
Parse a time string in format MM:SS or HH:MM:SS and convert it to a `timedelta` object.

Returns None if the provided string doesn't match any of the formats.

Definition at line 837 of file utils.py.

837def parse_duration_string(duration_str: str) -> timedelta | None:
838 """Parse a time string in format MM:SS or HH:MM:SS and convert it to a `timedelta` object.
839
840 Returns None if the provided string doesn't match any of the formats.
841 """
842 duration_str = duration_str.strip()
843
844 if not duration_str:
845 return None
846
847 try:
848 # prepending ["00"] here inits hours to 0 if they are not provided
849 time_parts = (["00"] + duration_str.split(":"))[:3]
850 hours, minutes, seconds = map(int, time_parts)
851 return timedelta(hours=hours, minutes=minutes, seconds=seconds)
852
853 except (ValueError, TypeError):
854 pass
855
856 return None

◆ remove_pua_from_str()

searx.utils.remove_pua_from_str ( string)
Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.

.. _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas

Definition at line 474 of file utils.py.

474def remove_pua_from_str(string):
475 """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
476
477 .. _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
478 """
479 pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
480 s = []
481 for c in string:
482 i = ord(c)
483 if any(a <= i <= b for (a, b) in pua_ranges):
484 continue
485 s.append(c)
486 return "".join(s)
487
488

◆ searx_useragent()

str searx.utils.searx_useragent ( )
Return the searx User Agent

Definition at line 64 of file utils.py.

64def searx_useragent() -> str:
65 """Return the searx User Agent"""
66 return 'searx/{searx_version} {suffix}'.format(
67 searx_version=VERSION_TAG, suffix=settings['outgoing']['useragent_suffix']
68 ).strip()
69
70

◆ to_string()

str searx.utils.to_string ( Any obj)
Convert obj to its string representation.

Definition at line 444 of file utils.py.

444def to_string(obj: Any) -> str:
445 """Convert obj to its string representation."""
446 if isinstance(obj, str):
447 return obj
448 if hasattr(obj, '__str__'):
449 return str(obj)
450 return repr(obj)
451
452

Variable Documentation

◆ _BLOCKED_TAGS

tuple searx.utils._BLOCKED_TAGS = ('script', 'style')
protected

Definition at line 37 of file utils.py.

◆ _ECMA_UNESCAPE2_RE

searx.utils._ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
protected

Definition at line 40 of file utils.py.

◆ _ECMA_UNESCAPE4_RE

searx.utils._ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
protected

Definition at line 39 of file utils.py.

◆ _FASTTEXT_MODEL

Optional searx.utils._FASTTEXT_MODEL = None
protected

Definition at line 49 of file utils.py.

◆ _JS_DECIMAL_RE

searx.utils._JS_DECIMAL_RE = re.compile(r":\s*\.")
protected

Definition at line 44 of file utils.py.

◆ _JS_QUOTE_KEYS_RE

searx.utils._JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
protected

Definition at line 42 of file utils.py.

◆ _JS_VOID_RE

searx.utils._JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\‍([0-9]+\‍)')
protected

Definition at line 43 of file utils.py.

◆ _LANG_TO_LC_CACHE

dict searx.utils._LANG_TO_LC_CACHE = {}
protected

Definition at line 47 of file utils.py.

◆ _NOTSET

searx.utils._NOTSET = _NotSetClass()
protected

Definition at line 61 of file utils.py.

◆ _XPATH_CACHE

dict searx.utils._XPATH_CACHE = {}
protected

Definition at line 46 of file utils.py.

◆ logger

searx.utils.logger = logger.getChild('utils')

Definition at line 33 of file utils.py.

◆ SEARCH_LANGUAGE_CODES

searx.utils.SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])

Definition at line 52 of file utils.py.

◆ XPathSpecType

searx.utils.XPathSpecType = Union[str, XPath]

Definition at line 35 of file utils.py.