2"""Utility functions for the engines
11from typing
import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
12from numbers
import Number
13from os.path
import splitext, join
14from random
import choice
15from html.parser
import HTMLParser
16from html
import escape
17from urllib.parse
import urljoin, urlparse
18from markdown_it
import MarkdownIt
21from lxml.etree
import ElementBase, XPath, XPathError, XPathSyntaxError
23from searx
import settings
27from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
28from searx
import logger
31logger = logger.getChild(
'utils')
33XPathSpecType = Union[str, XPath]
35_BLOCKED_TAGS = (
'script',
'style')
37_ECMA_UNESCAPE4_RE = re.compile(
r'%u([0-9a-fA-F]{4})', re.UNICODE)
38_ECMA_UNESCAPE2_RE = re.compile(
r'%([0-9a-fA-F]{2})', re.UNICODE)
40_JS_QUOTE_KEYS_RE = re.compile(
r'([\{\s,])(\w+)(:)')
41_JS_VOID_RE = re.compile(
r'void\s+[0-9]+|void\s*\([0-9]+\)')
42_JS_DECIMAL_RE = re.compile(
r":\s*\.")
44_STORAGE_UNIT_VALUE: Dict[str, int] = {
45 'TB': 1024 * 1024 * 1024 * 1024,
46 'GB': 1024 * 1024 * 1024,
48 'TiB': 1000 * 1000 * 1000 * 1000,
49 'GiB': 1000 * 1000 * 1000,
54_XPATH_CACHE: Dict[str, XPath] = {}
55_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
57_FASTTEXT_MODEL: Optional[
"fasttext.FastText._FastText"] =
None
58"""fasttext model to predict laguage of a search term"""
60SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split(
'-')[0]
for searxng_locale
in sxng_locales])
61"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
65 """Internal class for this module, do not create instance of this class.
66 Replace the None value, allow explicitly pass None as a function argument"""
69_NOTSET = _NotSetClass()
73 """Return the searx User Agent"""
74 return 'searx/{searx_version} {suffix}'.
format(
75 searx_version=VERSION_TAG, suffix=settings[
'outgoing'][
'useragent_suffix']
80 """Return a random browser User Agent
82 See searx/data/useragents.json
84 return USER_AGENTS[
'ua'].
format(os=os_string
or choice(USER_AGENTS[
'os']), version=choice(USER_AGENTS[
'versions']))
88 """Internal exception raised when the HTML is invalid"""
91class _HTMLTextExtractor(HTMLParser):
92 """Internal class to extract text from HTML"""
95 HTMLParser.__init__(self)
100 self.
tags.append(tag)
108 if tag != self.
tags[-1]:
114 return not self.
tags or self.
tags[-1]
not in _BLOCKED_TAGS
124 if name[0]
in (
'x',
'X'):
125 codepoint = int(name[1:], 16)
127 codepoint = int(name)
128 self.
result.append(chr(codepoint))
138 return ''.join(self.
result).strip()
143 raise AssertionError(message)
147 """Extract text from a HTML string
150 * html_str (str): string HTML
153 * str: extracted text
156 >>> html_to_text('Example <span id="42">#2</span>')
159 >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
162 >>> html_to_text(r'regexp: (?<![a-zA-Z]')
163 'regexp: (?<![a-zA-Z]'
165 html_str = html_str.replace(
'\n',
' ').replace(
'\r',
' ')
166 html_str =
' '.join(html_str.split())
170 except AssertionError:
172 s.feed(escape(html_str, quote=
True))
173 except _HTMLTextExtractorException:
174 logger.debug(
"HTMLTextExtractor: invalid HTML\n%s", html_str)
179 """Extract text from a Markdown string
182 * markdown_str (str): string Markdown
185 * str: extracted text
188 >>> markdown_to_text('[example](https://example.com)')
191 >>> markdown_to_text('## Headline')
196 MarkdownIt(
"commonmark", {
"typographer":
True}).enable([
"replacements",
"smartquotes"]).render(markdown_str)
198 return html_to_text(html_str)
201def extract_text(xpath_results, allow_none: bool =
False) -> Optional[str]:
202 """Extract text from a lxml result
204 * if xpath_results is list, extract the text from each result and concat the list
205 * if xpath_results is a xml element, extract all the text node from it
206 ( text_content() method from lxml )
207 * if xpath_results is a string element, then it's already done
209 if isinstance(xpath_results, list):
212 for e
in xpath_results:
213 result = result + (extract_text(e)
or '')
214 return result.strip()
215 if isinstance(xpath_results, ElementBase):
217 text: str = html.tostring(xpath_results, encoding=
'unicode', method=
'text', with_tail=
False)
218 text = text.strip().replace(
'\n',
' ')
219 return ' '.join(text.split())
220 if isinstance(xpath_results, (str, Number, bool)):
221 return str(xpath_results)
222 if xpath_results
is None and allow_none:
224 if xpath_results
is None and not allow_none:
225 raise ValueError(
'extract_text(None, allow_none=False)')
226 raise ValueError(
'unsupported type')
230 """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
233 * url (str): Relative URL
234 * base_url (str): Base URL, it must be an absolute URL.
237 >>> normalize_url('https://example.com', 'http://example.com/')
238 'https://example.com/'
239 >>> normalize_url('//example.com', 'http://example.com/')
240 'http://example.com/'
241 >>> normalize_url('//example.com', 'https://example.com/')
242 'https://example.com/'
243 >>> normalize_url('/path?a=1', 'https://example.com')
244 'https://example.com/path?a=1'
245 >>> normalize_url('', 'https://example.com')
246 'https://example.com/'
247 >>> normalize_url('/test', '/path')
251 * lxml.etree.ParserError
254 * str: normalized URL
256 if url.startswith(
'//'):
258 parsed_search_url = urlparse(base_url)
259 url =
'{0}:{1}'.
format(parsed_search_url.scheme
or 'http', url)
260 elif url.startswith(
'/'):
262 url = urljoin(base_url, url)
266 url = urljoin(base_url, url)
268 parsed_url = urlparse(url)
271 if not parsed_url.netloc:
272 raise ValueError(
'Cannot parse url')
273 if not parsed_url.path:
280 """Extract and normalize URL from lxml Element
283 * xpath_results (Union[List[html.HtmlElement], html.HtmlElement]): lxml Element(s)
284 * base_url (str): Base URL
287 >>> def f(s, search_url):
288 >>> return searx.utils.extract_url(html.fromstring(s), search_url)
289 >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
290 'https://example.com/'
291 >>> f('https://example.com', 'http://example.com/')
292 'https://example.com/'
293 >>> f('//example.com', 'http://example.com/')
294 'http://example.com/'
295 >>> f('//example.com', 'https://example.com/')
296 'https://example.com/'
297 >>> f('/path?a=1', 'https://example.com')
298 'https://example.com/path?a=1'
299 >>> f('', 'https://example.com')
300 raise lxml.etree.ParserError
301 >>> searx.utils.extract_url([], 'https://example.com')
306 * lxml.etree.ParserError
309 * str: normalized URL
311 if xpath_results == []:
312 raise ValueError(
'Empty url resultset')
314 url = extract_text(xpath_results)
317 raise ValueError(
'URL not found')
320def dict_subset(dictionary: MutableMapping, properties: Set[str]) -> Dict:
321 """Extract a subset of a dict
324 >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
326 >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
329 return {k: dictionary[k]
for k
in properties
if k
in dictionary}
336 * filesize (str): size
337 * filesize_multiplier (str): TB, GB, .... TiB, GiB...
340 * int: number of bytes
343 >>> get_torrent_size('5', 'GB')
345 >>> get_torrent_size('3.14', 'MiB')
349 multiplier = _STORAGE_UNIT_VALUE.get(filesize_multiplier, 1)
350 return int(float(filesize) * multiplier)
356 """Determine the *human readable* value of bytes on 1024 base (1KB=1024B)."""
357 s = [
'B ',
'KB',
'MB',
'GB',
'TB']
361 while size > 1024
and p < x:
364 return "%.*f %s" % (precision, size, s[p])
368 """Convert number_str to int or 0 if number_str is not a number."""
369 if number_str.isdigit():
370 return int(number_str)
375 """Convert num to int or 0. num can be either a str or a list.
376 If num is a list, the first element is converted to int (or return 0 if the list is empty).
377 If num is a str, see convert_str_to_int
379 if isinstance(num, list):
387 """Return language code and name if lang describe a language.
390 >>> is_valid_lang('zz')
392 >>> is_valid_lang('uk')
393 (True, 'uk', 'ukrainian')
394 >>> is_valid_lang(b'uk')
395 (True, 'uk', 'ukrainian')
396 >>> is_valid_lang('en')
397 (True, 'en', 'english')
398 >>> searx.utils.is_valid_lang('Español')
399 (True, 'es', 'spanish')
400 >>> searx.utils.is_valid_lang('Spanish')
401 (True, 'es', 'spanish')
403 if isinstance(lang, bytes):
405 is_abbr = len(lang) == 2
408 for l
in sxng_locales:
410 return (
True, l[0][:2], l[3].lower())
412 for l
in sxng_locales:
413 if l[1].lower() == lang
or l[3].lower() == lang:
414 return (
True, l[0][:2], l[3].lower())
418def load_module(filename: str, module_dir: str) -> types.ModuleType:
419 modname = splitext(filename)[0]
420 modpath = join(module_dir, filename)
422 spec = importlib.util.spec_from_file_location(modname, modpath)
424 raise ValueError(f
"Error loading '{modpath}' module")
425 module = importlib.util.module_from_spec(spec)
427 raise ValueError(f
"Error loading '{modpath}' module")
428 spec.loader.exec_module(module)
433 """Convert obj to its string representation."""
434 if isinstance(obj, str):
436 if hasattr(obj,
'__str__'):
442 """Python implementation of the unescape javascript function
444 https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
445 https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
448 >>> ecma_unescape('%u5409')
450 >>> ecma_unescape('%20')
452 >>> ecma_unescape('%F3')
456 string = _ECMA_UNESCAPE4_RE.sub(
lambda e: chr(int(e.group(1), 16)), string)
458 string = _ECMA_UNESCAPE2_RE.sub(
lambda e: chr(int(e.group(1), 16)), string)
463 rep = {re.escape(k): v
for k, v
in replaces.items()}
464 pattern = re.compile(
"|".join(rep.keys()))
467 return pattern.sub(
lambda m: rep[re.escape(m.group(0))], text)
473 """Return engine configuration from settings.yml of a given engine name"""
475 if 'engines' not in settings:
478 for engine
in settings[
'engines']:
479 if 'name' not in engine:
481 if name == engine[
'name']:
488 """Return cached compiled XPath
490 There is no thread lock.
491 Worst case scenario, xpath_str is compiled more than one time.
494 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
497 * result (bool, float, list, str): Results.
500 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
501 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
503 if isinstance(xpath_spec, str):
504 result = _XPATH_CACHE.get(xpath_spec,
None)
507 result = XPath(xpath_spec)
508 except XPathSyntaxError
as e:
510 _XPATH_CACHE[xpath_spec] = result
513 if isinstance(xpath_spec, XPath):
516 raise TypeError(
'xpath_spec must be either a str or a lxml.etree.XPath')
519def eval_xpath(element: ElementBase, xpath_spec: XPathSpecType):
520 """Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.
521 See https://lxml.de/xpathxslt.html#xpath-return-values
524 * element (ElementBase): [description]
525 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
528 * result (bool, float, list, str): Results.
531 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
532 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
533 * SearxEngineXPathException: Raise when the XPath can't be evaluated.
537 return xpath(element)
538 except XPathError
as e:
539 arg =
' '.join([str(i)
for i
in e.args])
543def eval_xpath_list(element: ElementBase, xpath_spec: XPathSpecType, min_len: Optional[int] =
None):
544 """Same as eval_xpath, check if the result is a list
547 * element (ElementBase): [description]
548 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
549 * min_len (int, optional): [description]. Defaults to None.
552 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
553 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
554 * SearxEngineXPathException: raise if the result is not a list
557 * result (bool, float, list, str): Results.
559 result = eval_xpath(element, xpath_spec)
560 if not isinstance(result, list):
562 if min_len
is not None and min_len > len(result):
568 """Call eval_xpath_list then get one element using the index parameter.
569 If the index does not exist, either raise an exception is default is not set,
570 other return the default value (can be None).
573 * elements (ElementBase): lxml element to apply the xpath.
574 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath.
575 * index (int): index to get
576 * default (Object, optional): Defaults if index doesn't exist.
579 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
580 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
581 * SearxEngineXPathException: if the index is not found. Also see eval_xpath.
584 * result (bool, float, list, str): Results.
586 result = eval_xpath_list(elements, xpath_spec)
587 if -len(result) <= index < len(result):
589 if default == _NOTSET:
597 global _FASTTEXT_MODEL
598 if _FASTTEXT_MODEL
is None:
602 fasttext.FastText.eprint =
lambda x:
None
603 _FASTTEXT_MODEL = fasttext.load_model(str(data_dir /
'lid.176.ftz'))
604 return _FASTTEXT_MODEL
607def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool =
False) -> Optional[str]:
608 """Detect the language of the ``text`` parameter.
610 :param str text: The string whose language is to be detected.
612 :param float threshold: Threshold filters the returned labels by a threshold
613 on probability. A choice of 0.3 will return labels with at least 0.3
616 :param bool only_search_languages: If ``True``, returns only supported
617 SearXNG search languages. see :py:obj:`searx.languages`
621 The detected language code or ``None``. See below.
623 :raises ValueError: If ``text`` is not a string.
625 The language detection is done by using `a fork`_ of the fastText_ library
626 (`python fasttext`_). fastText_ distributes the `language identification
627 model`_, for reference:
629 - `FastText.zip: Compressing text classification models`_
630 - `Bag of Tricks for Efficient Text Classification`_
632 The `language identification model`_ support the language codes
635 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
636 bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
637 et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
638 id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
639 lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
640 nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
641 rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
642 tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
644 By using ``only_search_languages=True`` the `language identification model`_
645 is harmonized with the SearXNG's language (locale) model. General
646 conditions of SearXNG's locale model are:
648 a. SearXNG's locale of a query is passed to the
649 :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
650 code that is used by an engine.
652 b. Most of SearXNG's engines do not support all the languages from `language
653 identification model`_ and there is also a discrepancy in the ISO-639-3
654 (fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
655 locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
656 (``zh_Hans``) while the `language identification model`_ reduce both to
659 .. _a fork: https://github.com/searxng/fasttext-predict
660 .. _fastText: https://fasttext.cc/
661 .. _python fasttext: https://pypi.org/project/fasttext/
662 .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
663 .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
664 .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
667 if not isinstance(text, str):
668 raise ValueError(
'text must a str')
670 if isinstance(r, tuple)
and len(r) == 2
and len(r[0]) > 0
and len(r[1]) > 0:
671 language = r[0][0].split(
'__label__')[1]
672 if only_search_languages
and language
not in SEARCH_LANGUAGE_CODES:
679 """Convert a javascript variable into JSON and then load the value
681 It does not deal with all cases, but it is good enough for now.
682 chompjs has a better implementation.
691 parts = re.split(
r'(["\'])', js_variable)
694 for i, p
in enumerate(parts):
699 parts[i] = parts[i].replace(
':', chr(1))
705 parts[i] = parts[i].replace(
'"',
r'\"')
708 if not in_string
and p
in (
'"',
"'"):
719 if len(previous_p) > 0
and previous_p[-1] ==
'\\':
731 parts[i] = _JS_VOID_RE.sub(
"null", p)
740 s = _JS_QUOTE_KEYS_RE.sub(
r'\1"\2"\3', s)
741 s = _JS_DECIMAL_RE.sub(
":0.", s)
743 s = s.replace(chr(1),
':')
Callable[[str], str] get_string_replaces_function(Dict[str, str] replaces)
js_variable_to_python(js_variable)
Optional[str] detect_language(str text, float threshold=0.3, bool only_search_languages=False)
XPath get_xpath(XPathSpecType xpath_spec)
humanize_bytes(size, precision=2)
Optional[str] extract_text(xpath_results, bool allow_none=False)
str ecma_unescape(str string)
eval_xpath_getindex(ElementBase elements, XPathSpecType xpath_spec, int index, default=_NOTSET)
int convert_str_to_int(str number_str)
eval_xpath(ElementBase element, XPathSpecType xpath_spec)
str gen_useragent(Optional[str] os_string=None)
int int_or_zero(Union[List[str], str] num)
Optional[Tuple[bool, str, str]] is_valid_lang(lang)
"fasttext.FastText._FastText" _get_fasttext_model()
str extract_url(xpath_results, base_url)
eval_xpath_list(ElementBase element, XPathSpecType xpath_spec, Optional[int] min_len=None)
types.ModuleType load_module(str filename, str module_dir)
Optional[int] get_torrent_size(str filesize, str filesize_multiplier)
str normalize_url(str url, str base_url)
str html_to_text(str html_str)
Dict dict_subset(MutableMapping dictionary, Set[str] properties)
Dict get_engine_from_settings(str name)
str markdown_to_text(str markdown_str)