2"""Utility functions for the engines"""
12from collections.abc
import MutableMapping, Callable
14from numbers
import Number
15from os.path
import splitext, join
16from random
import choice
17from html.parser
import HTMLParser
18from html
import escape
19from urllib.parse
import urljoin, urlparse, parse_qs, urlencode
20from datetime
import timedelta
21from markdown_it
import MarkdownIt
24from lxml.etree
import XPath, XPathError, XPathSyntaxError
25from lxml.etree
import ElementBase, _Element
27from searx
import settings
31from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
32from searx
import logger
35 import fasttext.FastText
38logger = logger.getChild(
'utils')
40XPathSpecType: t.TypeAlias = str | XPath
41"""Type alias used by :py:obj:`searx.utils.get_xpath`,
42:py:obj:`searx.utils.eval_xpath` and other XPath selectors."""
44ElementType: t.TypeAlias = ElementBase | _Element
47_BLOCKED_TAGS = (
'script',
'style')
49_ECMA_UNESCAPE4_RE = re.compile(
r'%u([0-9a-fA-F]{4})', re.UNICODE)
50_ECMA_UNESCAPE2_RE = re.compile(
r'%([0-9a-fA-F]{2})', re.UNICODE)
52_JS_QUOTE_KEYS_RE = re.compile(
r'([\{\s,])(\w+)(:)')
53_JS_VOID_RE = re.compile(
r'void\s+[0-9]+|void\s*\([0-9]+\)')
54_JS_DECIMAL_RE = re.compile(
r":\s*\.")
56_XPATH_CACHE: dict[str, XPath] = {}
57_LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {}
59_FASTTEXT_MODEL:
"fasttext.FastText._FastText | None" =
None
60"""fasttext model to predict language of a search term"""
62SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split(
'-')[0]
for searxng_locale
in sxng_locales])
63"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
67 """Internal class for this module, do not create instance of this class.
68 Replace the None value, allow explicitly pass None as a function argument"""
71_NOTSET = _NotSetClass()
75 """Return the SearXNG User Agent"""
76 return f
"SearXNG/{VERSION_TAG} {settings['outgoing']['useragent_suffix']}".strip()
80 """Return a random browser User Agent
82 See searx/data/useragents.json
84 return USER_AGENTS[
'ua'].format(
85 os=os_string
or choice(USER_AGENTS[
'os']),
86 version=choice(USER_AGENTS[
'versions']),
91 """Internal class to extract text from HTML"""
94 HTMLParser.__init__(self)
107 if tag != self.
tags[-1]:
108 self.
result.append(f
"</{tag}>")
114 return not self.
tags or self.
tags[-1]
not in _BLOCKED_TAGS
124 if name[0]
in (
'x',
'X'):
125 codepoint = int(name[1:], 16)
127 codepoint = int(name)
128 self.
result.append(chr(codepoint))
138 return ''.join(self.
result).strip()
140 def error(self, message: str) ->
None:
143 raise AssertionError(message)
147 """Extract text from a HTML string
150 * html_str (str): string HTML
153 * str: extracted text
156 >>> html_to_text('Example <span id="42">#2</span>')
159 >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
162 >>> html_to_text(r'regexp: (?<![a-zA-Z]')
163 'regexp: (?<![a-zA-Z]'
165 >>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
166 'Lorem ipsum </i>dolor sit amet</p>'
168 >>> html_to_text(r'> < a')
174 html_str = html_str.replace(
'\n',
' ').replace(
'\r',
' ')
175 html_str =
' '.join(html_str.split())
180 except AssertionError:
182 s.feed(escape(html_str, quote=
True))
188 """Extract text from a Markdown string
191 * markdown_str (str): string Markdown
194 * str: extracted text
197 >>> markdown_to_text('[example](https://example.com)')
200 >>> markdown_to_text('## Headline')
205 MarkdownIt(
"commonmark", {
"typographer":
True}).enable([
"replacements",
"smartquotes"]).render(markdown_str)
211 xpath_results: list[ElementType] | ElementType | str | Number | bool |
None,
212 allow_none: bool =
False,
214 """Extract text from a lxml result
216 - If ``xpath_results`` is a list of :py:obj:`ElementType` objects, extract
217 the text from each result and concatenate the list in a string.
219 - If ``xpath_results`` is a :py:obj:`ElementType` object, extract all the
220 text node from it ( :py:obj:`lxml.html.tostring`, ``method="text"`` )
222 - If ``xpath_results`` is of type :py:obj:`str` or :py:obj:`Number`,
223 :py:obj:`bool` the string value is returned.
225 - If ``xpath_results`` is of type ``None`` a :py:obj:`ValueError` is raised,
226 except ``allow_none`` is ``True`` where ``None`` is returned.
229 if isinstance(xpath_results, list):
232 for e
in xpath_results:
234 return result.strip()
235 if isinstance(xpath_results, ElementType):
237 text: str = html.tostring(
243 text = text.strip().replace(
'\n',
' ')
244 return ' '.join(text.split())
245 if isinstance(xpath_results, (str, Number, bool)):
246 return str(xpath_results)
247 if xpath_results
is None and allow_none:
249 if xpath_results
is None and not allow_none:
250 raise ValueError(
'extract_text(None, allow_none=False)')
251 raise ValueError(
'unsupported type')
255 """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
258 * url (str): Relative URL
259 * base_url (str): Base URL, it must be an absolute URL.
262 >>> normalize_url('https://example.com', 'http://example.com/')
263 'https://example.com/'
264 >>> normalize_url('//example.com', 'http://example.com/')
265 'http://example.com/'
266 >>> normalize_url('//example.com', 'https://example.com/')
267 'https://example.com/'
268 >>> normalize_url('/path?a=1', 'https://example.com')
269 'https://example.com/path?a=1'
270 >>> normalize_url('', 'https://example.com')
271 'https://example.com/'
272 >>> normalize_url('/test', '/path')
276 * lxml.etree.ParserError
279 * str: normalized URL
281 if url.startswith(
'//'):
283 parsed_search_url = urlparse(base_url)
284 url =
'{0}:{1}'.format(parsed_search_url.scheme
or 'http', url)
285 elif url.startswith(
'/'):
287 url = urljoin(base_url, url)
291 url = urljoin(base_url, url)
293 parsed_url = urlparse(url)
296 if not parsed_url.netloc:
297 raise ValueError(
'Cannot parse url')
298 if not parsed_url.path:
304def extract_url(xpath_results: list[ElementType] | ElementType | str | Number | bool |
None, base_url: str) -> str:
305 """Extract and normalize URL from lxml Element
308 >>> def f(s, search_url):
309 >>> return searx.utils.extract_url(html.fromstring(s), search_url)
310 >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
311 'https://example.com/'
312 >>> f('https://example.com', 'http://example.com/')
313 'https://example.com/'
314 >>> f('//example.com', 'http://example.com/')
315 'http://example.com/'
316 >>> f('//example.com', 'https://example.com/')
317 'https://example.com/'
318 >>> f('/path?a=1', 'https://example.com')
319 'https://example.com/path?a=1'
320 >>> f('', 'https://example.com')
321 raise lxml.etree.ParserError
322 >>> searx.utils.extract_url([], 'https://example.com')
327 * lxml.etree.ParserError
330 * str: normalized URL
332 if xpath_results == []:
333 raise ValueError(
'Empty url resultset')
338 raise ValueError(
'URL not found')
341def dict_subset(dictionary: MutableMapping[t.Any, t.Any], properties: set[str]) -> MutableMapping[str, t.Any]:
342 """Extract a subset of a dict
345 >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
347 >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
350 return {k: dictionary[k]
for k
in properties
if k
in dictionary}
354 """Determine the *human readable* value of bytes on 1024 base (1KB=1024B)."""
355 s = [
'B ',
'KB',
'MB',
'GB',
'TB']
359 while size > 1024
and p < x:
362 return "%.*f %s" % (precision, size, s[p])
366 """Determine the *human readable* value of a decimal number."""
367 s = [
'',
'K',
'M',
'B',
'T']
371 while size > 1000
and p < x:
374 return "%.*f%s" % (precision, size, s[p])
378 """Convert number_str to int or 0 if number_str is not a number."""
379 if number_str.isdigit():
380 return int(number_str)
384def extr(txt: str, begin: str, end: str, default: str =
""):
385 """Extract the string between ``begin`` and ``end`` from ``txt``
387 :param txt: String to search in
388 :param begin: First string to be searched for
389 :param end: Second string to be searched for after ``begin``
390 :param default: Default value if one of ``begin`` or ``end`` is not
391 found. Defaults to an empty string.
392 :return: The string between the two search-strings ``begin`` and ``end``.
393 If at least one of ``begin`` or ``end`` is not found, the value of
394 ``default`` is returned.
397 >>> extr("abcde", "a", "e")
399 >>> extr("abcde", "a", "z", deafult="nothing")
407 first = txt.index(begin) + len(begin)
408 return txt[first : txt.index(end, first)]
414 """Convert num to int or 0. num can be either a str or a list.
415 If num is a list, the first element is converted to int (or return 0 if the list is empty).
416 If num is a str, see convert_str_to_int
418 if isinstance(num, list):
425def load_module(filename: str, module_dir: str) -> types.ModuleType:
426 modname = splitext(filename)[0]
427 modpath = join(module_dir, filename)
429 spec = importlib.util.spec_from_file_location(modname, modpath)
431 raise ValueError(f
"Error loading '{modpath}' module")
432 module = importlib.util.module_from_spec(spec)
434 raise ValueError(f
"Error loading '{modpath}' module")
435 spec.loader.exec_module(module)
440 """Convert obj to its string representation."""
441 if isinstance(obj, str):
443 if hasattr(obj,
'__str__'):
449 """Python implementation of the unescape javascript function
451 https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
452 https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
455 >>> ecma_unescape('%u5409')
457 >>> ecma_unescape('%20')
459 >>> ecma_unescape('%F3')
463 string = _ECMA_UNESCAPE4_RE.sub(
lambda e: chr(int(e.group(1), 16)), string)
465 string = _ECMA_UNESCAPE2_RE.sub(
lambda e: chr(int(e.group(1), 16)), string)
470 """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
472 .. _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
474 pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
478 if any(a <= i <= b
for (a, b)
in pua_ranges):
485 rep = {re.escape(k): v
for k, v
in replaces.items()}
486 pattern = re.compile(
"|".join(rep.keys()))
489 return pattern.sub(
lambda m: rep[re.escape(m.group(0))], text)
495 """Return engine configuration from settings.yml of a given engine name"""
497 if 'engines' not in settings:
500 for engine
in settings[
'engines']:
501 if 'name' not in engine:
503 if name == engine[
'name']:
510 """Return cached compiled :py:obj:`lxml.etree.XPath` object.
513 Raised when ``xpath_spec`` is neither a :py:obj:`str` nor a
514 :py:obj:`lxml.etree.XPath`.
516 ``SearxXPathSyntaxException``:
517 Raised when there is a syntax error in the *XPath* selector (``str``).
519 if isinstance(xpath_spec, str):
520 result = _XPATH_CACHE.get(xpath_spec,
None)
523 result = XPath(xpath_spec)
524 except XPathSyntaxError
as e:
526 _XPATH_CACHE[xpath_spec] = result
529 if isinstance(xpath_spec, XPath):
532 raise TypeError(
'xpath_spec must be either a str or a lxml.etree.XPath')
535def eval_xpath(element: ElementType, xpath_spec: XPathSpecType) -> t.Any:
536 """Equivalent of ``element.xpath(xpath_str)`` but compile ``xpath_str`` into
537 a :py:obj:`lxml.etree.XPath` object once for all. The return value of
538 ``xpath(..)`` is complex, read `XPath return values`_ for more details.
540 .. _XPath return values:
541 https://lxml.de/xpathxslt.html#xpath-return-values
544 Raised when ``xpath_spec`` is neither a :py:obj:`str` nor a
545 :py:obj:`lxml.etree.XPath`.
547 ``SearxXPathSyntaxException``:
548 Raised when there is a syntax error in the *XPath* selector (``str``).
550 ``SearxEngineXPathException:``
551 Raised when the XPath can't be evaluated (masked
552 :py:obj:`lxml.etree..XPathError`).
557 return xpath(element)
558 except XPathError
as e:
559 arg =
' '.join([str(i)
for i
in e.args])
563def eval_xpath_list(element: ElementType, xpath_spec: XPathSpecType, min_len: int |
None =
None) -> list[t.Any]:
564 """Same as :py:obj:`searx.utils.eval_xpath`, but additionally ensures the
565 return value is a :py:obj:`list`. The minimum length of the list is also
566 checked (if ``min_len`` is set)."""
568 result: list[t.Any] =
eval_xpath(element, xpath_spec)
569 if not isinstance(result, list):
571 if min_len
is not None and min_len > len(result):
577 element: ElementType,
578 xpath_spec: XPathSpecType,
580 default: t.Any = _NOTSET,
582 """Same as :py:obj:`searx.utils.eval_xpath_list`, but returns item on
583 position ``index`` from the list (index starts with ``0``).
585 The exceptions known from :py:obj:`searx.utils.eval_xpath` are thrown. If a
586 default is specified, this is returned if an element at position ``index``
587 could not be determined.
591 if -len(result) <= index < len(result):
593 if default == _NOTSET:
601 global _FASTTEXT_MODEL
602 if _FASTTEXT_MODEL
is None:
606 fasttext.FastText.eprint =
lambda x:
None
607 _FASTTEXT_MODEL = fasttext.load_model(str(data_dir /
'lid.176.ftz'))
608 return _FASTTEXT_MODEL
613 Converts a standard video URL into its embed format. Supported services include Youtube,
614 Facebook, Instagram, TikTok, Dailymotion, and Bilibili.
616 parsed_url = urlparse(url)
620 if parsed_url.netloc
in [
'www.youtube.com',
'youtube.com']
and parsed_url.path ==
'/watch' and parsed_url.query:
621 video_id = parse_qs(parsed_url.query).get(
'v', [])
623 iframe_src =
'https://www.youtube-nocookie.com/embed/' + video_id[0]
626 elif parsed_url.netloc
in [
'www.facebook.com',
'facebook.com']:
627 encoded_href = urlencode({
'href': url})
628 iframe_src =
'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href
631 elif parsed_url.netloc
in [
'www.instagram.com',
'instagram.com']
and parsed_url.path.startswith(
'/p/'):
632 if parsed_url.path.endswith(
'/'):
633 iframe_src = url +
'embed'
635 iframe_src = url +
'/embed'
639 parsed_url.netloc
in [
'www.tiktok.com',
'tiktok.com']
640 and parsed_url.path.startswith(
'/@')
641 and '/video/' in parsed_url.path
643 path_parts = parsed_url.path.split(
'/video/')
644 video_id = path_parts[1]
645 iframe_src =
'https://www.tiktok.com/embed/' + video_id
648 elif parsed_url.netloc
in [
'www.dailymotion.com',
'dailymotion.com']
and parsed_url.path.startswith(
'/video/'):
649 path_parts = parsed_url.path.split(
'/')
650 if len(path_parts) == 3:
651 video_id = path_parts[2]
652 iframe_src =
'https://www.dailymotion.com/embed/video/' + video_id
655 elif parsed_url.netloc
in [
'www.bilibili.com',
'bilibili.com']
and parsed_url.path.startswith(
'/video/'):
656 path_parts = parsed_url.path.split(
'/')
658 video_id = path_parts[2]
660 if video_id.startswith(
'av'):
661 video_id = video_id[2:]
663 elif video_id.startswith(
'BV'):
667 f
'https://player.bilibili.com/player.html?{param_key}={video_id}&high_quality=1&autoplay=false&danmaku=0'
673def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool =
False) -> str |
None:
674 """Detect the language of the ``text`` parameter.
676 :param str text: The string whose language is to be detected.
678 :param float threshold: Threshold filters the returned labels by a threshold
679 on probability. A choice of 0.3 will return labels with at least 0.3
682 :param bool only_search_languages: If ``True``, returns only supported
683 SearXNG search languages. see :py:obj:`searx.languages`
687 The detected language code or ``None``. See below.
689 :raises ValueError: If ``text`` is not a string.
691 The language detection is done by using `a fork`_ of the fastText_ library
692 (`python fasttext`_). fastText_ distributes the `language identification
693 model`_, for reference:
695 - `FastText.zip: Compressing text classification models`_
696 - `Bag of Tricks for Efficient Text Classification`_
698 The `language identification model`_ support the language codes
701 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
702 bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
703 et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
704 id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
705 lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
706 nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
707 rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
708 tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
710 By using ``only_search_languages=True`` the `language identification model`_
711 is harmonized with the SearXNG's language (locale) model. General
712 conditions of SearXNG's locale model are:
714 a. SearXNG's locale of a query is passed to the
715 :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
716 code that is used by an engine.
718 b. Most of SearXNG's engines do not support all the languages from `language
719 identification model`_ and there is also a discrepancy in the ISO-639-3
720 (fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
721 locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
722 (``zh_Hans``) while the `language identification model`_ reduce both to
725 .. _a fork: https://github.com/searxng/fasttext-predict
726 .. _fastText: https://fasttext.cc/
727 .. _python fasttext: https://pypi.org/project/fasttext/
728 .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
729 .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
730 .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
733 if not isinstance(text, str):
734 raise ValueError(
'text must a str')
736 if isinstance(r, tuple)
and len(r) == 2
and len(r[0]) > 0
and len(r[1]) > 0:
737 language = r[0][0].split(
'__label__')[1]
738 if only_search_languages
and language
not in SEARCH_LANGUAGE_CODES:
745 """Convert a javascript variable into JSON and then load the value
747 It does not deal with all cases, but it is good enough for now.
748 chompjs has a better implementation.
757 parts = re.split(
r'(["\'])', js_variable)
760 for i, p
in enumerate(parts):
765 parts[i] = parts[i].replace(
':', chr(1))
771 parts[i] = parts[i].replace(
'"',
r'\"')
774 if not in_string
and p
in (
'"',
"'"):
785 if len(previous_p) > 0
and previous_p[-1] ==
'\\':
797 parts[i] = _JS_VOID_RE.sub(
"null", p)
806 s = _JS_QUOTE_KEYS_RE.sub(
r'\1"\2"\3', s)
807 s = _JS_DECIMAL_RE.sub(
":0.", s)
809 s = s.replace(chr(1),
':')
814 s = s.replace(
"',",
"\",")
820 """Parse a time string in format MM:SS or HH:MM:SS and convert it to a `timedelta` object.
822 Returns None if the provided string doesn't match any of the formats.
824 duration_str = duration_str.strip()
831 time_parts = ([
"00"] + duration_str.split(
":"))[:3]
832 hours, minutes, seconds = map(int, time_parts)
833 return timedelta(hours=hours, minutes=minutes, seconds=seconds)
835 except (ValueError, TypeError):
str|None detect_language(str text, float threshold=0.3, bool only_search_languages=False)
list[t.Any] eval_xpath_list(ElementType element, XPathSpecType xpath_spec, int|None min_len=None)
t.Any eval_xpath_getindex(ElementType element, XPathSpecType xpath_spec, int index, t.Any default=_NOTSET)
extr(str txt, str begin, str end, str default="")
t.Any eval_xpath(ElementType element, XPathSpecType xpath_spec)
str js_variable_to_python(str js_variable)
XPath get_xpath(XPathSpecType xpath_spec)
humanize_bytes(int|float size, int precision=2)
str ecma_unescape(str string)
int convert_str_to_int(str number_str)
int int_or_zero(list[str]|str num)
"fasttext.FastText._FastText" _get_fasttext_model()
humanize_number(int|float size, int precision=0)
remove_pua_from_str(str string)
types.ModuleType load_module(str filename, str module_dir)
MutableMapping[str, t.Any] dict_subset(MutableMapping[t.Any, t.Any] dictionary, set[str] properties)
str gen_useragent(str|None os_string=None)
get_embeded_stream_url(str url)
str normalize_url(str url, str base_url)
str extract_url(list[ElementType]|ElementType|str|Number|bool|None xpath_results, str base_url)
str html_to_text(str html_str)
timedelta|None parse_duration_string(str duration_str)
dict[str, dict[str, str]] get_engine_from_settings(str name)
str|None extract_text(list[ElementType]|ElementType|str|Number|bool|None xpath_results, bool allow_none=False)
str markdown_to_text(str markdown_str)
Callable[[str], str] get_string_replaces_function(dict[str, str] replaces)