.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
utils.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Utility functions for the engines"""
3
4
5import re
6import importlib
7import importlib.util
8import json
9import types
10
11import typing as t
12from collections.abc import MutableMapping, Callable
13
14from numbers import Number
15from os.path import splitext, join
16from random import choice
17from html.parser import HTMLParser
18from html import escape
19from urllib.parse import urljoin, urlparse, parse_qs, urlencode
20from datetime import timedelta
21from markdown_it import MarkdownIt
22
23from lxml import html
24from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError
25
26from searx import settings
27from searx.data import USER_AGENTS, data_dir
28from searx.version import VERSION_TAG
29from searx.sxng_locales import sxng_locales
30from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
31from searx import logger
32
33if t.TYPE_CHECKING:
34 import fasttext.FastText # type: ignore
35
36
37logger = logger.getChild('utils')
38
39XPathSpecType: t.TypeAlias = str | XPath
40"""Type alias used by :py:obj:`searx.utils.get_xpath`,
41:py:obj:`searx.utils.eval_xpath` and other XPath selectors."""
42
43_BLOCKED_TAGS = ('script', 'style')
44
45_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
46_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
47
48_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
49_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\‍([0-9]+\‍)')
50_JS_DECIMAL_RE = re.compile(r":\s*\.")
51
52_XPATH_CACHE: dict[str, XPath] = {}
53_LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {}
54
55_FASTTEXT_MODEL: "fasttext.FastText._FastText | None" = None # pyright: ignore[reportPrivateUsage]
56"""fasttext model to predict language of a search term"""
57
58SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
59"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
60
61
62class _NotSetClass: # pylint: disable=too-few-public-methods
63 """Internal class for this module, do not create instance of this class.
64 Replace the None value, allow explicitly pass None as a function argument"""
65
66
67_NOTSET = _NotSetClass()
68
69
70def searxng_useragent() -> str:
71 """Return the SearXNG User Agent"""
72 return f"SearXNG/{VERSION_TAG} {settings['outgoing']['useragent_suffix']}".strip()
73
74
75def gen_useragent(os_string: str | None = None) -> str:
76 """Return a random browser User Agent
77
78 See searx/data/useragents.json
79 """
80 return USER_AGENTS['ua'].format(
81 os=os_string or choice(USER_AGENTS['os']),
82 version=choice(USER_AGENTS['versions']),
83 )
84
85
86class HTMLTextExtractor(HTMLParser):
87 """Internal class to extract text from HTML"""
88
89 def __init__(self):
90 HTMLParser.__init__(self)
91 self.result: list[str] = []
92 self.tags: list[str] = []
93
94 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
95 self.tags.append(tag)
96 if tag == 'br':
97 self.result.append(' ')
98
99 def handle_endtag(self, tag: str) -> None:
100 if not self.tags:
101 return
102
103 if tag != self.tags[-1]:
104 self.result.append(f"</{tag}>")
105 return
106
107 self.tags.pop()
108
109 def is_valid_tag(self):
110 return not self.tags or self.tags[-1] not in _BLOCKED_TAGS
111
112 def handle_data(self, data: str) -> None:
113 if not self.is_valid_tag():
114 return
115 self.result.append(data)
116
117 def handle_charref(self, name: str) -> None:
118 if not self.is_valid_tag():
119 return
120 if name[0] in ('x', 'X'):
121 codepoint = int(name[1:], 16)
122 else:
123 codepoint = int(name)
124 self.result.append(chr(codepoint))
125
126 def handle_entityref(self, name: str) -> None:
127 if not self.is_valid_tag():
128 return
129 # codepoint = htmlentitydefs.name2codepoint[name]
130 # self.result.append(chr(codepoint))
131 self.result.append(name)
132
133 def get_text(self):
134 return ''.join(self.result).strip()
135
136 def error(self, message: str) -> None:
137 # error handle is needed in <py3.10
138 # https://github.com/python/cpython/pull/8562/files
139 raise AssertionError(message)
140
141
142def html_to_text(html_str: str) -> str:
143 """Extract text from a HTML string
144
145 Args:
146 * html_str (str): string HTML
147
148 Returns:
149 * str: extracted text
150
151 Examples:
152 >>> html_to_text('Example <span id="42">#2</span>')
153 'Example #2'
154
155 >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
156 'Example'
157
158 >>> html_to_text(r'regexp: (?&lt;![a-zA-Z]')
159 'regexp: (?<![a-zA-Z]'
160
161 >>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
162 'Lorem ipsum </i>dolor sit amet</p>'
163
164 >>> html_to_text(r'&#x3e &#x3c &#97')
165 '> < a'
166
167 """
168 if not html_str:
169 return ""
170 html_str = html_str.replace('\n', ' ').replace('\r', ' ')
171 html_str = ' '.join(html_str.split())
173 try:
174 s.feed(html_str)
175 s.close()
176 except AssertionError:
178 s.feed(escape(html_str, quote=True))
179 s.close()
180 return s.get_text()
181
182
183def markdown_to_text(markdown_str: str) -> str:
184 """Extract text from a Markdown string
185
186 Args:
187 * markdown_str (str): string Markdown
188
189 Returns:
190 * str: extracted text
191
192 Examples:
193 >>> markdown_to_text('[example](https://example.com)')
194 'example'
195
196 >>> markdown_to_text('## Headline')
197 'Headline'
198 """
199
200 html_str: str = (
201 MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
202 )
203 return html_to_text(html_str)
204
205
207 xpath_results: list[ElementBase] | ElementBase | str | Number | bool | None,
208 allow_none: bool = False,
209) -> str | None:
210 """Extract text from a lxml result
211
212 * if xpath_results is list, extract the text from each result and concat the list
213 * if xpath_results is a xml element, extract all the text node from it
214 ( text_content() method from lxml )
215 * if xpath_results is a string element, then it's already done
216 """
217 if isinstance(xpath_results, list):
218 # it's list of result : concat everything using recursive call
219 result = ''
220 for e in xpath_results:
221 result = result + (extract_text(e) or '')
222 return result.strip()
223 if isinstance(xpath_results, ElementBase):
224 # it's a element
225 text: str = html.tostring( # type: ignore
226 xpath_results, # pyright: ignore[reportArgumentType]
227 encoding='unicode',
228 method='text',
229 with_tail=False,
230 )
231 text = text.strip().replace('\n', ' ') # type: ignore
232 return ' '.join(text.split()) # type: ignore
233 if isinstance(xpath_results, (str, Number, bool)):
234 return str(xpath_results)
235 if xpath_results is None and allow_none:
236 return None
237 if xpath_results is None and not allow_none:
238 raise ValueError('extract_text(None, allow_none=False)')
239 raise ValueError('unsupported type')
240
241
242def normalize_url(url: str, base_url: str) -> str:
243 """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
244
245 Args:
246 * url (str): Relative URL
247 * base_url (str): Base URL, it must be an absolute URL.
248
249 Example:
250 >>> normalize_url('https://example.com', 'http://example.com/')
251 'https://example.com/'
252 >>> normalize_url('//example.com', 'http://example.com/')
253 'http://example.com/'
254 >>> normalize_url('//example.com', 'https://example.com/')
255 'https://example.com/'
256 >>> normalize_url('/path?a=1', 'https://example.com')
257 'https://example.com/path?a=1'
258 >>> normalize_url('', 'https://example.com')
259 'https://example.com/'
260 >>> normalize_url('/test', '/path')
261 raise ValueError
262
263 Raises:
264 * lxml.etree.ParserError
265
266 Returns:
267 * str: normalized URL
268 """
269 if url.startswith('//'):
270 # add http or https to this kind of url //example.com/
271 parsed_search_url = urlparse(base_url)
272 url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
273 elif url.startswith('/'):
274 # fix relative url to the search engine
275 url = urljoin(base_url, url)
276
277 # fix relative urls that fall through the crack
278 if '://' not in url:
279 url = urljoin(base_url, url)
280
281 parsed_url = urlparse(url)
282
283 # add a / at this end of the url if there is no path
284 if not parsed_url.netloc:
285 raise ValueError('Cannot parse url')
286 if not parsed_url.path:
287 url += '/'
288
289 return url
290
291
292def extract_url(xpath_results: list[ElementBase] | ElementBase | str | Number | bool | None, base_url: str) -> str:
293 """Extract and normalize URL from lxml Element
294
295 Example:
296 >>> def f(s, search_url):
297 >>> return searx.utils.extract_url(html.fromstring(s), search_url)
298 >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
299 'https://example.com/'
300 >>> f('https://example.com', 'http://example.com/')
301 'https://example.com/'
302 >>> f('//example.com', 'http://example.com/')
303 'http://example.com/'
304 >>> f('//example.com', 'https://example.com/')
305 'https://example.com/'
306 >>> f('/path?a=1', 'https://example.com')
307 'https://example.com/path?a=1'
308 >>> f('', 'https://example.com')
309 raise lxml.etree.ParserError
310 >>> searx.utils.extract_url([], 'https://example.com')
311 raise ValueError
312
313 Raises:
314 * ValueError
315 * lxml.etree.ParserError
316
317 Returns:
318 * str: normalized URL
319 """
320 if xpath_results == []:
321 raise ValueError('Empty url resultset')
322
323 url = extract_text(xpath_results)
324 if url:
325 return normalize_url(url, base_url)
326 raise ValueError('URL not found')
327
328
329def dict_subset(dictionary: MutableMapping[t.Any, t.Any], properties: set[str]) -> MutableMapping[str, t.Any]:
330 """Extract a subset of a dict
331
332 Examples:
333 >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
334 {'A': 'a', 'C': 'c'}
335 >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
336 {'A': 'a'}
337 """
338 return {k: dictionary[k] for k in properties if k in dictionary}
339
340
341def humanize_bytes(size: int | float, precision: int = 2):
342 """Determine the *human readable* value of bytes on 1024 base (1KB=1024B)."""
343 s = ['B ', 'KB', 'MB', 'GB', 'TB']
344
345 x = len(s)
346 p = 0
347 while size > 1024 and p < x:
348 p += 1
349 size = size / 1024.0
350 return "%.*f %s" % (precision, size, s[p])
351
352
353def humanize_number(size: int | float, precision: int = 0):
354 """Determine the *human readable* value of a decimal number."""
355 s = ['', 'K', 'M', 'B', 'T']
356
357 x = len(s)
358 p = 0
359 while size > 1000 and p < x:
360 p += 1
361 size = size / 1000.0
362 return "%.*f%s" % (precision, size, s[p])
363
364
365def convert_str_to_int(number_str: str) -> int:
366 """Convert number_str to int or 0 if number_str is not a number."""
367 if number_str.isdigit():
368 return int(number_str)
369 return 0
370
371
372def extr(txt: str, begin: str, end: str, default: str = ""):
373 """Extract the string between ``begin`` and ``end`` from ``txt``
374
375 :param txt: String to search in
376 :param begin: First string to be searched for
377 :param end: Second string to be searched for after ``begin``
378 :param default: Default value if one of ``begin`` or ``end`` is not
379 found. Defaults to an empty string.
380 :return: The string between the two search-strings ``begin`` and ``end``.
381 If at least one of ``begin`` or ``end`` is not found, the value of
382 ``default`` is returned.
383
384 Examples:
385 >>> extr("abcde", "a", "e")
386 "bcd"
387 >>> extr("abcde", "a", "z", deafult="nothing")
388 "nothing"
389
390 """
391
392 # From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129
393
394 try:
395 first = txt.index(begin) + len(begin)
396 return txt[first : txt.index(end, first)]
397 except ValueError:
398 return default
399
400
401def int_or_zero(num: list[str] | str) -> int:
402 """Convert num to int or 0. num can be either a str or a list.
403 If num is a list, the first element is converted to int (or return 0 if the list is empty).
404 If num is a str, see convert_str_to_int
405 """
406 if isinstance(num, list):
407 if len(num) < 1:
408 return 0
409 num = num[0]
410 return convert_str_to_int(num)
411
412
413def is_valid_lang(lang: str) -> tuple[bool, str, str] | None:
414 """Return language code and name if lang describe a language.
415
416 Examples:
417 >>> is_valid_lang('zz')
418 None
419 >>> is_valid_lang('uk')
420 (True, 'uk', 'ukrainian')
421 >>> is_valid_lang(b'uk')
422 (True, 'uk', 'ukrainian')
423 >>> is_valid_lang('en')
424 (True, 'en', 'english')
425 >>> searx.utils.is_valid_lang('Español')
426 (True, 'es', 'spanish')
427 >>> searx.utils.is_valid_lang('Spanish')
428 (True, 'es', 'spanish')
429 """
430 if isinstance(lang, bytes):
431 lang = lang.decode()
432 is_abbr = len(lang) == 2
433 lang = lang.lower()
434 if is_abbr:
435 for l in sxng_locales:
436 if l[0][:2] == lang:
437 return (True, l[0][:2], l[3].lower())
438 return None
439 for l in sxng_locales:
440 if l[1].lower() == lang or l[3].lower() == lang:
441 return (True, l[0][:2], l[3].lower())
442 return None
443
444
445def load_module(filename: str, module_dir: str) -> types.ModuleType:
446 modname = splitext(filename)[0]
447 modpath = join(module_dir, filename)
448 # and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
449 spec = importlib.util.spec_from_file_location(modname, modpath)
450 if not spec:
451 raise ValueError(f"Error loading '{modpath}' module")
452 module = importlib.util.module_from_spec(spec)
453 if not spec.loader:
454 raise ValueError(f"Error loading '{modpath}' module")
455 spec.loader.exec_module(module)
456 return module
457
458
459def to_string(obj: t.Any) -> str:
460 """Convert obj to its string representation."""
461 if isinstance(obj, str):
462 return obj
463 if hasattr(obj, '__str__'):
464 return str(obj)
465 return repr(obj)
466
467
468def ecma_unescape(string: str) -> str:
469 """Python implementation of the unescape javascript function
470
471 https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
472 https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
473
474 Examples:
475 >>> ecma_unescape('%u5409')
476 '吉'
477 >>> ecma_unescape('%20')
478 ' '
479 >>> ecma_unescape('%F3')
480 'ó'
481 """
482 # "%u5409" becomes "吉"
483 string = _ECMA_UNESCAPE4_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
484 # "%20" becomes " ", "%F3" becomes "ó"
485 string = _ECMA_UNESCAPE2_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
486 return string
487
488
489def remove_pua_from_str(string: str):
490 """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
491
492 .. _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
493 """
494 pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
495 s: list[str] = []
496 for c in string:
497 i = ord(c)
498 if any(a <= i <= b for (a, b) in pua_ranges):
499 continue
500 s.append(c)
501 return "".join(s)
502
503
504def get_string_replaces_function(replaces: dict[str, str]) -> Callable[[str], str]:
505 rep = {re.escape(k): v for k, v in replaces.items()}
506 pattern = re.compile("|".join(rep.keys()))
507
508 def func(text: str):
509 return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
510
511 return func
512
513
514def get_engine_from_settings(name: str) -> dict[str, dict[str, str]]:
515 """Return engine configuration from settings.yml of a given engine name"""
516
517 if 'engines' not in settings:
518 return {}
519
520 for engine in settings['engines']:
521 if 'name' not in engine:
522 continue
523 if name == engine['name']:
524 return engine
525
526 return {}
527
528
529def get_xpath(xpath_spec: XPathSpecType) -> XPath:
530 """Return cached compiled :py:obj:`lxml.etree.XPath` object.
531
532 ``TypeError``:
533 Raised when ``xpath_spec`` is neither a :py:obj:`str` nor a
534 :py:obj:`lxml.etree.XPath`.
535
536 ``SearxXPathSyntaxException``:
537 Raised when there is a syntax error in the *XPath* selector (``str``).
538 """
539 if isinstance(xpath_spec, str):
540 result = _XPATH_CACHE.get(xpath_spec, None)
541 if result is None:
542 try:
543 result = XPath(xpath_spec)
544 except XPathSyntaxError as e:
545 raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e
546 _XPATH_CACHE[xpath_spec] = result
547 return result
548
549 if isinstance(xpath_spec, XPath):
550 return xpath_spec
551
552 raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath') # pyright: ignore[reportUnreachable]
553
554
555def eval_xpath(element: ElementBase, xpath_spec: XPathSpecType) -> t.Any:
556 """Equivalent of ``element.xpath(xpath_str)`` but compile ``xpath_str`` into
557 a :py:obj:`lxml.etree.XPath` object once for all. The return value of
558 ``xpath(..)`` is complex, read `XPath return values`_ for more details.
559
560 .. _XPath return values:
561 https://lxml.de/xpathxslt.html#xpath-return-values
562
563 ``TypeError``:
564 Raised when ``xpath_spec`` is neither a :py:obj:`str` nor a
565 :py:obj:`lxml.etree.XPath`.
566
567 ``SearxXPathSyntaxException``:
568 Raised when there is a syntax error in the *XPath* selector (``str``).
569
570 ``SearxEngineXPathException:``
571 Raised when the XPath can't be evaluated (masked
572 :py:obj:`lxml.etree..XPathError`).
573 """
574 xpath: XPath = get_xpath(xpath_spec)
575 try:
576 # https://lxml.de/xpathxslt.html#xpath-return-values
577 return xpath(element)
578 except XPathError as e:
579 arg = ' '.join([str(i) for i in e.args])
580 raise SearxEngineXPathException(xpath_spec, arg) from e
581
582
583def eval_xpath_list(element: ElementBase, xpath_spec: XPathSpecType, min_len: int | None = None) -> list[t.Any]:
584 """Same as :py:obj:`searx.utils.eval_xpath`, but additionally ensures the
585 return value is a :py:obj:`list`. The minimum length of the list is also
586 checked (if ``min_len`` is set)."""
587
588 result = eval_xpath(element, xpath_spec)
589 if not isinstance(result, list):
590 raise SearxEngineXPathException(xpath_spec, 'the result is not a list')
591 if min_len is not None and min_len > len(result):
592 raise SearxEngineXPathException(xpath_spec, 'len(xpath_str) < ' + str(min_len))
593 return result
594
595
597 element: ElementBase,
598 xpath_spec: XPathSpecType,
599 index: int,
600 default: t.Any = _NOTSET,
601) -> t.Any:
602 """Same as :py:obj:`searx.utils.eval_xpath_list`, but returns item on
603 position ``index`` from the list (index starts with ``0``).
604
605 The exceptions known from :py:obj:`searx.utils.eval_xpath` are thrown. If a
606 default is specified, this is returned if an element at position ``index``
607 could not be determined.
608 """
609
610 result = eval_xpath_list(element, xpath_spec)
611 if -len(result) <= index < len(result):
612 return result[index]
613 if default == _NOTSET:
614 # raise an SearxEngineXPathException instead of IndexError to record
615 # xpath_spec
616 raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
617 return default
618
619
620def _get_fasttext_model() -> "fasttext.FastText._FastText": # pyright: ignore[reportPrivateUsage]
621 global _FASTTEXT_MODEL # pylint: disable=global-statement
622 if _FASTTEXT_MODEL is None:
623 import fasttext # pylint: disable=import-outside-toplevel
624
625 # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
626 fasttext.FastText.eprint = lambda x: None # type: ignore
627 _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz')) # type: ignore
628 return _FASTTEXT_MODEL
629
630
632 """
633 Converts a standard video URL into its embed format. Supported services include Youtube,
634 Facebook, Instagram, TikTok, Dailymotion, and Bilibili.
635 """
636 parsed_url = urlparse(url)
637 iframe_src = None
638
639 # YouTube
640 if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query:
641 video_id = parse_qs(parsed_url.query).get('v', [])
642 if video_id:
643 iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0]
644
645 # Facebook
646 elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']:
647 encoded_href = urlencode({'href': url})
648 iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href
649
650 # Instagram
651 elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'):
652 if parsed_url.path.endswith('/'):
653 iframe_src = url + 'embed'
654 else:
655 iframe_src = url + '/embed'
656
657 # TikTok
658 elif (
659 parsed_url.netloc in ['www.tiktok.com', 'tiktok.com']
660 and parsed_url.path.startswith('/@')
661 and '/video/' in parsed_url.path
662 ):
663 path_parts = parsed_url.path.split('/video/')
664 video_id = path_parts[1]
665 iframe_src = 'https://www.tiktok.com/embed/' + video_id
666
667 # Dailymotion
668 elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'):
669 path_parts = parsed_url.path.split('/')
670 if len(path_parts) == 3:
671 video_id = path_parts[2]
672 iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id
673
674 # Bilibili
675 elif parsed_url.netloc in ['www.bilibili.com', 'bilibili.com'] and parsed_url.path.startswith('/video/'):
676 path_parts = parsed_url.path.split('/')
677
678 video_id = path_parts[2]
679 param_key = None
680 if video_id.startswith('av'):
681 video_id = video_id[2:]
682 param_key = 'aid'
683 elif video_id.startswith('BV'):
684 param_key = 'bvid'
685
686 iframe_src = (
687 f'https://player.bilibili.com/player.html?{param_key}={video_id}&high_quality=1&autoplay=false&danmaku=0'
688 )
689
690 return iframe_src
691
692
693def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> str | None:
694 """Detect the language of the ``text`` parameter.
695
696 :param str text: The string whose language is to be detected.
697
698 :param float threshold: Threshold filters the returned labels by a threshold
699 on probability. A choice of 0.3 will return labels with at least 0.3
700 probability.
701
702 :param bool only_search_languages: If ``True``, returns only supported
703 SearXNG search languages. see :py:obj:`searx.languages`
704
705 :rtype: str, None
706 :returns:
707 The detected language code or ``None``. See below.
708
709 :raises ValueError: If ``text`` is not a string.
710
711 The language detection is done by using `a fork`_ of the fastText_ library
712 (`python fasttext`_). fastText_ distributes the `language identification
713 model`_, for reference:
714
715 - `FastText.zip: Compressing text classification models`_
716 - `Bag of Tricks for Efficient Text Classification`_
717
718 The `language identification model`_ support the language codes
719 (ISO-639-3)::
720
721 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
722 bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
723 et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
724 id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
725 lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
726 nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
727 rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
728 tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
729
730 By using ``only_search_languages=True`` the `language identification model`_
731 is harmonized with the SearXNG's language (locale) model. General
732 conditions of SearXNG's locale model are:
733
734 a. SearXNG's locale of a query is passed to the
735 :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
736 code that is used by an engine.
737
738 b. Most of SearXNG's engines do not support all the languages from `language
739 identification model`_ and there is also a discrepancy in the ISO-639-3
740 (fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
741 locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
742 (``zh_Hans``) while the `language identification model`_ reduce both to
743 ``zh``.
744
745 .. _a fork: https://github.com/searxng/fasttext-predict
746 .. _fastText: https://fasttext.cc/
747 .. _python fasttext: https://pypi.org/project/fasttext/
748 .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
749 .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
750 .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
751
752 """
753 if not isinstance(text, str):
754 raise ValueError('text must a str') # pyright: ignore[reportUnreachable]
755 r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold) # type: ignore
756 if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0: # type: ignore
757 language = r[0][0].split('__label__')[1] # type: ignore
758 if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
759 return None
760 return language # type: ignore
761 return None
762
763
764def js_variable_to_python(js_variable: str) -> str:
765 """Convert a javascript variable into JSON and then load the value
766
767 It does not deal with all cases, but it is good enough for now.
768 chompjs has a better implementation.
769 """
770 # when in_string is not None, it contains the character that has opened the string
771 # either simple quote or double quote
772 in_string = None
773 # cut the string:
774 # r"""{ a:"f\"irst", c:'sec"ond'}"""
775 # becomes
776 # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
777 parts = re.split(r'(["\'])', js_variable)
778 # previous part (to check the escape character antislash)
779 previous_p = ""
780 for i, p in enumerate(parts):
781 # parse characters inside a ECMA string
782 if in_string:
783 # we are in a JS string: replace the colon by a temporary character
784 # so quote_keys_regex doesn't have to deal with colon inside the JS strings
785 parts[i] = parts[i].replace(':', chr(1))
786 if in_string == "'":
787 # the JS string is delimited by simple quote.
788 # This is not supported by JSON.
789 # simple quote delimited string are converted to double quote delimited string
790 # here, inside a JS string, we escape the double quote
791 parts[i] = parts[i].replace('"', r'\"')
792
793 # deal with delimiters and escape character
794 if not in_string and p in ('"', "'"):
795 # we are not in string
796 # but p is double or simple quote
797 # that's the start of a new string
798 # replace simple quote by double quote
799 # (JSON doesn't support simple quote)
800 parts[i] = '"'
801 in_string = p
802 continue
803 if p == in_string:
804 # we are in a string and the current part MAY close the string
805 if len(previous_p) > 0 and previous_p[-1] == '\\':
806 # there is an antislash just before: the ECMA string continue
807 continue
808 # the current p close the string
809 # replace simple quote by double quote
810 parts[i] = '"'
811 in_string = None
812
813 if not in_string:
814 # replace void 0 by null
815 # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
816 # we are sure there is no string in p
817 parts[i] = _JS_VOID_RE.sub("null", p)
818 # update previous_p
819 previous_p = p
820 # join the string
821 s = ''.join(parts)
822 # add quote around the key
823 # { a: 12 }
824 # becomes
825 # { "a": 12 }
826 s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
827 s = _JS_DECIMAL_RE.sub(":0.", s)
828 # replace the surogate character by colon
829 s = s.replace(chr(1), ':')
830 # replace single-quote followed by comma with double-quote and comma
831 # {"a": "\"12\"',"b": "13"}
832 # becomes
833 # {"a": "\"12\"","b": "13"}
834 s = s.replace("',", "\",")
835 # load the JSON and return the result
836 return json.loads(s) # pyright: ignore[reportAny]
837
838
839def parse_duration_string(duration_str: str) -> timedelta | None:
840 """Parse a time string in format MM:SS or HH:MM:SS and convert it to a `timedelta` object.
841
842 Returns None if the provided string doesn't match any of the formats.
843 """
844 duration_str = duration_str.strip()
845
846 if not duration_str:
847 return None
848
849 try:
850 # prepending ["00"] here inits hours to 0 if they are not provided
851 time_parts = (["00"] + duration_str.split(":"))[:3]
852 hours, minutes, seconds = map(int, time_parts)
853 return timedelta(hours=hours, minutes=minutes, seconds=seconds)
854
855 except (ValueError, TypeError):
856 pass
857
858 return None
None handle_entityref(self, str name)
Definition utils.py:126
None handle_endtag(self, str tag)
Definition utils.py:99
None error(self, str message)
Definition utils.py:136
None handle_starttag(self, str tag, list[tuple[str, str|None]] attrs)
Definition utils.py:94
None handle_data(self, str data)
Definition utils.py:112
None handle_charref(self, str name)
Definition utils.py:117
str|None detect_language(str text, float threshold=0.3, bool only_search_languages=False)
Definition utils.py:693
extr(str txt, str begin, str end, str default="")
Definition utils.py:372
t.Any eval_xpath(ElementBase element, XPathSpecType xpath_spec)
Definition utils.py:555
str js_variable_to_python(str js_variable)
Definition utils.py:764
XPath get_xpath(XPathSpecType xpath_spec)
Definition utils.py:529
tuple[bool, str, str]|None is_valid_lang(str lang)
Definition utils.py:413
list[t.Any] eval_xpath_list(ElementBase element, XPathSpecType xpath_spec, int|None min_len=None)
Definition utils.py:583
humanize_bytes(int|float size, int precision=2)
Definition utils.py:341
str ecma_unescape(str string)
Definition utils.py:468
int convert_str_to_int(str number_str)
Definition utils.py:365
int int_or_zero(list[str]|str num)
Definition utils.py:401
"fasttext.FastText._FastText" _get_fasttext_model()
Definition utils.py:620
humanize_number(int|float size, int precision=0)
Definition utils.py:353
remove_pua_from_str(str string)
Definition utils.py:489
str extract_url(list[ElementBase]|ElementBase|str|Number|bool|None xpath_results, str base_url)
Definition utils.py:292
str searxng_useragent()
Definition utils.py:70
types.ModuleType load_module(str filename, str module_dir)
Definition utils.py:445
MutableMapping[str, t.Any] dict_subset(MutableMapping[t.Any, t.Any] dictionary, set[str] properties)
Definition utils.py:329
str gen_useragent(str|None os_string=None)
Definition utils.py:75
str|None extract_text(list[ElementBase]|ElementBase|str|Number|bool|None xpath_results, bool allow_none=False)
Definition utils.py:209
get_embeded_stream_url(str url)
Definition utils.py:631
str normalize_url(str url, str base_url)
Definition utils.py:242
t.Any eval_xpath_getindex(ElementBase element, XPathSpecType xpath_spec, int index, t.Any default=_NOTSET)
Definition utils.py:601
str html_to_text(str html_str)
Definition utils.py:142
timedelta|None parse_duration_string(str duration_str)
Definition utils.py:839
str to_string(t.Any obj)
Definition utils.py:459
dict[str, dict[str, str]] get_engine_from_settings(str name)
Definition utils.py:514
str markdown_to_text(str markdown_str)
Definition utils.py:183
Callable[[str], str] get_string_replaces_function(dict[str, str] replaces)
Definition utils.py:504