.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
utils.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Utility functions for the engines"""
3
4
5import re
6import importlib
7import importlib.util
8import json
9import types
10
11import typing as t
12from collections.abc import MutableMapping, Callable
13
14from numbers import Number
15from os.path import splitext, join
16from random import choice
17from html.parser import HTMLParser
18from html import escape
19from urllib.parse import urljoin, urlparse, parse_qs, urlencode
20from datetime import timedelta
21from markdown_it import MarkdownIt
22
23from lxml import html
24from lxml.etree import XPath, XPathError, XPathSyntaxError
25from lxml.etree import ElementBase, _Element # pyright: ignore[reportPrivateUsage]
26
27from searx import settings
28from searx.data import USER_AGENTS, data_dir
29from searx.version import VERSION_TAG
30from searx.sxng_locales import sxng_locales
31from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
32from searx import logger
33
34if t.TYPE_CHECKING:
35 import fasttext.FastText # type: ignore
36
37
38logger = logger.getChild('utils')
39
40XPathSpecType: t.TypeAlias = str | XPath
41"""Type alias used by :py:obj:`searx.utils.get_xpath`,
42:py:obj:`searx.utils.eval_xpath` and other XPath selectors."""
43
44ElementType: t.TypeAlias = ElementBase | _Element
45
46
47_BLOCKED_TAGS = ('script', 'style')
48
49_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
50_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
51
52_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
53_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\‍([0-9]+\‍)')
54_JS_DECIMAL_RE = re.compile(r":\s*\.")
55
56_XPATH_CACHE: dict[str, XPath] = {}
57_LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {}
58
59_FASTTEXT_MODEL: "fasttext.FastText._FastText | None" = None # pyright: ignore[reportPrivateUsage]
60"""fasttext model to predict language of a search term"""
61
62SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
63"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
64
65
66class _NotSetClass: # pylint: disable=too-few-public-methods
67 """Internal class for this module, do not create instance of this class.
68 Replace the None value, allow explicitly pass None as a function argument"""
69
70
71_NOTSET = _NotSetClass()
72
73
74def searxng_useragent() -> str:
75 """Return the SearXNG User Agent"""
76 return f"SearXNG/{VERSION_TAG} {settings['outgoing']['useragent_suffix']}".strip()
77
78
79def gen_useragent(os_string: str | None = None) -> str:
80 """Return a random browser User Agent
81
82 See searx/data/useragents.json
83 """
84 return USER_AGENTS['ua'].format(
85 os=os_string or choice(USER_AGENTS['os']),
86 version=choice(USER_AGENTS['versions']),
87 )
88
89
90class HTMLTextExtractor(HTMLParser):
91 """Internal class to extract text from HTML"""
92
93 def __init__(self):
94 HTMLParser.__init__(self)
95 self.result: list[str] = []
96 self.tags: list[str] = []
97
98 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
99 self.tags.append(tag)
100 if tag == 'br':
101 self.result.append(' ')
102
103 def handle_endtag(self, tag: str) -> None:
104 if not self.tags:
105 return
106
107 if tag != self.tags[-1]:
108 self.result.append(f"</{tag}>")
109 return
110
111 self.tags.pop()
112
113 def is_valid_tag(self):
114 return not self.tags or self.tags[-1] not in _BLOCKED_TAGS
115
116 def handle_data(self, data: str) -> None:
117 if not self.is_valid_tag():
118 return
119 self.result.append(data)
120
121 def handle_charref(self, name: str) -> None:
122 if not self.is_valid_tag():
123 return
124 if name[0] in ('x', 'X'):
125 codepoint = int(name[1:], 16)
126 else:
127 codepoint = int(name)
128 self.result.append(chr(codepoint))
129
130 def handle_entityref(self, name: str) -> None:
131 if not self.is_valid_tag():
132 return
133 # codepoint = htmlentitydefs.name2codepoint[name]
134 # self.result.append(chr(codepoint))
135 self.result.append(name)
136
137 def get_text(self):
138 return ''.join(self.result).strip()
139
140 def error(self, message: str) -> None:
141 # error handle is needed in <py3.10
142 # https://github.com/python/cpython/pull/8562/files
143 raise AssertionError(message)
144
145
146def html_to_text(html_str: str) -> str:
147 """Extract text from a HTML string
148
149 Args:
150 * html_str (str): string HTML
151
152 Returns:
153 * str: extracted text
154
155 Examples:
156 >>> html_to_text('Example <span id="42">#2</span>')
157 'Example #2'
158
159 >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
160 'Example'
161
162 >>> html_to_text(r'regexp: (?&lt;![a-zA-Z]')
163 'regexp: (?<![a-zA-Z]'
164
165 >>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>')
166 'Lorem ipsum </i>dolor sit amet</p>'
167
168 >>> html_to_text(r'&#x3e &#x3c &#97')
169 '> < a'
170
171 """
172 if not html_str:
173 return ""
174 html_str = html_str.replace('\n', ' ').replace('\r', ' ')
175 html_str = ' '.join(html_str.split())
177 try:
178 s.feed(html_str)
179 s.close()
180 except AssertionError:
182 s.feed(escape(html_str, quote=True))
183 s.close()
184 return s.get_text()
185
186
187def markdown_to_text(markdown_str: str) -> str:
188 """Extract text from a Markdown string
189
190 Args:
191 * markdown_str (str): string Markdown
192
193 Returns:
194 * str: extracted text
195
196 Examples:
197 >>> markdown_to_text('[example](https://example.com)')
198 'example'
199
200 >>> markdown_to_text('## Headline')
201 'Headline'
202 """
203
204 html_str: str = (
205 MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
206 )
207 return html_to_text(html_str)
208
209
211 xpath_results: list[ElementType] | ElementType | str | Number | bool | None,
212 allow_none: bool = False,
213) -> str | None:
214 """Extract text from a lxml result
215
216 - If ``xpath_results`` is a list of :py:obj:`ElementType` objects, extract
217 the text from each result and concatenate the list in a string.
218
219 - If ``xpath_results`` is a :py:obj:`ElementType` object, extract all the
220 text node from it ( :py:obj:`lxml.html.tostring`, ``method="text"`` )
221
222 - If ``xpath_results`` is of type :py:obj:`str` or :py:obj:`Number`,
223 :py:obj:`bool` the string value is returned.
224
225 - If ``xpath_results`` is of type ``None`` a :py:obj:`ValueError` is raised,
226 except ``allow_none`` is ``True`` where ``None`` is returned.
227
228 """
229 if isinstance(xpath_results, list):
230 # it's list of result : concat everything using recursive call
231 result = ''
232 for e in xpath_results:
233 result = result + (extract_text(e) or '')
234 return result.strip()
235 if isinstance(xpath_results, ElementType):
236 # it's a element
237 text: str = html.tostring( # type: ignore
238 xpath_results, # pyright: ignore[reportArgumentType]
239 encoding='unicode',
240 method='text',
241 with_tail=False,
242 )
243 text = text.strip().replace('\n', ' ') # type: ignore
244 return ' '.join(text.split()) # type: ignore
245 if isinstance(xpath_results, (str, Number, bool)):
246 return str(xpath_results)
247 if xpath_results is None and allow_none:
248 return None
249 if xpath_results is None and not allow_none:
250 raise ValueError('extract_text(None, allow_none=False)')
251 raise ValueError('unsupported type')
252
253
254def normalize_url(url: str, base_url: str) -> str:
255 """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
256
257 Args:
258 * url (str): Relative URL
259 * base_url (str): Base URL, it must be an absolute URL.
260
261 Example:
262 >>> normalize_url('https://example.com', 'http://example.com/')
263 'https://example.com/'
264 >>> normalize_url('//example.com', 'http://example.com/')
265 'http://example.com/'
266 >>> normalize_url('//example.com', 'https://example.com/')
267 'https://example.com/'
268 >>> normalize_url('/path?a=1', 'https://example.com')
269 'https://example.com/path?a=1'
270 >>> normalize_url('', 'https://example.com')
271 'https://example.com/'
272 >>> normalize_url('/test', '/path')
273 raise ValueError
274
275 Raises:
276 * lxml.etree.ParserError
277
278 Returns:
279 * str: normalized URL
280 """
281 if url.startswith('//'):
282 # add http or https to this kind of url //example.com/
283 parsed_search_url = urlparse(base_url)
284 url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
285 elif url.startswith('/'):
286 # fix relative url to the search engine
287 url = urljoin(base_url, url)
288
289 # fix relative urls that fall through the crack
290 if '://' not in url:
291 url = urljoin(base_url, url)
292
293 parsed_url = urlparse(url)
294
295 # add a / at this end of the url if there is no path
296 if not parsed_url.netloc:
297 raise ValueError('Cannot parse url')
298 if not parsed_url.path:
299 url += '/'
300
301 return url
302
303
304def extract_url(xpath_results: list[ElementType] | ElementType | str | Number | bool | None, base_url: str) -> str:
305 """Extract and normalize URL from lxml Element
306
307 Example:
308 >>> def f(s, search_url):
309 >>> return searx.utils.extract_url(html.fromstring(s), search_url)
310 >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
311 'https://example.com/'
312 >>> f('https://example.com', 'http://example.com/')
313 'https://example.com/'
314 >>> f('//example.com', 'http://example.com/')
315 'http://example.com/'
316 >>> f('//example.com', 'https://example.com/')
317 'https://example.com/'
318 >>> f('/path?a=1', 'https://example.com')
319 'https://example.com/path?a=1'
320 >>> f('', 'https://example.com')
321 raise lxml.etree.ParserError
322 >>> searx.utils.extract_url([], 'https://example.com')
323 raise ValueError
324
325 Raises:
326 * ValueError
327 * lxml.etree.ParserError
328
329 Returns:
330 * str: normalized URL
331 """
332 if xpath_results == []:
333 raise ValueError('Empty url resultset')
334
335 url = extract_text(xpath_results)
336 if url:
337 return normalize_url(url, base_url)
338 raise ValueError('URL not found')
339
340
341def dict_subset(dictionary: MutableMapping[t.Any, t.Any], properties: set[str]) -> MutableMapping[str, t.Any]:
342 """Extract a subset of a dict
343
344 Examples:
345 >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
346 {'A': 'a', 'C': 'c'}
347 >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
348 {'A': 'a'}
349 """
350 return {k: dictionary[k] for k in properties if k in dictionary}
351
352
353def humanize_bytes(size: int | float, precision: int = 2):
354 """Determine the *human readable* value of bytes on 1024 base (1KB=1024B)."""
355 s = ['B ', 'KB', 'MB', 'GB', 'TB']
356
357 x = len(s)
358 p = 0
359 while size > 1024 and p < x:
360 p += 1
361 size = size / 1024.0
362 return "%.*f %s" % (precision, size, s[p])
363
364
365def humanize_number(size: int | float, precision: int = 0):
366 """Determine the *human readable* value of a decimal number."""
367 s = ['', 'K', 'M', 'B', 'T']
368
369 x = len(s)
370 p = 0
371 while size > 1000 and p < x:
372 p += 1
373 size = size / 1000.0
374 return "%.*f%s" % (precision, size, s[p])
375
376
377def convert_str_to_int(number_str: str) -> int:
378 """Convert number_str to int or 0 if number_str is not a number."""
379 if number_str.isdigit():
380 return int(number_str)
381 return 0
382
383
384def extr(txt: str, begin: str, end: str, default: str = ""):
385 """Extract the string between ``begin`` and ``end`` from ``txt``
386
387 :param txt: String to search in
388 :param begin: First string to be searched for
389 :param end: Second string to be searched for after ``begin``
390 :param default: Default value if one of ``begin`` or ``end`` is not
391 found. Defaults to an empty string.
392 :return: The string between the two search-strings ``begin`` and ``end``.
393 If at least one of ``begin`` or ``end`` is not found, the value of
394 ``default`` is returned.
395
396 Examples:
397 >>> extr("abcde", "a", "e")
398 "bcd"
399 >>> extr("abcde", "a", "z", deafult="nothing")
400 "nothing"
401
402 """
403
404 # From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129
405
406 try:
407 first = txt.index(begin) + len(begin)
408 return txt[first : txt.index(end, first)]
409 except ValueError:
410 return default
411
412
413def int_or_zero(num: list[str] | str) -> int:
414 """Convert num to int or 0. num can be either a str or a list.
415 If num is a list, the first element is converted to int (or return 0 if the list is empty).
416 If num is a str, see convert_str_to_int
417 """
418 if isinstance(num, list):
419 if len(num) < 1:
420 return 0
421 num = num[0]
422 return convert_str_to_int(num)
423
424
425def load_module(filename: str, module_dir: str) -> types.ModuleType:
426 modname = splitext(filename)[0]
427 modpath = join(module_dir, filename)
428 # and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
429 spec = importlib.util.spec_from_file_location(modname, modpath)
430 if not spec:
431 raise ValueError(f"Error loading '{modpath}' module")
432 module = importlib.util.module_from_spec(spec)
433 if not spec.loader:
434 raise ValueError(f"Error loading '{modpath}' module")
435 spec.loader.exec_module(module)
436 return module
437
438
439def to_string(obj: t.Any) -> str:
440 """Convert obj to its string representation."""
441 if isinstance(obj, str):
442 return obj
443 if hasattr(obj, '__str__'):
444 return str(obj)
445 return repr(obj)
446
447
448def ecma_unescape(string: str) -> str:
449 """Python implementation of the unescape javascript function
450
451 https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
452 https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
453
454 Examples:
455 >>> ecma_unescape('%u5409')
456 '吉'
457 >>> ecma_unescape('%20')
458 ' '
459 >>> ecma_unescape('%F3')
460 'ó'
461 """
462 # "%u5409" becomes "吉"
463 string = _ECMA_UNESCAPE4_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
464 # "%20" becomes " ", "%F3" becomes "ó"
465 string = _ECMA_UNESCAPE2_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
466 return string
467
468
469def remove_pua_from_str(string: str):
470 """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
471
472 .. _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
473 """
474 pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
475 s: list[str] = []
476 for c in string:
477 i = ord(c)
478 if any(a <= i <= b for (a, b) in pua_ranges):
479 continue
480 s.append(c)
481 return "".join(s)
482
483
484def get_string_replaces_function(replaces: dict[str, str]) -> Callable[[str], str]:
485 rep = {re.escape(k): v for k, v in replaces.items()}
486 pattern = re.compile("|".join(rep.keys()))
487
488 def func(text: str):
489 return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
490
491 return func
492
493
494def get_engine_from_settings(name: str) -> dict[str, dict[str, str]]:
495 """Return engine configuration from settings.yml of a given engine name"""
496
497 if 'engines' not in settings:
498 return {}
499
500 for engine in settings['engines']:
501 if 'name' not in engine:
502 continue
503 if name == engine['name']:
504 return engine
505
506 return {}
507
508
509def get_xpath(xpath_spec: XPathSpecType) -> XPath:
510 """Return cached compiled :py:obj:`lxml.etree.XPath` object.
511
512 ``TypeError``:
513 Raised when ``xpath_spec`` is neither a :py:obj:`str` nor a
514 :py:obj:`lxml.etree.XPath`.
515
516 ``SearxXPathSyntaxException``:
517 Raised when there is a syntax error in the *XPath* selector (``str``).
518 """
519 if isinstance(xpath_spec, str):
520 result = _XPATH_CACHE.get(xpath_spec, None)
521 if result is None:
522 try:
523 result = XPath(xpath_spec)
524 except XPathSyntaxError as e:
525 raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e
526 _XPATH_CACHE[xpath_spec] = result
527 return result
528
529 if isinstance(xpath_spec, XPath):
530 return xpath_spec
531
532 raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath') # pyright: ignore[reportUnreachable]
533
534
535def eval_xpath(element: ElementType, xpath_spec: XPathSpecType) -> t.Any:
536 """Equivalent of ``element.xpath(xpath_str)`` but compile ``xpath_str`` into
537 a :py:obj:`lxml.etree.XPath` object once for all. The return value of
538 ``xpath(..)`` is complex, read `XPath return values`_ for more details.
539
540 .. _XPath return values:
541 https://lxml.de/xpathxslt.html#xpath-return-values
542
543 ``TypeError``:
544 Raised when ``xpath_spec`` is neither a :py:obj:`str` nor a
545 :py:obj:`lxml.etree.XPath`.
546
547 ``SearxXPathSyntaxException``:
548 Raised when there is a syntax error in the *XPath* selector (``str``).
549
550 ``SearxEngineXPathException:``
551 Raised when the XPath can't be evaluated (masked
552 :py:obj:`lxml.etree..XPathError`).
553 """
554 xpath: XPath = get_xpath(xpath_spec)
555 try:
556 # https://lxml.de/xpathxslt.html#xpath-return-values
557 return xpath(element)
558 except XPathError as e:
559 arg = ' '.join([str(i) for i in e.args])
560 raise SearxEngineXPathException(xpath_spec, arg) from e
561
562
563def eval_xpath_list(element: ElementType, xpath_spec: XPathSpecType, min_len: int | None = None) -> list[t.Any]:
564 """Same as :py:obj:`searx.utils.eval_xpath`, but additionally ensures the
565 return value is a :py:obj:`list`. The minimum length of the list is also
566 checked (if ``min_len`` is set)."""
567
568 result: list[t.Any] = eval_xpath(element, xpath_spec)
569 if not isinstance(result, list):
570 raise SearxEngineXPathException(xpath_spec, 'the result is not a list')
571 if min_len is not None and min_len > len(result):
572 raise SearxEngineXPathException(xpath_spec, 'len(xpath_str) < ' + str(min_len))
573 return result
574
575
577 element: ElementType,
578 xpath_spec: XPathSpecType,
579 index: int,
580 default: t.Any = _NOTSET,
581) -> t.Any:
582 """Same as :py:obj:`searx.utils.eval_xpath_list`, but returns item on
583 position ``index`` from the list (index starts with ``0``).
584
585 The exceptions known from :py:obj:`searx.utils.eval_xpath` are thrown. If a
586 default is specified, this is returned if an element at position ``index``
587 could not be determined.
588 """
589
590 result = eval_xpath_list(element, xpath_spec)
591 if -len(result) <= index < len(result):
592 return result[index]
593 if default == _NOTSET:
594 # raise an SearxEngineXPathException instead of IndexError to record
595 # xpath_spec
596 raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
597 return default
598
599
600def _get_fasttext_model() -> "fasttext.FastText._FastText": # pyright: ignore[reportPrivateUsage]
601 global _FASTTEXT_MODEL # pylint: disable=global-statement
602 if _FASTTEXT_MODEL is None:
603 import fasttext # pylint: disable=import-outside-toplevel
604
605 # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
606 fasttext.FastText.eprint = lambda x: None # type: ignore
607 _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz')) # type: ignore
608 return _FASTTEXT_MODEL
609
610
612 """
613 Converts a standard video URL into its embed format. Supported services include Youtube,
614 Facebook, Instagram, TikTok, Dailymotion, and Bilibili.
615 """
616 parsed_url = urlparse(url)
617 iframe_src = None
618
619 # YouTube
620 if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query:
621 video_id = parse_qs(parsed_url.query).get('v', [])
622 if video_id:
623 iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0]
624
625 # Facebook
626 elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']:
627 encoded_href = urlencode({'href': url})
628 iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href
629
630 # Instagram
631 elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'):
632 if parsed_url.path.endswith('/'):
633 iframe_src = url + 'embed'
634 else:
635 iframe_src = url + '/embed'
636
637 # TikTok
638 elif (
639 parsed_url.netloc in ['www.tiktok.com', 'tiktok.com']
640 and parsed_url.path.startswith('/@')
641 and '/video/' in parsed_url.path
642 ):
643 path_parts = parsed_url.path.split('/video/')
644 video_id = path_parts[1]
645 iframe_src = 'https://www.tiktok.com/embed/' + video_id
646
647 # Dailymotion
648 elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'):
649 path_parts = parsed_url.path.split('/')
650 if len(path_parts) == 3:
651 video_id = path_parts[2]
652 iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id
653
654 # Bilibili
655 elif parsed_url.netloc in ['www.bilibili.com', 'bilibili.com'] and parsed_url.path.startswith('/video/'):
656 path_parts = parsed_url.path.split('/')
657
658 video_id = path_parts[2]
659 param_key = None
660 if video_id.startswith('av'):
661 video_id = video_id[2:]
662 param_key = 'aid'
663 elif video_id.startswith('BV'):
664 param_key = 'bvid'
665
666 iframe_src = (
667 f'https://player.bilibili.com/player.html?{param_key}={video_id}&high_quality=1&autoplay=false&danmaku=0'
668 )
669
670 return iframe_src
671
672
673def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> str | None:
674 """Detect the language of the ``text`` parameter.
675
676 :param str text: The string whose language is to be detected.
677
678 :param float threshold: Threshold filters the returned labels by a threshold
679 on probability. A choice of 0.3 will return labels with at least 0.3
680 probability.
681
682 :param bool only_search_languages: If ``True``, returns only supported
683 SearXNG search languages. see :py:obj:`searx.languages`
684
685 :rtype: str, None
686 :returns:
687 The detected language code or ``None``. See below.
688
689 :raises ValueError: If ``text`` is not a string.
690
691 The language detection is done by using `a fork`_ of the fastText_ library
692 (`python fasttext`_). fastText_ distributes the `language identification
693 model`_, for reference:
694
695 - `FastText.zip: Compressing text classification models`_
696 - `Bag of Tricks for Efficient Text Classification`_
697
698 The `language identification model`_ support the language codes
699 (ISO-639-3)::
700
701 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
702 bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
703 et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
704 id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
705 lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
706 nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
707 rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
708 tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
709
710 By using ``only_search_languages=True`` the `language identification model`_
711 is harmonized with the SearXNG's language (locale) model. General
712 conditions of SearXNG's locale model are:
713
714 a. SearXNG's locale of a query is passed to the
715 :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
716 code that is used by an engine.
717
718 b. Most of SearXNG's engines do not support all the languages from `language
719 identification model`_ and there is also a discrepancy in the ISO-639-3
720 (fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
721 locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
722 (``zh_Hans``) while the `language identification model`_ reduce both to
723 ``zh``.
724
725 .. _a fork: https://github.com/searxng/fasttext-predict
726 .. _fastText: https://fasttext.cc/
727 .. _python fasttext: https://pypi.org/project/fasttext/
728 .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
729 .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
730 .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
731
732 """
733 if not isinstance(text, str):
734 raise ValueError('text must a str') # pyright: ignore[reportUnreachable]
735 r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold) # type: ignore
736 if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0: # type: ignore
737 language = r[0][0].split('__label__')[1] # type: ignore
738 if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
739 return None
740 return language # type: ignore
741 return None
742
743
744def js_variable_to_python(js_variable: str) -> str:
745 """Convert a javascript variable into JSON and then load the value
746
747 It does not deal with all cases, but it is good enough for now.
748 chompjs has a better implementation.
749 """
750 # when in_string is not None, it contains the character that has opened the string
751 # either simple quote or double quote
752 in_string = None
753 # cut the string:
754 # r"""{ a:"f\"irst", c:'sec"ond'}"""
755 # becomes
756 # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
757 parts = re.split(r'(["\'])', js_variable)
758 # previous part (to check the escape character antislash)
759 previous_p = ""
760 for i, p in enumerate(parts):
761 # parse characters inside a ECMA string
762 if in_string:
763 # we are in a JS string: replace the colon by a temporary character
764 # so quote_keys_regex doesn't have to deal with colon inside the JS strings
765 parts[i] = parts[i].replace(':', chr(1))
766 if in_string == "'":
767 # the JS string is delimited by simple quote.
768 # This is not supported by JSON.
769 # simple quote delimited string are converted to double quote delimited string
770 # here, inside a JS string, we escape the double quote
771 parts[i] = parts[i].replace('"', r'\"')
772
773 # deal with delimiters and escape character
774 if not in_string and p in ('"', "'"):
775 # we are not in string
776 # but p is double or simple quote
777 # that's the start of a new string
778 # replace simple quote by double quote
779 # (JSON doesn't support simple quote)
780 parts[i] = '"'
781 in_string = p
782 continue
783 if p == in_string:
784 # we are in a string and the current part MAY close the string
785 if len(previous_p) > 0 and previous_p[-1] == '\\':
786 # there is an antislash just before: the ECMA string continue
787 continue
788 # the current p close the string
789 # replace simple quote by double quote
790 parts[i] = '"'
791 in_string = None
792
793 if not in_string:
794 # replace void 0 by null
795 # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
796 # we are sure there is no string in p
797 parts[i] = _JS_VOID_RE.sub("null", p)
798 # update previous_p
799 previous_p = p
800 # join the string
801 s = ''.join(parts)
802 # add quote around the key
803 # { a: 12 }
804 # becomes
805 # { "a": 12 }
806 s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
807 s = _JS_DECIMAL_RE.sub(":0.", s)
808 # replace the surogate character by colon
809 s = s.replace(chr(1), ':')
810 # replace single-quote followed by comma with double-quote and comma
811 # {"a": "\"12\"',"b": "13"}
812 # becomes
813 # {"a": "\"12\"","b": "13"}
814 s = s.replace("',", "\",")
815 # load the JSON and return the result
816 return json.loads(s) # pyright: ignore[reportAny]
817
818
819def parse_duration_string(duration_str: str) -> timedelta | None:
820 """Parse a time string in format MM:SS or HH:MM:SS and convert it to a `timedelta` object.
821
822 Returns None if the provided string doesn't match any of the formats.
823 """
824 duration_str = duration_str.strip()
825
826 if not duration_str:
827 return None
828
829 try:
830 # prepending ["00"] here inits hours to 0 if they are not provided
831 time_parts = (["00"] + duration_str.split(":"))[:3]
832 hours, minutes, seconds = map(int, time_parts)
833 return timedelta(hours=hours, minutes=minutes, seconds=seconds)
834
835 except (ValueError, TypeError):
836 pass
837
838 return None
None handle_entityref(self, str name)
Definition utils.py:130
None handle_endtag(self, str tag)
Definition utils.py:103
None error(self, str message)
Definition utils.py:140
None handle_starttag(self, str tag, list[tuple[str, str|None]] attrs)
Definition utils.py:98
None handle_data(self, str data)
Definition utils.py:116
None handle_charref(self, str name)
Definition utils.py:121
str|None detect_language(str text, float threshold=0.3, bool only_search_languages=False)
Definition utils.py:673
list[t.Any] eval_xpath_list(ElementType element, XPathSpecType xpath_spec, int|None min_len=None)
Definition utils.py:563
t.Any eval_xpath_getindex(ElementType element, XPathSpecType xpath_spec, int index, t.Any default=_NOTSET)
Definition utils.py:581
extr(str txt, str begin, str end, str default="")
Definition utils.py:384
t.Any eval_xpath(ElementType element, XPathSpecType xpath_spec)
Definition utils.py:535
str js_variable_to_python(str js_variable)
Definition utils.py:744
XPath get_xpath(XPathSpecType xpath_spec)
Definition utils.py:509
humanize_bytes(int|float size, int precision=2)
Definition utils.py:353
str ecma_unescape(str string)
Definition utils.py:448
int convert_str_to_int(str number_str)
Definition utils.py:377
int int_or_zero(list[str]|str num)
Definition utils.py:413
"fasttext.FastText._FastText" _get_fasttext_model()
Definition utils.py:600
humanize_number(int|float size, int precision=0)
Definition utils.py:365
remove_pua_from_str(str string)
Definition utils.py:469
str searxng_useragent()
Definition utils.py:74
types.ModuleType load_module(str filename, str module_dir)
Definition utils.py:425
MutableMapping[str, t.Any] dict_subset(MutableMapping[t.Any, t.Any] dictionary, set[str] properties)
Definition utils.py:341
str gen_useragent(str|None os_string=None)
Definition utils.py:79
get_embeded_stream_url(str url)
Definition utils.py:611
str normalize_url(str url, str base_url)
Definition utils.py:254
str extract_url(list[ElementType]|ElementType|str|Number|bool|None xpath_results, str base_url)
Definition utils.py:304
str html_to_text(str html_str)
Definition utils.py:146
timedelta|None parse_duration_string(str duration_str)
Definition utils.py:819
str to_string(t.Any obj)
Definition utils.py:439
dict[str, dict[str, str]] get_engine_from_settings(str name)
Definition utils.py:494
str|None extract_text(list[ElementType]|ElementType|str|Number|bool|None xpath_results, bool allow_none=False)
Definition utils.py:213
str markdown_to_text(str markdown_str)
Definition utils.py:187
Callable[[str], str] get_string_replaces_function(dict[str, str] replaces)
Definition utils.py:484