.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
utils.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Utility functions for the engines"""
3
4from __future__ import annotations
5
6import re
7import importlib
8import importlib.util
9import json
10import types
11
12from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
13from numbers import Number
14from os.path import splitext, join
15from random import choice
16from html.parser import HTMLParser
17from html import escape
18from urllib.parse import urljoin, urlparse, parse_qs, urlencode
19from datetime import timedelta
20from markdown_it import MarkdownIt
21
22from lxml import html
23from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError
24
25from searx import settings
26from searx.data import USER_AGENTS, data_dir
27from searx.version import VERSION_TAG
28from searx.sxng_locales import sxng_locales
29from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
30from searx import logger
31
32
33logger = logger.getChild('utils')
34
35XPathSpecType = Union[str, XPath]
36
37_BLOCKED_TAGS = ('script', 'style')
38
39_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
40_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
41
42_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
43_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\‍([0-9]+\‍)')
44_JS_DECIMAL_RE = re.compile(r":\s*\.")
45
46_XPATH_CACHE: Dict[str, XPath] = {}
47_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
48
49_FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None # type: ignore
50"""fasttext model to predict language of a search term"""
51
52SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
53"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
54
55
56class _NotSetClass: # pylint: disable=too-few-public-methods
57 """Internal class for this module, do not create instance of this class.
58 Replace the None value, allow explicitly pass None as a function argument"""
59
60
61_NOTSET = _NotSetClass()
62
63
64def searx_useragent() -> str:
65 """Return the searx User Agent"""
66 return 'searx/{searx_version} {suffix}'.format(
67 searx_version=VERSION_TAG, suffix=settings['outgoing']['useragent_suffix']
68 ).strip()
69
70
71def gen_useragent(os_string: Optional[str] = None) -> str:
72 """Return a random browser User Agent
73
74 See searx/data/useragents.json
75 """
76 return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
77
78
80 """Internal exception raised when the HTML is invalid"""
81
82
83class _HTMLTextExtractor(HTMLParser):
84 """Internal class to extract text from HTML"""
85
86 def __init__(self):
87 HTMLParser.__init__(self)
88 self.result = []
89 self.tags = []
90
91 def handle_starttag(self, tag, attrs):
92 self.tags.append(tag)
93 if tag == 'br':
94 self.result.append(' ')
95
96 def handle_endtag(self, tag):
97 if not self.tags:
98 return
99
100 if tag != self.tags[-1]:
102
103 self.tags.pop()
104
105 def is_valid_tag(self):
106 return not self.tags or self.tags[-1] not in _BLOCKED_TAGS
107
108 def handle_data(self, data):
109 if not self.is_valid_tag():
110 return
111 self.result.append(data)
112
113 def handle_charref(self, name):
114 if not self.is_valid_tag():
115 return
116 if name[0] in ('x', 'X'):
117 codepoint = int(name[1:], 16)
118 else:
119 codepoint = int(name)
120 self.result.append(chr(codepoint))
121
122 def handle_entityref(self, name):
123 if not self.is_valid_tag():
124 return
125 # codepoint = htmlentitydefs.name2codepoint[name]
126 # self.result.append(chr(codepoint))
127 self.result.append(name)
128
129 def get_text(self):
130 return ''.join(self.result).strip()
131
132 def error(self, message):
133 # error handle is needed in <py3.10
134 # https://github.com/python/cpython/pull/8562/files
135 raise AssertionError(message)
136
137
138def html_to_text(html_str: str) -> str:
139 """Extract text from a HTML string
140
141 Args:
142 * html_str (str): string HTML
143
144 Returns:
145 * str: extracted text
146
147 Examples:
148 >>> html_to_text('Example <span id="42">#2</span>')
149 'Example #2'
150
151 >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
152 'Example'
153
154 >>> html_to_text(r'regexp: (?<![a-zA-Z]')
155 'regexp: (?<![a-zA-Z]'
156 """
157 if not html_str:
158 return ""
159 html_str = html_str.replace('\n', ' ').replace('\r', ' ')
160 html_str = ' '.join(html_str.split())
162 try:
163 s.feed(html_str)
164 s.close()
165 except AssertionError:
167 s.feed(escape(html_str, quote=True))
168 s.close()
169 except _HTMLTextExtractorException:
170 logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
171 return s.get_text()
172
173
174def markdown_to_text(markdown_str: str) -> str:
175 """Extract text from a Markdown string
176
177 Args:
178 * markdown_str (str): string Markdown
179
180 Returns:
181 * str: extracted text
182
183 Examples:
184 >>> markdown_to_text('[example](https://example.com)')
185 'example'
186
187 >>> markdown_to_text('## Headline')
188 'Headline'
189 """
190
191 html_str = (
192 MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
193 )
194 return html_to_text(html_str)
195
196
197def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
198 """Extract text from a lxml result
199
200 * if xpath_results is list, extract the text from each result and concat the list
201 * if xpath_results is a xml element, extract all the text node from it
202 ( text_content() method from lxml )
203 * if xpath_results is a string element, then it's already done
204 """
205 if isinstance(xpath_results, list):
206 # it's list of result : concat everything using recursive call
207 result = ''
208 for e in xpath_results:
209 result = result + (extract_text(e) or '')
210 return result.strip()
211 if isinstance(xpath_results, ElementBase):
212 # it's a element
213 text: str = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False)
214 text = text.strip().replace('\n', ' ')
215 return ' '.join(text.split())
216 if isinstance(xpath_results, (str, Number, bool)):
217 return str(xpath_results)
218 if xpath_results is None and allow_none:
219 return None
220 if xpath_results is None and not allow_none:
221 raise ValueError('extract_text(None, allow_none=False)')
222 raise ValueError('unsupported type')
223
224
225def normalize_url(url: str, base_url: str) -> str:
226 """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
227
228 Args:
229 * url (str): Relative URL
230 * base_url (str): Base URL, it must be an absolute URL.
231
232 Example:
233 >>> normalize_url('https://example.com', 'http://example.com/')
234 'https://example.com/'
235 >>> normalize_url('//example.com', 'http://example.com/')
236 'http://example.com/'
237 >>> normalize_url('//example.com', 'https://example.com/')
238 'https://example.com/'
239 >>> normalize_url('/path?a=1', 'https://example.com')
240 'https://example.com/path?a=1'
241 >>> normalize_url('', 'https://example.com')
242 'https://example.com/'
243 >>> normalize_url('/test', '/path')
244 raise ValueError
245
246 Raises:
247 * lxml.etree.ParserError
248
249 Returns:
250 * str: normalized URL
251 """
252 if url.startswith('//'):
253 # add http or https to this kind of url //example.com/
254 parsed_search_url = urlparse(base_url)
255 url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
256 elif url.startswith('/'):
257 # fix relative url to the search engine
258 url = urljoin(base_url, url)
259
260 # fix relative urls that fall through the crack
261 if '://' not in url:
262 url = urljoin(base_url, url)
263
264 parsed_url = urlparse(url)
265
266 # add a / at this end of the url if there is no path
267 if not parsed_url.netloc:
268 raise ValueError('Cannot parse url')
269 if not parsed_url.path:
270 url += '/'
271
272 return url
273
274
275def extract_url(xpath_results, base_url) -> str:
276 """Extract and normalize URL from lxml Element
277
278 Args:
279 * xpath_results (Union[List[html.HtmlElement], html.HtmlElement]): lxml Element(s)
280 * base_url (str): Base URL
281
282 Example:
283 >>> def f(s, search_url):
284 >>> return searx.utils.extract_url(html.fromstring(s), search_url)
285 >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
286 'https://example.com/'
287 >>> f('https://example.com', 'http://example.com/')
288 'https://example.com/'
289 >>> f('//example.com', 'http://example.com/')
290 'http://example.com/'
291 >>> f('//example.com', 'https://example.com/')
292 'https://example.com/'
293 >>> f('/path?a=1', 'https://example.com')
294 'https://example.com/path?a=1'
295 >>> f('', 'https://example.com')
296 raise lxml.etree.ParserError
297 >>> searx.utils.extract_url([], 'https://example.com')
298 raise ValueError
299
300 Raises:
301 * ValueError
302 * lxml.etree.ParserError
303
304 Returns:
305 * str: normalized URL
306 """
307 if xpath_results == []:
308 raise ValueError('Empty url resultset')
309
310 url = extract_text(xpath_results)
311 if url:
312 return normalize_url(url, base_url)
313 raise ValueError('URL not found')
314
315
316def dict_subset(dictionary: MutableMapping, properties: Set[str]) -> Dict:
317 """Extract a subset of a dict
318
319 Examples:
320 >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
321 {'A': 'a', 'C': 'c'}
322 >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
323 {'A': 'a'}
324 """
325 return {k: dictionary[k] for k in properties if k in dictionary}
326
327
328def humanize_bytes(size, precision=2):
329 """Determine the *human readable* value of bytes on 1024 base (1KB=1024B)."""
330 s = ['B ', 'KB', 'MB', 'GB', 'TB']
331
332 x = len(s)
333 p = 0
334 while size > 1024 and p < x:
335 p += 1
336 size = size / 1024.0
337 return "%.*f %s" % (precision, size, s[p])
338
339
340def humanize_number(size, precision=0):
341 """Determine the *human readable* value of a decimal number."""
342 s = ['', 'K', 'M', 'B', 'T']
343
344 x = len(s)
345 p = 0
346 while size > 1000 and p < x:
347 p += 1
348 size = size / 1000.0
349 return "%.*f%s" % (precision, size, s[p])
350
351
352def convert_str_to_int(number_str: str) -> int:
353 """Convert number_str to int or 0 if number_str is not a number."""
354 if number_str.isdigit():
355 return int(number_str)
356 return 0
357
358
359def extr(txt: str, begin: str, end: str, default: str = ""):
360 """Extract the string between ``begin`` and ``end`` from ``txt``
361
362 :param txt: String to search in
363 :param begin: First string to be searched for
364 :param end: Second string to be searched for after ``begin``
365 :param default: Default value if one of ``begin`` or ``end`` is not
366 found. Defaults to an empty string.
367 :return: The string between the two search-strings ``begin`` and ``end``.
368 If at least one of ``begin`` or ``end`` is not found, the value of
369 ``default`` is returned.
370
371 Examples:
372 >>> extr("abcde", "a", "e")
373 "bcd"
374 >>> extr("abcde", "a", "z", deafult="nothing")
375 "nothing"
376
377 """
378
379 # From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129
380
381 try:
382 first = txt.index(begin) + len(begin)
383 return txt[first : txt.index(end, first)]
384 except ValueError:
385 return default
386
387
388def int_or_zero(num: Union[List[str], str]) -> int:
389 """Convert num to int or 0. num can be either a str or a list.
390 If num is a list, the first element is converted to int (or return 0 if the list is empty).
391 If num is a str, see convert_str_to_int
392 """
393 if isinstance(num, list):
394 if len(num) < 1:
395 return 0
396 num = num[0]
397 return convert_str_to_int(num)
398
399
400def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
401 """Return language code and name if lang describe a language.
402
403 Examples:
404 >>> is_valid_lang('zz')
405 None
406 >>> is_valid_lang('uk')
407 (True, 'uk', 'ukrainian')
408 >>> is_valid_lang(b'uk')
409 (True, 'uk', 'ukrainian')
410 >>> is_valid_lang('en')
411 (True, 'en', 'english')
412 >>> searx.utils.is_valid_lang('Español')
413 (True, 'es', 'spanish')
414 >>> searx.utils.is_valid_lang('Spanish')
415 (True, 'es', 'spanish')
416 """
417 if isinstance(lang, bytes):
418 lang = lang.decode()
419 is_abbr = len(lang) == 2
420 lang = lang.lower()
421 if is_abbr:
422 for l in sxng_locales:
423 if l[0][:2] == lang:
424 return (True, l[0][:2], l[3].lower())
425 return None
426 for l in sxng_locales:
427 if l[1].lower() == lang or l[3].lower() == lang:
428 return (True, l[0][:2], l[3].lower())
429 return None
430
431
432def load_module(filename: str, module_dir: str) -> types.ModuleType:
433 modname = splitext(filename)[0]
434 modpath = join(module_dir, filename)
435 # and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
436 spec = importlib.util.spec_from_file_location(modname, modpath)
437 if not spec:
438 raise ValueError(f"Error loading '{modpath}' module")
439 module = importlib.util.module_from_spec(spec)
440 if not spec.loader:
441 raise ValueError(f"Error loading '{modpath}' module")
442 spec.loader.exec_module(module)
443 return module
444
445
446def to_string(obj: Any) -> str:
447 """Convert obj to its string representation."""
448 if isinstance(obj, str):
449 return obj
450 if hasattr(obj, '__str__'):
451 return str(obj)
452 return repr(obj)
453
454
455def ecma_unescape(string: str) -> str:
456 """Python implementation of the unescape javascript function
457
458 https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
459 https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
460
461 Examples:
462 >>> ecma_unescape('%u5409')
463 '吉'
464 >>> ecma_unescape('%20')
465 ' '
466 >>> ecma_unescape('%F3')
467 'ó'
468 """
469 # "%u5409" becomes "吉"
470 string = _ECMA_UNESCAPE4_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
471 # "%20" becomes " ", "%F3" becomes "ó"
472 string = _ECMA_UNESCAPE2_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
473 return string
474
475
477 """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
478
479 .. _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
480 """
481 pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
482 s = []
483 for c in string:
484 i = ord(c)
485 if any(a <= i <= b for (a, b) in pua_ranges):
486 continue
487 s.append(c)
488 return "".join(s)
489
490
491def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
492 rep = {re.escape(k): v for k, v in replaces.items()}
493 pattern = re.compile("|".join(rep.keys()))
494
495 def func(text):
496 return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
497
498 return func
499
500
501def get_engine_from_settings(name: str) -> Dict:
502 """Return engine configuration from settings.yml of a given engine name"""
503
504 if 'engines' not in settings:
505 return {}
506
507 for engine in settings['engines']:
508 if 'name' not in engine:
509 continue
510 if name == engine['name']:
511 return engine
512
513 return {}
514
515
516def get_xpath(xpath_spec: XPathSpecType) -> XPath:
517 """Return cached compiled XPath
518
519 There is no thread lock.
520 Worst case scenario, xpath_str is compiled more than one time.
521
522 Args:
523 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
524
525 Returns:
526 * result (bool, float, list, str): Results.
527
528 Raises:
529 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
530 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
531 """
532 if isinstance(xpath_spec, str):
533 result = _XPATH_CACHE.get(xpath_spec, None)
534 if result is None:
535 try:
536 result = XPath(xpath_spec)
537 except XPathSyntaxError as e:
538 raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e
539 _XPATH_CACHE[xpath_spec] = result
540 return result
541
542 if isinstance(xpath_spec, XPath):
543 return xpath_spec
544
545 raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath')
546
547
548def eval_xpath(element: ElementBase, xpath_spec: XPathSpecType):
549 """Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.
550 See https://lxml.de/xpathxslt.html#xpath-return-values
551
552 Args:
553 * element (ElementBase): [description]
554 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
555
556 Returns:
557 * result (bool, float, list, str): Results.
558
559 Raises:
560 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
561 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
562 * SearxEngineXPathException: Raise when the XPath can't be evaluated.
563 """
564 xpath = get_xpath(xpath_spec)
565 try:
566 return xpath(element)
567 except XPathError as e:
568 arg = ' '.join([str(i) for i in e.args])
569 raise SearxEngineXPathException(xpath_spec, arg) from e
570
571
572def eval_xpath_list(element: ElementBase, xpath_spec: XPathSpecType, min_len: Optional[int] = None):
573 """Same as eval_xpath, check if the result is a list
574
575 Args:
576 * element (ElementBase): [description]
577 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
578 * min_len (int, optional): [description]. Defaults to None.
579
580 Raises:
581 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
582 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
583 * SearxEngineXPathException: raise if the result is not a list
584
585 Returns:
586 * result (bool, float, list, str): Results.
587 """
588 result = eval_xpath(element, xpath_spec)
589 if not isinstance(result, list):
590 raise SearxEngineXPathException(xpath_spec, 'the result is not a list')
591 if min_len is not None and min_len > len(result):
592 raise SearxEngineXPathException(xpath_spec, 'len(xpath_str) < ' + str(min_len))
593 return result
594
595
596def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index: int, default=_NOTSET):
597 """Call eval_xpath_list then get one element using the index parameter.
598 If the index does not exist, either raise an exception is default is not set,
599 other return the default value (can be None).
600
601 Args:
602 * elements (ElementBase): lxml element to apply the xpath.
603 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath.
604 * index (int): index to get
605 * default (Object, optional): Defaults if index doesn't exist.
606
607 Raises:
608 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
609 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
610 * SearxEngineXPathException: if the index is not found. Also see eval_xpath.
611
612 Returns:
613 * result (bool, float, list, str): Results.
614 """
615 result = eval_xpath_list(elements, xpath_spec)
616 if -len(result) <= index < len(result):
617 return result[index]
618 if default == _NOTSET:
619 # raise an SearxEngineXPathException instead of IndexError
620 # to record xpath_spec
621 raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
622 return default
623
624
625def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore
626 global _FASTTEXT_MODEL # pylint: disable=global-statement
627 if _FASTTEXT_MODEL is None:
628 import fasttext # pylint: disable=import-outside-toplevel
629
630 # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
631 fasttext.FastText.eprint = lambda x: None
632 _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
633 return _FASTTEXT_MODEL
634
635
637 """
638 Converts a standard video URL into its embed format. Supported services include Youtube,
639 Facebook, Instagram, TikTok, Dailymotion, and Bilibili.
640 """
641 parsed_url = urlparse(url)
642 iframe_src = None
643
644 # YouTube
645 if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query:
646 video_id = parse_qs(parsed_url.query).get('v', [])
647 if video_id:
648 iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0]
649
650 # Facebook
651 elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']:
652 encoded_href = urlencode({'href': url})
653 iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href
654
655 # Instagram
656 elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'):
657 if parsed_url.path.endswith('/'):
658 iframe_src = url + 'embed'
659 else:
660 iframe_src = url + '/embed'
661
662 # TikTok
663 elif (
664 parsed_url.netloc in ['www.tiktok.com', 'tiktok.com']
665 and parsed_url.path.startswith('/@')
666 and '/video/' in parsed_url.path
667 ):
668 path_parts = parsed_url.path.split('/video/')
669 video_id = path_parts[1]
670 iframe_src = 'https://www.tiktok.com/embed/' + video_id
671
672 # Dailymotion
673 elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'):
674 path_parts = parsed_url.path.split('/')
675 if len(path_parts) == 3:
676 video_id = path_parts[2]
677 iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id
678
679 # Bilibili
680 elif parsed_url.netloc in ['www.bilibili.com', 'bilibili.com'] and parsed_url.path.startswith('/video/'):
681 path_parts = parsed_url.path.split('/')
682
683 video_id = path_parts[2]
684 param_key = None
685 if video_id.startswith('av'):
686 video_id = video_id[2:]
687 param_key = 'aid'
688 elif video_id.startswith('BV'):
689 param_key = 'bvid'
690
691 iframe_src = (
692 f'https://player.bilibili.com/player.html?{param_key}={video_id}&high_quality=1&autoplay=false&danmaku=0'
693 )
694
695 return iframe_src
696
697
698def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
699 """Detect the language of the ``text`` parameter.
700
701 :param str text: The string whose language is to be detected.
702
703 :param float threshold: Threshold filters the returned labels by a threshold
704 on probability. A choice of 0.3 will return labels with at least 0.3
705 probability.
706
707 :param bool only_search_languages: If ``True``, returns only supported
708 SearXNG search languages. see :py:obj:`searx.languages`
709
710 :rtype: str, None
711 :returns:
712 The detected language code or ``None``. See below.
713
714 :raises ValueError: If ``text`` is not a string.
715
716 The language detection is done by using `a fork`_ of the fastText_ library
717 (`python fasttext`_). fastText_ distributes the `language identification
718 model`_, for reference:
719
720 - `FastText.zip: Compressing text classification models`_
721 - `Bag of Tricks for Efficient Text Classification`_
722
723 The `language identification model`_ support the language codes
724 (ISO-639-3)::
725
726 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
727 bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
728 et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
729 id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
730 lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
731 nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
732 rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
733 tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
734
735 By using ``only_search_languages=True`` the `language identification model`_
736 is harmonized with the SearXNG's language (locale) model. General
737 conditions of SearXNG's locale model are:
738
739 a. SearXNG's locale of a query is passed to the
740 :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
741 code that is used by an engine.
742
743 b. Most of SearXNG's engines do not support all the languages from `language
744 identification model`_ and there is also a discrepancy in the ISO-639-3
745 (fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
746 locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
747 (``zh_Hans``) while the `language identification model`_ reduce both to
748 ``zh``.
749
750 .. _a fork: https://github.com/searxng/fasttext-predict
751 .. _fastText: https://fasttext.cc/
752 .. _python fasttext: https://pypi.org/project/fasttext/
753 .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
754 .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
755 .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
756
757 """
758 if not isinstance(text, str):
759 raise ValueError('text must a str')
760 r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
761 if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
762 language = r[0][0].split('__label__')[1]
763 if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
764 return None
765 return language
766 return None
767
768
769def js_variable_to_python(js_variable):
770 """Convert a javascript variable into JSON and then load the value
771
772 It does not deal with all cases, but it is good enough for now.
773 chompjs has a better implementation.
774 """
775 # when in_string is not None, it contains the character that has opened the string
776 # either simple quote or double quote
777 in_string = None
778 # cut the string:
779 # r"""{ a:"f\"irst", c:'sec"ond'}"""
780 # becomes
781 # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
782 parts = re.split(r'(["\'])', js_variable)
783 # previous part (to check the escape character antislash)
784 previous_p = ""
785 for i, p in enumerate(parts):
786 # parse characters inside a ECMA string
787 if in_string:
788 # we are in a JS string: replace the colon by a temporary character
789 # so quote_keys_regex doesn't have to deal with colon inside the JS strings
790 parts[i] = parts[i].replace(':', chr(1))
791 if in_string == "'":
792 # the JS string is delimited by simple quote.
793 # This is not supported by JSON.
794 # simple quote delimited string are converted to double quote delimited string
795 # here, inside a JS string, we escape the double quote
796 parts[i] = parts[i].replace('"', r'\"')
797
798 # deal with delimiters and escape character
799 if not in_string and p in ('"', "'"):
800 # we are not in string
801 # but p is double or simple quote
802 # that's the start of a new string
803 # replace simple quote by double quote
804 # (JSON doesn't support simple quote)
805 parts[i] = '"'
806 in_string = p
807 continue
808 if p == in_string:
809 # we are in a string and the current part MAY close the string
810 if len(previous_p) > 0 and previous_p[-1] == '\\':
811 # there is an antislash just before: the ECMA string continue
812 continue
813 # the current p close the string
814 # replace simple quote by double quote
815 parts[i] = '"'
816 in_string = None
817
818 if not in_string:
819 # replace void 0 by null
820 # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
821 # we are sure there is no string in p
822 parts[i] = _JS_VOID_RE.sub("null", p)
823 # update previous_p
824 previous_p = p
825 # join the string
826 s = ''.join(parts)
827 # add quote around the key
828 # { a: 12 }
829 # becomes
830 # { "a": 12 }
831 s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
832 s = _JS_DECIMAL_RE.sub(":0.", s)
833 # replace the surogate character by colon
834 s = s.replace(chr(1), ':')
835 # replace single-quote followed by comma with double-quote and comma
836 # {"a": "\"12\"',"b": "13"}
837 # becomes
838 # {"a": "\"12\"","b": "13"}
839 s = s.replace("',", "\",")
840 # load the JSON and return the result
841 return json.loads(s)
842
843
844def parse_duration_string(duration_str: str) -> timedelta | None:
845 """Parse a time string in format MM:SS or HH:MM:SS and convert it to a `timedelta` object.
846
847 Returns None if the provided string doesn't match any of the formats.
848 """
849 duration_str = duration_str.strip()
850
851 if not duration_str:
852 return None
853
854 try:
855 # prepending ["00"] here inits hours to 0 if they are not provided
856 time_parts = (["00"] + duration_str.split(":"))[:3]
857 hours, minutes, seconds = map(int, time_parts)
858 return timedelta(hours=hours, minutes=minutes, seconds=seconds)
859
860 except (ValueError, TypeError):
861 pass
862
863 return None
handle_starttag(self, tag, attrs)
Definition utils.py:91
extr(str txt, str begin, str end, str default="")
Definition utils.py:359
Callable[[str], str] get_string_replaces_function(Dict[str, str] replaces)
Definition utils.py:491
js_variable_to_python(js_variable)
Definition utils.py:769
Optional[str] detect_language(str text, float threshold=0.3, bool only_search_languages=False)
Definition utils.py:698
XPath get_xpath(XPathSpecType xpath_spec)
Definition utils.py:516
str to_string(Any obj)
Definition utils.py:446
humanize_bytes(size, precision=2)
Definition utils.py:328
Optional[str] extract_text(xpath_results, bool allow_none=False)
Definition utils.py:197
str ecma_unescape(str string)
Definition utils.py:455
eval_xpath_getindex(ElementBase elements, XPathSpecType xpath_spec, int index, default=_NOTSET)
Definition utils.py:596
int convert_str_to_int(str number_str)
Definition utils.py:352
eval_xpath(ElementBase element, XPathSpecType xpath_spec)
Definition utils.py:548
str gen_useragent(Optional[str] os_string=None)
Definition utils.py:71
int int_or_zero(Union[List[str], str] num)
Definition utils.py:388
Optional[Tuple[bool, str, str]] is_valid_lang(lang)
Definition utils.py:400
"fasttext.FastText._FastText" _get_fasttext_model()
Definition utils.py:625
str extract_url(xpath_results, base_url)
Definition utils.py:275
eval_xpath_list(ElementBase element, XPathSpecType xpath_spec, Optional[int] min_len=None)
Definition utils.py:572
types.ModuleType load_module(str filename, str module_dir)
Definition utils.py:432
str searx_useragent()
Definition utils.py:64
str normalize_url(str url, str base_url)
Definition utils.py:225
get_embeded_stream_url(url)
Definition utils.py:636
str html_to_text(str html_str)
Definition utils.py:138
humanize_number(size, precision=0)
Definition utils.py:340
Dict dict_subset(MutableMapping dictionary, Set[str] properties)
Definition utils.py:316
timedelta|None parse_duration_string(str duration_str)
Definition utils.py:844
Dict get_engine_from_settings(str name)
Definition utils.py:501
str markdown_to_text(str markdown_str)
Definition utils.py:174
remove_pua_from_str(string)
Definition utils.py:476