.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
utils.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Utility functions for the engines"""
3
4from __future__ import annotations
5
6import re
7import importlib
8import importlib.util
9import json
10import types
11
12from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
13from numbers import Number
14from os.path import splitext, join
15from random import choice
16from html.parser import HTMLParser
17from html import escape
18from urllib.parse import urljoin, urlparse, parse_qs, urlencode
19from datetime import timedelta
20from markdown_it import MarkdownIt
21
22from lxml import html
23from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError
24
25from searx import settings
26from searx.data import USER_AGENTS, data_dir
27from searx.version import VERSION_TAG
28from searx.sxng_locales import sxng_locales
29from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
30from searx import logger
31
32
33logger = logger.getChild('utils')
34
35XPathSpecType = Union[str, XPath]
36
37_BLOCKED_TAGS = ('script', 'style')
38
39_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
40_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
41
42_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
43_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\‍([0-9]+\‍)')
44_JS_DECIMAL_RE = re.compile(r":\s*\.")
45
46_XPATH_CACHE: Dict[str, XPath] = {}
47_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
48
49_FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None # type: ignore
50"""fasttext model to predict language of a search term"""
51
52SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
53"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
54
55
56class _NotSetClass: # pylint: disable=too-few-public-methods
57 """Internal class for this module, do not create instance of this class.
58 Replace the None value, allow explicitly pass None as a function argument"""
59
60
61_NOTSET = _NotSetClass()
62
63
64def searx_useragent() -> str:
65 """Return the searx User Agent"""
66 return 'searx/{searx_version} {suffix}'.format(
67 searx_version=VERSION_TAG, suffix=settings['outgoing']['useragent_suffix']
68 ).strip()
69
70
71def gen_useragent(os_string: Optional[str] = None) -> str:
72 """Return a random browser User Agent
73
74 See searx/data/useragents.json
75 """
76 return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
77
78
80 """Internal exception raised when the HTML is invalid"""
81
82
83class _HTMLTextExtractor(HTMLParser):
84 """Internal class to extract text from HTML"""
85
86 def __init__(self):
87 HTMLParser.__init__(self)
88 self.result = []
89 self.tags = []
90
91 def handle_starttag(self, tag, attrs):
92 self.tags.append(tag)
93 if tag == 'br':
94 self.result.append(' ')
95
96 def handle_endtag(self, tag):
97 if not self.tags:
98 return
99
100 if tag != self.tags[-1]:
102
103 self.tags.pop()
104
105 def is_valid_tag(self):
106 return not self.tags or self.tags[-1] not in _BLOCKED_TAGS
107
108 def handle_data(self, data):
109 if not self.is_valid_tag():
110 return
111 self.result.append(data)
112
113 def handle_charref(self, name):
114 if not self.is_valid_tag():
115 return
116 if name[0] in ('x', 'X'):
117 codepoint = int(name[1:], 16)
118 else:
119 codepoint = int(name)
120 self.result.append(chr(codepoint))
121
122 def handle_entityref(self, name):
123 if not self.is_valid_tag():
124 return
125 # codepoint = htmlentitydefs.name2codepoint[name]
126 # self.result.append(chr(codepoint))
127 self.result.append(name)
128
129 def get_text(self):
130 return ''.join(self.result).strip()
131
132 def error(self, message):
133 # error handle is needed in <py3.10
134 # https://github.com/python/cpython/pull/8562/files
135 raise AssertionError(message)
136
137
138def html_to_text(html_str: str) -> str:
139 """Extract text from a HTML string
140
141 Args:
142 * html_str (str): string HTML
143
144 Returns:
145 * str: extracted text
146
147 Examples:
148 >>> html_to_text('Example <span id="42">#2</span>')
149 'Example #2'
150
151 >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
152 'Example'
153
154 >>> html_to_text(r'regexp: (?<![a-zA-Z]')
155 'regexp: (?<![a-zA-Z]'
156 """
157 html_str = html_str.replace('\n', ' ').replace('\r', ' ')
158 html_str = ' '.join(html_str.split())
160 try:
161 s.feed(html_str)
162 except AssertionError:
164 s.feed(escape(html_str, quote=True))
165 except _HTMLTextExtractorException:
166 logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
167 return s.get_text()
168
169
170def markdown_to_text(markdown_str: str) -> str:
171 """Extract text from a Markdown string
172
173 Args:
174 * markdown_str (str): string Markdown
175
176 Returns:
177 * str: extracted text
178
179 Examples:
180 >>> markdown_to_text('[example](https://example.com)')
181 'example'
182
183 >>> markdown_to_text('## Headline')
184 'Headline'
185 """
186
187 html_str = (
188 MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
189 )
190 return html_to_text(html_str)
191
192
193def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
194 """Extract text from a lxml result
195
196 * if xpath_results is list, extract the text from each result and concat the list
197 * if xpath_results is a xml element, extract all the text node from it
198 ( text_content() method from lxml )
199 * if xpath_results is a string element, then it's already done
200 """
201 if isinstance(xpath_results, list):
202 # it's list of result : concat everything using recursive call
203 result = ''
204 for e in xpath_results:
205 result = result + (extract_text(e) or '')
206 return result.strip()
207 if isinstance(xpath_results, ElementBase):
208 # it's a element
209 text: str = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False)
210 text = text.strip().replace('\n', ' ')
211 return ' '.join(text.split())
212 if isinstance(xpath_results, (str, Number, bool)):
213 return str(xpath_results)
214 if xpath_results is None and allow_none:
215 return None
216 if xpath_results is None and not allow_none:
217 raise ValueError('extract_text(None, allow_none=False)')
218 raise ValueError('unsupported type')
219
220
221def normalize_url(url: str, base_url: str) -> str:
222 """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
223
224 Args:
225 * url (str): Relative URL
226 * base_url (str): Base URL, it must be an absolute URL.
227
228 Example:
229 >>> normalize_url('https://example.com', 'http://example.com/')
230 'https://example.com/'
231 >>> normalize_url('//example.com', 'http://example.com/')
232 'http://example.com/'
233 >>> normalize_url('//example.com', 'https://example.com/')
234 'https://example.com/'
235 >>> normalize_url('/path?a=1', 'https://example.com')
236 'https://example.com/path?a=1'
237 >>> normalize_url('', 'https://example.com')
238 'https://example.com/'
239 >>> normalize_url('/test', '/path')
240 raise ValueError
241
242 Raises:
243 * lxml.etree.ParserError
244
245 Returns:
246 * str: normalized URL
247 """
248 if url.startswith('//'):
249 # add http or https to this kind of url //example.com/
250 parsed_search_url = urlparse(base_url)
251 url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
252 elif url.startswith('/'):
253 # fix relative url to the search engine
254 url = urljoin(base_url, url)
255
256 # fix relative urls that fall through the crack
257 if '://' not in url:
258 url = urljoin(base_url, url)
259
260 parsed_url = urlparse(url)
261
262 # add a / at this end of the url if there is no path
263 if not parsed_url.netloc:
264 raise ValueError('Cannot parse url')
265 if not parsed_url.path:
266 url += '/'
267
268 return url
269
270
271def extract_url(xpath_results, base_url) -> str:
272 """Extract and normalize URL from lxml Element
273
274 Args:
275 * xpath_results (Union[List[html.HtmlElement], html.HtmlElement]): lxml Element(s)
276 * base_url (str): Base URL
277
278 Example:
279 >>> def f(s, search_url):
280 >>> return searx.utils.extract_url(html.fromstring(s), search_url)
281 >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
282 'https://example.com/'
283 >>> f('https://example.com', 'http://example.com/')
284 'https://example.com/'
285 >>> f('//example.com', 'http://example.com/')
286 'http://example.com/'
287 >>> f('//example.com', 'https://example.com/')
288 'https://example.com/'
289 >>> f('/path?a=1', 'https://example.com')
290 'https://example.com/path?a=1'
291 >>> f('', 'https://example.com')
292 raise lxml.etree.ParserError
293 >>> searx.utils.extract_url([], 'https://example.com')
294 raise ValueError
295
296 Raises:
297 * ValueError
298 * lxml.etree.ParserError
299
300 Returns:
301 * str: normalized URL
302 """
303 if xpath_results == []:
304 raise ValueError('Empty url resultset')
305
306 url = extract_text(xpath_results)
307 if url:
308 return normalize_url(url, base_url)
309 raise ValueError('URL not found')
310
311
312def dict_subset(dictionary: MutableMapping, properties: Set[str]) -> Dict:
313 """Extract a subset of a dict
314
315 Examples:
316 >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
317 {'A': 'a', 'C': 'c'}
318 >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
319 {'A': 'a'}
320 """
321 return {k: dictionary[k] for k in properties if k in dictionary}
322
323
324def humanize_bytes(size, precision=2):
325 """Determine the *human readable* value of bytes on 1024 base (1KB=1024B)."""
326 s = ['B ', 'KB', 'MB', 'GB', 'TB']
327
328 x = len(s)
329 p = 0
330 while size > 1024 and p < x:
331 p += 1
332 size = size / 1024.0
333 return "%.*f %s" % (precision, size, s[p])
334
335
336def humanize_number(size, precision=0):
337 """Determine the *human readable* value of a decimal number."""
338 s = ['', 'K', 'M', 'B', 'T']
339
340 x = len(s)
341 p = 0
342 while size > 1000 and p < x:
343 p += 1
344 size = size / 1000.0
345 return "%.*f%s" % (precision, size, s[p])
346
347
348def convert_str_to_int(number_str: str) -> int:
349 """Convert number_str to int or 0 if number_str is not a number."""
350 if number_str.isdigit():
351 return int(number_str)
352 return 0
353
354
355def extr(txt: str, begin: str, end: str, default: str = ""):
356 """Extract the string between ``begin`` and ``end`` from ``txt``
357
358 :param txt: String to search in
359 :param begin: First string to be searched for
360 :param end: Second string to be searched for after ``begin``
361 :param default: Default value if one of ``begin`` or ``end`` is not
362 found. Defaults to an empty string.
363 :return: The string between the two search-strings ``begin`` and ``end``.
364 If at least one of ``begin`` or ``end`` is not found, the value of
365 ``default`` is returned.
366
367 Examples:
368 >>> extr("abcde", "a", "e")
369 "bcd"
370 >>> extr("abcde", "a", "z", deafult="nothing")
371 "nothing"
372
373 """
374
375 # From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129
376
377 try:
378 first = txt.index(begin) + len(begin)
379 return txt[first : txt.index(end, first)]
380 except ValueError:
381 return default
382
383
384def int_or_zero(num: Union[List[str], str]) -> int:
385 """Convert num to int or 0. num can be either a str or a list.
386 If num is a list, the first element is converted to int (or return 0 if the list is empty).
387 If num is a str, see convert_str_to_int
388 """
389 if isinstance(num, list):
390 if len(num) < 1:
391 return 0
392 num = num[0]
393 return convert_str_to_int(num)
394
395
396def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
397 """Return language code and name if lang describe a language.
398
399 Examples:
400 >>> is_valid_lang('zz')
401 None
402 >>> is_valid_lang('uk')
403 (True, 'uk', 'ukrainian')
404 >>> is_valid_lang(b'uk')
405 (True, 'uk', 'ukrainian')
406 >>> is_valid_lang('en')
407 (True, 'en', 'english')
408 >>> searx.utils.is_valid_lang('Español')
409 (True, 'es', 'spanish')
410 >>> searx.utils.is_valid_lang('Spanish')
411 (True, 'es', 'spanish')
412 """
413 if isinstance(lang, bytes):
414 lang = lang.decode()
415 is_abbr = len(lang) == 2
416 lang = lang.lower()
417 if is_abbr:
418 for l in sxng_locales:
419 if l[0][:2] == lang:
420 return (True, l[0][:2], l[3].lower())
421 return None
422 for l in sxng_locales:
423 if l[1].lower() == lang or l[3].lower() == lang:
424 return (True, l[0][:2], l[3].lower())
425 return None
426
427
428def load_module(filename: str, module_dir: str) -> types.ModuleType:
429 modname = splitext(filename)[0]
430 modpath = join(module_dir, filename)
431 # and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
432 spec = importlib.util.spec_from_file_location(modname, modpath)
433 if not spec:
434 raise ValueError(f"Error loading '{modpath}' module")
435 module = importlib.util.module_from_spec(spec)
436 if not spec.loader:
437 raise ValueError(f"Error loading '{modpath}' module")
438 spec.loader.exec_module(module)
439 return module
440
441
442def to_string(obj: Any) -> str:
443 """Convert obj to its string representation."""
444 if isinstance(obj, str):
445 return obj
446 if hasattr(obj, '__str__'):
447 return str(obj)
448 return repr(obj)
449
450
451def ecma_unescape(string: str) -> str:
452 """Python implementation of the unescape javascript function
453
454 https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
455 https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
456
457 Examples:
458 >>> ecma_unescape('%u5409')
459 '吉'
460 >>> ecma_unescape('%20')
461 ' '
462 >>> ecma_unescape('%F3')
463 'ó'
464 """
465 # "%u5409" becomes "吉"
466 string = _ECMA_UNESCAPE4_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
467 # "%20" becomes " ", "%F3" becomes "ó"
468 string = _ECMA_UNESCAPE2_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
469 return string
470
471
473 """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
474
475 .. _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
476 """
477 pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
478 s = []
479 for c in string:
480 i = ord(c)
481 if any(a <= i <= b for (a, b) in pua_ranges):
482 continue
483 s.append(c)
484 return "".join(s)
485
486
487def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
488 rep = {re.escape(k): v for k, v in replaces.items()}
489 pattern = re.compile("|".join(rep.keys()))
490
491 def func(text):
492 return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
493
494 return func
495
496
497def get_engine_from_settings(name: str) -> Dict:
498 """Return engine configuration from settings.yml of a given engine name"""
499
500 if 'engines' not in settings:
501 return {}
502
503 for engine in settings['engines']:
504 if 'name' not in engine:
505 continue
506 if name == engine['name']:
507 return engine
508
509 return {}
510
511
512def get_xpath(xpath_spec: XPathSpecType) -> XPath:
513 """Return cached compiled XPath
514
515 There is no thread lock.
516 Worst case scenario, xpath_str is compiled more than one time.
517
518 Args:
519 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
520
521 Returns:
522 * result (bool, float, list, str): Results.
523
524 Raises:
525 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
526 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
527 """
528 if isinstance(xpath_spec, str):
529 result = _XPATH_CACHE.get(xpath_spec, None)
530 if result is None:
531 try:
532 result = XPath(xpath_spec)
533 except XPathSyntaxError as e:
534 raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e
535 _XPATH_CACHE[xpath_spec] = result
536 return result
537
538 if isinstance(xpath_spec, XPath):
539 return xpath_spec
540
541 raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath')
542
543
544def eval_xpath(element: ElementBase, xpath_spec: XPathSpecType):
545 """Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.
546 See https://lxml.de/xpathxslt.html#xpath-return-values
547
548 Args:
549 * element (ElementBase): [description]
550 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
551
552 Returns:
553 * result (bool, float, list, str): Results.
554
555 Raises:
556 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
557 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
558 * SearxEngineXPathException: Raise when the XPath can't be evaluated.
559 """
560 xpath = get_xpath(xpath_spec)
561 try:
562 return xpath(element)
563 except XPathError as e:
564 arg = ' '.join([str(i) for i in e.args])
565 raise SearxEngineXPathException(xpath_spec, arg) from e
566
567
568def eval_xpath_list(element: ElementBase, xpath_spec: XPathSpecType, min_len: Optional[int] = None):
569 """Same as eval_xpath, check if the result is a list
570
571 Args:
572 * element (ElementBase): [description]
573 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
574 * min_len (int, optional): [description]. Defaults to None.
575
576 Raises:
577 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
578 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
579 * SearxEngineXPathException: raise if the result is not a list
580
581 Returns:
582 * result (bool, float, list, str): Results.
583 """
584 result = eval_xpath(element, xpath_spec)
585 if not isinstance(result, list):
586 raise SearxEngineXPathException(xpath_spec, 'the result is not a list')
587 if min_len is not None and min_len > len(result):
588 raise SearxEngineXPathException(xpath_spec, 'len(xpath_str) < ' + str(min_len))
589 return result
590
591
592def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index: int, default=_NOTSET):
593 """Call eval_xpath_list then get one element using the index parameter.
594 If the index does not exist, either raise an exception is default is not set,
595 other return the default value (can be None).
596
597 Args:
598 * elements (ElementBase): lxml element to apply the xpath.
599 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath.
600 * index (int): index to get
601 * default (Object, optional): Defaults if index doesn't exist.
602
603 Raises:
604 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
605 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
606 * SearxEngineXPathException: if the index is not found. Also see eval_xpath.
607
608 Returns:
609 * result (bool, float, list, str): Results.
610 """
611 result = eval_xpath_list(elements, xpath_spec)
612 if -len(result) <= index < len(result):
613 return result[index]
614 if default == _NOTSET:
615 # raise an SearxEngineXPathException instead of IndexError
616 # to record xpath_spec
617 raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
618 return default
619
620
621def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore
622 global _FASTTEXT_MODEL # pylint: disable=global-statement
623 if _FASTTEXT_MODEL is None:
624 import fasttext # pylint: disable=import-outside-toplevel
625
626 # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
627 fasttext.FastText.eprint = lambda x: None
628 _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
629 return _FASTTEXT_MODEL
630
631
633 """
634 Converts a standard video URL into its embed format. Supported services include Youtube,
635 Facebook, Instagram, TikTok, Dailymotion, and Bilibili.
636 """
637 parsed_url = urlparse(url)
638 iframe_src = None
639
640 # YouTube
641 if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query:
642 video_id = parse_qs(parsed_url.query).get('v', [])
643 if video_id:
644 iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0]
645
646 # Facebook
647 elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']:
648 encoded_href = urlencode({'href': url})
649 iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href
650
651 # Instagram
652 elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'):
653 if parsed_url.path.endswith('/'):
654 iframe_src = url + 'embed'
655 else:
656 iframe_src = url + '/embed'
657
658 # TikTok
659 elif (
660 parsed_url.netloc in ['www.tiktok.com', 'tiktok.com']
661 and parsed_url.path.startswith('/@')
662 and '/video/' in parsed_url.path
663 ):
664 path_parts = parsed_url.path.split('/video/')
665 video_id = path_parts[1]
666 iframe_src = 'https://www.tiktok.com/embed/' + video_id
667
668 # Dailymotion
669 elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'):
670 path_parts = parsed_url.path.split('/')
671 if len(path_parts) == 3:
672 video_id = path_parts[2]
673 iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id
674
675 # Bilibili
676 elif parsed_url.netloc in ['www.bilibili.com', 'bilibili.com'] and parsed_url.path.startswith('/video/'):
677 path_parts = parsed_url.path.split('/')
678
679 video_id = path_parts[2]
680 param_key = None
681 if video_id.startswith('av'):
682 video_id = video_id[2:]
683 param_key = 'aid'
684 elif video_id.startswith('BV'):
685 param_key = 'bvid'
686
687 iframe_src = (
688 f'https://player.bilibili.com/player.html?{param_key}={video_id}&high_quality=1&autoplay=false&danmaku=0'
689 )
690
691 return iframe_src
692
693
694def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
695 """Detect the language of the ``text`` parameter.
696
697 :param str text: The string whose language is to be detected.
698
699 :param float threshold: Threshold filters the returned labels by a threshold
700 on probability. A choice of 0.3 will return labels with at least 0.3
701 probability.
702
703 :param bool only_search_languages: If ``True``, returns only supported
704 SearXNG search languages. see :py:obj:`searx.languages`
705
706 :rtype: str, None
707 :returns:
708 The detected language code or ``None``. See below.
709
710 :raises ValueError: If ``text`` is not a string.
711
712 The language detection is done by using `a fork`_ of the fastText_ library
713 (`python fasttext`_). fastText_ distributes the `language identification
714 model`_, for reference:
715
716 - `FastText.zip: Compressing text classification models`_
717 - `Bag of Tricks for Efficient Text Classification`_
718
719 The `language identification model`_ support the language codes
720 (ISO-639-3)::
721
722 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
723 bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
724 et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
725 id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
726 lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
727 nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
728 rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
729 tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
730
731 By using ``only_search_languages=True`` the `language identification model`_
732 is harmonized with the SearXNG's language (locale) model. General
733 conditions of SearXNG's locale model are:
734
735 a. SearXNG's locale of a query is passed to the
736 :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
737 code that is used by an engine.
738
739 b. Most of SearXNG's engines do not support all the languages from `language
740 identification model`_ and there is also a discrepancy in the ISO-639-3
741 (fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
742 locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
743 (``zh_Hans``) while the `language identification model`_ reduce both to
744 ``zh``.
745
746 .. _a fork: https://github.com/searxng/fasttext-predict
747 .. _fastText: https://fasttext.cc/
748 .. _python fasttext: https://pypi.org/project/fasttext/
749 .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
750 .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
751 .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
752
753 """
754 if not isinstance(text, str):
755 raise ValueError('text must a str')
756 r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
757 if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
758 language = r[0][0].split('__label__')[1]
759 if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
760 return None
761 return language
762 return None
763
764
765def js_variable_to_python(js_variable):
766 """Convert a javascript variable into JSON and then load the value
767
768 It does not deal with all cases, but it is good enough for now.
769 chompjs has a better implementation.
770 """
771 # when in_string is not None, it contains the character that has opened the string
772 # either simple quote or double quote
773 in_string = None
774 # cut the string:
775 # r"""{ a:"f\"irst", c:'sec"ond'}"""
776 # becomes
777 # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
778 parts = re.split(r'(["\'])', js_variable)
779 # previous part (to check the escape character antislash)
780 previous_p = ""
781 for i, p in enumerate(parts):
782 # parse characters inside a ECMA string
783 if in_string:
784 # we are in a JS string: replace the colon by a temporary character
785 # so quote_keys_regex doesn't have to deal with colon inside the JS strings
786 parts[i] = parts[i].replace(':', chr(1))
787 if in_string == "'":
788 # the JS string is delimited by simple quote.
789 # This is not supported by JSON.
790 # simple quote delimited string are converted to double quote delimited string
791 # here, inside a JS string, we escape the double quote
792 parts[i] = parts[i].replace('"', r'\"')
793
794 # deal with delimiters and escape character
795 if not in_string and p in ('"', "'"):
796 # we are not in string
797 # but p is double or simple quote
798 # that's the start of a new string
799 # replace simple quote by double quote
800 # (JSON doesn't support simple quote)
801 parts[i] = '"'
802 in_string = p
803 continue
804 if p == in_string:
805 # we are in a string and the current part MAY close the string
806 if len(previous_p) > 0 and previous_p[-1] == '\\':
807 # there is an antislash just before: the ECMA string continue
808 continue
809 # the current p close the string
810 # replace simple quote by double quote
811 parts[i] = '"'
812 in_string = None
813
814 if not in_string:
815 # replace void 0 by null
816 # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
817 # we are sure there is no string in p
818 parts[i] = _JS_VOID_RE.sub("null", p)
819 # update previous_p
820 previous_p = p
821 # join the string
822 s = ''.join(parts)
823 # add quote around the key
824 # { a: 12 }
825 # becomes
826 # { "a": 12 }
827 s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
828 s = _JS_DECIMAL_RE.sub(":0.", s)
829 # replace the surogate character by colon
830 s = s.replace(chr(1), ':')
831 # load the JSON and return the result
832 return json.loads(s)
833
834
835def parse_duration_string(duration_str: str) -> timedelta | None:
836 """Parse a time string in format MM:SS or HH:MM:SS and convert it to a `timedelta` object.
837
838 Returns None if the provided string doesn't match any of the formats.
839 """
840 duration_str = duration_str.strip()
841
842 if not duration_str:
843 return None
844
845 try:
846 # prepending ["00"] here inits hours to 0 if they are not provided
847 time_parts = (["00"] + duration_str.split(":"))[:3]
848 hours, minutes, seconds = map(int, time_parts)
849 return timedelta(hours=hours, minutes=minutes, seconds=seconds)
850
851 except (ValueError, TypeError):
852 pass
853
854 return None
handle_starttag(self, tag, attrs)
Definition utils.py:91
extr(str txt, str begin, str end, str default="")
Definition utils.py:355
Callable[[str], str] get_string_replaces_function(Dict[str, str] replaces)
Definition utils.py:487
js_variable_to_python(js_variable)
Definition utils.py:765
Optional[str] detect_language(str text, float threshold=0.3, bool only_search_languages=False)
Definition utils.py:694
XPath get_xpath(XPathSpecType xpath_spec)
Definition utils.py:512
str to_string(Any obj)
Definition utils.py:442
humanize_bytes(size, precision=2)
Definition utils.py:324
Optional[str] extract_text(xpath_results, bool allow_none=False)
Definition utils.py:193
str ecma_unescape(str string)
Definition utils.py:451
eval_xpath_getindex(ElementBase elements, XPathSpecType xpath_spec, int index, default=_NOTSET)
Definition utils.py:592
int convert_str_to_int(str number_str)
Definition utils.py:348
eval_xpath(ElementBase element, XPathSpecType xpath_spec)
Definition utils.py:544
str gen_useragent(Optional[str] os_string=None)
Definition utils.py:71
int int_or_zero(Union[List[str], str] num)
Definition utils.py:384
Optional[Tuple[bool, str, str]] is_valid_lang(lang)
Definition utils.py:396
"fasttext.FastText._FastText" _get_fasttext_model()
Definition utils.py:621
str extract_url(xpath_results, base_url)
Definition utils.py:271
eval_xpath_list(ElementBase element, XPathSpecType xpath_spec, Optional[int] min_len=None)
Definition utils.py:568
types.ModuleType load_module(str filename, str module_dir)
Definition utils.py:428
str searx_useragent()
Definition utils.py:64
str normalize_url(str url, str base_url)
Definition utils.py:221
get_embeded_stream_url(url)
Definition utils.py:632
str html_to_text(str html_str)
Definition utils.py:138
humanize_number(size, precision=0)
Definition utils.py:336
Dict dict_subset(MutableMapping dictionary, Set[str] properties)
Definition utils.py:312
timedelta|None parse_duration_string(str duration_str)
Definition utils.py:835
Dict get_engine_from_settings(str name)
Definition utils.py:497
str markdown_to_text(str markdown_str)
Definition utils.py:170
remove_pua_from_str(string)
Definition utils.py:472