.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
utils.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Utility functions for the engines"""
3
4from __future__ import annotations
5
6import re
7import importlib
8import importlib.util
9import json
10import types
11
12from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
13from numbers import Number
14from os.path import splitext, join
15from random import choice
16from html.parser import HTMLParser
17from html import escape
18from urllib.parse import urljoin, urlparse, parse_qs, urlencode
19from datetime import timedelta
20from markdown_it import MarkdownIt
21
22from lxml import html
23from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError
24
25from searx import settings
26from searx.data import USER_AGENTS, data_dir
27from searx.version import VERSION_TAG
28from searx.sxng_locales import sxng_locales
29from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
30from searx import logger
31
32
33logger = logger.getChild('utils')
34
35XPathSpecType = Union[str, XPath]
36
37_BLOCKED_TAGS = ('script', 'style')
38
39_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
40_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
41
42_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
43_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\‍([0-9]+\‍)')
44_JS_DECIMAL_RE = re.compile(r":\s*\.")
45
46_XPATH_CACHE: Dict[str, XPath] = {}
47_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
48
49_FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None # type: ignore
50"""fasttext model to predict language of a search term"""
51
52SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
53"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
54
55
56class _NotSetClass: # pylint: disable=too-few-public-methods
57 """Internal class for this module, do not create instance of this class.
58 Replace the None value, allow explicitly pass None as a function argument"""
59
60
61_NOTSET = _NotSetClass()
62
63
64def searx_useragent() -> str:
65 """Return the searx User Agent"""
66 return 'searx/{searx_version} {suffix}'.format(
67 searx_version=VERSION_TAG, suffix=settings['outgoing']['useragent_suffix']
68 ).strip()
69
70
71def gen_useragent(os_string: Optional[str] = None) -> str:
72 """Return a random browser User Agent
73
74 See searx/data/useragents.json
75 """
76 return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
77
78
80 """Internal exception raised when the HTML is invalid"""
81
82
83class _HTMLTextExtractor(HTMLParser):
84 """Internal class to extract text from HTML"""
85
86 def __init__(self):
87 HTMLParser.__init__(self)
88 self.result = []
89 self.tags = []
90
91 def handle_starttag(self, tag, attrs):
92 self.tags.append(tag)
93 if tag == 'br':
94 self.result.append(' ')
95
96 def handle_endtag(self, tag):
97 if not self.tags:
98 return
99
100 if tag != self.tags[-1]:
102
103 self.tags.pop()
104
105 def is_valid_tag(self):
106 return not self.tags or self.tags[-1] not in _BLOCKED_TAGS
107
108 def handle_data(self, data):
109 if not self.is_valid_tag():
110 return
111 self.result.append(data)
112
113 def handle_charref(self, name):
114 if not self.is_valid_tag():
115 return
116 if name[0] in ('x', 'X'):
117 codepoint = int(name[1:], 16)
118 else:
119 codepoint = int(name)
120 self.result.append(chr(codepoint))
121
122 def handle_entityref(self, name):
123 if not self.is_valid_tag():
124 return
125 # codepoint = htmlentitydefs.name2codepoint[name]
126 # self.result.append(chr(codepoint))
127 self.result.append(name)
128
129 def get_text(self):
130 return ''.join(self.result).strip()
131
132 def error(self, message):
133 # error handle is needed in <py3.10
134 # https://github.com/python/cpython/pull/8562/files
135 raise AssertionError(message)
136
137
138def html_to_text(html_str: str) -> str:
139 """Extract text from a HTML string
140
141 Args:
142 * html_str (str): string HTML
143
144 Returns:
145 * str: extracted text
146
147 Examples:
148 >>> html_to_text('Example <span id="42">#2</span>')
149 'Example #2'
150
151 >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
152 'Example'
153
154 >>> html_to_text(r'regexp: (?<![a-zA-Z]')
155 'regexp: (?<![a-zA-Z]'
156 """
157 if not html_str:
158 return ""
159 html_str = html_str.replace('\n', ' ').replace('\r', ' ')
160 html_str = ' '.join(html_str.split())
162 try:
163 s.feed(html_str)
164 except AssertionError:
166 s.feed(escape(html_str, quote=True))
167 except _HTMLTextExtractorException:
168 logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
169 return s.get_text()
170
171
172def markdown_to_text(markdown_str: str) -> str:
173 """Extract text from a Markdown string
174
175 Args:
176 * markdown_str (str): string Markdown
177
178 Returns:
179 * str: extracted text
180
181 Examples:
182 >>> markdown_to_text('[example](https://example.com)')
183 'example'
184
185 >>> markdown_to_text('## Headline')
186 'Headline'
187 """
188
189 html_str = (
190 MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
191 )
192 return html_to_text(html_str)
193
194
195def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
196 """Extract text from a lxml result
197
198 * if xpath_results is list, extract the text from each result and concat the list
199 * if xpath_results is a xml element, extract all the text node from it
200 ( text_content() method from lxml )
201 * if xpath_results is a string element, then it's already done
202 """
203 if isinstance(xpath_results, list):
204 # it's list of result : concat everything using recursive call
205 result = ''
206 for e in xpath_results:
207 result = result + (extract_text(e) or '')
208 return result.strip()
209 if isinstance(xpath_results, ElementBase):
210 # it's a element
211 text: str = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False)
212 text = text.strip().replace('\n', ' ')
213 return ' '.join(text.split())
214 if isinstance(xpath_results, (str, Number, bool)):
215 return str(xpath_results)
216 if xpath_results is None and allow_none:
217 return None
218 if xpath_results is None and not allow_none:
219 raise ValueError('extract_text(None, allow_none=False)')
220 raise ValueError('unsupported type')
221
222
223def normalize_url(url: str, base_url: str) -> str:
224 """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
225
226 Args:
227 * url (str): Relative URL
228 * base_url (str): Base URL, it must be an absolute URL.
229
230 Example:
231 >>> normalize_url('https://example.com', 'http://example.com/')
232 'https://example.com/'
233 >>> normalize_url('//example.com', 'http://example.com/')
234 'http://example.com/'
235 >>> normalize_url('//example.com', 'https://example.com/')
236 'https://example.com/'
237 >>> normalize_url('/path?a=1', 'https://example.com')
238 'https://example.com/path?a=1'
239 >>> normalize_url('', 'https://example.com')
240 'https://example.com/'
241 >>> normalize_url('/test', '/path')
242 raise ValueError
243
244 Raises:
245 * lxml.etree.ParserError
246
247 Returns:
248 * str: normalized URL
249 """
250 if url.startswith('//'):
251 # add http or https to this kind of url //example.com/
252 parsed_search_url = urlparse(base_url)
253 url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
254 elif url.startswith('/'):
255 # fix relative url to the search engine
256 url = urljoin(base_url, url)
257
258 # fix relative urls that fall through the crack
259 if '://' not in url:
260 url = urljoin(base_url, url)
261
262 parsed_url = urlparse(url)
263
264 # add a / at this end of the url if there is no path
265 if not parsed_url.netloc:
266 raise ValueError('Cannot parse url')
267 if not parsed_url.path:
268 url += '/'
269
270 return url
271
272
273def extract_url(xpath_results, base_url) -> str:
274 """Extract and normalize URL from lxml Element
275
276 Args:
277 * xpath_results (Union[List[html.HtmlElement], html.HtmlElement]): lxml Element(s)
278 * base_url (str): Base URL
279
280 Example:
281 >>> def f(s, search_url):
282 >>> return searx.utils.extract_url(html.fromstring(s), search_url)
283 >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
284 'https://example.com/'
285 >>> f('https://example.com', 'http://example.com/')
286 'https://example.com/'
287 >>> f('//example.com', 'http://example.com/')
288 'http://example.com/'
289 >>> f('//example.com', 'https://example.com/')
290 'https://example.com/'
291 >>> f('/path?a=1', 'https://example.com')
292 'https://example.com/path?a=1'
293 >>> f('', 'https://example.com')
294 raise lxml.etree.ParserError
295 >>> searx.utils.extract_url([], 'https://example.com')
296 raise ValueError
297
298 Raises:
299 * ValueError
300 * lxml.etree.ParserError
301
302 Returns:
303 * str: normalized URL
304 """
305 if xpath_results == []:
306 raise ValueError('Empty url resultset')
307
308 url = extract_text(xpath_results)
309 if url:
310 return normalize_url(url, base_url)
311 raise ValueError('URL not found')
312
313
314def dict_subset(dictionary: MutableMapping, properties: Set[str]) -> Dict:
315 """Extract a subset of a dict
316
317 Examples:
318 >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
319 {'A': 'a', 'C': 'c'}
320 >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
321 {'A': 'a'}
322 """
323 return {k: dictionary[k] for k in properties if k in dictionary}
324
325
326def humanize_bytes(size, precision=2):
327 """Determine the *human readable* value of bytes on 1024 base (1KB=1024B)."""
328 s = ['B ', 'KB', 'MB', 'GB', 'TB']
329
330 x = len(s)
331 p = 0
332 while size > 1024 and p < x:
333 p += 1
334 size = size / 1024.0
335 return "%.*f %s" % (precision, size, s[p])
336
337
338def humanize_number(size, precision=0):
339 """Determine the *human readable* value of a decimal number."""
340 s = ['', 'K', 'M', 'B', 'T']
341
342 x = len(s)
343 p = 0
344 while size > 1000 and p < x:
345 p += 1
346 size = size / 1000.0
347 return "%.*f%s" % (precision, size, s[p])
348
349
350def convert_str_to_int(number_str: str) -> int:
351 """Convert number_str to int or 0 if number_str is not a number."""
352 if number_str.isdigit():
353 return int(number_str)
354 return 0
355
356
357def extr(txt: str, begin: str, end: str, default: str = ""):
358 """Extract the string between ``begin`` and ``end`` from ``txt``
359
360 :param txt: String to search in
361 :param begin: First string to be searched for
362 :param end: Second string to be searched for after ``begin``
363 :param default: Default value if one of ``begin`` or ``end`` is not
364 found. Defaults to an empty string.
365 :return: The string between the two search-strings ``begin`` and ``end``.
366 If at least one of ``begin`` or ``end`` is not found, the value of
367 ``default`` is returned.
368
369 Examples:
370 >>> extr("abcde", "a", "e")
371 "bcd"
372 >>> extr("abcde", "a", "z", deafult="nothing")
373 "nothing"
374
375 """
376
377 # From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129
378
379 try:
380 first = txt.index(begin) + len(begin)
381 return txt[first : txt.index(end, first)]
382 except ValueError:
383 return default
384
385
386def int_or_zero(num: Union[List[str], str]) -> int:
387 """Convert num to int or 0. num can be either a str or a list.
388 If num is a list, the first element is converted to int (or return 0 if the list is empty).
389 If num is a str, see convert_str_to_int
390 """
391 if isinstance(num, list):
392 if len(num) < 1:
393 return 0
394 num = num[0]
395 return convert_str_to_int(num)
396
397
398def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
399 """Return language code and name if lang describe a language.
400
401 Examples:
402 >>> is_valid_lang('zz')
403 None
404 >>> is_valid_lang('uk')
405 (True, 'uk', 'ukrainian')
406 >>> is_valid_lang(b'uk')
407 (True, 'uk', 'ukrainian')
408 >>> is_valid_lang('en')
409 (True, 'en', 'english')
410 >>> searx.utils.is_valid_lang('Español')
411 (True, 'es', 'spanish')
412 >>> searx.utils.is_valid_lang('Spanish')
413 (True, 'es', 'spanish')
414 """
415 if isinstance(lang, bytes):
416 lang = lang.decode()
417 is_abbr = len(lang) == 2
418 lang = lang.lower()
419 if is_abbr:
420 for l in sxng_locales:
421 if l[0][:2] == lang:
422 return (True, l[0][:2], l[3].lower())
423 return None
424 for l in sxng_locales:
425 if l[1].lower() == lang or l[3].lower() == lang:
426 return (True, l[0][:2], l[3].lower())
427 return None
428
429
430def load_module(filename: str, module_dir: str) -> types.ModuleType:
431 modname = splitext(filename)[0]
432 modpath = join(module_dir, filename)
433 # and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
434 spec = importlib.util.spec_from_file_location(modname, modpath)
435 if not spec:
436 raise ValueError(f"Error loading '{modpath}' module")
437 module = importlib.util.module_from_spec(spec)
438 if not spec.loader:
439 raise ValueError(f"Error loading '{modpath}' module")
440 spec.loader.exec_module(module)
441 return module
442
443
444def to_string(obj: Any) -> str:
445 """Convert obj to its string representation."""
446 if isinstance(obj, str):
447 return obj
448 if hasattr(obj, '__str__'):
449 return str(obj)
450 return repr(obj)
451
452
453def ecma_unescape(string: str) -> str:
454 """Python implementation of the unescape javascript function
455
456 https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
457 https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
458
459 Examples:
460 >>> ecma_unescape('%u5409')
461 '吉'
462 >>> ecma_unescape('%20')
463 ' '
464 >>> ecma_unescape('%F3')
465 'ó'
466 """
467 # "%u5409" becomes "吉"
468 string = _ECMA_UNESCAPE4_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
469 # "%20" becomes " ", "%F3" becomes "ó"
470 string = _ECMA_UNESCAPE2_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
471 return string
472
473
475 """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string.
476
477 .. _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas
478 """
479 pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD))
480 s = []
481 for c in string:
482 i = ord(c)
483 if any(a <= i <= b for (a, b) in pua_ranges):
484 continue
485 s.append(c)
486 return "".join(s)
487
488
489def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
490 rep = {re.escape(k): v for k, v in replaces.items()}
491 pattern = re.compile("|".join(rep.keys()))
492
493 def func(text):
494 return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
495
496 return func
497
498
499def get_engine_from_settings(name: str) -> Dict:
500 """Return engine configuration from settings.yml of a given engine name"""
501
502 if 'engines' not in settings:
503 return {}
504
505 for engine in settings['engines']:
506 if 'name' not in engine:
507 continue
508 if name == engine['name']:
509 return engine
510
511 return {}
512
513
514def get_xpath(xpath_spec: XPathSpecType) -> XPath:
515 """Return cached compiled XPath
516
517 There is no thread lock.
518 Worst case scenario, xpath_str is compiled more than one time.
519
520 Args:
521 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
522
523 Returns:
524 * result (bool, float, list, str): Results.
525
526 Raises:
527 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
528 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
529 """
530 if isinstance(xpath_spec, str):
531 result = _XPATH_CACHE.get(xpath_spec, None)
532 if result is None:
533 try:
534 result = XPath(xpath_spec)
535 except XPathSyntaxError as e:
536 raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e
537 _XPATH_CACHE[xpath_spec] = result
538 return result
539
540 if isinstance(xpath_spec, XPath):
541 return xpath_spec
542
543 raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath')
544
545
546def eval_xpath(element: ElementBase, xpath_spec: XPathSpecType):
547 """Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.
548 See https://lxml.de/xpathxslt.html#xpath-return-values
549
550 Args:
551 * element (ElementBase): [description]
552 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
553
554 Returns:
555 * result (bool, float, list, str): Results.
556
557 Raises:
558 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
559 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
560 * SearxEngineXPathException: Raise when the XPath can't be evaluated.
561 """
562 xpath = get_xpath(xpath_spec)
563 try:
564 return xpath(element)
565 except XPathError as e:
566 arg = ' '.join([str(i) for i in e.args])
567 raise SearxEngineXPathException(xpath_spec, arg) from e
568
569
570def eval_xpath_list(element: ElementBase, xpath_spec: XPathSpecType, min_len: Optional[int] = None):
571 """Same as eval_xpath, check if the result is a list
572
573 Args:
574 * element (ElementBase): [description]
575 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
576 * min_len (int, optional): [description]. Defaults to None.
577
578 Raises:
579 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
580 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
581 * SearxEngineXPathException: raise if the result is not a list
582
583 Returns:
584 * result (bool, float, list, str): Results.
585 """
586 result = eval_xpath(element, xpath_spec)
587 if not isinstance(result, list):
588 raise SearxEngineXPathException(xpath_spec, 'the result is not a list')
589 if min_len is not None and min_len > len(result):
590 raise SearxEngineXPathException(xpath_spec, 'len(xpath_str) < ' + str(min_len))
591 return result
592
593
594def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index: int, default=_NOTSET):
595 """Call eval_xpath_list then get one element using the index parameter.
596 If the index does not exist, either raise an exception is default is not set,
597 other return the default value (can be None).
598
599 Args:
600 * elements (ElementBase): lxml element to apply the xpath.
601 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath.
602 * index (int): index to get
603 * default (Object, optional): Defaults if index doesn't exist.
604
605 Raises:
606 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
607 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
608 * SearxEngineXPathException: if the index is not found. Also see eval_xpath.
609
610 Returns:
611 * result (bool, float, list, str): Results.
612 """
613 result = eval_xpath_list(elements, xpath_spec)
614 if -len(result) <= index < len(result):
615 return result[index]
616 if default == _NOTSET:
617 # raise an SearxEngineXPathException instead of IndexError
618 # to record xpath_spec
619 raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
620 return default
621
622
623def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore
624 global _FASTTEXT_MODEL # pylint: disable=global-statement
625 if _FASTTEXT_MODEL is None:
626 import fasttext # pylint: disable=import-outside-toplevel
627
628 # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
629 fasttext.FastText.eprint = lambda x: None
630 _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
631 return _FASTTEXT_MODEL
632
633
635 """
636 Converts a standard video URL into its embed format. Supported services include Youtube,
637 Facebook, Instagram, TikTok, Dailymotion, and Bilibili.
638 """
639 parsed_url = urlparse(url)
640 iframe_src = None
641
642 # YouTube
643 if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query:
644 video_id = parse_qs(parsed_url.query).get('v', [])
645 if video_id:
646 iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0]
647
648 # Facebook
649 elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']:
650 encoded_href = urlencode({'href': url})
651 iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href
652
653 # Instagram
654 elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'):
655 if parsed_url.path.endswith('/'):
656 iframe_src = url + 'embed'
657 else:
658 iframe_src = url + '/embed'
659
660 # TikTok
661 elif (
662 parsed_url.netloc in ['www.tiktok.com', 'tiktok.com']
663 and parsed_url.path.startswith('/@')
664 and '/video/' in parsed_url.path
665 ):
666 path_parts = parsed_url.path.split('/video/')
667 video_id = path_parts[1]
668 iframe_src = 'https://www.tiktok.com/embed/' + video_id
669
670 # Dailymotion
671 elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'):
672 path_parts = parsed_url.path.split('/')
673 if len(path_parts) == 3:
674 video_id = path_parts[2]
675 iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id
676
677 # Bilibili
678 elif parsed_url.netloc in ['www.bilibili.com', 'bilibili.com'] and parsed_url.path.startswith('/video/'):
679 path_parts = parsed_url.path.split('/')
680
681 video_id = path_parts[2]
682 param_key = None
683 if video_id.startswith('av'):
684 video_id = video_id[2:]
685 param_key = 'aid'
686 elif video_id.startswith('BV'):
687 param_key = 'bvid'
688
689 iframe_src = (
690 f'https://player.bilibili.com/player.html?{param_key}={video_id}&high_quality=1&autoplay=false&danmaku=0'
691 )
692
693 return iframe_src
694
695
696def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
697 """Detect the language of the ``text`` parameter.
698
699 :param str text: The string whose language is to be detected.
700
701 :param float threshold: Threshold filters the returned labels by a threshold
702 on probability. A choice of 0.3 will return labels with at least 0.3
703 probability.
704
705 :param bool only_search_languages: If ``True``, returns only supported
706 SearXNG search languages. see :py:obj:`searx.languages`
707
708 :rtype: str, None
709 :returns:
710 The detected language code or ``None``. See below.
711
712 :raises ValueError: If ``text`` is not a string.
713
714 The language detection is done by using `a fork`_ of the fastText_ library
715 (`python fasttext`_). fastText_ distributes the `language identification
716 model`_, for reference:
717
718 - `FastText.zip: Compressing text classification models`_
719 - `Bag of Tricks for Efficient Text Classification`_
720
721 The `language identification model`_ support the language codes
722 (ISO-639-3)::
723
724 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
725 bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
726 et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
727 id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
728 lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
729 nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
730 rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
731 tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
732
733 By using ``only_search_languages=True`` the `language identification model`_
734 is harmonized with the SearXNG's language (locale) model. General
735 conditions of SearXNG's locale model are:
736
737 a. SearXNG's locale of a query is passed to the
738 :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
739 code that is used by an engine.
740
741 b. Most of SearXNG's engines do not support all the languages from `language
742 identification model`_ and there is also a discrepancy in the ISO-639-3
743 (fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
744 locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
745 (``zh_Hans``) while the `language identification model`_ reduce both to
746 ``zh``.
747
748 .. _a fork: https://github.com/searxng/fasttext-predict
749 .. _fastText: https://fasttext.cc/
750 .. _python fasttext: https://pypi.org/project/fasttext/
751 .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
752 .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
753 .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
754
755 """
756 if not isinstance(text, str):
757 raise ValueError('text must a str')
758 r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
759 if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
760 language = r[0][0].split('__label__')[1]
761 if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
762 return None
763 return language
764 return None
765
766
767def js_variable_to_python(js_variable):
768 """Convert a javascript variable into JSON and then load the value
769
770 It does not deal with all cases, but it is good enough for now.
771 chompjs has a better implementation.
772 """
773 # when in_string is not None, it contains the character that has opened the string
774 # either simple quote or double quote
775 in_string = None
776 # cut the string:
777 # r"""{ a:"f\"irst", c:'sec"ond'}"""
778 # becomes
779 # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
780 parts = re.split(r'(["\'])', js_variable)
781 # previous part (to check the escape character antislash)
782 previous_p = ""
783 for i, p in enumerate(parts):
784 # parse characters inside a ECMA string
785 if in_string:
786 # we are in a JS string: replace the colon by a temporary character
787 # so quote_keys_regex doesn't have to deal with colon inside the JS strings
788 parts[i] = parts[i].replace(':', chr(1))
789 if in_string == "'":
790 # the JS string is delimited by simple quote.
791 # This is not supported by JSON.
792 # simple quote delimited string are converted to double quote delimited string
793 # here, inside a JS string, we escape the double quote
794 parts[i] = parts[i].replace('"', r'\"')
795
796 # deal with delimiters and escape character
797 if not in_string and p in ('"', "'"):
798 # we are not in string
799 # but p is double or simple quote
800 # that's the start of a new string
801 # replace simple quote by double quote
802 # (JSON doesn't support simple quote)
803 parts[i] = '"'
804 in_string = p
805 continue
806 if p == in_string:
807 # we are in a string and the current part MAY close the string
808 if len(previous_p) > 0 and previous_p[-1] == '\\':
809 # there is an antislash just before: the ECMA string continue
810 continue
811 # the current p close the string
812 # replace simple quote by double quote
813 parts[i] = '"'
814 in_string = None
815
816 if not in_string:
817 # replace void 0 by null
818 # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
819 # we are sure there is no string in p
820 parts[i] = _JS_VOID_RE.sub("null", p)
821 # update previous_p
822 previous_p = p
823 # join the string
824 s = ''.join(parts)
825 # add quote around the key
826 # { a: 12 }
827 # becomes
828 # { "a": 12 }
829 s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
830 s = _JS_DECIMAL_RE.sub(":0.", s)
831 # replace the surogate character by colon
832 s = s.replace(chr(1), ':')
833 # replace single-quote followed by comma with double-quote and comma
834 # {"a": "\"12\"',"b": "13"}
835 # becomes
836 # {"a": "\"12\"","b": "13"}
837 s = s.replace("',", "\",")
838 # load the JSON and return the result
839 return json.loads(s)
840
841
842def parse_duration_string(duration_str: str) -> timedelta | None:
843 """Parse a time string in format MM:SS or HH:MM:SS and convert it to a `timedelta` object.
844
845 Returns None if the provided string doesn't match any of the formats.
846 """
847 duration_str = duration_str.strip()
848
849 if not duration_str:
850 return None
851
852 try:
853 # prepending ["00"] here inits hours to 0 if they are not provided
854 time_parts = (["00"] + duration_str.split(":"))[:3]
855 hours, minutes, seconds = map(int, time_parts)
856 return timedelta(hours=hours, minutes=minutes, seconds=seconds)
857
858 except (ValueError, TypeError):
859 pass
860
861 return None
handle_starttag(self, tag, attrs)
Definition utils.py:91
extr(str txt, str begin, str end, str default="")
Definition utils.py:357
Callable[[str], str] get_string_replaces_function(Dict[str, str] replaces)
Definition utils.py:489
js_variable_to_python(js_variable)
Definition utils.py:767
Optional[str] detect_language(str text, float threshold=0.3, bool only_search_languages=False)
Definition utils.py:696
XPath get_xpath(XPathSpecType xpath_spec)
Definition utils.py:514
str to_string(Any obj)
Definition utils.py:444
humanize_bytes(size, precision=2)
Definition utils.py:326
Optional[str] extract_text(xpath_results, bool allow_none=False)
Definition utils.py:195
str ecma_unescape(str string)
Definition utils.py:453
eval_xpath_getindex(ElementBase elements, XPathSpecType xpath_spec, int index, default=_NOTSET)
Definition utils.py:594
int convert_str_to_int(str number_str)
Definition utils.py:350
eval_xpath(ElementBase element, XPathSpecType xpath_spec)
Definition utils.py:546
str gen_useragent(Optional[str] os_string=None)
Definition utils.py:71
int int_or_zero(Union[List[str], str] num)
Definition utils.py:386
Optional[Tuple[bool, str, str]] is_valid_lang(lang)
Definition utils.py:398
"fasttext.FastText._FastText" _get_fasttext_model()
Definition utils.py:623
str extract_url(xpath_results, base_url)
Definition utils.py:273
eval_xpath_list(ElementBase element, XPathSpecType xpath_spec, Optional[int] min_len=None)
Definition utils.py:570
types.ModuleType load_module(str filename, str module_dir)
Definition utils.py:430
str searx_useragent()
Definition utils.py:64
str normalize_url(str url, str base_url)
Definition utils.py:223
get_embeded_stream_url(url)
Definition utils.py:634
str html_to_text(str html_str)
Definition utils.py:138
humanize_number(size, precision=0)
Definition utils.py:338
Dict dict_subset(MutableMapping dictionary, Set[str] properties)
Definition utils.py:314
timedelta|None parse_duration_string(str duration_str)
Definition utils.py:842
Dict get_engine_from_settings(str name)
Definition utils.py:499
str markdown_to_text(str markdown_str)
Definition utils.py:172
remove_pua_from_str(string)
Definition utils.py:474