.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
utils.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Utility functions for the engines
3
4"""
5import re
6import importlib
7import importlib.util
8import json
9import types
10
11from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
12from numbers import Number
13from os.path import splitext, join
14from random import choice
15from html.parser import HTMLParser
16from html import escape
17from urllib.parse import urljoin, urlparse
18from markdown_it import MarkdownIt
19
20from lxml import html
21from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError
22
23from searx import settings
24from searx.data import USER_AGENTS, data_dir
25from searx.version import VERSION_TAG
26from searx.sxng_locales import sxng_locales
27from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
28from searx import logger
29
30
31logger = logger.getChild('utils')
32
33XPathSpecType = Union[str, XPath]
34
35_BLOCKED_TAGS = ('script', 'style')
36
37_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
38_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
39
40_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
41_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\‍([0-9]+\‍)')
42_JS_DECIMAL_RE = re.compile(r":\s*\.")
43
44_STORAGE_UNIT_VALUE: Dict[str, int] = {
45 'TB': 1024 * 1024 * 1024 * 1024,
46 'GB': 1024 * 1024 * 1024,
47 'MB': 1024 * 1024,
48 'TiB': 1000 * 1000 * 1000 * 1000,
49 'GiB': 1000 * 1000 * 1000,
50 'MiB': 1000 * 1000,
51 'KiB': 1000,
52}
53
54_XPATH_CACHE: Dict[str, XPath] = {}
55_LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
56
57_FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None # type: ignore
58"""fasttext model to predict laguage of a search term"""
59
60SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
61"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
62
63
64class _NotSetClass: # pylint: disable=too-few-public-methods
65 """Internal class for this module, do not create instance of this class.
66 Replace the None value, allow explicitly pass None as a function argument"""
67
68
69_NOTSET = _NotSetClass()
70
71
72def searx_useragent() -> str:
73 """Return the searx User Agent"""
74 return 'searx/{searx_version} {suffix}'.format(
75 searx_version=VERSION_TAG, suffix=settings['outgoing']['useragent_suffix']
76 ).strip()
77
78
79def gen_useragent(os_string: Optional[str] = None) -> str:
80 """Return a random browser User Agent
81
82 See searx/data/useragents.json
83 """
84 return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
85
86
88 """Internal exception raised when the HTML is invalid"""
89
90
91class _HTMLTextExtractor(HTMLParser):
92 """Internal class to extract text from HTML"""
93
94 def __init__(self):
95 HTMLParser.__init__(self)
96 self.result = []
97 self.tags = []
98
99 def handle_starttag(self, tag, attrs):
100 self.tags.append(tag)
101 if tag == 'br':
102 self.result.append(' ')
103
104 def handle_endtag(self, tag):
105 if not self.tags:
106 return
107
108 if tag != self.tags[-1]:
110
111 self.tags.pop()
112
113 def is_valid_tag(self):
114 return not self.tags or self.tags[-1] not in _BLOCKED_TAGS
115
116 def handle_data(self, data):
117 if not self.is_valid_tag():
118 return
119 self.result.append(data)
120
121 def handle_charref(self, name):
122 if not self.is_valid_tag():
123 return
124 if name[0] in ('x', 'X'):
125 codepoint = int(name[1:], 16)
126 else:
127 codepoint = int(name)
128 self.result.append(chr(codepoint))
129
130 def handle_entityref(self, name):
131 if not self.is_valid_tag():
132 return
133 # codepoint = htmlentitydefs.name2codepoint[name]
134 # self.result.append(chr(codepoint))
135 self.result.append(name)
136
137 def get_text(self):
138 return ''.join(self.result).strip()
139
140 def error(self, message):
141 # error handle is needed in <py3.10
142 # https://github.com/python/cpython/pull/8562/files
143 raise AssertionError(message)
144
145
146def html_to_text(html_str: str) -> str:
147 """Extract text from a HTML string
148
149 Args:
150 * html_str (str): string HTML
151
152 Returns:
153 * str: extracted text
154
155 Examples:
156 >>> html_to_text('Example <span id="42">#2</span>')
157 'Example #2'
158
159 >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
160 'Example'
161
162 >>> html_to_text(r'regexp: (?<![a-zA-Z]')
163 'regexp: (?<![a-zA-Z]'
164 """
165 html_str = html_str.replace('\n', ' ').replace('\r', ' ')
166 html_str = ' '.join(html_str.split())
168 try:
169 s.feed(html_str)
170 except AssertionError:
172 s.feed(escape(html_str, quote=True))
173 except _HTMLTextExtractorException:
174 logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
175 return s.get_text()
176
177
178def markdown_to_text(markdown_str: str) -> str:
179 """Extract text from a Markdown string
180
181 Args:
182 * markdown_str (str): string Markdown
183
184 Returns:
185 * str: extracted text
186
187 Examples:
188 >>> markdown_to_text('[example](https://example.com)')
189 'example'
190
191 >>> markdown_to_text('## Headline')
192 'Headline'
193 """
194
195 html_str = (
196 MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
197 )
198 return html_to_text(html_str)
199
200
201def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
202 """Extract text from a lxml result
203
204 * if xpath_results is list, extract the text from each result and concat the list
205 * if xpath_results is a xml element, extract all the text node from it
206 ( text_content() method from lxml )
207 * if xpath_results is a string element, then it's already done
208 """
209 if isinstance(xpath_results, list):
210 # it's list of result : concat everything using recursive call
211 result = ''
212 for e in xpath_results:
213 result = result + (extract_text(e) or '')
214 return result.strip()
215 if isinstance(xpath_results, ElementBase):
216 # it's a element
217 text: str = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False)
218 text = text.strip().replace('\n', ' ')
219 return ' '.join(text.split())
220 if isinstance(xpath_results, (str, Number, bool)):
221 return str(xpath_results)
222 if xpath_results is None and allow_none:
223 return None
224 if xpath_results is None and not allow_none:
225 raise ValueError('extract_text(None, allow_none=False)')
226 raise ValueError('unsupported type')
227
228
229def normalize_url(url: str, base_url: str) -> str:
230 """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
231
232 Args:
233 * url (str): Relative URL
234 * base_url (str): Base URL, it must be an absolute URL.
235
236 Example:
237 >>> normalize_url('https://example.com', 'http://example.com/')
238 'https://example.com/'
239 >>> normalize_url('//example.com', 'http://example.com/')
240 'http://example.com/'
241 >>> normalize_url('//example.com', 'https://example.com/')
242 'https://example.com/'
243 >>> normalize_url('/path?a=1', 'https://example.com')
244 'https://example.com/path?a=1'
245 >>> normalize_url('', 'https://example.com')
246 'https://example.com/'
247 >>> normalize_url('/test', '/path')
248 raise ValueError
249
250 Raises:
251 * lxml.etree.ParserError
252
253 Returns:
254 * str: normalized URL
255 """
256 if url.startswith('//'):
257 # add http or https to this kind of url //example.com/
258 parsed_search_url = urlparse(base_url)
259 url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url)
260 elif url.startswith('/'):
261 # fix relative url to the search engine
262 url = urljoin(base_url, url)
263
264 # fix relative urls that fall through the crack
265 if '://' not in url:
266 url = urljoin(base_url, url)
267
268 parsed_url = urlparse(url)
269
270 # add a / at this end of the url if there is no path
271 if not parsed_url.netloc:
272 raise ValueError('Cannot parse url')
273 if not parsed_url.path:
274 url += '/'
275
276 return url
277
278
279def extract_url(xpath_results, base_url) -> str:
280 """Extract and normalize URL from lxml Element
281
282 Args:
283 * xpath_results (Union[List[html.HtmlElement], html.HtmlElement]): lxml Element(s)
284 * base_url (str): Base URL
285
286 Example:
287 >>> def f(s, search_url):
288 >>> return searx.utils.extract_url(html.fromstring(s), search_url)
289 >>> f('<span id="42">https://example.com</span>', 'http://example.com/')
290 'https://example.com/'
291 >>> f('https://example.com', 'http://example.com/')
292 'https://example.com/'
293 >>> f('//example.com', 'http://example.com/')
294 'http://example.com/'
295 >>> f('//example.com', 'https://example.com/')
296 'https://example.com/'
297 >>> f('/path?a=1', 'https://example.com')
298 'https://example.com/path?a=1'
299 >>> f('', 'https://example.com')
300 raise lxml.etree.ParserError
301 >>> searx.utils.extract_url([], 'https://example.com')
302 raise ValueError
303
304 Raises:
305 * ValueError
306 * lxml.etree.ParserError
307
308 Returns:
309 * str: normalized URL
310 """
311 if xpath_results == []:
312 raise ValueError('Empty url resultset')
313
314 url = extract_text(xpath_results)
315 if url:
316 return normalize_url(url, base_url)
317 raise ValueError('URL not found')
318
319
320def dict_subset(dictionary: MutableMapping, properties: Set[str]) -> Dict:
321 """Extract a subset of a dict
322
323 Examples:
324 >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C'])
325 {'A': 'a', 'C': 'c'}
326 >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
327 {'A': 'a'}
328 """
329 return {k: dictionary[k] for k in properties if k in dictionary}
330
331
332def get_torrent_size(filesize: str, filesize_multiplier: str) -> Optional[int]:
333 """
334
335 Args:
336 * filesize (str): size
337 * filesize_multiplier (str): TB, GB, .... TiB, GiB...
338
339 Returns:
340 * int: number of bytes
341
342 Example:
343 >>> get_torrent_size('5', 'GB')
344 5368709120
345 >>> get_torrent_size('3.14', 'MiB')
346 3140000
347 """
348 try:
349 multiplier = _STORAGE_UNIT_VALUE.get(filesize_multiplier, 1)
350 return int(float(filesize) * multiplier)
351 except ValueError:
352 return None
353
354
355def humanize_bytes(size, precision=2):
356 """Determine the *human readable* value of bytes on 1024 base (1KB=1024B)."""
357 s = ['B ', 'KB', 'MB', 'GB', 'TB']
358
359 x = len(s)
360 p = 0
361 while size > 1024 and p < x:
362 p += 1
363 size = size / 1024.0
364 return "%.*f %s" % (precision, size, s[p])
365
366
367def convert_str_to_int(number_str: str) -> int:
368 """Convert number_str to int or 0 if number_str is not a number."""
369 if number_str.isdigit():
370 return int(number_str)
371 return 0
372
373
374def int_or_zero(num: Union[List[str], str]) -> int:
375 """Convert num to int or 0. num can be either a str or a list.
376 If num is a list, the first element is converted to int (or return 0 if the list is empty).
377 If num is a str, see convert_str_to_int
378 """
379 if isinstance(num, list):
380 if len(num) < 1:
381 return 0
382 num = num[0]
383 return convert_str_to_int(num)
384
385
386def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
387 """Return language code and name if lang describe a language.
388
389 Examples:
390 >>> is_valid_lang('zz')
391 None
392 >>> is_valid_lang('uk')
393 (True, 'uk', 'ukrainian')
394 >>> is_valid_lang(b'uk')
395 (True, 'uk', 'ukrainian')
396 >>> is_valid_lang('en')
397 (True, 'en', 'english')
398 >>> searx.utils.is_valid_lang('Español')
399 (True, 'es', 'spanish')
400 >>> searx.utils.is_valid_lang('Spanish')
401 (True, 'es', 'spanish')
402 """
403 if isinstance(lang, bytes):
404 lang = lang.decode()
405 is_abbr = len(lang) == 2
406 lang = lang.lower()
407 if is_abbr:
408 for l in sxng_locales:
409 if l[0][:2] == lang:
410 return (True, l[0][:2], l[3].lower())
411 return None
412 for l in sxng_locales:
413 if l[1].lower() == lang or l[3].lower() == lang:
414 return (True, l[0][:2], l[3].lower())
415 return None
416
417
418def load_module(filename: str, module_dir: str) -> types.ModuleType:
419 modname = splitext(filename)[0]
420 modpath = join(module_dir, filename)
421 # and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
422 spec = importlib.util.spec_from_file_location(modname, modpath)
423 if not spec:
424 raise ValueError(f"Error loading '{modpath}' module")
425 module = importlib.util.module_from_spec(spec)
426 if not spec.loader:
427 raise ValueError(f"Error loading '{modpath}' module")
428 spec.loader.exec_module(module)
429 return module
430
431
432def to_string(obj: Any) -> str:
433 """Convert obj to its string representation."""
434 if isinstance(obj, str):
435 return obj
436 if hasattr(obj, '__str__'):
437 return str(obj)
438 return repr(obj)
439
440
441def ecma_unescape(string: str) -> str:
442 """Python implementation of the unescape javascript function
443
444 https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
445 https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
446
447 Examples:
448 >>> ecma_unescape('%u5409')
449 '吉'
450 >>> ecma_unescape('%20')
451 ' '
452 >>> ecma_unescape('%F3')
453 'ó'
454 """
455 # "%u5409" becomes "吉"
456 string = _ECMA_UNESCAPE4_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
457 # "%20" becomes " ", "%F3" becomes "ó"
458 string = _ECMA_UNESCAPE2_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
459 return string
460
461
462def get_string_replaces_function(replaces: Dict[str, str]) -> Callable[[str], str]:
463 rep = {re.escape(k): v for k, v in replaces.items()}
464 pattern = re.compile("|".join(rep.keys()))
465
466 def func(text):
467 return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
468
469 return func
470
471
472def get_engine_from_settings(name: str) -> Dict:
473 """Return engine configuration from settings.yml of a given engine name"""
474
475 if 'engines' not in settings:
476 return {}
477
478 for engine in settings['engines']:
479 if 'name' not in engine:
480 continue
481 if name == engine['name']:
482 return engine
483
484 return {}
485
486
487def get_xpath(xpath_spec: XPathSpecType) -> XPath:
488 """Return cached compiled XPath
489
490 There is no thread lock.
491 Worst case scenario, xpath_str is compiled more than one time.
492
493 Args:
494 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
495
496 Returns:
497 * result (bool, float, list, str): Results.
498
499 Raises:
500 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
501 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
502 """
503 if isinstance(xpath_spec, str):
504 result = _XPATH_CACHE.get(xpath_spec, None)
505 if result is None:
506 try:
507 result = XPath(xpath_spec)
508 except XPathSyntaxError as e:
509 raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e
510 _XPATH_CACHE[xpath_spec] = result
511 return result
512
513 if isinstance(xpath_spec, XPath):
514 return xpath_spec
515
516 raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath')
517
518
519def eval_xpath(element: ElementBase, xpath_spec: XPathSpecType):
520 """Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.
521 See https://lxml.de/xpathxslt.html#xpath-return-values
522
523 Args:
524 * element (ElementBase): [description]
525 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
526
527 Returns:
528 * result (bool, float, list, str): Results.
529
530 Raises:
531 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
532 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
533 * SearxEngineXPathException: Raise when the XPath can't be evaluated.
534 """
535 xpath = get_xpath(xpath_spec)
536 try:
537 return xpath(element)
538 except XPathError as e:
539 arg = ' '.join([str(i) for i in e.args])
540 raise SearxEngineXPathException(xpath_spec, arg) from e
541
542
543def eval_xpath_list(element: ElementBase, xpath_spec: XPathSpecType, min_len: Optional[int] = None):
544 """Same as eval_xpath, check if the result is a list
545
546 Args:
547 * element (ElementBase): [description]
548 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath
549 * min_len (int, optional): [description]. Defaults to None.
550
551 Raises:
552 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
553 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
554 * SearxEngineXPathException: raise if the result is not a list
555
556 Returns:
557 * result (bool, float, list, str): Results.
558 """
559 result = eval_xpath(element, xpath_spec)
560 if not isinstance(result, list):
561 raise SearxEngineXPathException(xpath_spec, 'the result is not a list')
562 if min_len is not None and min_len > len(result):
563 raise SearxEngineXPathException(xpath_spec, 'len(xpath_str) < ' + str(min_len))
564 return result
565
566
567def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index: int, default=_NOTSET):
568 """Call eval_xpath_list then get one element using the index parameter.
569 If the index does not exist, either raise an exception is default is not set,
570 other return the default value (can be None).
571
572 Args:
573 * elements (ElementBase): lxml element to apply the xpath.
574 * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath.
575 * index (int): index to get
576 * default (Object, optional): Defaults if index doesn't exist.
577
578 Raises:
579 * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath
580 * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
581 * SearxEngineXPathException: if the index is not found. Also see eval_xpath.
582
583 Returns:
584 * result (bool, float, list, str): Results.
585 """
586 result = eval_xpath_list(elements, xpath_spec)
587 if -len(result) <= index < len(result):
588 return result[index]
589 if default == _NOTSET:
590 # raise an SearxEngineXPathException instead of IndexError
591 # to record xpath_spec
592 raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
593 return default
594
595
596def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore
597 global _FASTTEXT_MODEL # pylint: disable=global-statement
598 if _FASTTEXT_MODEL is None:
599 import fasttext # pylint: disable=import-outside-toplevel
600
601 # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
602 fasttext.FastText.eprint = lambda x: None
603 _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))
604 return _FASTTEXT_MODEL
605
606
607def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> Optional[str]:
608 """Detect the language of the ``text`` parameter.
609
610 :param str text: The string whose language is to be detected.
611
612 :param float threshold: Threshold filters the returned labels by a threshold
613 on probability. A choice of 0.3 will return labels with at least 0.3
614 probability.
615
616 :param bool only_search_languages: If ``True``, returns only supported
617 SearXNG search languages. see :py:obj:`searx.languages`
618
619 :rtype: str, None
620 :returns:
621 The detected language code or ``None``. See below.
622
623 :raises ValueError: If ``text`` is not a string.
624
625 The language detection is done by using `a fork`_ of the fastText_ library
626 (`python fasttext`_). fastText_ distributes the `language identification
627 model`_, for reference:
628
629 - `FastText.zip: Compressing text classification models`_
630 - `Bag of Tricks for Efficient Text Classification`_
631
632 The `language identification model`_ support the language codes
633 (ISO-639-3)::
634
635 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
636 bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
637 et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
638 id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
639 lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
640 nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
641 rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
642 tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
643
644 By using ``only_search_languages=True`` the `language identification model`_
645 is harmonized with the SearXNG's language (locale) model. General
646 conditions of SearXNG's locale model are:
647
648 a. SearXNG's locale of a query is passed to the
649 :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
650 code that is used by an engine.
651
652 b. Most of SearXNG's engines do not support all the languages from `language
653 identification model`_ and there is also a discrepancy in the ISO-639-3
654 (fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
655 locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
656 (``zh_Hans``) while the `language identification model`_ reduce both to
657 ``zh``.
658
659 .. _a fork: https://github.com/searxng/fasttext-predict
660 .. _fastText: https://fasttext.cc/
661 .. _python fasttext: https://pypi.org/project/fasttext/
662 .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
663 .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
664 .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
665
666 """
667 if not isinstance(text, str):
668 raise ValueError('text must a str')
669 r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)
670 if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:
671 language = r[0][0].split('__label__')[1]
672 if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
673 return None
674 return language
675 return None
676
677
678def js_variable_to_python(js_variable):
679 """Convert a javascript variable into JSON and then load the value
680
681 It does not deal with all cases, but it is good enough for now.
682 chompjs has a better implementation.
683 """
684 # when in_string is not None, it contains the character that has opened the string
685 # either simple quote or double quote
686 in_string = None
687 # cut the string:
688 # r"""{ a:"f\"irst", c:'sec"ond'}"""
689 # becomes
690 # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
691 parts = re.split(r'(["\'])', js_variable)
692 # previous part (to check the escape character antislash)
693 previous_p = ""
694 for i, p in enumerate(parts):
695 # parse characters inside a ECMA string
696 if in_string:
697 # we are in a JS string: replace the colon by a temporary character
698 # so quote_keys_regex doesn't have to deal with colon inside the JS strings
699 parts[i] = parts[i].replace(':', chr(1))
700 if in_string == "'":
701 # the JS string is delimited by simple quote.
702 # This is not supported by JSON.
703 # simple quote delimited string are converted to double quote delimited string
704 # here, inside a JS string, we escape the double quote
705 parts[i] = parts[i].replace('"', r'\"')
706
707 # deal with delimiters and escape character
708 if not in_string and p in ('"', "'"):
709 # we are not in string
710 # but p is double or simple quote
711 # that's the start of a new string
712 # replace simple quote by double quote
713 # (JSON doesn't support simple quote)
714 parts[i] = '"'
715 in_string = p
716 continue
717 if p == in_string:
718 # we are in a string and the current part MAY close the string
719 if len(previous_p) > 0 and previous_p[-1] == '\\':
720 # there is an antislash just before: the ECMA string continue
721 continue
722 # the current p close the string
723 # replace simple quote by double quote
724 parts[i] = '"'
725 in_string = None
726
727 if not in_string:
728 # replace void 0 by null
729 # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
730 # we are sure there is no string in p
731 parts[i] = _JS_VOID_RE.sub("null", p)
732 # update previous_p
733 previous_p = p
734 # join the string
735 s = ''.join(parts)
736 # add quote around the key
737 # { a: 12 }
738 # becomes
739 # { "a": 12 }
740 s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
741 s = _JS_DECIMAL_RE.sub(":0.", s)
742 # replace the surogate character by colon
743 s = s.replace(chr(1), ':')
744 # load the JSON and return the result
745 return json.loads(s)
handle_starttag(self, tag, attrs)
Definition utils.py:99
Callable[[str], str] get_string_replaces_function(Dict[str, str] replaces)
Definition utils.py:462
js_variable_to_python(js_variable)
Definition utils.py:678
Optional[str] detect_language(str text, float threshold=0.3, bool only_search_languages=False)
Definition utils.py:607
XPath get_xpath(XPathSpecType xpath_spec)
Definition utils.py:487
str to_string(Any obj)
Definition utils.py:432
humanize_bytes(size, precision=2)
Definition utils.py:355
Optional[str] extract_text(xpath_results, bool allow_none=False)
Definition utils.py:201
str ecma_unescape(str string)
Definition utils.py:441
eval_xpath_getindex(ElementBase elements, XPathSpecType xpath_spec, int index, default=_NOTSET)
Definition utils.py:567
int convert_str_to_int(str number_str)
Definition utils.py:367
eval_xpath(ElementBase element, XPathSpecType xpath_spec)
Definition utils.py:519
str gen_useragent(Optional[str] os_string=None)
Definition utils.py:79
int int_or_zero(Union[List[str], str] num)
Definition utils.py:374
Optional[Tuple[bool, str, str]] is_valid_lang(lang)
Definition utils.py:386
"fasttext.FastText._FastText" _get_fasttext_model()
Definition utils.py:596
str extract_url(xpath_results, base_url)
Definition utils.py:279
eval_xpath_list(ElementBase element, XPathSpecType xpath_spec, Optional[int] min_len=None)
Definition utils.py:543
types.ModuleType load_module(str filename, str module_dir)
Definition utils.py:418
str searx_useragent()
Definition utils.py:72
Optional[int] get_torrent_size(str filesize, str filesize_multiplier)
Definition utils.py:332
str normalize_url(str url, str base_url)
Definition utils.py:229
str html_to_text(str html_str)
Definition utils.py:146
Dict dict_subset(MutableMapping dictionary, Set[str] properties)
Definition utils.py:320
Dict get_engine_from_settings(str name)
Definition utils.py:472
str markdown_to_text(str markdown_str)
Definition utils.py:178