94from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
174 """Get an actual ``sc`` argument from Startpage's search form (HTML page).
176 Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
177 <search_form_xpath>`. Without this argument Startpage considers the request
178 is from a bot. We do not know what is encoded in the value of the ``sc``
179 argument, but it seems to be a kind of a *timestamp*.
181 Startpage's search form generates a new sc-code on each request. This
182 function scrapes a new sc-code from Startpage's home page every
183 :py:obj:`sc_code_cache_sec` seconds."""
185 sc_code = CACHE.get(
"SC_CODE")
188 logger.debug(
"get_sc_code: using cached value: %s", sc_code)
191 headers = {**params[
'headers']}
194 if searxng_locale ==
'all':
195 searxng_locale =
'en-US'
196 locale = babel.Locale.parse(searxng_locale, sep=
'-')
198 if send_accept_language_header:
199 ac_lang = locale.language
201 ac_lang =
"%s-%s,%s;q=0.9,*;q=0.5" % (
206 headers[
'Accept-Language'] = ac_lang
208 get_sc_url = base_url +
'/'
209 logger.debug(
"get_sc_code: querying new sc timestamp @ %s", get_sc_url)
210 logger.debug(
"get_sc_code: request headers: %s", headers)
211 resp = get(get_sc_url, headers=headers)
217 if str(resp.url).startswith(
'https://www.startpage.com/sp/captcha'):
219 message=
"get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
222 dom = lxml.html.fromstring(resp.text)
225 sc_code = eval_xpath(dom, search_form_xpath +
'//input[@name="sc"]/@value')[0]
226 except IndexError
as exc:
227 logger.debug(
"suspend startpage API --> https://github.com/searxng/searxng/pull/695")
229 message=
"get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url,
232 sc_code = str(sc_code)
233 logger.debug(
"get_sc_code: new value is: %s", sc_code)
234 CACHE.set(key=
"SC_CODE", value=sc_code, expire=sc_code_cache_sec)
311 published_date =
None
314 if re.match(
r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
315 date_pos = content.find(
'...') + 4
316 date_string = content[0 : date_pos - 5]
318 content = content[date_pos:]
321 published_date = dateutil.parser.parse(date_string, dayfirst=
True)
326 elif re.match(
r"^[0-9]+ days? ago \.\.\. ", content):
327 date_pos = content.find(
'...') + 4
328 date_string = content[0 : date_pos - 5]
331 published_date = datetime.now() - timedelta(days=int(re.match(
r'\d+', date_string).group()))
334 content = content[date_pos:]
336 return content, published_date
374 url = result.get(
'altClickUrl')
379 if result.get(
'thumbnailUrl'):
380 thumbnailUrl = base_url + result[
'thumbnailUrl']
383 if result.get(
'width')
and result.get(
'height'):
384 resolution = f
"{result['width']}x{result['height']}"
387 if result.get(
'filesize'):
388 size_str =
''.join(filter(str.isdigit, result[
'filesize']))
389 filesize = humanize_bytes(int(size_str))
392 'template':
'images.html',
394 'title': html_to_text(result[
'title']),
396 'img_src': result.get(
'rawImageUrl'),
397 'thumbnail_src': thumbnailUrl,
398 'resolution': resolution,
399 'img_format': result.get(
'format'),
400 'filesize': filesize,
426 """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
427 regions>` from Startpage."""
431 'User-Agent': gen_useragent(),
432 'Accept-Language':
"en-US,en;q=0.5",
434 resp = get(
'https://www.startpage.com/do/settings', headers=headers)
437 print(
"ERROR: response from Startpage is not OK.")
439 dom = lxml.html.fromstring(resp.text)
444 for option
in dom.xpath(
'//form[@name="settings"]//select[@name="search_results_region"]/option'):
445 sp_region_names.append(option.get(
'value'))
447 for eng_tag
in sp_region_names:
450 babel_region_tag = {
'no_NO':
'nb_NO'}.get(eng_tag, eng_tag)
452 if '-' in babel_region_tag:
453 l, r = babel_region_tag.split(
'-')
455 sxng_tag = region_tag(babel.Locale.parse(l +
'_' + r, sep=
'_'))
459 sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep=
'_'))
461 except babel.UnknownLocaleError:
462 print(
"ERROR: can't determine babel locale of startpage's locale %s" % eng_tag)
465 conflict = engine_traits.regions.get(sxng_tag)
467 if conflict != eng_tag:
468 print(
"CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
470 engine_traits.regions[sxng_tag] = eng_tag
474 catalog_engine2code = {name.lower(): lang_code
for lang_code, name
in babel.Locale(
'en').languages.items()}
478 for lang_code
in filter(
lambda lang_code: lang_code.find(
'_') == -1, babel.localedata.locale_identifiers()):
479 native_name = babel.Locale(lang_code).get_language_name()
481 print(f
"ERROR: language name of startpage's language {lang_code} is unknown by babel")
483 native_name = native_name.lower()
485 catalog_engine2code[native_name] = lang_code
488 unaccented_name =
''.join(filter(
lambda c:
not combining(c), normalize(
'NFKD', native_name)))
489 if len(unaccented_name) == len(unaccented_name.encode()):
491 catalog_engine2code[unaccented_name] = lang_code
495 catalog_engine2code.update(
498 'fantizhengwen':
'zh_Hant',
512 for option
in dom.xpath(
'//form[@name="settings"]//select[@name="language"]/option'):
514 eng_tag = option.get(
'value')
515 if eng_tag
in skip_eng_tags:
517 name = extract_text(option).lower()
519 sxng_tag = catalog_engine2code.get(eng_tag)
521 sxng_tag = catalog_engine2code[name]
523 conflict = engine_traits.languages.get(sxng_tag)
525 if conflict != eng_tag:
526 print(
"CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
528 engine_traits.languages[sxng_tag] = eng_tag
get_sc_code(searxng_locale, params)
dict[str, t.Any]|None _get_image_result(result)
fetch_traits(EngineTraits engine_traits)
tuple[str, datetime|None] _parse_published_date(str content)