169 """Get an actual ``sc`` argument from Startpage's search form (HTML page).
171 Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
172 <search_form_xpath>`. Without this argument Startpage considers the request
173 is from a bot. We do not know what is encoded in the value of the ``sc``
174 argument, but it seems to be a kind of a *time-stamp*.
176 Startpage's search form generates a new sc-code on each request. This
177 function scrap a new sc-code from Startpage's home page every
178 :py:obj:`sc_code_cache_sec` seconds.
182 global sc_code_ts, sc_code
184 if sc_code
and (time() < (sc_code_ts + sc_code_cache_sec)):
185 logger.debug(
"get_sc_code: reuse '%s'", sc_code)
188 headers = {**params[
'headers']}
189 headers[
'Origin'] = base_url
190 headers[
'Referer'] = base_url +
'/'
197 if searxng_locale ==
'all':
198 searxng_locale =
'en-US'
199 locale = babel.Locale.parse(searxng_locale, sep=
'-')
201 if send_accept_language_header:
202 ac_lang = locale.language
204 ac_lang =
"%s-%s,%s;q=0.9,*;q=0.5" % (
209 headers[
'Accept-Language'] = ac_lang
211 get_sc_url = base_url +
'/?sc=%s' % (sc_code)
212 logger.debug(
"query new sc time-stamp ... %s", get_sc_url)
213 logger.debug(
"headers: %s", headers)
214 resp = get(get_sc_url, headers=headers)
220 if str(resp.url).startswith(
'https://www.startpage.com/sp/captcha'):
222 message=
"get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
225 dom = lxml.html.fromstring(resp.text)
228 sc_code = eval_xpath(dom, search_form_xpath +
'//input[@name="sc"]/@value')[0]
229 except IndexError
as exc:
230 logger.debug(
"suspend startpage API --> https://github.com/searxng/searxng/pull/695")
232 message=
"get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url,
236 logger.debug(
"get_sc_code: new value is: %s", sc_code)
335 for result
in eval_xpath(dom,
'//div[@class="w-gl"]/div[contains(@class, "result")]'):
336 links = eval_xpath(result,
'.//a[contains(@class, "result-title result-link")]')
340 url = link.attrib.get(
'href')
343 if re.match(
r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
347 if re.match(
r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
350 title = extract_text(eval_xpath(link,
'h2'))
351 content = eval_xpath(result,
'.//p[contains(@class, "description")]')
352 content = extract_text(content, allow_none=
True)
or ''
354 published_date =
None
357 if re.match(
r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
358 date_pos = content.find(
'...') + 4
359 date_string = content[0 : date_pos - 5]
361 content = content[date_pos:]
364 published_date = dateutil.parser.parse(date_string, dayfirst=
True)
369 elif re.match(
r"^[0-9]+ days? ago \.\.\. ", content):
370 date_pos = content.find(
'...') + 4
371 date_string = content[0 : date_pos - 5]
374 published_date = datetime.now() - timedelta(days=int(re.match(
r'\d+', date_string).group()))
377 content = content[date_pos:]
381 results.append({
'url': url,
'title': title,
'content': content,
'publishedDate': published_date})
384 results.append({
'url': url,
'title': title,
'content': content})
391 """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
392 regions>` from Startpage."""
396 'User-Agent': gen_useragent(),
397 'Accept-Language':
"en-US,en;q=0.5",
399 resp = get(
'https://www.startpage.com/do/settings', headers=headers)
402 print(
"ERROR: response from Startpage is not OK.")
404 dom = lxml.html.fromstring(resp.text)
409 for option
in dom.xpath(
'//form[@name="settings"]//select[@name="search_results_region"]/option'):
410 sp_region_names.append(option.get(
'value'))
412 for eng_tag
in sp_region_names:
415 babel_region_tag = {
'no_NO':
'nb_NO'}.get(eng_tag, eng_tag)
417 if '-' in babel_region_tag:
418 l, r = babel_region_tag.split(
'-')
420 sxng_tag = region_tag(babel.Locale.parse(l +
'_' + r, sep=
'_'))
424 sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep=
'_'))
426 except babel.UnknownLocaleError:
427 print(
"ERROR: can't determine babel locale of startpage's locale %s" % eng_tag)
430 conflict = engine_traits.regions.get(sxng_tag)
432 if conflict != eng_tag:
433 print(
"CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
435 engine_traits.regions[sxng_tag] = eng_tag
439 catalog_engine2code = {name.lower(): lang_code
for lang_code, name
in babel.Locale(
'en').languages.items()}
443 for lang_code
in filter(
lambda lang_code: lang_code.find(
'_') == -1, babel.localedata.locale_identifiers()):
444 native_name = babel.Locale(lang_code).get_language_name()
446 print(f
"ERROR: language name of startpage's language {lang_code} is unknown by babel")
448 native_name = native_name.lower()
450 catalog_engine2code[native_name] = lang_code
453 unaccented_name =
''.join(filter(
lambda c:
not combining(c), normalize(
'NFKD', native_name)))
454 if len(unaccented_name) == len(unaccented_name.encode()):
456 catalog_engine2code[unaccented_name] = lang_code
460 catalog_engine2code.update(
463 'fantizhengwen':
'zh_Hant',
477 for option
in dom.xpath(
'//form[@name="settings"]//select[@name="language"]/option'):
479 eng_tag = option.get(
'value')
480 if eng_tag
in skip_eng_tags:
482 name = extract_text(option).lower()
484 sxng_tag = catalog_engine2code.get(eng_tag)
486 sxng_tag = catalog_engine2code[name]
488 conflict = engine_traits.languages.get(sxng_tag)
490 if conflict != eng_tag:
491 print(
"CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
493 engine_traits.languages[sxng_tag] = eng_tag
_request_cat_web(query, params)
get_sc_code(searxng_locale, params)
fetch_traits(EngineTraits engine_traits)