.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
startpage.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Startpage's language & region selectors are a mess ..
3
4.. _startpage regions:
5
6Startpage regions
7=================
8
9In the list of regions there are tags we need to map to common region tags::
10
11 pt-BR_BR --> pt_BR
12 zh-CN_CN --> zh_Hans_CN
13 zh-TW_TW --> zh_Hant_TW
14 zh-TW_HK --> zh_Hant_HK
15 en-GB_GB --> en_GB
16
17and there is at least one tag with a three letter language tag (ISO 639-2)::
18
19 fil_PH --> fil_PH
20
21The locale code ``no_NO`` from Startpage does not exists and is mapped to
22``nb-NO``::
23
24 babel.core.UnknownLocaleError: unknown locale 'no_NO'
25
26For reference see languages-subtag at iana; ``no`` is the macrolanguage [1]_ and
27W3C recommends subtag over macrolanguage [2]_.
28
29.. [1] `iana: language-subtag-registry
30 <https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry>`_ ::
31
32 type: language
33 Subtag: nb
34 Description: Norwegian Bokmål
35 Added: 2005-10-16
36 Suppress-Script: Latn
37 Macrolanguage: no
38
39.. [2]
40 Use macrolanguages with care. Some language subtags have a Scope field set to
41 macrolanguage, i.e. this primary language subtag encompasses a number of more
42 specific primary language subtags in the registry. ... As we recommended for
43 the collection subtags mentioned above, in most cases you should try to use
44 the more specific subtags ... `W3: The primary language subtag
45 <https://www.w3.org/International/questions/qa-choosing-language-tags#langsubtag>`_
46
47.. _startpage languages:
48
49Startpage languages
50===================
51
52:py:obj:`send_accept_language_header`:
53 The displayed name in Startpage's settings page depend on the location of the
54 IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
55 we use::
56
57 'Accept-Language': "en-US,en;q=0.5",
58 ..
59
60 to get uniform names independent from the IP).
61
62.. _startpage categories:
63
64Startpage categories
65====================
66
67Startpage's category (for Web-search, News, Videos, ..) is set by
68:py:obj:`startpage_categ` in settings.yml::
69
70 - name: startpage
71 engine: startpage
72 startpage_categ: web
73 ...
74
75.. hint::
76
77 Supported categories are ``web``, ``news`` and ``images``.
78
79"""
80# pylint: disable=too-many-statements
81from __future__ import annotations
82
83from typing import TYPE_CHECKING, Any
84from collections import OrderedDict
85import re
86from unicodedata import normalize, combining
87from datetime import datetime, timedelta
88from json import loads
89
90import dateutil.parser
91import lxml.html
92import babel.localedata
93
94from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
95from searx.network import get # see https://github.com/searxng/searxng/issues/762
96from searx.exceptions import SearxEngineCaptchaException
97from searx.locales import region_tag
98from searx.enginelib.traits import EngineTraits
99from searx.enginelib import EngineCache
100
101if TYPE_CHECKING:
102 import logging
103
104 logger: logging.Logger
105
106traits: EngineTraits
107
108# about
109about = {
110 "website": 'https://startpage.com',
111 "wikidata_id": 'Q2333295',
112 "official_api_documentation": None,
113 "use_official_api": False,
114 "require_api_key": False,
115 "results": 'HTML',
116}
117
118startpage_categ = 'web'
119"""Startpage's category, visit :ref:`startpage categories`.
120"""
121
122send_accept_language_header = True
123"""Startpage tries to guess user's language and territory from the HTTP
124``Accept-Language``. Optional the user can select a search-language (can be
125different to the UI language) and a region filter.
126"""
127
128# engine dependent config
129categories = ['general', 'web']
130paging = True
131max_page = 18
132"""Tested 18 pages maximum (argument ``page``), to be save max is set to 20."""
133
134time_range_support = True
135safesearch = True
136
137time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
138safesearch_dict = {0: '0', 1: '1', 2: '1'}
139
140# search-url
141base_url = 'https://www.startpage.com'
142search_url = base_url + '/sp/search'
143
144# specific xpath variables
145# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
146# not ads: div[@class="result"] are the direct children of div[@id="results"]
147search_form_xpath = '//form[@id="search"]'
148"""XPath of Startpage's origin search form
149
150.. code: html
151
152 <form action="/sp/search" method="post">
153 <input type="text" name="query" value="" ..>
154 <input type="hidden" name="t" value="device">
155 <input type="hidden" name="lui" value="english">
156 <input type="hidden" name="sc" value="Q7Mt5TRqowKB00">
157 <input type="hidden" name="cat" value="web">
158 <input type="hidden" class="abp" id="abp-input" name="abp" value="1">
159 </form>
160"""
161
162
163CACHE: EngineCache
164"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
165seconds."""
166
167
168def init(_):
169 global CACHE # pylint: disable=global-statement
170
171 # hint: all three startpage engines (WEB, Images & News) can/should use the
172 # same sc_code ..
173 CACHE = EngineCache("startpage") # type:ignore
174
175
176sc_code_cache_sec = 3600
177"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
178
179
180def get_sc_code(searxng_locale, params):
181 """Get an actual ``sc`` argument from Startpage's search form (HTML page).
182
183 Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
184 <search_form_xpath>`. Without this argument Startpage considers the request
185 is from a bot. We do not know what is encoded in the value of the ``sc``
186 argument, but it seems to be a kind of a *time-stamp*.
187
188 Startpage's search form generates a new sc-code on each request. This
189 function scrap a new sc-code from Startpage's home page every
190 :py:obj:`sc_code_cache_sec` seconds."""
191
192 sc_code = CACHE.get("SC_CODE", "")
193 if sc_code:
194 return sc_code
195
196 headers = {**params['headers']}
197 headers['Origin'] = base_url
198 headers['Referer'] = base_url + '/'
199 # headers['Connection'] = 'keep-alive'
200 # headers['Accept-Encoding'] = 'gzip, deflate, br'
201 # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
202 # headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0'
203
204 # add Accept-Language header
205 if searxng_locale == 'all':
206 searxng_locale = 'en-US'
207 locale = babel.Locale.parse(searxng_locale, sep='-')
208
209 if send_accept_language_header:
210 ac_lang = locale.language
211 if locale.territory:
212 ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
213 locale.language,
214 locale.territory,
215 locale.language,
216 )
217 headers['Accept-Language'] = ac_lang
218
219 get_sc_url = base_url + '/?sc=%s' % (sc_code)
220 logger.debug("query new sc time-stamp ... %s", get_sc_url)
221 logger.debug("headers: %s", headers)
222 resp = get(get_sc_url, headers=headers)
223
224 # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
225 # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
226 # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
227
228 if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore
230 message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
231 )
232
233 dom = lxml.html.fromstring(resp.text) # type: ignore
234
235 try:
236 sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
237 except IndexError as exc:
238 logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
240 message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url, # type: ignore
241 ) from exc
242
243 sc_code = str(sc_code)
244 logger.debug("get_sc_code: new value is: %s", sc_code)
245 CACHE.set(key="SC_CODE", value=sc_code, expire=sc_code_cache_sec)
246 return sc_code
247
248
249def request(query, params):
250 """Assemble a Startpage request.
251
252 To avoid CAPTCHA we need to send a well formed HTTP POST request with a
253 cookie. We need to form a request that is identical to the request build by
254 Startpage's search form:
255
256 - in the cookie the **region** is selected
257 - in the HTTP POST data the **language** is selected
258
259 Additionally the arguments form Startpage's search form needs to be set in
260 HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
261 """
262 engine_region = traits.get_region(params['searxng_locale'], 'en-US')
263 engine_language = traits.get_language(params['searxng_locale'], 'en')
264
265 # build arguments
266 args = {
267 'query': query,
268 'cat': startpage_categ,
269 't': 'device',
270 'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
271 'with_date': time_range_dict.get(params['time_range'], ''),
272 }
273
274 if engine_language:
275 args['language'] = engine_language
276 args['lui'] = engine_language
277
278 args['abp'] = '1'
279 if params['pageno'] > 1:
280 args['page'] = params['pageno']
281
282 # build cookie
283 lang_homepage = 'en'
284 cookie = OrderedDict()
285 cookie['date_time'] = 'world'
286 cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
287 cookie['disable_open_in_new_window'] = '0'
288 cookie['enable_post_method'] = '1' # hint: POST
289 cookie['enable_proxy_safety_suggest'] = '1'
290 cookie['enable_stay_control'] = '1'
291 cookie['instant_answers'] = '1'
292 cookie['lang_homepage'] = 's/device/%s/' % lang_homepage
293 cookie['num_of_results'] = '10'
294 cookie['suggestions'] = '1'
295 cookie['wt_unit'] = 'celsius'
296
297 if engine_language:
298 cookie['language'] = engine_language
299 cookie['language_ui'] = engine_language
300
301 if engine_region:
302 cookie['search_results_region'] = engine_region
303
304 params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
305 logger.debug('cookie preferences: %s', params['cookies']['preferences'])
306
307 # POST request
308 logger.debug("data: %s", args)
309 params['data'] = args
310 params['method'] = 'POST'
311 params['url'] = search_url
312 params['headers']['Origin'] = base_url
313 params['headers']['Referer'] = base_url + '/'
314 # is the Accept header needed?
315 # params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
316
317 return params
318
319
320def _parse_published_date(content: str) -> tuple[str, datetime | None]:
321 published_date = None
322
323 # check if search result starts with something like: "2 Sep 2014 ... "
324 if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
325 date_pos = content.find('...') + 4
326 date_string = content[0 : date_pos - 5]
327 # fix content string
328 content = content[date_pos:]
329
330 try:
331 published_date = dateutil.parser.parse(date_string, dayfirst=True)
332 except ValueError:
333 pass
334
335 # check if search result starts with something like: "5 days ago ... "
336 elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
337 date_pos = content.find('...') + 4
338 date_string = content[0 : date_pos - 5]
339
340 # calculate datetime
341 published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
342
343 # fix content string
344 content = content[date_pos:]
345
346 return content, published_date
347
348
349def _get_web_result(result):
350 content = html_to_text(result.get('description'))
351 content, publishedDate = _parse_published_date(content)
352
353 return {
354 'url': result['clickUrl'],
355 'title': html_to_text(result['title']),
356 'content': content,
357 'publishedDate': publishedDate,
358 }
359
360
362
363 title = remove_pua_from_str(html_to_text(result['title']))
364 content = remove_pua_from_str(html_to_text(result.get('description')))
365
366 publishedDate = None
367 if result.get('date'):
368 publishedDate = datetime.fromtimestamp(result['date'] / 1000)
369
370 thumbnailUrl = None
371 if result.get('thumbnailUrl'):
372 thumbnailUrl = base_url + result['thumbnailUrl']
373
374 return {
375 'url': result['clickUrl'],
376 'title': title,
377 'content': content,
378 'publishedDate': publishedDate,
379 'thumbnail': thumbnailUrl,
380 }
381
382
383def _get_image_result(result) -> dict[str, Any] | None:
384 url = result.get('altClickUrl')
385 if not url:
386 return None
387
388 thumbnailUrl = None
389 if result.get('thumbnailUrl'):
390 thumbnailUrl = base_url + result['thumbnailUrl']
391
392 resolution = None
393 if result.get('width') and result.get('height'):
394 resolution = f"{result['width']}x{result['height']}"
395
396 filesize = None
397 if result.get('filesize'):
398 size_str = ''.join(filter(str.isdigit, result['filesize']))
399 filesize = humanize_bytes(int(size_str))
400
401 return {
402 'template': 'images.html',
403 'url': url,
404 'title': html_to_text(result['title']),
405 'content': '',
406 'img_src': result.get('rawImageUrl'),
407 'thumbnail_src': thumbnailUrl,
408 'resolution': resolution,
409 'img_format': result.get('format'),
410 'filesize': filesize,
411 }
412
413
414def response(resp):
415 categ = startpage_categ.capitalize()
416 results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}'
417 results_json = loads(results_raw)
418 results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {})
419
420 results = []
421 for results_categ in results_obj.get('mainline', []):
422 for item in results_categ.get('results', []):
423 if results_categ['display_type'] == 'web-google':
424 results.append(_get_web_result(item))
425 elif results_categ['display_type'] == 'news-bing':
426 results.append(_get_news_result(item))
427 elif 'images' in results_categ['display_type']:
428 item = _get_image_result(item)
429 if item:
430 results.append(item)
431
432 return results
433
434
435def fetch_traits(engine_traits: EngineTraits):
436 """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
437 regions>` from Startpage."""
438 # pylint: disable=too-many-branches
439
440 headers = {
441 'User-Agent': gen_useragent(),
442 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
443 }
444 resp = get('https://www.startpage.com/do/settings', headers=headers)
445
446 if not resp.ok: # type: ignore
447 print("ERROR: response from Startpage is not OK.")
448
449 dom = lxml.html.fromstring(resp.text) # type: ignore
450
451 # regions
452
453 sp_region_names = []
454 for option in dom.xpath('//form[@name="settings"]//select[@name="search_results_region"]/option'):
455 sp_region_names.append(option.get('value'))
456
457 for eng_tag in sp_region_names:
458 if eng_tag == 'all':
459 continue
460 babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
461
462 if '-' in babel_region_tag:
463 l, r = babel_region_tag.split('-')
464 r = r.split('_')[-1]
465 sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
466
467 else:
468 try:
469 sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep='_'))
470
471 except babel.UnknownLocaleError:
472 print("ERROR: can't determine babel locale of startpage's locale %s" % eng_tag)
473 continue
474
475 conflict = engine_traits.regions.get(sxng_tag)
476 if conflict:
477 if conflict != eng_tag:
478 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
479 continue
480 engine_traits.regions[sxng_tag] = eng_tag
481
482 # languages
483
484 catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()}
485
486 # get the native name of every language known by babel
487
488 for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()):
489 native_name = babel.Locale(lang_code).get_language_name()
490 if not native_name:
491 print(f"ERROR: language name of startpage's language {lang_code} is unknown by babel")
492 continue
493 native_name = native_name.lower()
494 # add native name exactly as it is
495 catalog_engine2code[native_name] = lang_code
496
497 # add "normalized" language name (i.e. français becomes francais and español becomes espanol)
498 unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
499 if len(unaccented_name) == len(unaccented_name.encode()):
500 # add only if result is ascii (otherwise "normalization" didn't work)
501 catalog_engine2code[unaccented_name] = lang_code
502
503 # values that can't be determined by babel's languages names
504
505 catalog_engine2code.update(
506 {
507 # traditional chinese used in ..
508 'fantizhengwen': 'zh_Hant',
509 # Korean alphabet
510 'hangul': 'ko',
511 # Malayalam is one of 22 scheduled languages of India.
512 'malayam': 'ml',
513 'norsk': 'nb',
514 'sinhalese': 'si',
515 }
516 )
517
518 skip_eng_tags = {
519 'english_uk', # SearXNG lang 'en' already maps to 'english'
520 }
521
522 for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
523
524 eng_tag = option.get('value')
525 if eng_tag in skip_eng_tags:
526 continue
527 name = extract_text(option).lower() # type: ignore
528
529 sxng_tag = catalog_engine2code.get(eng_tag)
530 if sxng_tag is None:
531 sxng_tag = catalog_engine2code[name]
532
533 conflict = engine_traits.languages.get(sxng_tag)
534 if conflict:
535 if conflict != eng_tag:
536 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
537 continue
538 engine_traits.languages[sxng_tag] = eng_tag
get_sc_code(searxng_locale, params)
Definition startpage.py:180
dict[str, Any]|None _get_image_result(result)
Definition startpage.py:383
fetch_traits(EngineTraits engine_traits)
Definition startpage.py:435
request(query, params)
Definition startpage.py:249
tuple[str, datetime|None] _parse_published_date(str content)
Definition startpage.py:320