.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
startpage.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Startpage's language & region selectors are a mess ..
3
4.. _startpage regions:
5
6Startpage regions
7=================
8
9In the list of regions there are tags we need to map to common region tags::
10
11 pt-BR_BR --> pt_BR
12 zh-CN_CN --> zh_Hans_CN
13 zh-TW_TW --> zh_Hant_TW
14 zh-TW_HK --> zh_Hant_HK
15 en-GB_GB --> en_GB
16
17and there is at least one tag with a three letter language tag (ISO 639-2)::
18
19 fil_PH --> fil_PH
20
21The locale code ``no_NO`` from Startpage does not exists and is mapped to
22``nb-NO``::
23
24 babel.core.UnknownLocaleError: unknown locale 'no_NO'
25
26For reference see languages-subtag at iana; ``no`` is the macrolanguage [1]_ and
27W3C recommends subtag over macrolanguage [2]_.
28
29.. [1] `iana: language-subtag-registry
30 <https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry>`_ ::
31
32 type: language
33 Subtag: nb
34 Description: Norwegian Bokmål
35 Added: 2005-10-16
36 Suppress-Script: Latn
37 Macrolanguage: no
38
39.. [2]
40 Use macrolanguages with care. Some language subtags have a Scope field set to
41 macrolanguage, i.e. this primary language subtag encompasses a number of more
42 specific primary language subtags in the registry. ... As we recommended for
43 the collection subtags mentioned above, in most cases you should try to use
44 the more specific subtags ... `W3: The primary language subtag
45 <https://www.w3.org/International/questions/qa-choosing-language-tags#langsubtag>`_
46
47.. _startpage languages:
48
49Startpage languages
50===================
51
52:py:obj:`send_accept_language_header`:
53 The displayed name in Startpage's settings page depend on the location of the
54 IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
55 we use::
56
57 'Accept-Language': "en-US,en;q=0.5",
58 ..
59
60 to get uniform names independent from the IP).
61
62.. _startpage categories:
63
64Startpage categories
65====================
66
67Startpage's category (for Web-search, News, Videos, ..) is set by
68:py:obj:`startpage_categ` in settings.yml::
69
70 - name: startpage
71 engine: startpage
72 startpage_categ: web
73 ...
74
75.. hint::
76
77 Supported categories are ``web``, ``news`` and ``images``.
78
79"""
80# pylint: disable=too-many-statements
81from __future__ import annotations
82
83from typing import TYPE_CHECKING, Any
84from collections import OrderedDict
85import re
86from unicodedata import normalize, combining
87from datetime import datetime, timedelta
88from json import loads
89
90import dateutil.parser
91import lxml.html
92import babel.localedata
93
94from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
95from searx.network import get # see https://github.com/searxng/searxng/issues/762
96from searx.exceptions import SearxEngineCaptchaException
97from searx.locales import region_tag
98from searx.enginelib.traits import EngineTraits
99from searx.enginelib import EngineCache
100
101if TYPE_CHECKING:
102 import logging
103
104 logger: logging.Logger
105
106traits: EngineTraits
107
108# about
109about = {
110 "website": 'https://startpage.com',
111 "wikidata_id": 'Q2333295',
112 "official_api_documentation": None,
113 "use_official_api": False,
114 "require_api_key": False,
115 "results": 'HTML',
116}
117
118startpage_categ = 'web'
119"""Startpage's category, visit :ref:`startpage categories`.
120"""
121
122send_accept_language_header = True
123"""Startpage tries to guess user's language and territory from the HTTP
124``Accept-Language``. Optional the user can select a search-language (can be
125different to the UI language) and a region filter.
126"""
127
128# engine dependent config
129categories = ['general', 'web']
130paging = True
131max_page = 18
132"""Tested 18 pages maximum (argument ``page``), to be save max is set to 20."""
133
134time_range_support = True
135safesearch = True
136
137time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
138safesearch_dict = {0: '0', 1: '1', 2: '1'}
139
140# search-url
141base_url = 'https://www.startpage.com'
142search_url = base_url + '/sp/search'
143
144# specific xpath variables
145# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
146# not ads: div[@class="result"] are the direct children of div[@id="results"]
147search_form_xpath = '//form[@id="search"]'
148"""XPath of Startpage's origin search form
149
150.. code: html
151
152 <form action="/sp/search" method="post">
153 <input type="text" name="query" value="" ..>
154 <input type="hidden" name="t" value="device">
155 <input type="hidden" name="lui" value="english">
156 <input type="hidden" name="sc" value="Q7Mt5TRqowKB00">
157 <input type="hidden" name="cat" value="web">
158 <input type="hidden" class="abp" id="abp-input" name="abp" value="1">
159 </form>
160"""
161
162
163CACHE: EngineCache
164"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
165seconds."""
166
167
168def init(_):
169 global CACHE # pylint: disable=global-statement
170
171 # hint: all three startpage engines (WEB, Images & News) can/should use the
172 # same sc_code ..
173 CACHE = EngineCache("startpage") # type:ignore
174
175
176sc_code_cache_sec = 3600
177"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
178
179
180def get_sc_code(searxng_locale, params):
181 """Get an actual ``sc`` argument from Startpage's search form (HTML page).
182
183 Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
184 <search_form_xpath>`. Without this argument Startpage considers the request
185 is from a bot. We do not know what is encoded in the value of the ``sc``
186 argument, but it seems to be a kind of a *timestamp*.
187
188 Startpage's search form generates a new sc-code on each request. This
189 function scrapes a new sc-code from Startpage's home page every
190 :py:obj:`sc_code_cache_sec` seconds."""
191
192 sc_code = CACHE.get("SC_CODE")
193
194 if sc_code:
195 logger.debug("get_sc_code: using cached value: %s", sc_code)
196 return sc_code
197
198 headers = {**params['headers']}
199
200 # add Accept-Language header
201 if searxng_locale == 'all':
202 searxng_locale = 'en-US'
203 locale = babel.Locale.parse(searxng_locale, sep='-')
204
205 if send_accept_language_header:
206 ac_lang = locale.language
207 if locale.territory:
208 ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
209 locale.language,
210 locale.territory,
211 locale.language,
212 )
213 headers['Accept-Language'] = ac_lang
214
215 get_sc_url = base_url + '/'
216 logger.debug("get_sc_code: querying new sc timestamp @ %s", get_sc_url)
217 logger.debug("get_sc_code: request headers: %s", headers)
218 resp = get(get_sc_url, headers=headers)
219
220 # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
221 # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
222 # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
223
224 if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore
226 message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
227 )
228
229 dom = lxml.html.fromstring(resp.text) # type: ignore
230
231 try:
232 sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
233 except IndexError as exc:
234 logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
236 message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url, # type: ignore
237 ) from exc
238
239 sc_code = str(sc_code)
240 logger.debug("get_sc_code: new value is: %s", sc_code)
241 CACHE.set(key="SC_CODE", value=sc_code, expire=sc_code_cache_sec)
242 return sc_code
243
244
245def request(query, params):
246 """Assemble a Startpage request.
247
248 To avoid CAPTCHAs we need to send a well formed HTTP POST request with a
249 cookie. We need to form a request that is identical to the request built by
250 Startpage's search form:
251
252 - in the cookie the **region** is selected
253 - in the HTTP POST data the **language** is selected
254
255 Additionally the arguments form Startpage's search form needs to be set in
256 HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
257 """
258 engine_region = traits.get_region(params['searxng_locale'], 'en-US')
259 engine_language = traits.get_language(params['searxng_locale'], 'en')
260
261 params['headers']['Origin'] = base_url
262 params['headers']['Referer'] = base_url + '/'
263
264 # Build form data
265 args = {
266 'query': query,
267 'cat': startpage_categ,
268 't': 'device',
269 'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers
270 'with_date': time_range_dict.get(params['time_range'], ''),
271 'abp': '1',
272 'abd': '1',
273 'abe': '1',
274 }
275
276 if engine_language:
277 args['language'] = engine_language
278 args['lui'] = engine_language
279
280 if params['pageno'] > 1:
281 args['page'] = params['pageno']
282 args['segment'] = 'startpage.udog'
283
284 # Build cookie
285 lang_homepage = 'en'
286 cookie = OrderedDict()
287 cookie['date_time'] = 'world'
288 cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
289 cookie['disable_open_in_new_window'] = '0'
290 cookie['enable_post_method'] = '1' # hint: POST
291 cookie['enable_proxy_safety_suggest'] = '1'
292 cookie['enable_stay_control'] = '1'
293 cookie['instant_answers'] = '1'
294 cookie['lang_homepage'] = 's/device/%s/' % lang_homepage
295 cookie['num_of_results'] = '10'
296 cookie['suggestions'] = '1'
297 cookie['wt_unit'] = 'celsius'
298
299 if engine_language:
300 cookie['language'] = engine_language
301 cookie['language_ui'] = engine_language
302
303 if engine_region:
304 cookie['search_results_region'] = engine_region
305
306 params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
307 logger.debug('cookie preferences: %s', params['cookies']['preferences'])
308
309 logger.debug("data: %s", args)
310 params['data'] = args
311 params['method'] = 'POST'
312 params['url'] = search_url
313
314 return params
315
316
317def _parse_published_date(content: str) -> tuple[str, datetime | None]:
318 published_date = None
319
320 # check if search result starts with something like: "2 Sep 2014 ... "
321 if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
322 date_pos = content.find('...') + 4
323 date_string = content[0 : date_pos - 5]
324 # fix content string
325 content = content[date_pos:]
326
327 try:
328 published_date = dateutil.parser.parse(date_string, dayfirst=True)
329 except ValueError:
330 pass
331
332 # check if search result starts with something like: "5 days ago ... "
333 elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
334 date_pos = content.find('...') + 4
335 date_string = content[0 : date_pos - 5]
336
337 # calculate datetime
338 published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
339
340 # fix content string
341 content = content[date_pos:]
342
343 return content, published_date
344
345
346def _get_web_result(result):
347 content = html_to_text(result.get('description'))
348 content, publishedDate = _parse_published_date(content)
349
350 return {
351 'url': result['clickUrl'],
352 'title': html_to_text(result['title']),
353 'content': content,
354 'publishedDate': publishedDate,
355 }
356
357
359
360 title = remove_pua_from_str(html_to_text(result['title']))
361 content = remove_pua_from_str(html_to_text(result.get('description')))
362
363 publishedDate = None
364 if result.get('date'):
365 publishedDate = datetime.fromtimestamp(result['date'] / 1000)
366
367 thumbnailUrl = None
368 if result.get('thumbnailUrl'):
369 thumbnailUrl = base_url + result['thumbnailUrl']
370
371 return {
372 'url': result['clickUrl'],
373 'title': title,
374 'content': content,
375 'publishedDate': publishedDate,
376 'thumbnail': thumbnailUrl,
377 }
378
379
380def _get_image_result(result) -> dict[str, Any] | None:
381 url = result.get('altClickUrl')
382 if not url:
383 return None
384
385 thumbnailUrl = None
386 if result.get('thumbnailUrl'):
387 thumbnailUrl = base_url + result['thumbnailUrl']
388
389 resolution = None
390 if result.get('width') and result.get('height'):
391 resolution = f"{result['width']}x{result['height']}"
392
393 filesize = None
394 if result.get('filesize'):
395 size_str = ''.join(filter(str.isdigit, result['filesize']))
396 filesize = humanize_bytes(int(size_str))
397
398 return {
399 'template': 'images.html',
400 'url': url,
401 'title': html_to_text(result['title']),
402 'content': '',
403 'img_src': result.get('rawImageUrl'),
404 'thumbnail_src': thumbnailUrl,
405 'resolution': resolution,
406 'img_format': result.get('format'),
407 'filesize': filesize,
408 }
409
410
411def response(resp):
412 categ = startpage_categ.capitalize()
413 results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}'
414 results_json = loads(results_raw)
415 results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {})
416
417 results = []
418 for results_categ in results_obj.get('mainline', []):
419 for item in results_categ.get('results', []):
420 if results_categ['display_type'] == 'web-google':
421 results.append(_get_web_result(item))
422 elif results_categ['display_type'] == 'news-bing':
423 results.append(_get_news_result(item))
424 elif 'images' in results_categ['display_type']:
425 item = _get_image_result(item)
426 if item:
427 results.append(item)
428
429 return results
430
431
432def fetch_traits(engine_traits: EngineTraits):
433 """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
434 regions>` from Startpage."""
435 # pylint: disable=too-many-branches
436
437 headers = {
438 'User-Agent': gen_useragent(),
439 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
440 }
441 resp = get('https://www.startpage.com/do/settings', headers=headers)
442
443 if not resp.ok: # type: ignore
444 print("ERROR: response from Startpage is not OK.")
445
446 dom = lxml.html.fromstring(resp.text) # type: ignore
447
448 # regions
449
450 sp_region_names = []
451 for option in dom.xpath('//form[@name="settings"]//select[@name="search_results_region"]/option'):
452 sp_region_names.append(option.get('value'))
453
454 for eng_tag in sp_region_names:
455 if eng_tag == 'all':
456 continue
457 babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
458
459 if '-' in babel_region_tag:
460 l, r = babel_region_tag.split('-')
461 r = r.split('_')[-1]
462 sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
463
464 else:
465 try:
466 sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep='_'))
467
468 except babel.UnknownLocaleError:
469 print("ERROR: can't determine babel locale of startpage's locale %s" % eng_tag)
470 continue
471
472 conflict = engine_traits.regions.get(sxng_tag)
473 if conflict:
474 if conflict != eng_tag:
475 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
476 continue
477 engine_traits.regions[sxng_tag] = eng_tag
478
479 # languages
480
481 catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()}
482
483 # get the native name of every language known by babel
484
485 for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()):
486 native_name = babel.Locale(lang_code).get_language_name()
487 if not native_name:
488 print(f"ERROR: language name of startpage's language {lang_code} is unknown by babel")
489 continue
490 native_name = native_name.lower()
491 # add native name exactly as it is
492 catalog_engine2code[native_name] = lang_code
493
494 # add "normalized" language name (i.e. français becomes francais and español becomes espanol)
495 unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
496 if len(unaccented_name) == len(unaccented_name.encode()):
497 # add only if result is ascii (otherwise "normalization" didn't work)
498 catalog_engine2code[unaccented_name] = lang_code
499
500 # values that can't be determined by babel's languages names
501
502 catalog_engine2code.update(
503 {
504 # traditional chinese used in ..
505 'fantizhengwen': 'zh_Hant',
506 # Korean alphabet
507 'hangul': 'ko',
508 # Malayalam is one of 22 scheduled languages of India.
509 'malayam': 'ml',
510 'norsk': 'nb',
511 'sinhalese': 'si',
512 }
513 )
514
515 skip_eng_tags = {
516 'english_uk', # SearXNG lang 'en' already maps to 'english'
517 }
518
519 for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
520
521 eng_tag = option.get('value')
522 if eng_tag in skip_eng_tags:
523 continue
524 name = extract_text(option).lower() # type: ignore
525
526 sxng_tag = catalog_engine2code.get(eng_tag)
527 if sxng_tag is None:
528 sxng_tag = catalog_engine2code[name]
529
530 conflict = engine_traits.languages.get(sxng_tag)
531 if conflict:
532 if conflict != eng_tag:
533 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
534 continue
535 engine_traits.languages[sxng_tag] = eng_tag
get_sc_code(searxng_locale, params)
Definition startpage.py:180
dict[str, Any]|None _get_image_result(result)
Definition startpage.py:380
fetch_traits(EngineTraits engine_traits)
Definition startpage.py:432
request(query, params)
Definition startpage.py:245
tuple[str, datetime|None] _parse_published_date(str content)
Definition startpage.py:317