.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
startpage.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Startpage's language & region selectors are a mess ..
3
4.. _startpage regions:
5
6Startpage regions
7=================
8
9In the list of regions there are tags we need to map to common region tags::
10
11 pt-BR_BR --> pt_BR
12 zh-CN_CN --> zh_Hans_CN
13 zh-TW_TW --> zh_Hant_TW
14 zh-TW_HK --> zh_Hant_HK
15 en-GB_GB --> en_GB
16
17and there is at least one tag with a three letter language tag (ISO 639-2)::
18
19 fil_PH --> fil_PH
20
21The locale code ``no_NO`` from Startpage does not exists and is mapped to
22``nb-NO``::
23
24 babel.core.UnknownLocaleError: unknown locale 'no_NO'
25
26For reference see languages-subtag at iana; ``no`` is the macrolanguage [1]_ and
27W3C recommends subtag over macrolanguage [2]_.
28
29.. [1] `iana: language-subtag-registry
30 <https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry>`_ ::
31
32 type: language
33 Subtag: nb
34 Description: Norwegian Bokmål
35 Added: 2005-10-16
36 Suppress-Script: Latn
37 Macrolanguage: no
38
39.. [2]
40 Use macrolanguages with care. Some language subtags have a Scope field set to
41 macrolanguage, i.e. this primary language subtag encompasses a number of more
42 specific primary language subtags in the registry. ... As we recommended for
43 the collection subtags mentioned above, in most cases you should try to use
44 the more specific subtags ... `W3: The primary language subtag
45 <https://www.w3.org/International/questions/qa-choosing-language-tags#langsubtag>`_
46
47.. _startpage languages:
48
49Startpage languages
50===================
51
52:py:obj:`send_accept_language_header`:
53 The displayed name in Startpage's settings page depend on the location of the
54 IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
55 we use::
56
57 'Accept-Language': "en-US,en;q=0.5",
58 ..
59
60 to get uniform names independent from the IP).
61
62.. _startpage categories:
63
64Startpage categories
65====================
66
67Startpage's category (for Web-search, News, Videos, ..) is set by
68:py:obj:`startpage_categ` in settings.yml::
69
70 - name: startpage
71 engine: startpage
72 startpage_categ: web
73 ...
74
75.. hint::
76
77 Supported categories are ``web``, ``news`` and ``images``.
78
79"""
80# pylint: disable=too-many-statements
81
82import typing as t
83
84from collections import OrderedDict
85import re
86from unicodedata import normalize, combining
87from datetime import datetime, timedelta
88from json import loads
89
90import dateutil.parser
91import lxml.html
92import babel.localedata
93
94from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
95from searx.network import get # see https://github.com/searxng/searxng/issues/762
96from searx.exceptions import SearxEngineCaptchaException
97from searx.locales import region_tag
98from searx.enginelib.traits import EngineTraits
99from searx.enginelib import EngineCache
100
101# about
102about = {
103 "website": 'https://startpage.com',
104 "wikidata_id": 'Q2333295',
105 "official_api_documentation": None,
106 "use_official_api": False,
107 "require_api_key": False,
108 "results": 'HTML',
109}
110
111startpage_categ = 'web'
112"""Startpage's category, visit :ref:`startpage categories`.
113"""
114
115send_accept_language_header = True
116"""Startpage tries to guess user's language and territory from the HTTP
117``Accept-Language``. Optional the user can select a search-language (can be
118different to the UI language) and a region filter.
119"""
120
121# engine dependent config
122categories = ['general', 'web']
123paging = True
124max_page = 18
125"""Tested 18 pages maximum (argument ``page``), to be save max is set to 20."""
126
127time_range_support = True
128safesearch = True
129
130time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
131safesearch_dict = {0: '0', 1: '1', 2: '1'}
132
133# search-url
134base_url = 'https://www.startpage.com'
135search_url = base_url + '/sp/search'
136
137# specific xpath variables
138# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
139# not ads: div[@class="result"] are the direct children of div[@id="results"]
140search_form_xpath = '//form[@id="search"]'
141"""XPath of Startpage's origin search form
142
143.. code: html
144
145 <form action="/sp/search" method="post">
146 <input type="text" name="query" value="" ..>
147 <input type="hidden" name="t" value="device">
148 <input type="hidden" name="lui" value="english">
149 <input type="hidden" name="sc" value="Q7Mt5TRqowKB00">
150 <input type="hidden" name="cat" value="web">
151 <input type="hidden" class="abp" id="abp-input" name="abp" value="1">
152 </form>
153"""
154
155
156CACHE: EngineCache
157"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
158seconds."""
159
160
161def init(_):
162 global CACHE # pylint: disable=global-statement
163
164 # hint: all three startpage engines (WEB, Images & News) can/should use the
165 # same sc_code ..
166 CACHE = EngineCache("startpage") # type:ignore
167
168
169sc_code_cache_sec = 3600
170"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
171
172
173def get_sc_code(searxng_locale, params):
174 """Get an actual ``sc`` argument from Startpage's search form (HTML page).
175
176 Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
177 <search_form_xpath>`. Without this argument Startpage considers the request
178 is from a bot. We do not know what is encoded in the value of the ``sc``
179 argument, but it seems to be a kind of a *timestamp*.
180
181 Startpage's search form generates a new sc-code on each request. This
182 function scrapes a new sc-code from Startpage's home page every
183 :py:obj:`sc_code_cache_sec` seconds."""
184
185 sc_code = CACHE.get("SC_CODE")
186
187 if sc_code:
188 logger.debug("get_sc_code: using cached value: %s", sc_code)
189 return sc_code
190
191 headers = {**params['headers']}
192
193 # add Accept-Language header
194 if searxng_locale == 'all':
195 searxng_locale = 'en-US'
196 locale = babel.Locale.parse(searxng_locale, sep='-')
197
198 if send_accept_language_header:
199 ac_lang = locale.language
200 if locale.territory:
201 ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
202 locale.language,
203 locale.territory,
204 locale.language,
205 )
206 headers['Accept-Language'] = ac_lang
207
208 get_sc_url = base_url + '/'
209 logger.debug("get_sc_code: querying new sc timestamp @ %s", get_sc_url)
210 logger.debug("get_sc_code: request headers: %s", headers)
211 resp = get(get_sc_url, headers=headers)
212
213 # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
214 # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
215 # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
216
217 if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore
219 message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
220 )
221
222 dom = lxml.html.fromstring(resp.text) # type: ignore
223
224 try:
225 sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
226 except IndexError as exc:
227 logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
229 message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url, # type: ignore
230 ) from exc
231
232 sc_code = str(sc_code)
233 logger.debug("get_sc_code: new value is: %s", sc_code)
234 CACHE.set(key="SC_CODE", value=sc_code, expire=sc_code_cache_sec)
235 return sc_code
236
237
238def request(query, params):
239 """Assemble a Startpage request.
240
241 To avoid CAPTCHAs we need to send a well formed HTTP POST request with a
242 cookie. We need to form a request that is identical to the request built by
243 Startpage's search form:
244
245 - in the cookie the **region** is selected
246 - in the HTTP POST data the **language** is selected
247
248 Additionally the arguments form Startpage's search form needs to be set in
249 HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
250 """
251 engine_region = traits.get_region(params['searxng_locale'], 'en-US')
252 engine_language = traits.get_language(params['searxng_locale'], 'en')
253
254 params['headers']['Origin'] = base_url
255 params['headers']['Referer'] = base_url + '/'
256
257 # Build form data
258 args = {
259 'query': query,
260 'cat': startpage_categ,
261 't': 'device',
262 'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers
263 'with_date': time_range_dict.get(params['time_range'], ''),
264 'abp': '1',
265 'abd': '1',
266 'abe': '1',
267 }
268
269 if engine_language:
270 args['language'] = engine_language
271 args['lui'] = engine_language
272
273 if params['pageno'] > 1:
274 args['page'] = params['pageno']
275 args['segment'] = 'startpage.udog'
276
277 # Build cookie
278 lang_homepage = 'en'
279 cookie = OrderedDict()
280 cookie['date_time'] = 'world'
281 cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
282 cookie['disable_open_in_new_window'] = '0'
283 cookie['enable_post_method'] = '1' # hint: POST
284 cookie['enable_proxy_safety_suggest'] = '1'
285 cookie['enable_stay_control'] = '1'
286 cookie['instant_answers'] = '1'
287 cookie['lang_homepage'] = 's/device/%s/' % lang_homepage
288 cookie['num_of_results'] = '10'
289 cookie['suggestions'] = '1'
290 cookie['wt_unit'] = 'celsius'
291
292 if engine_language:
293 cookie['language'] = engine_language
294 cookie['language_ui'] = engine_language
295
296 if engine_region:
297 cookie['search_results_region'] = engine_region
298
299 params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
300 logger.debug('cookie preferences: %s', params['cookies']['preferences'])
301
302 logger.debug("data: %s", args)
303 params['data'] = args
304 params['method'] = 'POST'
305 params['url'] = search_url
306
307 return params
308
309
310def _parse_published_date(content: str) -> tuple[str, datetime | None]:
311 published_date = None
312
313 # check if search result starts with something like: "2 Sep 2014 ... "
314 if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
315 date_pos = content.find('...') + 4
316 date_string = content[0 : date_pos - 5]
317 # fix content string
318 content = content[date_pos:]
319
320 try:
321 published_date = dateutil.parser.parse(date_string, dayfirst=True)
322 except ValueError:
323 pass
324
325 # check if search result starts with something like: "5 days ago ... "
326 elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
327 date_pos = content.find('...') + 4
328 date_string = content[0 : date_pos - 5]
329
330 # calculate datetime
331 published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
332
333 # fix content string
334 content = content[date_pos:]
335
336 return content, published_date
337
338
339def _get_web_result(result):
340 content = html_to_text(result.get('description'))
341 content, publishedDate = _parse_published_date(content)
342
343 return {
344 'url': result['clickUrl'],
345 'title': html_to_text(result['title']),
346 'content': content,
347 'publishedDate': publishedDate,
348 }
349
350
352
353 title = remove_pua_from_str(html_to_text(result['title']))
354 content = remove_pua_from_str(html_to_text(result.get('description')))
355
356 publishedDate = None
357 if result.get('date'):
358 publishedDate = datetime.fromtimestamp(result['date'] / 1000)
359
360 thumbnailUrl = None
361 if result.get('thumbnailUrl'):
362 thumbnailUrl = base_url + result['thumbnailUrl']
363
364 return {
365 'url': result['clickUrl'],
366 'title': title,
367 'content': content,
368 'publishedDate': publishedDate,
369 'thumbnail': thumbnailUrl,
370 }
371
372
373def _get_image_result(result) -> dict[str, t.Any] | None:
374 url = result.get('altClickUrl')
375 if not url:
376 return None
377
378 thumbnailUrl = None
379 if result.get('thumbnailUrl'):
380 thumbnailUrl = base_url + result['thumbnailUrl']
381
382 resolution = None
383 if result.get('width') and result.get('height'):
384 resolution = f"{result['width']}x{result['height']}"
385
386 filesize = None
387 if result.get('filesize'):
388 size_str = ''.join(filter(str.isdigit, result['filesize']))
389 filesize = humanize_bytes(int(size_str))
390
391 return {
392 'template': 'images.html',
393 'url': url,
394 'title': html_to_text(result['title']),
395 'content': '',
396 'img_src': result.get('rawImageUrl'),
397 'thumbnail_src': thumbnailUrl,
398 'resolution': resolution,
399 'img_format': result.get('format'),
400 'filesize': filesize,
401 }
402
403
404def response(resp):
405 categ = startpage_categ.capitalize()
406 results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}'
407 results_json = loads(results_raw)
408 results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {})
409
410 results = []
411 for results_categ in results_obj.get('mainline', []):
412 for item in results_categ.get('results', []):
413 if results_categ['display_type'] == 'web-google':
414 results.append(_get_web_result(item))
415 elif results_categ['display_type'] == 'news-bing':
416 results.append(_get_news_result(item))
417 elif 'images' in results_categ['display_type']:
418 item = _get_image_result(item)
419 if item:
420 results.append(item)
421
422 return results
423
424
425def fetch_traits(engine_traits: EngineTraits):
426 """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
427 regions>` from Startpage."""
428 # pylint: disable=too-many-branches
429
430 headers = {
431 'User-Agent': gen_useragent(),
432 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
433 }
434 resp = get('https://www.startpage.com/do/settings', headers=headers)
435
436 if not resp.ok: # type: ignore
437 print("ERROR: response from Startpage is not OK.")
438
439 dom = lxml.html.fromstring(resp.text) # type: ignore
440
441 # regions
442
443 sp_region_names = []
444 for option in dom.xpath('//form[@name="settings"]//select[@name="search_results_region"]/option'):
445 sp_region_names.append(option.get('value'))
446
447 for eng_tag in sp_region_names:
448 if eng_tag == 'all':
449 continue
450 babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
451
452 if '-' in babel_region_tag:
453 l, r = babel_region_tag.split('-')
454 r = r.split('_')[-1]
455 sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
456
457 else:
458 try:
459 sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep='_'))
460
461 except babel.UnknownLocaleError:
462 print("ERROR: can't determine babel locale of startpage's locale %s" % eng_tag)
463 continue
464
465 conflict = engine_traits.regions.get(sxng_tag)
466 if conflict:
467 if conflict != eng_tag:
468 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
469 continue
470 engine_traits.regions[sxng_tag] = eng_tag
471
472 # languages
473
474 catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()}
475
476 # get the native name of every language known by babel
477
478 for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()):
479 native_name = babel.Locale(lang_code).get_language_name()
480 if not native_name:
481 print(f"ERROR: language name of startpage's language {lang_code} is unknown by babel")
482 continue
483 native_name = native_name.lower()
484 # add native name exactly as it is
485 catalog_engine2code[native_name] = lang_code
486
487 # add "normalized" language name (i.e. français becomes francais and español becomes espanol)
488 unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
489 if len(unaccented_name) == len(unaccented_name.encode()):
490 # add only if result is ascii (otherwise "normalization" didn't work)
491 catalog_engine2code[unaccented_name] = lang_code
492
493 # values that can't be determined by babel's languages names
494
495 catalog_engine2code.update(
496 {
497 # traditional chinese used in ..
498 'fantizhengwen': 'zh_Hant',
499 # Korean alphabet
500 'hangul': 'ko',
501 # Malayalam is one of 22 scheduled languages of India.
502 'malayam': 'ml',
503 'norsk': 'nb',
504 'sinhalese': 'si',
505 }
506 )
507
508 skip_eng_tags = {
509 'english_uk', # SearXNG lang 'en' already maps to 'english'
510 }
511
512 for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
513
514 eng_tag = option.get('value')
515 if eng_tag in skip_eng_tags:
516 continue
517 name = extract_text(option).lower() # type: ignore
518
519 sxng_tag = catalog_engine2code.get(eng_tag)
520 if sxng_tag is None:
521 sxng_tag = catalog_engine2code[name]
522
523 conflict = engine_traits.languages.get(sxng_tag)
524 if conflict:
525 if conflict != eng_tag:
526 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
527 continue
528 engine_traits.languages[sxng_tag] = eng_tag
get_sc_code(searxng_locale, params)
Definition startpage.py:173
dict[str, t.Any]|None _get_image_result(result)
Definition startpage.py:373
fetch_traits(EngineTraits engine_traits)
Definition startpage.py:425
request(query, params)
Definition startpage.py:238
tuple[str, datetime|None] _parse_published_date(str content)
Definition startpage.py:310