.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
startpage.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Startpage's language & region selectors are a mess ..
3
4.. _startpage regions:
5
6Startpage regions
7=================
8
9In the list of regions there are tags we need to map to common region tags::
10
11 pt-BR_BR --> pt_BR
12 zh-CN_CN --> zh_Hans_CN
13 zh-TW_TW --> zh_Hant_TW
14 zh-TW_HK --> zh_Hant_HK
15 en-GB_GB --> en_GB
16
17and there is at least one tag with a three letter language tag (ISO 639-2)::
18
19 fil_PH --> fil_PH
20
21The locale code ``no_NO`` from Startpage does not exists and is mapped to
22``nb-NO``::
23
24 babel.core.UnknownLocaleError: unknown locale 'no_NO'
25
26For reference see languages-subtag at iana; ``no`` is the macrolanguage [1]_ and
27W3C recommends subtag over macrolanguage [2]_.
28
29.. [1] `iana: language-subtag-registry
30 <https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry>`_ ::
31
32 type: language
33 Subtag: nb
34 Description: Norwegian Bokmål
35 Added: 2005-10-16
36 Suppress-Script: Latn
37 Macrolanguage: no
38
39.. [2]
40 Use macrolanguages with care. Some language subtags have a Scope field set to
41 macrolanguage, i.e. this primary language subtag encompasses a number of more
42 specific primary language subtags in the registry. ... As we recommended for
43 the collection subtags mentioned above, in most cases you should try to use
44 the more specific subtags ... `W3: The primary language subtag
45 <https://www.w3.org/International/questions/qa-choosing-language-tags#langsubtag>`_
46
47.. _startpage languages:
48
49Startpage languages
50===================
51
52:py:obj:`send_accept_language_header`:
53 The displayed name in Startpage's settings page depend on the location of the
54 IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
55 we use::
56
57 'Accept-Language': "en-US,en;q=0.5",
58 ..
59
60 to get uniform names independent from the IP).
61
62.. _startpage categories:
63
64Startpage categories
65====================
66
67Startpage's category (for Web-search, News, Videos, ..) is set by
68:py:obj:`startpage_categ` in settings.yml::
69
70 - name: startpage
71 engine: startpage
72 startpage_categ: web
73 ...
74
75.. hint::
76
77 Supported categories are ``web``, ``news`` and ``images``.
78
79"""
80# pylint: disable=too-many-statements
81from __future__ import annotations
82
83from typing import TYPE_CHECKING, Any
84from collections import OrderedDict
85import re
86from unicodedata import normalize, combining
87from time import time
88from datetime import datetime, timedelta
89from json import loads
90
91import dateutil.parser
92import lxml.html
93import babel.localedata
94
95from searx.utils import extr, extract_text, eval_xpath, gen_useragent, html_to_text, humanize_bytes, remove_pua_from_str
96from searx.network import get # see https://github.com/searxng/searxng/issues/762
97from searx.exceptions import SearxEngineCaptchaException
98from searx.locales import region_tag
99from searx.enginelib.traits import EngineTraits
100
101if TYPE_CHECKING:
102 import logging
103
104 logger: logging.Logger
105
106traits: EngineTraits
107
108# about
109about = {
110 "website": 'https://startpage.com',
111 "wikidata_id": 'Q2333295',
112 "official_api_documentation": None,
113 "use_official_api": False,
114 "require_api_key": False,
115 "results": 'HTML',
116}
117
118startpage_categ = 'web'
119"""Startpage's category, visit :ref:`startpage categories`.
120"""
121
122send_accept_language_header = True
123"""Startpage tries to guess user's language and territory from the HTTP
124``Accept-Language``. Optional the user can select a search-language (can be
125different to the UI language) and a region filter.
126"""
127
128# engine dependent config
129categories = ['general', 'web']
130paging = True
131max_page = 18
132"""Tested 18 pages maximum (argument ``page``), to be save max is set to 20."""
133
134time_range_support = True
135safesearch = True
136
137time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
138safesearch_dict = {0: '0', 1: '1', 2: '1'}
139
140# search-url
141base_url = 'https://www.startpage.com'
142search_url = base_url + '/sp/search'
143
144# specific xpath variables
145# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
146# not ads: div[@class="result"] are the direct children of div[@id="results"]
147search_form_xpath = '//form[@id="search"]'
148"""XPath of Startpage's origin search form
149
150.. code: html
151
152 <form action="/sp/search" method="post">
153 <input type="text" name="query" value="" ..>
154 <input type="hidden" name="t" value="device">
155 <input type="hidden" name="lui" value="english">
156 <input type="hidden" name="sc" value="Q7Mt5TRqowKB00">
157 <input type="hidden" name="cat" value="web">
158 <input type="hidden" class="abp" id="abp-input" name="abp" value="1">
159 </form>
160"""
161
162# timestamp of the last fetch of 'sc' code
163sc_code_ts = 0
164sc_code = ''
165sc_code_cache_sec = 30
166"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
167
168
169def get_sc_code(searxng_locale, params):
170 """Get an actual ``sc`` argument from Startpage's search form (HTML page).
171
172 Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
173 <search_form_xpath>`. Without this argument Startpage considers the request
174 is from a bot. We do not know what is encoded in the value of the ``sc``
175 argument, but it seems to be a kind of a *time-stamp*.
176
177 Startpage's search form generates a new sc-code on each request. This
178 function scrap a new sc-code from Startpage's home page every
179 :py:obj:`sc_code_cache_sec` seconds.
180
181 """
182
183 global sc_code_ts, sc_code # pylint: disable=global-statement
184
185 if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)):
186 logger.debug("get_sc_code: reuse '%s'", sc_code)
187 return sc_code
188
189 headers = {**params['headers']}
190 headers['Origin'] = base_url
191 headers['Referer'] = base_url + '/'
192 # headers['Connection'] = 'keep-alive'
193 # headers['Accept-Encoding'] = 'gzip, deflate, br'
194 # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
195 # headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0'
196
197 # add Accept-Language header
198 if searxng_locale == 'all':
199 searxng_locale = 'en-US'
200 locale = babel.Locale.parse(searxng_locale, sep='-')
201
202 if send_accept_language_header:
203 ac_lang = locale.language
204 if locale.territory:
205 ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
206 locale.language,
207 locale.territory,
208 locale.language,
209 )
210 headers['Accept-Language'] = ac_lang
211
212 get_sc_url = base_url + '/?sc=%s' % (sc_code)
213 logger.debug("query new sc time-stamp ... %s", get_sc_url)
214 logger.debug("headers: %s", headers)
215 resp = get(get_sc_url, headers=headers)
216
217 # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
218 # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
219 # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
220
221 if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore
223 message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
224 )
225
226 dom = lxml.html.fromstring(resp.text) # type: ignore
227
228 try:
229 sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
230 except IndexError as exc:
231 logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
233 message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url, # type: ignore
234 ) from exc
235
236 sc_code_ts = time()
237 logger.debug("get_sc_code: new value is: %s", sc_code)
238 return sc_code
239
240
241def request(query, params):
242 """Assemble a Startpage request.
243
244 To avoid CAPTCHA we need to send a well formed HTTP POST request with a
245 cookie. We need to form a request that is identical to the request build by
246 Startpage's search form:
247
248 - in the cookie the **region** is selected
249 - in the HTTP POST data the **language** is selected
250
251 Additionally the arguments form Startpage's search form needs to be set in
252 HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
253 """
254 engine_region = traits.get_region(params['searxng_locale'], 'en-US')
255 engine_language = traits.get_language(params['searxng_locale'], 'en')
256
257 # build arguments
258 args = {
259 'query': query,
260 'cat': startpage_categ,
261 't': 'device',
262 'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
263 'with_date': time_range_dict.get(params['time_range'], ''),
264 }
265
266 if engine_language:
267 args['language'] = engine_language
268 args['lui'] = engine_language
269
270 args['abp'] = '1'
271 if params['pageno'] > 1:
272 args['page'] = params['pageno']
273
274 # build cookie
275 lang_homepage = 'en'
276 cookie = OrderedDict()
277 cookie['date_time'] = 'world'
278 cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
279 cookie['disable_open_in_new_window'] = '0'
280 cookie['enable_post_method'] = '1' # hint: POST
281 cookie['enable_proxy_safety_suggest'] = '1'
282 cookie['enable_stay_control'] = '1'
283 cookie['instant_answers'] = '1'
284 cookie['lang_homepage'] = 's/device/%s/' % lang_homepage
285 cookie['num_of_results'] = '10'
286 cookie['suggestions'] = '1'
287 cookie['wt_unit'] = 'celsius'
288
289 if engine_language:
290 cookie['language'] = engine_language
291 cookie['language_ui'] = engine_language
292
293 if engine_region:
294 cookie['search_results_region'] = engine_region
295
296 params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
297 logger.debug('cookie preferences: %s', params['cookies']['preferences'])
298
299 # POST request
300 logger.debug("data: %s", args)
301 params['data'] = args
302 params['method'] = 'POST'
303 params['url'] = search_url
304 params['headers']['Origin'] = base_url
305 params['headers']['Referer'] = base_url + '/'
306 # is the Accept header needed?
307 # params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
308
309 return params
310
311
312def _parse_published_date(content: str) -> tuple[str, datetime | None]:
313 published_date = None
314
315 # check if search result starts with something like: "2 Sep 2014 ... "
316 if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
317 date_pos = content.find('...') + 4
318 date_string = content[0 : date_pos - 5]
319 # fix content string
320 content = content[date_pos:]
321
322 try:
323 published_date = dateutil.parser.parse(date_string, dayfirst=True)
324 except ValueError:
325 pass
326
327 # check if search result starts with something like: "5 days ago ... "
328 elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
329 date_pos = content.find('...') + 4
330 date_string = content[0 : date_pos - 5]
331
332 # calculate datetime
333 published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
334
335 # fix content string
336 content = content[date_pos:]
337
338 return content, published_date
339
340
341def _get_web_result(result):
342 content = html_to_text(result.get('description'))
343 content, publishedDate = _parse_published_date(content)
344
345 return {
346 'url': result['clickUrl'],
347 'title': html_to_text(result['title']),
348 'content': content,
349 'publishedDate': publishedDate,
350 }
351
352
354
355 title = remove_pua_from_str(html_to_text(result['title']))
356 content = remove_pua_from_str(html_to_text(result.get('description')))
357
358 publishedDate = None
359 if result.get('date'):
360 publishedDate = datetime.fromtimestamp(result['date'] / 1000)
361
362 thumbnailUrl = None
363 if result.get('thumbnailUrl'):
364 thumbnailUrl = base_url + result['thumbnailUrl']
365
366 return {
367 'url': result['clickUrl'],
368 'title': title,
369 'content': content,
370 'publishedDate': publishedDate,
371 'thumbnail': thumbnailUrl,
372 }
373
374
375def _get_image_result(result) -> dict[str, Any] | None:
376 url = result.get('altClickUrl')
377 if not url:
378 return None
379
380 thumbnailUrl = None
381 if result.get('thumbnailUrl'):
382 thumbnailUrl = base_url + result['thumbnailUrl']
383
384 resolution = None
385 if result.get('width') and result.get('height'):
386 resolution = f"{result['width']}x{result['height']}"
387
388 filesize = None
389 if result.get('filesize'):
390 size_str = ''.join(filter(str.isdigit, result['filesize']))
391 filesize = humanize_bytes(int(size_str))
392
393 return {
394 'template': 'images.html',
395 'url': url,
396 'title': html_to_text(result['title']),
397 'content': '',
398 'img_src': result.get('rawImageUrl'),
399 'thumbnail_src': thumbnailUrl,
400 'resolution': resolution,
401 'img_format': result.get('format'),
402 'filesize': filesize,
403 }
404
405
406def response(resp):
407 categ = startpage_categ.capitalize()
408 results_raw = '{' + extr(resp.text, f"React.createElement(UIStartpage.AppSerp{categ}, {{", '}})') + '}}'
409 results_json = loads(results_raw)
410 results_obj = results_json.get('render', {}).get('presenter', {}).get('regions', {})
411
412 results = []
413 for results_categ in results_obj.get('mainline', []):
414 for item in results_categ.get('results', []):
415 if results_categ['display_type'] == 'web-google':
416 results.append(_get_web_result(item))
417 elif results_categ['display_type'] == 'news-bing':
418 results.append(_get_news_result(item))
419 elif 'images' in results_categ['display_type']:
420 item = _get_image_result(item)
421 if item:
422 results.append(item)
423
424 return results
425
426
427def fetch_traits(engine_traits: EngineTraits):
428 """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
429 regions>` from Startpage."""
430 # pylint: disable=too-many-branches
431
432 headers = {
433 'User-Agent': gen_useragent(),
434 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
435 }
436 resp = get('https://www.startpage.com/do/settings', headers=headers)
437
438 if not resp.ok: # type: ignore
439 print("ERROR: response from Startpage is not OK.")
440
441 dom = lxml.html.fromstring(resp.text) # type: ignore
442
443 # regions
444
445 sp_region_names = []
446 for option in dom.xpath('//form[@name="settings"]//select[@name="search_results_region"]/option'):
447 sp_region_names.append(option.get('value'))
448
449 for eng_tag in sp_region_names:
450 if eng_tag == 'all':
451 continue
452 babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
453
454 if '-' in babel_region_tag:
455 l, r = babel_region_tag.split('-')
456 r = r.split('_')[-1]
457 sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
458
459 else:
460 try:
461 sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep='_'))
462
463 except babel.UnknownLocaleError:
464 print("ERROR: can't determine babel locale of startpage's locale %s" % eng_tag)
465 continue
466
467 conflict = engine_traits.regions.get(sxng_tag)
468 if conflict:
469 if conflict != eng_tag:
470 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
471 continue
472 engine_traits.regions[sxng_tag] = eng_tag
473
474 # languages
475
476 catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()}
477
478 # get the native name of every language known by babel
479
480 for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()):
481 native_name = babel.Locale(lang_code).get_language_name()
482 if not native_name:
483 print(f"ERROR: language name of startpage's language {lang_code} is unknown by babel")
484 continue
485 native_name = native_name.lower()
486 # add native name exactly as it is
487 catalog_engine2code[native_name] = lang_code
488
489 # add "normalized" language name (i.e. français becomes francais and español becomes espanol)
490 unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
491 if len(unaccented_name) == len(unaccented_name.encode()):
492 # add only if result is ascii (otherwise "normalization" didn't work)
493 catalog_engine2code[unaccented_name] = lang_code
494
495 # values that can't be determined by babel's languages names
496
497 catalog_engine2code.update(
498 {
499 # traditional chinese used in ..
500 'fantizhengwen': 'zh_Hant',
501 # Korean alphabet
502 'hangul': 'ko',
503 # Malayalam is one of 22 scheduled languages of India.
504 'malayam': 'ml',
505 'norsk': 'nb',
506 'sinhalese': 'si',
507 }
508 )
509
510 skip_eng_tags = {
511 'english_uk', # SearXNG lang 'en' already maps to 'english'
512 }
513
514 for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
515
516 eng_tag = option.get('value')
517 if eng_tag in skip_eng_tags:
518 continue
519 name = extract_text(option).lower() # type: ignore
520
521 sxng_tag = catalog_engine2code.get(eng_tag)
522 if sxng_tag is None:
523 sxng_tag = catalog_engine2code[name]
524
525 conflict = engine_traits.languages.get(sxng_tag)
526 if conflict:
527 if conflict != eng_tag:
528 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
529 continue
530 engine_traits.languages[sxng_tag] = eng_tag
get_sc_code(searxng_locale, params)
Definition startpage.py:169
dict[str, Any]|None _get_image_result(result)
Definition startpage.py:375
fetch_traits(EngineTraits engine_traits)
Definition startpage.py:427
request(query, params)
Definition startpage.py:241
tuple[str, datetime|None] _parse_published_date(str content)
Definition startpage.py:312