.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
startpage.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Startpage's language & region selectors are a mess ..
3
4.. _startpage regions:
5
6Startpage regions
7=================
8
9In the list of regions there are tags we need to map to common region tags::
10
11 pt-BR_BR --> pt_BR
12 zh-CN_CN --> zh_Hans_CN
13 zh-TW_TW --> zh_Hant_TW
14 zh-TW_HK --> zh_Hant_HK
15 en-GB_GB --> en_GB
16
17and there is at least one tag with a three letter language tag (ISO 639-2)::
18
19 fil_PH --> fil_PH
20
21The locale code ``no_NO`` from Startpage does not exists and is mapped to
22``nb-NO``::
23
24 babel.core.UnknownLocaleError: unknown locale 'no_NO'
25
26For reference see languages-subtag at iana; ``no`` is the macrolanguage [1]_ and
27W3C recommends subtag over macrolanguage [2]_.
28
29.. [1] `iana: language-subtag-registry
30 <https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry>`_ ::
31
32 type: language
33 Subtag: nb
34 Description: Norwegian Bokmål
35 Added: 2005-10-16
36 Suppress-Script: Latn
37 Macrolanguage: no
38
39.. [2]
40 Use macrolanguages with care. Some language subtags have a Scope field set to
41 macrolanguage, i.e. this primary language subtag encompasses a number of more
42 specific primary language subtags in the registry. ... As we recommended for
43 the collection subtags mentioned above, in most cases you should try to use
44 the more specific subtags ... `W3: The primary language subtag
45 <https://www.w3.org/International/questions/qa-choosing-language-tags#langsubtag>`_
46
47.. _startpage languages:
48
49Startpage languages
50===================
51
52:py:obj:`send_accept_language_header`:
53 The displayed name in Startpage's settings page depend on the location of the
54 IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
55 we use::
56
57 'Accept-Language': "en-US,en;q=0.5",
58 ..
59
60 to get uniform names independent from the IP).
61
62.. _startpage categories:
63
64Startpage categories
65====================
66
67Startpage's category (for Web-search, News, Videos, ..) is set by
68:py:obj:`startpage_categ` in settings.yml::
69
70 - name: startpage
71 engine: startpage
72 startpage_categ: web
73 ...
74
75.. hint::
76
77 The default category is ``web`` .. and other categories than ``web`` are not
78 yet implemented.
79
80"""
81# pylint: disable=too-many-statements
82
83from typing import TYPE_CHECKING
84from collections import OrderedDict
85import re
86from unicodedata import normalize, combining
87from time import time
88from datetime import datetime, timedelta
89
90import dateutil.parser
91import lxml.html
92import babel.localedata
93
94from searx.utils import extract_text, eval_xpath, gen_useragent
95from searx.network import get # see https://github.com/searxng/searxng/issues/762
96from searx.exceptions import SearxEngineCaptchaException
97from searx.locales import region_tag
98from searx.enginelib.traits import EngineTraits
99
100if TYPE_CHECKING:
101 import logging
102
103 logger: logging.Logger
104
105traits: EngineTraits
106
107# about
108about = {
109 "website": 'https://startpage.com',
110 "wikidata_id": 'Q2333295',
111 "official_api_documentation": None,
112 "use_official_api": False,
113 "require_api_key": False,
114 "results": 'HTML',
115}
116
117startpage_categ = 'web'
118"""Startpage's category, visit :ref:`startpage categories`.
119"""
120
121send_accept_language_header = True
122"""Startpage tries to guess user's language and territory from the HTTP
123``Accept-Language``. Optional the user can select a search-language (can be
124different to the UI language) and a region filter.
125"""
126
127# engine dependent config
128categories = ['general', 'web']
129paging = True
130max_page = 18
131"""Tested 18 pages maximum (argument ``page``), to be save max is set to 20."""
132
133time_range_support = True
134safesearch = True
135
136time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
137safesearch_dict = {0: '0', 1: '1', 2: '1'}
138
139# search-url
140base_url = 'https://www.startpage.com'
141search_url = base_url + '/sp/search'
142
143# specific xpath variables
144# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
145# not ads: div[@class="result"] are the direct childs of div[@id="results"]
146search_form_xpath = '//form[@id="search"]'
147"""XPath of Startpage's origin search form
148
149.. code: html
150
151 <form action="/sp/search" method="post">
152 <input type="text" name="query" value="" ..>
153 <input type="hidden" name="t" value="device">
154 <input type="hidden" name="lui" value="english">
155 <input type="hidden" name="sc" value="Q7Mt5TRqowKB00">
156 <input type="hidden" name="cat" value="web">
157 <input type="hidden" class="abp" id="abp-input" name="abp" value="1">
158 </form>
159"""
160
161# timestamp of the last fetch of 'sc' code
162sc_code_ts = 0
163sc_code = ''
164sc_code_cache_sec = 30
165"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
166
167
168def get_sc_code(searxng_locale, params):
169 """Get an actual ``sc`` argument from Startpage's search form (HTML page).
170
171 Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
172 <search_form_xpath>`. Without this argument Startpage considers the request
173 is from a bot. We do not know what is encoded in the value of the ``sc``
174 argument, but it seems to be a kind of a *time-stamp*.
175
176 Startpage's search form generates a new sc-code on each request. This
177 function scrap a new sc-code from Startpage's home page every
178 :py:obj:`sc_code_cache_sec` seconds.
179
180 """
181
182 global sc_code_ts, sc_code # pylint: disable=global-statement
183
184 if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)):
185 logger.debug("get_sc_code: reuse '%s'", sc_code)
186 return sc_code
187
188 headers = {**params['headers']}
189 headers['Origin'] = base_url
190 headers['Referer'] = base_url + '/'
191 # headers['Connection'] = 'keep-alive'
192 # headers['Accept-Encoding'] = 'gzip, deflate, br'
193 # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
194 # headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0'
195
196 # add Accept-Language header
197 if searxng_locale == 'all':
198 searxng_locale = 'en-US'
199 locale = babel.Locale.parse(searxng_locale, sep='-')
200
201 if send_accept_language_header:
202 ac_lang = locale.language
203 if locale.territory:
204 ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
205 locale.language,
206 locale.territory,
207 locale.language,
208 )
209 headers['Accept-Language'] = ac_lang
210
211 get_sc_url = base_url + '/?sc=%s' % (sc_code)
212 logger.debug("query new sc time-stamp ... %s", get_sc_url)
213 logger.debug("headers: %s", headers)
214 resp = get(get_sc_url, headers=headers)
215
216 # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
217 # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
218 # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
219
220 if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore
222 message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
223 )
224
225 dom = lxml.html.fromstring(resp.text) # type: ignore
226
227 try:
228 sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
229 except IndexError as exc:
230 logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
232 message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url, # type: ignore
233 ) from exc
234
235 sc_code_ts = time()
236 logger.debug("get_sc_code: new value is: %s", sc_code)
237 return sc_code
238
239
240def request(query, params):
241 """Assemble a Startpage request.
242
243 To avoid CAPTCHA we need to send a well formed HTTP POST request with a
244 cookie. We need to form a request that is identical to the request build by
245 Startpage's search form:
246
247 - in the cookie the **region** is selected
248 - in the HTTP POST data the **language** is selected
249
250 Additionally the arguments form Startpage's search form needs to be set in
251 HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
252 """
253 if startpage_categ == 'web':
254 return _request_cat_web(query, params)
255
256 logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
257 return params
258
259
260def _request_cat_web(query, params):
261
262 engine_region = traits.get_region(params['searxng_locale'], 'en-US')
263 engine_language = traits.get_language(params['searxng_locale'], 'en')
264
265 # build arguments
266 args = {
267 'query': query,
268 'cat': 'web',
269 't': 'device',
270 'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
271 'with_date': time_range_dict.get(params['time_range'], ''),
272 }
273
274 if engine_language:
275 args['language'] = engine_language
276 args['lui'] = engine_language
277
278 args['abp'] = '1'
279 if params['pageno'] > 1:
280 args['page'] = params['pageno']
281
282 # build cookie
283 lang_homepage = 'en'
284 cookie = OrderedDict()
285 cookie['date_time'] = 'world'
286 cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
287 cookie['disable_open_in_new_window'] = '0'
288 cookie['enable_post_method'] = '1' # hint: POST
289 cookie['enable_proxy_safety_suggest'] = '1'
290 cookie['enable_stay_control'] = '1'
291 cookie['instant_answers'] = '1'
292 cookie['lang_homepage'] = 's/device/%s/' % lang_homepage
293 cookie['num_of_results'] = '10'
294 cookie['suggestions'] = '1'
295 cookie['wt_unit'] = 'celsius'
296
297 if engine_language:
298 cookie['language'] = engine_language
299 cookie['language_ui'] = engine_language
300
301 if engine_region:
302 cookie['search_results_region'] = engine_region
303
304 params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
305 logger.debug('cookie preferences: %s', params['cookies']['preferences'])
306
307 # POST request
308 logger.debug("data: %s", args)
309 params['data'] = args
310 params['method'] = 'POST'
311 params['url'] = search_url
312 params['headers']['Origin'] = base_url
313 params['headers']['Referer'] = base_url + '/'
314 # is the Accept header needed?
315 # params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
316
317 return params
318
319
320# get response from search-request
321def response(resp):
322 dom = lxml.html.fromstring(resp.text)
323
324 if startpage_categ == 'web':
325 return _response_cat_web(dom)
326
327 logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
328 return []
329
330
332 results = []
333
334 # parse results
335 for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
336 links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
337 if not links:
338 continue
339 link = links[0]
340 url = link.attrib.get('href')
341
342 # block google-ad url's
343 if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
344 continue
345
346 # block startpage search url's
347 if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
348 continue
349
350 title = extract_text(eval_xpath(link, 'h2'))
351 content = eval_xpath(result, './/p[contains(@class, "description")]')
352 content = extract_text(content, allow_none=True) or ''
353
354 published_date = None
355
356 # check if search result starts with something like: "2 Sep 2014 ... "
357 if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
358 date_pos = content.find('...') + 4
359 date_string = content[0 : date_pos - 5]
360 # fix content string
361 content = content[date_pos:]
362
363 try:
364 published_date = dateutil.parser.parse(date_string, dayfirst=True)
365 except ValueError:
366 pass
367
368 # check if search result starts with something like: "5 days ago ... "
369 elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
370 date_pos = content.find('...') + 4
371 date_string = content[0 : date_pos - 5]
372
373 # calculate datetime
374 published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore
375
376 # fix content string
377 content = content[date_pos:]
378
379 if published_date:
380 # append result
381 results.append({'url': url, 'title': title, 'content': content, 'publishedDate': published_date})
382 else:
383 # append result
384 results.append({'url': url, 'title': title, 'content': content})
385
386 # return results
387 return results
388
389
390def fetch_traits(engine_traits: EngineTraits):
391 """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
392 regions>` from Startpage."""
393 # pylint: disable=too-many-branches
394
395 headers = {
396 'User-Agent': gen_useragent(),
397 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
398 }
399 resp = get('https://www.startpage.com/do/settings', headers=headers)
400
401 if not resp.ok: # type: ignore
402 print("ERROR: response from Startpage is not OK.")
403
404 dom = lxml.html.fromstring(resp.text) # type: ignore
405
406 # regions
407
408 sp_region_names = []
409 for option in dom.xpath('//form[@name="settings"]//select[@name="search_results_region"]/option'):
410 sp_region_names.append(option.get('value'))
411
412 for eng_tag in sp_region_names:
413 if eng_tag == 'all':
414 continue
415 babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
416
417 if '-' in babel_region_tag:
418 l, r = babel_region_tag.split('-')
419 r = r.split('_')[-1]
420 sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
421
422 else:
423 try:
424 sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep='_'))
425
426 except babel.UnknownLocaleError:
427 print("ERROR: can't determine babel locale of startpage's locale %s" % eng_tag)
428 continue
429
430 conflict = engine_traits.regions.get(sxng_tag)
431 if conflict:
432 if conflict != eng_tag:
433 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
434 continue
435 engine_traits.regions[sxng_tag] = eng_tag
436
437 # languages
438
439 catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()}
440
441 # get the native name of every language known by babel
442
443 for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()):
444 native_name = babel.Locale(lang_code).get_language_name()
445 if not native_name:
446 print(f"ERROR: language name of startpage's language {lang_code} is unknown by babel")
447 continue
448 native_name = native_name.lower()
449 # add native name exactly as it is
450 catalog_engine2code[native_name] = lang_code
451
452 # add "normalized" language name (i.e. français becomes francais and español becomes espanol)
453 unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
454 if len(unaccented_name) == len(unaccented_name.encode()):
455 # add only if result is ascii (otherwise "normalization" didn't work)
456 catalog_engine2code[unaccented_name] = lang_code
457
458 # values that can't be determined by babel's languages names
459
460 catalog_engine2code.update(
461 {
462 # traditional chinese used in ..
463 'fantizhengwen': 'zh_Hant',
464 # Korean alphabet
465 'hangul': 'ko',
466 # Malayalam is one of 22 scheduled languages of India.
467 'malayam': 'ml',
468 'norsk': 'nb',
469 'sinhalese': 'si',
470 }
471 )
472
473 skip_eng_tags = {
474 'english_uk', # SearXNG lang 'en' already maps to 'english'
475 }
476
477 for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
478
479 eng_tag = option.get('value')
480 if eng_tag in skip_eng_tags:
481 continue
482 name = extract_text(option).lower() # type: ignore
483
484 sxng_tag = catalog_engine2code.get(eng_tag)
485 if sxng_tag is None:
486 sxng_tag = catalog_engine2code[name]
487
488 conflict = engine_traits.languages.get(sxng_tag)
489 if conflict:
490 if conflict != eng_tag:
491 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
492 continue
493 engine_traits.languages[sxng_tag] = eng_tag
_request_cat_web(query, params)
Definition startpage.py:260
get_sc_code(searxng_locale, params)
Definition startpage.py:168
fetch_traits(EngineTraits engine_traits)
Definition startpage.py:390
request(query, params)
Definition startpage.py:240