.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
duckduckgo.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""
3DuckDuckGo Lite
4~~~~~~~~~~~~~~~
5"""
6
7from typing import TYPE_CHECKING
8import re
9from urllib.parse import urlencode
10import json
11import babel
12import lxml.html
13
14from searx import (
15 locales,
16 redislib,
17 external_bang,
18)
19from searx.utils import (
20 eval_xpath,
21 eval_xpath_getindex,
22 extract_text,
23)
24from searx.network import get # see https://github.com/searxng/searxng/issues/762
25from searx import redisdb
26from searx.enginelib.traits import EngineTraits
27
28if TYPE_CHECKING:
29 import logging
30
31 logger: logging.Logger
32
33traits: EngineTraits
34
35about = {
36 "website": 'https://lite.duckduckgo.com/lite/',
37 "wikidata_id": 'Q12805',
38 "use_official_api": False,
39 "require_api_key": False,
40 "results": 'HTML',
41}
42
43send_accept_language_header = True
44"""DuckDuckGo-Lite tries to guess user's prefered language from the HTTP
45``Accept-Language``. Optional the user can select a region filter (but not a
46language).
47"""
48
49# engine dependent config
50categories = ['general', 'web']
51paging = True
52time_range_support = True
53safesearch = True # user can't select but the results are filtered
54
55url = 'https://lite.duckduckgo.com/lite/'
56# url_ping = 'https://duckduckgo.com/t/sl_l'
57
58time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
59form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
60
61
62def cache_vqd(query, value):
63 """Caches a ``vqd`` value from a query."""
64 c = redisdb.client()
65 if c:
66 logger.debug("cache vqd value: %s", value)
67 key = 'SearXNG_ddg_web_vqd' + redislib.secret_hash(query)
68 c.set(key, value, ex=600)
69
70
71def get_vqd(query):
72 """Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached
73 (:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the
74 response.
75
76 .. hint::
77
78 If an empty string is returned there are no results for the ``query`` and
79 therefore no ``vqd`` value.
80
81 DDG's bot detection is sensitive to the ``vqd`` value. For some search terms
82 (such as extremely long search terms that are often sent by bots), no ``vqd``
83 value can be determined.
84
85 If SearXNG cannot determine a ``vqd`` value, then no request should go out
86 to DDG:
87
88 A request with a wrong ``vqd`` value leads to DDG temporarily putting
89 SearXNG's IP on a block list.
90
91 Requests from IPs in this block list run into timeouts.
92
93 Not sure, but it seems the block list is a sliding window: to get my IP rid
94 from the bot list I had to cool down my IP for 1h (send no requests from
95 that IP to DDG).
96
97 TL;DR; the ``vqd`` value is needed to pass DDG's bot protection and is used
98 by all request to DDG:
99
100 - DuckDuckGo Lite: ``https://lite.duckduckgo.com/lite`` (POST form data)
101 - DuckDuckGo Web: ``https://links.duckduckgo.com/d.js?q=...&vqd=...``
102 - DuckDuckGo Images: ``https://duckduckgo.com/i.js??q=...&vqd=...``
103 - DuckDuckGo Videos: ``https://duckduckgo.com/v.js??q=...&vqd=...``
104 - DuckDuckGo News: ``https://duckduckgo.com/news.js??q=...&vqd=...``
105
106 """
107 value = None
108 c = redisdb.client()
109 if c:
110 key = 'SearXNG_ddg_web_vqd' + redislib.secret_hash(query)
111 value = c.get(key)
112 if value or value == b'':
113 value = value.decode('utf-8')
114 logger.debug("re-use cached vqd value: %s", value)
115 return value
116
117 query_url = 'https://duckduckgo.com/?' + urlencode({'q': query})
118 res = get(query_url)
119 doc = lxml.html.fromstring(res.text)
120 for script in doc.xpath("//script[@type='text/javascript']"):
121 script = script.text
122 if 'vqd="' in script:
123 value = script[script.index('vqd="') + 5 :]
124 value = value[: value.index('"')]
125 break
126 logger.debug("new vqd value: '%s'", value)
127 if value is not None:
128 cache_vqd(query, value)
129 return value
130
131
132def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
133 """Get DuckDuckGo's language identifier from SearXNG's locale.
134
135 DuckDuckGo defines its languages by region codes (see
136 :py:obj:`fetch_traits`).
137
138 To get region and language of a DDG service use:
139
140 .. code: python
141
142 eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
143 eng_lang = get_ddg_lang(traits, params['searxng_locale'])
144
145 It might confuse, but the ``l`` value of the cookie is what SearXNG calls
146 the *region*:
147
148 .. code:: python
149
150 # !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
151 params['cookies']['ad'] = eng_lang
152 params['cookies']['ah'] = eng_region
153 params['cookies']['l'] = eng_region
154
155 .. hint::
156
157 `DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language
158 selection to the user, only a region can be selected by the user
159 (``eng_region`` from the example above). DDG-lite stores the selected
160 region in a cookie::
161
162 params['cookies']['kl'] = eng_region # 'ar-es'
163
164 """
165 return eng_traits.custom['lang_region'].get( # type: ignore
166 sxng_locale, eng_traits.get_language(sxng_locale, default)
167 )
168
169
170ddg_reg_map = {
171 'tw-tzh': 'zh_TW',
172 'hk-tzh': 'zh_HK',
173 'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES
174 'es-ca': 'ca_ES',
175 'id-en': 'id_ID',
176 'no-no': 'nb_NO',
177 'jp-jp': 'ja_JP',
178 'kr-kr': 'ko_KR',
179 'xa-ar': 'ar_SA',
180 'sl-sl': 'sl_SI',
181 'th-en': 'th_TH',
182 'vn-en': 'vi_VN',
183}
184
185ddg_lang_map = {
186 # use ar --> ar_EG (Egypt's arabic)
187 "ar_DZ": 'lang_region',
188 "ar_JO": 'lang_region',
189 "ar_SA": 'lang_region',
190 # use bn --> bn_BD
191 'bn_IN': 'lang_region',
192 # use de --> de_DE
193 'de_CH': 'lang_region',
194 # use en --> en_US,
195 'en_AU': 'lang_region',
196 'en_CA': 'lang_region',
197 'en_GB': 'lang_region',
198 # Esperanto
199 'eo_XX': 'eo',
200 # use es --> es_ES,
201 'es_AR': 'lang_region',
202 'es_CL': 'lang_region',
203 'es_CO': 'lang_region',
204 'es_CR': 'lang_region',
205 'es_EC': 'lang_region',
206 'es_MX': 'lang_region',
207 'es_PE': 'lang_region',
208 'es_UY': 'lang_region',
209 'es_VE': 'lang_region',
210 # use fr --> rf_FR
211 'fr_CA': 'lang_region',
212 'fr_CH': 'lang_region',
213 'fr_BE': 'lang_region',
214 # use nl --> nl_NL
215 'nl_BE': 'lang_region',
216 # use pt --> pt_PT
217 'pt_BR': 'lang_region',
218 # skip these languages
219 'od_IN': 'skip',
220 'io_XX': 'skip',
221 'tokipona_XX': 'skip',
222}
223
224
225def quote_ddg_bangs(query):
226 # quote ddg bangs
227 query_parts = []
228
229 # for val in re.split(r'(\s+)', query):
230 for val in re.split(r'(\s+)', query):
231 if not val.strip():
232 continue
233 if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]):
234 val = f"'{val}'"
235 query_parts.append(val)
236 return ' '.join(query_parts)
237
238
239def request(query, params):
240
241 query = quote_ddg_bangs(query)
242
243 # request needs a vqd argument
244 vqd = get_vqd(query)
245
246 eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
247 # eng_lang = get_ddg_lang(traits, params['searxng_locale'])
248
249 params['url'] = url
250 params['method'] = 'POST'
251 params['data']['q'] = query
252
253 # The API is not documented, so we do some reverse engineering and emulate
254 # what https://lite.duckduckgo.com/lite/ does when you press "next Page"
255 # link again and again ..
256
257 params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
258 params['data']['vqd'] = vqd
259
260 # initial page does not have an offset
261 if params['pageno'] == 2:
262 # second page does have an offset of 20
263 offset = (params['pageno'] - 1) * 20
264 params['data']['s'] = offset
265 params['data']['dc'] = offset + 1
266
267 elif params['pageno'] > 2:
268 # third and following pages do have an offset of 20 + n*50
269 offset = 20 + (params['pageno'] - 2) * 50
270 params['data']['s'] = offset
271 params['data']['dc'] = offset + 1
272
273 # initial page does not have additional data in the input form
274 if params['pageno'] > 1:
275
276 params['data']['o'] = form_data.get('o', 'json')
277 params['data']['api'] = form_data.get('api', 'd.js')
278 params['data']['nextParams'] = form_data.get('nextParams', '')
279 params['data']['v'] = form_data.get('v', 'l')
280 params['headers']['Referer'] = 'https://lite.duckduckgo.com/'
281
282 params['data']['kl'] = eng_region
283 params['cookies']['kl'] = eng_region
284
285 params['data']['df'] = ''
286 if params['time_range'] in time_range_dict:
287 params['data']['df'] = time_range_dict[params['time_range']]
288 params['cookies']['df'] = time_range_dict[params['time_range']]
289
290 logger.debug("param data: %s", params['data'])
291 logger.debug("param cookies: %s", params['cookies'])
292 return params
293
294
295def response(resp):
296
297 if resp.status_code == 303:
298 return []
299
300 results = []
301 doc = lxml.html.fromstring(resp.text)
302
303 result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
304
305 if len(result_table) == 2:
306 # some locales (at least China) does not have a "next page" button and
307 # the layout of the HTML tables is different.
308 result_table = result_table[1]
309 elif not len(result_table) >= 3:
310 # no more results
311 return []
312 else:
313 result_table = result_table[2]
314 # update form data from response
315 form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..')
316 if len(form):
317
318 form = form[0]
319 form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0]
320 form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0]
321 form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0]
322 logger.debug('form_data: %s', form_data)
323
324 tr_rows = eval_xpath(result_table, './/tr')
325 # In the last <tr> is the form of the 'previous/next page' links
326 tr_rows = tr_rows[:-1]
327
328 len_tr_rows = len(tr_rows)
329 offset = 0
330
331 while len_tr_rows >= offset + 4:
332
333 # assemble table rows we need to scrap
334 tr_title = tr_rows[offset]
335 tr_content = tr_rows[offset + 1]
336 offset += 4
337
338 # ignore sponsored Adds <tr class="result-sponsored">
339 if tr_content.get('class') == 'result-sponsored':
340 continue
341
342 a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None)
343 if a_tag is None:
344 continue
345
346 td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None)
347 if td_content is None:
348 continue
349
350 results.append(
351 {
352 'title': a_tag.text_content(),
353 'content': extract_text(td_content),
354 'url': a_tag.get('href'),
355 }
356 )
357
358 return results
359
360
361def fetch_traits(engine_traits: EngineTraits):
362 """Fetch languages & regions from DuckDuckGo.
363
364 SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``).
365 DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no
366 sense in a SearXNG request since SearXNG's ``all`` will not add a
367 ``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale``
368 is ``wt-wt`` (the region).
369
370 Beside regions DuckDuckGo also defines its languages by region codes. By
371 example these are the english languages in DuckDuckGo:
372
373 - en_US
374 - en_AU
375 - en_CA
376 - en_GB
377
378 The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from
379 SearXNG's locale.
380
381 """
382 # pylint: disable=too-many-branches, too-many-statements
383 # fetch regions
384
385 engine_traits.all_locale = 'wt-wt'
386
387 # updated from u661.js to u.7669f071a13a7daa57cb / should be updated automatically?
388 resp = get('https://duckduckgo.com/dist/util/u.7669f071a13a7daa57cb.js')
389
390 if not resp.ok: # type: ignore
391 print("ERROR: response from DuckDuckGo is not OK.")
392
393 pos = resp.text.find('regions:{') + 8 # type: ignore
394 js_code = resp.text[pos:] # type: ignore
395 pos = js_code.find('}') + 1
396 regions = json.loads(js_code[:pos])
397
398 for eng_tag, name in regions.items():
399
400 if eng_tag == 'wt-wt':
401 engine_traits.all_locale = 'wt-wt'
402 continue
403
404 region = ddg_reg_map.get(eng_tag)
405 if region == 'skip':
406 continue
407
408 if not region:
409 eng_territory, eng_lang = eng_tag.split('-')
410 region = eng_lang + '_' + eng_territory.upper()
411
412 try:
413 sxng_tag = locales.region_tag(babel.Locale.parse(region))
414 except babel.UnknownLocaleError:
415 print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region))
416 continue
417
418 conflict = engine_traits.regions.get(sxng_tag)
419 if conflict:
420 if conflict != eng_tag:
421 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
422 continue
423 engine_traits.regions[sxng_tag] = eng_tag
424
425 # fetch languages
426
427 engine_traits.custom['lang_region'] = {}
428
429 pos = resp.text.find('languages:{') + 10 # type: ignore
430 js_code = resp.text[pos:] # type: ignore
431 pos = js_code.find('}') + 1
432 js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"')
433 languages = json.loads(js_code)
434
435 for eng_lang, name in languages.items():
436
437 if eng_lang == 'wt_WT':
438 continue
439
440 babel_tag = ddg_lang_map.get(eng_lang, eng_lang)
441 if babel_tag == 'skip':
442 continue
443
444 try:
445
446 if babel_tag == 'lang_region':
447 sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang))
448 engine_traits.custom['lang_region'][sxng_tag] = eng_lang
449 continue
450
451 sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag))
452
453 except babel.UnknownLocaleError:
454 print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang))
455 continue
456
457 conflict = engine_traits.languages.get(sxng_tag)
458 if conflict:
459 if conflict != eng_lang:
460 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
461 continue
462 engine_traits.languages[sxng_tag] = eng_lang
cache_vqd(query, value)
Definition duckduckgo.py:62
get_ddg_lang(EngineTraits eng_traits, sxng_locale, default='en_US')