.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
brave.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Brave supports the categories listed in :py:obj:`brave_category` (General,
3news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
4<time_range_support>` is limited (see remarks).
5
6Configured ``brave`` engines:
7
8.. code:: yaml
9
10 - name: brave
11 engine: brave
12 ...
13 brave_category: search
14 time_range_support: true
15 paging: true
16
17 - name: brave.images
18 engine: brave
19 ...
20 brave_category: images
21
22 - name: brave.videos
23 engine: brave
24 ...
25 brave_category: videos
26
27 - name: brave.news
28 engine: brave
29 ...
30 brave_category: news
31
32 - name: brave.goggles
33 brave_category: goggles
34 time_range_support: true
35 paging: true
36 ...
37 brave_category: goggles
38
39
40.. _brave regions:
41
42Brave regions
43=============
44
45Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
46locales. To get a mapping, all *officiat de-facto* languages of the Brave
47region are mapped to regions in SearXNG (see :py:obj:`babel
48<babel.languages.get_official_languages>`):
49
50.. code:: python
51
52 "regions": {
53 ..
54 "en-CA": "ca",
55 "fr-CA": "ca",
56 ..
57 }
58
59
60.. note::
61
62 The language (aka region) support of Brave's index is limited to very basic
63 languages. The search results for languages like Chinese or Arabic are of
64 low quality.
65
66
67.. _brave googles:
68
69Brave Goggles
70=============
71
72.. _list of Goggles: https://search.brave.com/goggles/discover
73.. _Goggles Whitepaper: https://brave.com/static-assets/files/goggles.pdf
74.. _Goggles Quickstart: https://github.com/brave/goggles-quickstart
75
76Goggles allow you to choose, alter, or extend the ranking of Brave Search
77results (`Goggles Whitepaper`_). Goggles are openly developed by the community
78of Brave Search users.
79
80Select from the `list of Goggles`_ people have published, or create your own
81(`Goggles Quickstart`_).
82
83
84.. _brave languages:
85
86Brave languages
87===============
88
89Brave's language support is limited to the UI (menus, area local notations,
90etc). Brave's index only seems to support a locale, but it does not seem to
91support any languages in its index. The choice of available languages is very
92small (and its not clear to me where the difference in UI is when switching
93from en-us to en-ca or en-gb).
94
95In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
96UI languages are stored in a custom field named ``ui_lang``:
97
98.. code:: python
99
100 "custom": {
101 "ui_lang": {
102 "ca": "ca",
103 "de-DE": "de-de",
104 "en-CA": "en-ca",
105 "en-GB": "en-gb",
106 "en-US": "en-us",
107 "es": "es",
108 "fr-CA": "fr-ca",
109 "fr-FR": "fr-fr",
110 "ja-JP": "ja-jp",
111 "pt-BR": "pt-br",
112 "sq-AL": "sq-al"
113 }
114 },
115
116Implementations
117===============
118
119"""
120
121from typing import Any, TYPE_CHECKING
122
123from urllib.parse import (
124 urlencode,
125 urlparse,
126)
127
128from dateutil import parser
129from lxml import html
130
131from searx import locales
132from searx.utils import (
133 extract_text,
134 eval_xpath,
135 eval_xpath_list,
136 eval_xpath_getindex,
137 js_variable_to_python,
138 get_embeded_stream_url,
139)
140from searx.enginelib.traits import EngineTraits
141from searx.result_types import EngineResults
142
143if TYPE_CHECKING:
144 import logging
145
146 logger: logging.Logger
147
148traits: EngineTraits
149
150about = {
151 "website": 'https://search.brave.com/',
152 "wikidata_id": 'Q22906900',
153 "official_api_documentation": None,
154 "use_official_api": False,
155 "require_api_key": False,
156 "results": 'HTML',
157}
158
159base_url = "https://search.brave.com/"
160categories = []
161brave_category = 'search'
162Goggles = Any
163"""Brave supports common web-search, videos, images, news, and goggles search.
164
165- ``search``: Common WEB search
166- ``videos``: search for videos
167- ``images``: search for images
168- ``news``: search for news
169- ``goggles``: Common WEB search with custom rules
170"""
171
172brave_spellcheck = False
173"""Brave supports some kind of spell checking. When activated, Brave tries to
174fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
175the UI of Brave the user gets warned about this, since we can not warn the user
176in SearXNG, the spellchecking is disabled by default.
177"""
178
179send_accept_language_header = True
180paging = False
181"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
182category All) and in the goggles category."""
183max_page = 10
184"""Tested 9 pages maximum (``&offset=8``), to be save max is set to 10. Trying
185to do more won't return any result and you will most likely be flagged as a bot.
186"""
187
188safesearch = True
189safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
190
191time_range_support = False
192"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
193category All) and in the goggles category."""
194
195time_range_map = {
196 'day': 'pd',
197 'week': 'pw',
198 'month': 'pm',
199 'year': 'py',
200}
201
202
203def request(query, params):
204
205 # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
206 params['headers']['Accept-Encoding'] = 'gzip, deflate'
207
208 args = {
209 'q': query,
210 }
211 if brave_spellcheck:
212 args['spellcheck'] = '1'
213
214 if brave_category in ('search', 'goggles'):
215 if params.get('pageno', 1) - 1:
216 args['offset'] = params.get('pageno', 1) - 1
217 if time_range_map.get(params['time_range']):
218 args['tf'] = time_range_map.get(params['time_range'])
219
220 if brave_category == 'goggles':
221 args['goggles_id'] = Goggles
222
223 params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
224
225 # set properties in the cookies
226
227 params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
228 # the useLocation is IP based, we use cookie 'country' for the region
229 params['cookies']['useLocation'] = '0'
230 params['cookies']['summarizer'] = '0'
231
232 engine_region = traits.get_region(params['searxng_locale'], 'all')
233 params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore
234
235 ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
236 params['cookies']['ui_lang'] = ui_lang
237
238 logger.debug("cookies %s", params['cookies'])
239
240
241def _extract_published_date(published_date_raw):
242 if published_date_raw is None:
243 return None
244
245 try:
246 return parser.parse(published_date_raw)
247 except parser.ParserError:
248 return None
249
250
252 # kit.start(app, element, {
253 # node_ids: [0, 19],
254 # data: [{"type":"data","data" .... ["q","goggles_id"],"route":1,"url":1}}]
255 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
256 kit_start = resp.text.index("kit.start(app,")
257 start = resp.text[kit_start:].index('data: [{"type":"data"')
258 start = kit_start + start + len('data: ')
259
260 lev = 0
261 end = start
262 inner = False
263 for c in resp.text[start:]:
264 if inner and lev == 0:
265 break
266 end += 1
267 if c == "[":
268 lev += 1
269 inner = True
270 continue
271 if c == "]":
272 lev -= 1
273
274 json_data = js_variable_to_python(resp.text[start:end])
275 return json_data
276
277
278def response(resp) -> EngineResults:
279
280 if brave_category in ('search', 'goggles'):
281 return _parse_search(resp)
282
283 if brave_category in ('news'):
284 return _parse_news(resp)
285
286 json_data = parse_data_string(resp)
287 # json_data is a list and at the second position (0,1) in this list we find the "response" data we need ..
288 json_resp = json_data[1]['data']['body']['response']
289
290 if brave_category == 'images':
291 return _parse_images(json_resp)
292 if brave_category == 'videos':
293 return _parse_videos(json_resp)
294
295 raise ValueError(f"Unsupported brave category: {brave_category}")
296
297
298def _parse_search(resp) -> EngineResults:
299 result_list = EngineResults()
300
301 dom = html.fromstring(resp.text)
302
303 # I doubt that Brave is still providing the "answer" class / I haven't seen
304 # answers in brave for a long time.
305 answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
306 if answer_tag:
307 url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None)
308 answer = extract_text(answer_tag)
309 if answer is not None:
310 result_list.add(result_list.types.Answer(answer=answer, url=url))
311
312 # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
313 xpath_results = '//div[contains(@class, "snippet ")]'
314
315 for result in eval_xpath_list(dom, xpath_results):
316
317 url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None)
318 title_tag = eval_xpath_getindex(
319 result, './/a[contains(@class, "h")]//div[contains(@class, "title")]', 0, default=None
320 )
321 if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
322 continue
323
324 content: str = extract_text(
325 eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='')
326 ) # type: ignore
327 pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")')
328 pub_date = _extract_published_date(pub_date_raw)
329 if pub_date and content.startswith(pub_date_raw):
330 content = content.lstrip(pub_date_raw).strip("- \n\t")
331
332 thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
333
334 item = {
335 'url': url,
336 'title': extract_text(title_tag),
337 'content': content,
338 'publishedDate': pub_date,
339 'thumbnail': thumbnail,
340 }
341
342 video_tag = eval_xpath_getindex(
343 result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
344 )
345 if video_tag is not None:
346
347 # In my tests a video tag in the WEB search was most often not a
348 # video, except the ones from youtube ..
349
350 iframe_src = get_embeded_stream_url(url)
351 if iframe_src:
352 item['iframe_src'] = iframe_src
353 item['template'] = 'videos.html'
354 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
355 pub_date_raw = extract_text(
356 eval_xpath(video_tag, './/div[contains(@class, "snippet-attributes")]/div/text()')
357 )
358 item['publishedDate'] = _extract_published_date(pub_date_raw)
359 else:
360 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
361
362 result_list.append(item)
363
364 return result_list
365
366
367def _parse_news(resp) -> EngineResults:
368
369 result_list = EngineResults()
370 dom = html.fromstring(resp.text)
371
372 for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'):
373
374 # import pdb
375 # pdb.set_trace()
376
377 url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None)
378 if url is None:
379 continue
380
381 title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]'))
382 content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]'))
383 thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='')
384
385 item = {
386 "url": url,
387 "title": title,
388 "content": content,
389 "thumbnail": thumbnail,
390 }
391
392 result_list.append(item)
393
394 return result_list
395
396
397def _parse_images(json_resp) -> EngineResults:
398 result_list = EngineResults()
399
400 for result in json_resp["results"]:
401 item = {
402 'url': result['url'],
403 'title': result['title'],
404 'content': result['description'],
405 'template': 'images.html',
406 'resolution': result['properties']['format'],
407 'source': result['source'],
408 'img_src': result['properties']['url'],
409 'thumbnail_src': result['thumbnail']['src'],
410 }
411 result_list.append(item)
412
413 return result_list
414
415
416def _parse_videos(json_resp) -> EngineResults:
417 result_list = EngineResults()
418
419 for result in json_resp["results"]:
420
421 url = result['url']
422 item = {
423 'url': url,
424 'title': result['title'],
425 'content': result['description'],
426 'template': 'videos.html',
427 'length': result['video']['duration'],
428 'duration': result['video']['duration'],
429 'publishedDate': _extract_published_date(result['age']),
430 }
431
432 if result['thumbnail'] is not None:
433 item['thumbnail'] = result['thumbnail']['src']
434
435 iframe_src = get_embeded_stream_url(url)
436 if iframe_src:
437 item['iframe_src'] = iframe_src
438
439 result_list.append(item)
440
441 return result_list
442
443
444def fetch_traits(engine_traits: EngineTraits):
445 """Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
446 regions>` from Brave."""
447
448 # pylint: disable=import-outside-toplevel, too-many-branches
449
450 import babel.languages
451 from searx.locales import region_tag, language_tag
452 from searx.network import get # see https://github.com/searxng/searxng/issues/762
453
454 engine_traits.custom["ui_lang"] = {}
455
456 headers = {
457 'Accept-Encoding': 'gzip, deflate',
458 }
459 lang_map = {'no': 'nb'} # norway
460
461 # languages (UI)
462
463 resp = get('https://search.brave.com/settings', headers=headers)
464
465 if not resp.ok: # type: ignore
466 print("ERROR: response from Brave is not OK.")
467 dom = html.fromstring(resp.text) # type: ignore
468
469 for option in dom.xpath('//section//option[@value="en-us"]/../option'):
470
471 ui_lang = option.get('value')
472 try:
473 l = babel.Locale.parse(ui_lang, sep='-')
474 if l.territory:
475 sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
476 else:
477 sxng_tag = language_tag(babel.Locale.parse(ui_lang, sep='-'))
478
479 except babel.UnknownLocaleError:
480 print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
481 continue
482
483 conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
484 if conflict:
485 if conflict != ui_lang:
486 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
487 continue
488 engine_traits.custom["ui_lang"][sxng_tag] = ui_lang
489
490 # search regions of brave
491
492 resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js', headers=headers)
493
494 if not resp.ok: # type: ignore
495 print("ERROR: response from Brave is not OK.")
496
497 country_js = resp.text[resp.text.index("options:{all") + len('options:') :] # type: ignore
498 country_js = country_js[: country_js.index("},k={default")]
499 country_tags = js_variable_to_python(country_js)
500
501 for k, v in country_tags.items():
502 if k == 'all':
503 engine_traits.all_locale = 'all'
504 continue
505 country_tag = v['value']
506
507 # add official languages of the country ..
508 for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
509 lang_tag = lang_map.get(lang_tag, lang_tag)
510 sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
511 # print("%-20s: %s <-- %s" % (v['label'], country_tag, sxng_tag))
512
513 conflict = engine_traits.regions.get(sxng_tag)
514 if conflict:
515 if conflict != country_tag:
516 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
517 continue
518 engine_traits.regions[sxng_tag] = country_tag
EngineResults _parse_search(resp)
Definition brave.py:298
request(query, params)
Definition brave.py:203
fetch_traits(EngineTraits engine_traits)
Definition brave.py:444
EngineResults _parse_videos(json_resp)
Definition brave.py:416
EngineResults _parse_news(resp)
Definition brave.py:367
parse_data_string(resp)
Definition brave.py:251
_extract_published_date(published_date_raw)
Definition brave.py:241
EngineResults _parse_images(json_resp)
Definition brave.py:397
EngineResults response(resp)
Definition brave.py:278