.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
brave.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Brave supports the categories listed in :py:obj:`brave_category` (General,
3news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
4<time_range_support>` is limited (see remarks).
5
6Configured ``brave`` engines:
7
8.. code:: yaml
9
10 - name: brave
11 engine: brave
12 ...
13 brave_category: search
14 time_range_support: true
15 paging: true
16
17 - name: brave.images
18 engine: brave
19 ...
20 brave_category: images
21
22 - name: brave.videos
23 engine: brave
24 ...
25 brave_category: videos
26
27 - name: brave.news
28 engine: brave
29 ...
30 brave_category: news
31
32 - name: brave.goggles
33 time_range_support: true
34 paging: true
35 ...
36 brave_category: goggles
37
38
39.. _brave regions:
40
41Brave regions
42=============
43
44Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
45locales. To get a mapping, all *officiat de-facto* languages of the Brave
46region are mapped to regions in SearXNG (see :py:obj:`babel
47<babel.languages.get_official_languages>`):
48
49.. code:: python
50
51 "regions": {
52 ..
53 "en-CA": "ca",
54 "fr-CA": "ca",
55 ..
56 }
57
58
59.. note::
60
61 The language (aka region) support of Brave's index is limited to very basic
62 languages. The search results for languages like Chinese or Arabic are of
63 low quality.
64
65
66.. _brave googles:
67
68Brave Goggles
69=============
70
71.. _list of Goggles: https://search.brave.com/goggles/discover
72.. _Goggles Whitepaper: https://brave.com/static-assets/files/goggles.pdf
73.. _Goggles Quickstart: https://github.com/brave/goggles-quickstart
74
75Goggles allow you to choose, alter, or extend the ranking of Brave Search
76results (`Goggles Whitepaper`_). Goggles are openly developed by the community
77of Brave Search users.
78
79Select from the `list of Goggles`_ people have published, or create your own
80(`Goggles Quickstart`_).
81
82
83.. _brave languages:
84
85Brave languages
86===============
87
88Brave's language support is limited to the UI (menus, area local notations,
89etc). Brave's index only seems to support a locale, but it does not seem to
90support any languages in its index. The choice of available languages is very
91small (and its not clear to me where the difference in UI is when switching
92from en-us to en-ca or en-gb).
93
94In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
95UI languages are stored in a custom field named ``ui_lang``:
96
97.. code:: python
98
99 "custom": {
100 "ui_lang": {
101 "ca": "ca",
102 "de-DE": "de-de",
103 "en-CA": "en-ca",
104 "en-GB": "en-gb",
105 "en-US": "en-us",
106 "es": "es",
107 "fr-CA": "fr-ca",
108 "fr-FR": "fr-fr",
109 "ja-JP": "ja-jp",
110 "pt-BR": "pt-br",
111 "sq-AL": "sq-al"
112 }
113 },
114
115Implementations
116===============
117
118"""
119
120from typing import Any, TYPE_CHECKING
121
122from urllib.parse import (
123 urlencode,
124 urlparse,
125)
126
127from dateutil import parser
128from lxml import html
129
130from searx import locales
131from searx.utils import (
132 extract_text,
133 eval_xpath,
134 eval_xpath_list,
135 eval_xpath_getindex,
136 js_variable_to_python,
137 get_embeded_stream_url,
138)
139from searx.enginelib.traits import EngineTraits
140from searx.result_types import EngineResults
141
142if TYPE_CHECKING:
143 import logging
144
145 logger: logging.Logger
146
147traits: EngineTraits
148
149about = {
150 "website": 'https://search.brave.com/',
151 "wikidata_id": 'Q22906900',
152 "official_api_documentation": None,
153 "use_official_api": False,
154 "require_api_key": False,
155 "results": 'HTML',
156}
157
158base_url = "https://search.brave.com/"
159categories = []
160brave_category = 'search'
161Goggles = Any
162"""Brave supports common web-search, videos, images, news, and goggles search.
163
164- ``search``: Common WEB search
165- ``videos``: search for videos
166- ``images``: search for images
167- ``news``: search for news
168- ``goggles``: Common WEB search with custom rules
169"""
170
171brave_spellcheck = False
172"""Brave supports some kind of spell checking. When activated, Brave tries to
173fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
174the UI of Brave the user gets warned about this, since we can not warn the user
175in SearXNG, the spellchecking is disabled by default.
176"""
177
178send_accept_language_header = True
179paging = False
180"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
181category All) and in the goggles category."""
182max_page = 10
183"""Tested 9 pages maximum (``&offset=8``), to be save max is set to 10. Trying
184to do more won't return any result and you will most likely be flagged as a bot.
185"""
186
187safesearch = True
188safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
189
190time_range_support = False
191"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
192category All) and in the goggles category."""
193
194time_range_map = {
195 'day': 'pd',
196 'week': 'pw',
197 'month': 'pm',
198 'year': 'py',
199}
200
201
202def request(query, params):
203
204 # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
205 params['headers']['Accept-Encoding'] = 'gzip, deflate'
206
207 args = {
208 'q': query,
209 'source': 'web',
210 }
211 if brave_spellcheck:
212 args['spellcheck'] = '1'
213
214 if brave_category in ('search', 'goggles'):
215 if params.get('pageno', 1) - 1:
216 args['offset'] = params.get('pageno', 1) - 1
217 if time_range_map.get(params['time_range']):
218 args['tf'] = time_range_map.get(params['time_range'])
219
220 if brave_category == 'goggles':
221 args['goggles_id'] = Goggles
222
223 params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
224
225 # set properties in the cookies
226
227 params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
228 # the useLocation is IP based, we use cookie 'country' for the region
229 params['cookies']['useLocation'] = '0'
230 params['cookies']['summarizer'] = '0'
231
232 engine_region = traits.get_region(params['searxng_locale'], 'all')
233 params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore
234
235 ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
236 params['cookies']['ui_lang'] = ui_lang
237
238 logger.debug("cookies %s", params['cookies'])
239
240 params['headers']['Sec-Fetch-Dest'] = "document"
241 params['headers']['Sec-Fetch-Mode'] = "navigate"
242 params['headers']['Sec-Fetch-Site'] = "same-origin"
243 params['headers']['Sec-Fetch-User'] = "?1"
244
245
246def _extract_published_date(published_date_raw):
247 if published_date_raw is None:
248 return None
249
250 try:
251 return parser.parse(published_date_raw)
252 except parser.ParserError:
253 return None
254
255
257 # kit.start(app, element, {
258 # node_ids: [0, 19],
259 # data: [{"type":"data","data" .... ["q","goggles_id"],"route":1,"url":1}}]
260 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
261 kit_start = resp.text.index("kit.start(app,")
262 start = resp.text[kit_start:].index('data: [{"type":"data"')
263 start = kit_start + start + len('data: ')
264
265 lev = 0
266 end = start
267 inner = False
268 for c in resp.text[start:]:
269 if inner and lev == 0:
270 break
271 end += 1
272 if c == "[":
273 lev += 1
274 inner = True
275 continue
276 if c == "]":
277 lev -= 1
278
279 json_data = js_variable_to_python(resp.text[start:end])
280 return json_data
281
282
283def response(resp) -> EngineResults:
284
285 if brave_category in ('search', 'goggles'):
286 return _parse_search(resp)
287
288 if brave_category in ('news'):
289 return _parse_news(resp)
290
291 json_data = parse_data_string(resp)
292 # json_data is a list and at the second position (0,1) in this list we find the "response" data we need ..
293 json_resp = json_data[1]['data']['body']['response']
294
295 if brave_category == 'images':
296 return _parse_images(json_resp)
297 if brave_category == 'videos':
298 return _parse_videos(json_resp)
299
300 raise ValueError(f"Unsupported brave category: {brave_category}")
301
302
303def _parse_search(resp) -> EngineResults:
304 result_list = EngineResults()
305
306 dom = html.fromstring(resp.text)
307
308 # I doubt that Brave is still providing the "answer" class / I haven't seen
309 # answers in brave for a long time.
310 answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
311 if answer_tag:
312 url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None)
313 answer = extract_text(answer_tag)
314 if answer is not None:
315 result_list.add(result_list.types.Answer(answer=answer, url=url))
316
317 # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
318 xpath_results = '//div[contains(@class, "snippet ")]'
319
320 for result in eval_xpath_list(dom, xpath_results):
321
322 url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None)
323 title_tag = eval_xpath_getindex(
324 result, './/a[contains(@class, "h")]//div[contains(@class, "title")]', 0, default=None
325 )
326 if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
327 continue
328
329 content: str = extract_text(
330 eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='')
331 ) # type: ignore
332 pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")')
333 pub_date = _extract_published_date(pub_date_raw)
334 if pub_date and content.startswith(pub_date_raw):
335 content = content.lstrip(pub_date_raw).strip("- \n\t")
336
337 thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
338
339 item = {
340 'url': url,
341 'title': extract_text(title_tag),
342 'content': content,
343 'publishedDate': pub_date,
344 'thumbnail': thumbnail,
345 }
346
347 video_tag = eval_xpath_getindex(
348 result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
349 )
350 if video_tag is not None:
351
352 # In my tests a video tag in the WEB search was most often not a
353 # video, except the ones from youtube ..
354
355 iframe_src = get_embeded_stream_url(url)
356 if iframe_src:
357 item['iframe_src'] = iframe_src
358 item['template'] = 'videos.html'
359 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
360 pub_date_raw = extract_text(
361 eval_xpath(video_tag, './/div[contains(@class, "snippet-attributes")]/div/text()')
362 )
363 item['publishedDate'] = _extract_published_date(pub_date_raw)
364 else:
365 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
366
367 result_list.append(item)
368
369 return result_list
370
371
372def _parse_news(resp) -> EngineResults:
373
374 result_list = EngineResults()
375 dom = html.fromstring(resp.text)
376
377 for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'):
378
379 # import pdb
380 # pdb.set_trace()
381
382 url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None)
383 if url is None:
384 continue
385
386 title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]'))
387 content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]'))
388 thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='')
389
390 item = {
391 "url": url,
392 "title": title,
393 "content": content,
394 "thumbnail": thumbnail,
395 }
396
397 result_list.append(item)
398
399 return result_list
400
401
402def _parse_images(json_resp) -> EngineResults:
403 result_list = EngineResults()
404
405 for result in json_resp["results"]:
406 item = {
407 'url': result['url'],
408 'title': result['title'],
409 'content': result['description'],
410 'template': 'images.html',
411 'resolution': result['properties']['format'],
412 'source': result['source'],
413 'img_src': result['properties']['url'],
414 'thumbnail_src': result['thumbnail']['src'],
415 }
416 result_list.append(item)
417
418 return result_list
419
420
421def _parse_videos(json_resp) -> EngineResults:
422 result_list = EngineResults()
423
424 for result in json_resp["results"]:
425
426 url = result['url']
427 item = {
428 'url': url,
429 'title': result['title'],
430 'content': result['description'],
431 'template': 'videos.html',
432 'length': result['video']['duration'],
433 'duration': result['video']['duration'],
434 'publishedDate': _extract_published_date(result['age']),
435 }
436
437 if result['thumbnail'] is not None:
438 item['thumbnail'] = result['thumbnail']['src']
439
440 iframe_src = get_embeded_stream_url(url)
441 if iframe_src:
442 item['iframe_src'] = iframe_src
443
444 result_list.append(item)
445
446 return result_list
447
448
449def fetch_traits(engine_traits: EngineTraits):
450 """Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
451 regions>` from Brave."""
452
453 # pylint: disable=import-outside-toplevel, too-many-branches
454
455 import babel.languages
456 from searx.locales import region_tag, language_tag
457 from searx.network import get # see https://github.com/searxng/searxng/issues/762
458
459 engine_traits.custom["ui_lang"] = {}
460
461 headers = {
462 'Accept-Encoding': 'gzip, deflate',
463 }
464 lang_map = {'no': 'nb'} # norway
465
466 # languages (UI)
467
468 resp = get('https://search.brave.com/settings', headers=headers)
469
470 if not resp.ok: # type: ignore
471 print("ERROR: response from Brave is not OK.")
472 dom = html.fromstring(resp.text) # type: ignore
473
474 for option in dom.xpath('//section//option[@value="en-us"]/../option'):
475
476 ui_lang = option.get('value')
477 try:
478 l = babel.Locale.parse(ui_lang, sep='-')
479 if l.territory:
480 sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
481 else:
482 sxng_tag = language_tag(babel.Locale.parse(ui_lang, sep='-'))
483
484 except babel.UnknownLocaleError:
485 print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
486 continue
487
488 conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
489 if conflict:
490 if conflict != ui_lang:
491 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
492 continue
493 engine_traits.custom["ui_lang"][sxng_tag] = ui_lang
494
495 # search regions of brave
496
497 resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js', headers=headers)
498
499 if not resp.ok: # type: ignore
500 print("ERROR: response from Brave is not OK.")
501
502 country_js = resp.text[resp.text.index("options:{all") + len('options:') :] # type: ignore
503 country_js = country_js[: country_js.index("},k={default")]
504 country_tags = js_variable_to_python(country_js)
505
506 for k, v in country_tags.items():
507 if k == 'all':
508 engine_traits.all_locale = 'all'
509 continue
510 country_tag = v['value']
511
512 # add official languages of the country ..
513 for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
514 lang_tag = lang_map.get(lang_tag, lang_tag)
515 sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
516 # print("%-20s: %s <-- %s" % (v['label'], country_tag, sxng_tag))
517
518 conflict = engine_traits.regions.get(sxng_tag)
519 if conflict:
520 if conflict != country_tag:
521 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
522 continue
523 engine_traits.regions[sxng_tag] = country_tag
EngineResults _parse_search(resp)
Definition brave.py:303
request(query, params)
Definition brave.py:202
fetch_traits(EngineTraits engine_traits)
Definition brave.py:449
EngineResults _parse_videos(json_resp)
Definition brave.py:421
EngineResults _parse_news(resp)
Definition brave.py:372
parse_data_string(resp)
Definition brave.py:256
_extract_published_date(published_date_raw)
Definition brave.py:246
EngineResults _parse_images(json_resp)
Definition brave.py:402
EngineResults response(resp)
Definition brave.py:283