.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
brave.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Brave supports the categories listed in :py:obj:`brave_category` (General,
3news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
4<time_range_support>` is limited (see remarks).
5
6Configured ``brave`` engines:
7
8.. code:: yaml
9
10 - name: brave
11 engine: brave
12 ...
13 brave_category: search
14 time_range_support: true
15 paging: true
16
17 - name: brave.images
18 engine: brave
19 ...
20 brave_category: images
21
22 - name: brave.videos
23 engine: brave
24 ...
25 brave_category: videos
26
27 - name: brave.news
28 engine: brave
29 ...
30 brave_category: news
31
32 - name: brave.goggles
33 time_range_support: true
34 paging: true
35 ...
36 brave_category: goggles
37
38
39.. _brave regions:
40
41Brave regions
42=============
43
44Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
45locales. To get a mapping, all *officiat de-facto* languages of the Brave
46region are mapped to regions in SearXNG (see :py:obj:`babel
47<babel.languages.get_official_languages>`):
48
49.. code:: python
50
51 "regions": {
52 ..
53 "en-CA": "ca",
54 "fr-CA": "ca",
55 ..
56 }
57
58
59.. note::
60
61 The language (aka region) support of Brave's index is limited to very basic
62 languages. The search results for languages like Chinese or Arabic are of
63 low quality.
64
65
66.. _brave googles:
67
68Brave Goggles
69=============
70
71.. _list of Goggles: https://search.brave.com/goggles/discover
72.. _Goggles Whitepaper: https://brave.com/static-assets/files/goggles.pdf
73.. _Goggles Quickstart: https://github.com/brave/goggles-quickstart
74
75Goggles allow you to choose, alter, or extend the ranking of Brave Search
76results (`Goggles Whitepaper`_). Goggles are openly developed by the community
77of Brave Search users.
78
79Select from the `list of Goggles`_ people have published, or create your own
80(`Goggles Quickstart`_).
81
82
83.. _brave languages:
84
85Brave languages
86===============
87
88Brave's language support is limited to the UI (menus, area local notations,
89etc). Brave's index only seems to support a locale, but it does not seem to
90support any languages in its index. The choice of available languages is very
91small (and its not clear to me where the difference in UI is when switching
92from en-us to en-ca or en-gb).
93
94In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
95UI languages are stored in a custom field named ``ui_lang``:
96
97.. code:: python
98
99 "custom": {
100 "ui_lang": {
101 "ca": "ca",
102 "de-DE": "de-de",
103 "en-CA": "en-ca",
104 "en-GB": "en-gb",
105 "en-US": "en-us",
106 "es": "es",
107 "fr-CA": "fr-ca",
108 "fr-FR": "fr-fr",
109 "ja-JP": "ja-jp",
110 "pt-BR": "pt-br",
111 "sq-AL": "sq-al"
112 }
113 },
114
115Implementations
116===============
117
118"""
119
120from typing import Any, TYPE_CHECKING
121
122from urllib.parse import (
123 urlencode,
124 urlparse,
125)
126
127from dateutil import parser
128from lxml import html
129
130from searx import locales
131from searx.utils import (
132 extr,
133 extract_text,
134 eval_xpath,
135 eval_xpath_list,
136 eval_xpath_getindex,
137 js_variable_to_python,
138 get_embeded_stream_url,
139)
140from searx.enginelib.traits import EngineTraits
141from searx.result_types import EngineResults
142
143if TYPE_CHECKING:
144 import logging
145
146 logger: logging.Logger
147
148traits: EngineTraits
149
150about = {
151 "website": 'https://search.brave.com/',
152 "wikidata_id": 'Q22906900',
153 "official_api_documentation": None,
154 "use_official_api": False,
155 "require_api_key": False,
156 "results": 'HTML',
157}
158
159base_url = "https://search.brave.com/"
160categories = []
161brave_category = 'search'
162Goggles = Any
163"""Brave supports common web-search, videos, images, news, and goggles search.
164
165- ``search``: Common WEB search
166- ``videos``: search for videos
167- ``images``: search for images
168- ``news``: search for news
169- ``goggles``: Common WEB search with custom rules
170"""
171
172brave_spellcheck = False
173"""Brave supports some kind of spell checking. When activated, Brave tries to
174fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
175the UI of Brave the user gets warned about this, since we can not warn the user
176in SearXNG, the spellchecking is disabled by default.
177"""
178
179send_accept_language_header = True
180paging = False
181"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
182category All) and in the goggles category."""
183max_page = 10
184"""Tested 9 pages maximum (``&offset=8``), to be save max is set to 10. Trying
185to do more won't return any result and you will most likely be flagged as a bot.
186"""
187
188safesearch = True
189safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
190
191time_range_support = False
192"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
193category All) and in the goggles category."""
194
195time_range_map = {
196 'day': 'pd',
197 'week': 'pw',
198 'month': 'pm',
199 'year': 'py',
200}
201
202
203def request(query, params):
204
205 # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
206 params['headers']['Accept-Encoding'] = 'gzip, deflate'
207
208 args = {
209 'q': query,
210 'source': 'web',
211 }
212 if brave_spellcheck:
213 args['spellcheck'] = '1'
214
215 if brave_category in ('search', 'goggles'):
216 if params.get('pageno', 1) - 1:
217 args['offset'] = params.get('pageno', 1) - 1
218 if time_range_map.get(params['time_range']):
219 args['tf'] = time_range_map.get(params['time_range'])
220
221 if brave_category == 'goggles':
222 args['goggles_id'] = Goggles
223
224 params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
225
226 # set properties in the cookies
227
228 params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
229 # the useLocation is IP based, we use cookie 'country' for the region
230 params['cookies']['useLocation'] = '0'
231 params['cookies']['summarizer'] = '0'
232
233 engine_region = traits.get_region(params['searxng_locale'], 'all')
234 params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore
235
236 ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
237 params['cookies']['ui_lang'] = ui_lang
238
239 logger.debug("cookies %s", params['cookies'])
240
241 params['headers']['Sec-Fetch-Dest'] = "document"
242 params['headers']['Sec-Fetch-Mode'] = "navigate"
243 params['headers']['Sec-Fetch-Site'] = "same-origin"
244 params['headers']['Sec-Fetch-User'] = "?1"
245
246
247def _extract_published_date(published_date_raw):
248 if published_date_raw is None:
249 return None
250
251 try:
252 return parser.parse(published_date_raw)
253 except parser.ParserError:
254 return None
255
256
257def response(resp) -> EngineResults:
258
259 if brave_category in ('search', 'goggles'):
260 return _parse_search(resp)
261
262 if brave_category in ('news'):
263 return _parse_news(resp)
264
265 # Example script source containing the data:
266 #
267 # kit.start(app, element, {
268 # node_ids: [0, 19],
269 # data: [{type:"data",data: .... ["q","goggles_id"],route:1,url:1}}]
270 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
271 js_object = "[{" + extr(resp.text, "data: [{", "}}],") + "}}]"
272 json_data = js_variable_to_python(js_object)
273
274 # json_data is a list and at the second position (0,1) in this list we find the "response" data we need ..
275 json_resp = json_data[1]['data']['body']['response']
276
277 if brave_category == 'images':
278 return _parse_images(json_resp)
279 if brave_category == 'videos':
280 return _parse_videos(json_resp)
281
282 raise ValueError(f"Unsupported brave category: {brave_category}")
283
284
285def _parse_search(resp) -> EngineResults:
286 result_list = EngineResults()
287
288 dom = html.fromstring(resp.text)
289
290 # I doubt that Brave is still providing the "answer" class / I haven't seen
291 # answers in brave for a long time.
292 answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
293 if answer_tag:
294 url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None)
295 answer = extract_text(answer_tag)
296 if answer is not None:
297 result_list.add(result_list.types.Answer(answer=answer, url=url))
298
299 # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
300 xpath_results = '//div[contains(@class, "snippet ")]'
301
302 for result in eval_xpath_list(dom, xpath_results):
303
304 url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None)
305 title_tag = eval_xpath_getindex(
306 result, './/a[contains(@class, "h")]//div[contains(@class, "title")]', 0, default=None
307 )
308 if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
309 continue
310
311 content: str = extract_text(
312 eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='')
313 ) # type: ignore
314 pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")')
315 pub_date = _extract_published_date(pub_date_raw)
316 if pub_date and content.startswith(pub_date_raw):
317 content = content.lstrip(pub_date_raw).strip("- \n\t")
318
319 thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
320
321 item = {
322 'url': url,
323 'title': extract_text(title_tag),
324 'content': content,
325 'publishedDate': pub_date,
326 'thumbnail': thumbnail,
327 }
328
329 video_tag = eval_xpath_getindex(
330 result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
331 )
332 if video_tag is not None:
333
334 # In my tests a video tag in the WEB search was most often not a
335 # video, except the ones from youtube ..
336
337 iframe_src = get_embeded_stream_url(url)
338 if iframe_src:
339 item['iframe_src'] = iframe_src
340 item['template'] = 'videos.html'
341 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
342 pub_date_raw = extract_text(
343 eval_xpath(video_tag, './/div[contains(@class, "snippet-attributes")]/div/text()')
344 )
345 item['publishedDate'] = _extract_published_date(pub_date_raw)
346 else:
347 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
348
349 result_list.append(item)
350
351 return result_list
352
353
354def _parse_news(resp) -> EngineResults:
355
356 result_list = EngineResults()
357 dom = html.fromstring(resp.text)
358
359 for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'):
360
361 # import pdb
362 # pdb.set_trace()
363
364 url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None)
365 if url is None:
366 continue
367
368 title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]'))
369 content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]'))
370 thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='')
371
372 item = {
373 "url": url,
374 "title": title,
375 "content": content,
376 "thumbnail": thumbnail,
377 }
378
379 result_list.append(item)
380
381 return result_list
382
383
384def _parse_images(json_resp) -> EngineResults:
385 result_list = EngineResults()
386
387 for result in json_resp["results"]:
388 item = {
389 'url': result['url'],
390 'title': result['title'],
391 'content': result['description'],
392 'template': 'images.html',
393 'resolution': result['properties']['format'],
394 'source': result['source'],
395 'img_src': result['properties']['url'],
396 'thumbnail_src': result['thumbnail']['src'],
397 }
398 result_list.append(item)
399
400 return result_list
401
402
403def _parse_videos(json_resp) -> EngineResults:
404 result_list = EngineResults()
405
406 for result in json_resp["results"]:
407
408 url = result['url']
409 item = {
410 'url': url,
411 'title': result['title'],
412 'content': result['description'],
413 'template': 'videos.html',
414 'length': result['video']['duration'],
415 'duration': result['video']['duration'],
416 'publishedDate': _extract_published_date(result['age']),
417 }
418
419 if result['thumbnail'] is not None:
420 item['thumbnail'] = result['thumbnail']['src']
421
422 iframe_src = get_embeded_stream_url(url)
423 if iframe_src:
424 item['iframe_src'] = iframe_src
425
426 result_list.append(item)
427
428 return result_list
429
430
431def fetch_traits(engine_traits: EngineTraits):
432 """Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
433 regions>` from Brave."""
434
435 # pylint: disable=import-outside-toplevel, too-many-branches
436
437 import babel.languages
438 from searx.locales import region_tag, language_tag
439 from searx.network import get # see https://github.com/searxng/searxng/issues/762
440
441 engine_traits.custom["ui_lang"] = {}
442
443 headers = {
444 'Accept-Encoding': 'gzip, deflate',
445 }
446 lang_map = {'no': 'nb'} # norway
447
448 # languages (UI)
449
450 resp = get('https://search.brave.com/settings', headers=headers)
451
452 if not resp.ok: # type: ignore
453 print("ERROR: response from Brave is not OK.")
454 dom = html.fromstring(resp.text) # type: ignore
455
456 for option in dom.xpath('//section//option[@value="en-us"]/../option'):
457
458 ui_lang = option.get('value')
459 try:
460 l = babel.Locale.parse(ui_lang, sep='-')
461 if l.territory:
462 sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
463 else:
464 sxng_tag = language_tag(babel.Locale.parse(ui_lang, sep='-'))
465
466 except babel.UnknownLocaleError:
467 print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
468 continue
469
470 conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
471 if conflict:
472 if conflict != ui_lang:
473 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
474 continue
475 engine_traits.custom["ui_lang"][sxng_tag] = ui_lang
476
477 # search regions of brave
478
479 resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js', headers=headers)
480
481 if not resp.ok: # type: ignore
482 print("ERROR: response from Brave is not OK.")
483
484 country_js = resp.text[resp.text.index("options:{all") + len('options:') :] # type: ignore
485 country_js = country_js[: country_js.index("},k={default")]
486 country_tags = js_variable_to_python(country_js)
487
488 for k, v in country_tags.items():
489 if k == 'all':
490 engine_traits.all_locale = 'all'
491 continue
492 country_tag = v['value']
493
494 # add official languages of the country ..
495 for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
496 lang_tag = lang_map.get(lang_tag, lang_tag)
497 sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
498 # print("%-20s: %s <-- %s" % (v['label'], country_tag, sxng_tag))
499
500 conflict = engine_traits.regions.get(sxng_tag)
501 if conflict:
502 if conflict != country_tag:
503 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
504 continue
505 engine_traits.regions[sxng_tag] = country_tag
EngineResults _parse_search(resp)
Definition brave.py:285
request(query, params)
Definition brave.py:203
fetch_traits(EngineTraits engine_traits)
Definition brave.py:431
EngineResults _parse_videos(json_resp)
Definition brave.py:403
EngineResults _parse_news(resp)
Definition brave.py:354
_extract_published_date(published_date_raw)
Definition brave.py:247
EngineResults _parse_images(json_resp)
Definition brave.py:384
EngineResults response(resp)
Definition brave.py:257