.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
brave.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Brave supports the categories listed in :py:obj:`brave_category` (General,
3news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
4<time_range_support>` is limited (see remarks).
5
6Configured ``brave`` engines:
7
8.. code:: yaml
9
10 - name: brave
11 engine: brave
12 ...
13 brave_category: search
14 time_range_support: true
15 paging: true
16
17 - name: brave.images
18 engine: brave
19 ...
20 brave_category: images
21
22 - name: brave.videos
23 engine: brave
24 ...
25 brave_category: videos
26
27 - name: brave.news
28 engine: brave
29 ...
30 brave_category: news
31
32 - name: brave.goggles
33 time_range_support: true
34 paging: true
35 ...
36 brave_category: goggles
37
38
39.. _brave regions:
40
41Brave regions
42=============
43
44Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
45locales. To get a mapping, all *officiat de-facto* languages of the Brave
46region are mapped to regions in SearXNG (see :py:obj:`babel
47<babel.languages.get_official_languages>`):
48
49.. code:: python
50
51 "regions": {
52 ..
53 "en-CA": "ca",
54 "fr-CA": "ca",
55 ..
56 }
57
58
59.. note::
60
61 The language (aka region) support of Brave's index is limited to very basic
62 languages. The search results for languages like Chinese or Arabic are of
63 low quality.
64
65
66.. _brave googles:
67
68Brave Goggles
69=============
70
71.. _list of Goggles: https://search.brave.com/goggles/discover
72.. _Goggles Whitepaper: https://brave.com/static-assets/files/goggles.pdf
73.. _Goggles Quickstart: https://github.com/brave/goggles-quickstart
74
75Goggles allow you to choose, alter, or extend the ranking of Brave Search
76results (`Goggles Whitepaper`_). Goggles are openly developed by the community
77of Brave Search users.
78
79Select from the `list of Goggles`_ people have published, or create your own
80(`Goggles Quickstart`_).
81
82
83.. _brave languages:
84
85Brave languages
86===============
87
88Brave's language support is limited to the UI (menus, area local notations,
89etc). Brave's index only seems to support a locale, but it does not seem to
90support any languages in its index. The choice of available languages is very
91small (and its not clear to me where the difference in UI is when switching
92from en-us to en-ca or en-gb).
93
94In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
95UI languages are stored in a custom field named ``ui_lang``:
96
97.. code:: python
98
99 "custom": {
100 "ui_lang": {
101 "ca": "ca",
102 "de-DE": "de-de",
103 "en-CA": "en-ca",
104 "en-GB": "en-gb",
105 "en-US": "en-us",
106 "es": "es",
107 "fr-CA": "fr-ca",
108 "fr-FR": "fr-fr",
109 "ja-JP": "ja-jp",
110 "pt-BR": "pt-br",
111 "sq-AL": "sq-al"
112 }
113 },
114
115Implementations
116===============
117
118"""
119
120import typing as t
121
122from urllib.parse import (
123 urlencode,
124 urlparse,
125)
126
127from dateutil import parser
128from lxml import html
129
130from searx import locales
131from searx.utils import (
132 extr,
133 extract_text,
134 eval_xpath,
135 eval_xpath_list,
136 eval_xpath_getindex,
137 js_variable_to_python,
138 get_embeded_stream_url,
139)
140from searx.enginelib.traits import EngineTraits
141from searx.result_types import EngineResults
142from searx.extended_types import SXNG_Response
143
144about = {
145 "website": 'https://search.brave.com/',
146 "wikidata_id": 'Q22906900',
147 "official_api_documentation": None,
148 "use_official_api": False,
149 "require_api_key": False,
150 "results": 'HTML',
151}
152
153base_url = "https://search.brave.com/"
154categories = []
155brave_category: t.Literal["search", "videos", "images", "news", "goggles"] = 'search'
156"""Brave supports common web-search, videos, images, news, and goggles search.
157
158- ``search``: Common WEB search
159- ``videos``: search for videos
160- ``images``: search for images
161- ``news``: search for news
162- ``goggles``: Common WEB search with custom rules, requires a :py:obj:`Goggles` URL.
163"""
164
165Goggles: str = ""
166"""This should be a URL ending in ``.goggle``"""
167
168brave_spellcheck = False
169"""Brave supports some kind of spell checking. When activated, Brave tries to
170fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
171the UI of Brave the user gets warned about this, since we can not warn the user
172in SearXNG, the spellchecking is disabled by default.
173"""
174
175send_accept_language_header = True
176paging = False
177"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
178category All) and in the goggles category."""
179max_page = 10
180"""Tested 9 pages maximum (``&offset=8``), to be save max is set to 10. Trying
181to do more won't return any result and you will most likely be flagged as a bot.
182"""
183
184safesearch = True
185safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
186
187time_range_support = False
188"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
189category All) and in the goggles category."""
190
191time_range_map: dict[str, str] = {
192 'day': 'pd',
193 'week': 'pw',
194 'month': 'pm',
195 'year': 'py',
196}
197
198
199def request(query: str, params: dict[str, t.Any]) -> None:
200
201 # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
202 params['headers']['Accept-Encoding'] = 'gzip, deflate'
203
204 args: dict[str, t.Any] = {
205 'q': query,
206 'source': 'web',
207 }
208 if brave_spellcheck:
209 args['spellcheck'] = '1'
210
211 if brave_category in ('search', 'goggles'):
212 if params.get('pageno', 1) - 1:
213 args['offset'] = params.get('pageno', 1) - 1
214 if time_range_map.get(params['time_range']):
215 args['tf'] = time_range_map.get(params['time_range'])
216
217 if brave_category == 'goggles':
218 args['goggles_id'] = Goggles
219
220 params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
221
222 # set properties in the cookies
223
224 params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
225 # the useLocation is IP based, we use cookie 'country' for the region
226 params['cookies']['useLocation'] = '0'
227 params['cookies']['summarizer'] = '0'
228
229 engine_region = traits.get_region(params['searxng_locale'], 'all')
230 params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore
231
232 ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
233 params['cookies']['ui_lang'] = ui_lang
234
235 logger.debug("cookies %s", params['cookies'])
236
237 params['headers']['Sec-Fetch-Dest'] = "document"
238 params['headers']['Sec-Fetch-Mode'] = "navigate"
239 params['headers']['Sec-Fetch-Site'] = "same-origin"
240 params['headers']['Sec-Fetch-User'] = "?1"
241
242
243def _extract_published_date(published_date_raw):
244 if published_date_raw is None:
245 return None
246
247 try:
248 return parser.parse(published_date_raw)
249 except parser.ParserError:
250 return None
251
252
253def response(resp: SXNG_Response) -> EngineResults:
254
255 if brave_category in ('search', 'goggles'):
256 return _parse_search(resp)
257
258 if brave_category in ('news'):
259 return _parse_news(resp)
260
261 # Example script source containing the data:
262 #
263 # kit.start(app, element, {
264 # node_ids: [0, 19],
265 # data: [{type:"data",data: .... ["q","goggles_id"],route:1,url:1}}]
266 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
267 js_object = "[{" + extr(resp.text, "data: [{", "}}],") + "}}]"
268 json_data = js_variable_to_python(js_object)
269
270 # json_data is a list and at the second position (0,1) in this list we find the "response" data we need ..
271 json_resp = json_data[1]['data']['body']['response']
272
273 if brave_category == 'images':
274 return _parse_images(json_resp)
275 if brave_category == 'videos':
276 return _parse_videos(json_resp)
277
278 raise ValueError(f"Unsupported brave category: {brave_category}")
279
280
281def _parse_search(resp) -> EngineResults:
282 result_list = EngineResults()
283
284 dom = html.fromstring(resp.text)
285
286 # I doubt that Brave is still providing the "answer" class / I haven't seen
287 # answers in brave for a long time.
288 answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
289 if answer_tag:
290 url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None)
291 answer = extract_text(answer_tag)
292 if answer is not None:
293 result_list.add(result_list.types.Answer(answer=answer, url=url))
294
295 # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
296 xpath_results = '//div[contains(@class, "snippet ")]'
297
298 for result in eval_xpath_list(dom, xpath_results):
299
300 url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None)
301 title_tag = eval_xpath_getindex(
302 result, './/a[contains(@class, "h")]//div[contains(@class, "title")]', 0, default=None
303 )
304 if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
305 continue
306
307 content: str = extract_text(
308 eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='')
309 ) # type: ignore
310 pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")')
311 pub_date = _extract_published_date(pub_date_raw)
312 if pub_date and content.startswith(pub_date_raw):
313 content = content.lstrip(pub_date_raw).strip("- \n\t")
314
315 thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
316
317 item = {
318 'url': url,
319 'title': extract_text(title_tag),
320 'content': content,
321 'publishedDate': pub_date,
322 'thumbnail': thumbnail,
323 }
324
325 video_tag = eval_xpath_getindex(
326 result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
327 )
328 if video_tag is not None:
329
330 # In my tests a video tag in the WEB search was most often not a
331 # video, except the ones from youtube ..
332
333 iframe_src = get_embeded_stream_url(url)
334 if iframe_src:
335 item['iframe_src'] = iframe_src
336 item['template'] = 'videos.html'
337 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
338 pub_date_raw = extract_text(
339 eval_xpath(video_tag, './/div[contains(@class, "snippet-attributes")]/div/text()')
340 )
341 item['publishedDate'] = _extract_published_date(pub_date_raw)
342 else:
343 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
344
345 result_list.append(item)
346
347 return result_list
348
349
350def _parse_news(resp) -> EngineResults:
351
352 result_list = EngineResults()
353 dom = html.fromstring(resp.text)
354
355 for result in eval_xpath_list(dom, '//div[contains(@class, "results")]//div[@data-type="news"]'):
356
357 # import pdb
358 # pdb.set_trace()
359
360 url = eval_xpath_getindex(result, './/a[contains(@class, "result-header")]/@href', 0, default=None)
361 if url is None:
362 continue
363
364 title = extract_text(eval_xpath_list(result, './/span[contains(@class, "snippet-title")]'))
365 content = extract_text(eval_xpath_list(result, './/p[contains(@class, "desc")]'))
366 thumbnail = eval_xpath_getindex(result, './/div[contains(@class, "image-wrapper")]//img/@src', 0, default='')
367
368 item = {
369 "url": url,
370 "title": title,
371 "content": content,
372 "thumbnail": thumbnail,
373 }
374
375 result_list.append(item)
376
377 return result_list
378
379
380def _parse_images(json_resp) -> EngineResults:
381 result_list = EngineResults()
382
383 for result in json_resp["results"]:
384 item = {
385 'url': result['url'],
386 'title': result['title'],
387 'content': result['description'],
388 'template': 'images.html',
389 'resolution': result['properties']['format'],
390 'source': result['source'],
391 'img_src': result['properties']['url'],
392 'thumbnail_src': result['thumbnail']['src'],
393 }
394 result_list.append(item)
395
396 return result_list
397
398
399def _parse_videos(json_resp) -> EngineResults:
400 result_list = EngineResults()
401
402 for result in json_resp["results"]:
403
404 url = result['url']
405 item = {
406 'url': url,
407 'title': result['title'],
408 'content': result['description'],
409 'template': 'videos.html',
410 'length': result['video']['duration'],
411 'duration': result['video']['duration'],
412 'publishedDate': _extract_published_date(result['age']),
413 }
414
415 if result['thumbnail'] is not None:
416 item['thumbnail'] = result['thumbnail']['src']
417
418 iframe_src = get_embeded_stream_url(url)
419 if iframe_src:
420 item['iframe_src'] = iframe_src
421
422 result_list.append(item)
423
424 return result_list
425
426
427def fetch_traits(engine_traits: EngineTraits):
428 """Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
429 regions>` from Brave."""
430
431 # pylint: disable=import-outside-toplevel, too-many-branches
432
433 import babel.languages
434 from searx.locales import region_tag, language_tag
435 from searx.network import get # see https://github.com/searxng/searxng/issues/762
436
437 engine_traits.custom["ui_lang"] = {}
438
439 headers = {
440 'Accept-Encoding': 'gzip, deflate',
441 }
442 lang_map = {'no': 'nb'} # norway
443
444 # languages (UI)
445
446 resp = get('https://search.brave.com/settings', headers=headers)
447
448 if not resp.ok: # type: ignore
449 print("ERROR: response from Brave is not OK.")
450 dom = html.fromstring(resp.text) # type: ignore
451
452 for option in dom.xpath('//section//option[@value="en-us"]/../option'):
453
454 ui_lang = option.get('value')
455 try:
456 l = babel.Locale.parse(ui_lang, sep='-')
457 if l.territory:
458 sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
459 else:
460 sxng_tag = language_tag(babel.Locale.parse(ui_lang, sep='-'))
461
462 except babel.UnknownLocaleError:
463 print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
464 continue
465
466 conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
467 if conflict:
468 if conflict != ui_lang:
469 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
470 continue
471 engine_traits.custom["ui_lang"][sxng_tag] = ui_lang
472
473 # search regions of brave
474
475 resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js', headers=headers)
476
477 if not resp.ok: # type: ignore
478 print("ERROR: response from Brave is not OK.")
479
480 country_js = resp.text[resp.text.index("options:{all") + len('options:') :] # type: ignore
481 country_js = country_js[: country_js.index("},k={default")]
482 country_tags = js_variable_to_python(country_js)
483
484 for k, v in country_tags.items():
485 if k == 'all':
486 engine_traits.all_locale = 'all'
487 continue
488 country_tag = v['value']
489
490 # add official languages of the country ..
491 for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
492 lang_tag = lang_map.get(lang_tag, lang_tag)
493 sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
494 # print("%-20s: %s <-- %s" % (v['label'], country_tag, sxng_tag))
495
496 conflict = engine_traits.regions.get(sxng_tag)
497 if conflict:
498 if conflict != country_tag:
499 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
500 continue
501 engine_traits.regions[sxng_tag] = country_tag
EngineResults _parse_search(resp)
Definition brave.py:281
EngineResults response(SXNG_Response resp)
Definition brave.py:253
fetch_traits(EngineTraits engine_traits)
Definition brave.py:427
EngineResults _parse_videos(json_resp)
Definition brave.py:399
EngineResults _parse_news(resp)
Definition brave.py:350
_extract_published_date(published_date_raw)
Definition brave.py:243
EngineResults _parse_images(json_resp)
Definition brave.py:380
None request(str query, dict[str, t.Any] params)
Definition brave.py:199