.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
brave.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Brave supports the categories listed in :py:obj:`brave_category` (General,
3news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
4<time_range_support>` is limited (see remarks).
5
6Configured ``brave`` engines:
7
8.. code:: yaml
9
10 - name: brave
11 engine: brave
12 ...
13 brave_category: search
14 time_range_support: true
15 paging: true
16
17 - name: brave.images
18 engine: brave
19 ...
20 brave_category: images
21
22 - name: brave.videos
23 engine: brave
24 ...
25 brave_category: videos
26
27 - name: brave.news
28 engine: brave
29 ...
30 brave_category: news
31
32 - name: brave.goggles
33 brave_category: goggles
34 time_range_support: true
35 paging: true
36 ...
37 brave_category: goggles
38
39
40.. _brave regions:
41
42Brave regions
43=============
44
45Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
46locales. To get a mapping, all *officiat de-facto* languages of the Brave
47region are mapped to regions in SearXNG (see :py:obj:`babel
48<babel.languages.get_official_languages>`):
49
50.. code:: python
51
52 "regions": {
53 ..
54 "en-CA": "ca",
55 "fr-CA": "ca",
56 ..
57 }
58
59
60.. note::
61
62 The language (aka region) support of Brave's index is limited to very basic
63 languages. The search results for languages like Chinese or Arabic are of
64 low quality.
65
66
67.. _brave googles:
68
69Brave Goggles
70=============
71
72.. _list of Goggles: https://search.brave.com/goggles/discover
73.. _Goggles Whitepaper: https://brave.com/static-assets/files/goggles.pdf
74.. _Goggles Quickstart: https://github.com/brave/goggles-quickstart
75
76Goggles allow you to choose, alter, or extend the ranking of Brave Search
77results (`Goggles Whitepaper`_). Goggles are openly developed by the community
78of Brave Search users.
79
80Select from the `list of Goggles`_ people have published, or create your own
81(`Goggles Quickstart`_).
82
83
84.. _brave languages:
85
86Brave languages
87===============
88
89Brave's language support is limited to the UI (menus, area local notations,
90etc). Brave's index only seems to support a locale, but it does not seem to
91support any languages in its index. The choice of available languages is very
92small (and its not clear to me where the difference in UI is when switching
93from en-us to en-ca or en-gb).
94
95In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
96UI languages are stored in a custom field named ``ui_lang``:
97
98.. code:: python
99
100 "custom": {
101 "ui_lang": {
102 "ca": "ca",
103 "de-DE": "de-de",
104 "en-CA": "en-ca",
105 "en-GB": "en-gb",
106 "en-US": "en-us",
107 "es": "es",
108 "fr-CA": "fr-ca",
109 "fr-FR": "fr-fr",
110 "ja-JP": "ja-jp",
111 "pt-BR": "pt-br",
112 "sq-AL": "sq-al"
113 }
114 },
115
116Implementations
117===============
118
119"""
120
121from typing import Any, TYPE_CHECKING
122
123from urllib.parse import (
124 urlencode,
125 urlparse,
126)
127
128from dateutil import parser
129from lxml import html
130
131from searx import locales
132from searx.utils import (
133 extract_text,
134 extr,
135 eval_xpath,
136 eval_xpath_list,
137 eval_xpath_getindex,
138 js_variable_to_python,
139 get_embeded_stream_url,
140)
141from searx.enginelib.traits import EngineTraits
142
143if TYPE_CHECKING:
144 import logging
145
146 logger: logging.Logger
147
148traits: EngineTraits
149
150about = {
151 "website": 'https://search.brave.com/',
152 "wikidata_id": 'Q22906900',
153 "official_api_documentation": None,
154 "use_official_api": False,
155 "require_api_key": False,
156 "results": 'HTML',
157}
158
159base_url = "https://search.brave.com/"
160categories = []
161brave_category = 'search'
162Goggles = Any
163"""Brave supports common web-search, videos, images, news, and goggles search.
164
165- ``search``: Common WEB search
166- ``videos``: search for videos
167- ``images``: search for images
168- ``news``: search for news
169- ``goggles``: Common WEB search with custom rules
170"""
171
172brave_spellcheck = False
173"""Brave supports some kind of spell checking. When activated, Brave tries to
174fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
175the UI of Brave the user gets warned about this, since we can not warn the user
176in SearXNG, the spellchecking is disabled by default.
177"""
178
179send_accept_language_header = True
180paging = False
181"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
182category All) and in the goggles category."""
183max_page = 10
184"""Tested 9 pages maximum (``&offset=8``), to be save max is set to 10. Trying
185to do more won't return any result and you will most likely be flagged as a bot.
186"""
187
188safesearch = True
189safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
190
191time_range_support = False
192"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
193category All) and in the goggles category."""
194
195time_range_map = {
196 'day': 'pd',
197 'week': 'pw',
198 'month': 'pm',
199 'year': 'py',
200}
201
202
203def request(query, params):
204
205 # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
206 params['headers']['Accept-Encoding'] = 'gzip, deflate'
207
208 args = {
209 'q': query,
210 }
211 if brave_spellcheck:
212 args['spellcheck'] = '1'
213
214 if brave_category in ('search', 'goggles'):
215 if params.get('pageno', 1) - 1:
216 args['offset'] = params.get('pageno', 1) - 1
217 if time_range_map.get(params['time_range']):
218 args['tf'] = time_range_map.get(params['time_range'])
219
220 if brave_category == 'goggles':
221 args['goggles_id'] = Goggles
222
223 params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
224
225 # set properties in the cookies
226
227 params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
228 # the useLocation is IP based, we use cookie 'country' for the region
229 params['cookies']['useLocation'] = '0'
230 params['cookies']['summarizer'] = '0'
231
232 engine_region = traits.get_region(params['searxng_locale'], 'all')
233 params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore
234
235 ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
236 params['cookies']['ui_lang'] = ui_lang
237
238 logger.debug("cookies %s", params['cookies'])
239
240
241def _extract_published_date(published_date_raw):
242 if published_date_raw is None:
243 return None
244
245 try:
246 return parser.parse(published_date_raw)
247 except parser.ParserError:
248 return None
249
250
251def response(resp):
252
253 if brave_category in ('search', 'goggles'):
254 return _parse_search(resp)
255
256 datastr = extr(resp.text, "const data = ", ";\n").strip()
257
258 json_data = js_variable_to_python(datastr)
259 json_resp = json_data[1]['data']['body']['response']
260
261 if brave_category == 'news':
262 return _parse_news(json_resp['news'])
263
264 if brave_category == 'images':
265 return _parse_images(json_resp)
266 if brave_category == 'videos':
267 return _parse_videos(json_resp)
268
269 raise ValueError(f"Unsupported brave category: {brave_category}")
270
271
273
274 result_list = []
275 dom = html.fromstring(resp.text)
276
277 answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
278 if answer_tag:
279 url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None)
280 result_list.append({'answer': extract_text(answer_tag), 'url': url})
281
282 # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
283 xpath_results = '//div[contains(@class, "snippet ")]'
284
285 for result in eval_xpath_list(dom, xpath_results):
286
287 url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None)
288 title_tag = eval_xpath_getindex(
289 result, './/a[contains(@class, "h")]//div[contains(@class, "title")]', 0, default=None
290 )
291 if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
292 continue
293
294 content_tag = eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='')
295 pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")')
296 thumbnail = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
297
298 item = {
299 'url': url,
300 'title': extract_text(title_tag),
301 'content': extract_text(content_tag),
302 'publishedDate': _extract_published_date(pub_date_raw),
303 'thumbnail': thumbnail,
304 }
305
306 video_tag = eval_xpath_getindex(
307 result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
308 )
309 if video_tag is not None:
310
311 # In my tests a video tag in the WEB search was most often not a
312 # video, except the ones from youtube ..
313
314 iframe_src = get_embeded_stream_url(url)
315 if iframe_src:
316 item['iframe_src'] = iframe_src
317 item['template'] = 'videos.html'
318 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
319 pub_date_raw = extract_text(
320 eval_xpath(video_tag, './/div[contains(@class, "snippet-attributes")]/div/text()')
321 )
322 item['publishedDate'] = _extract_published_date(pub_date_raw)
323 else:
324 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
325
326 result_list.append(item)
327
328 return result_list
329
330
331def _parse_news(json_resp):
332 result_list = []
333
334 for result in json_resp["results"]:
335 item = {
336 'url': result['url'],
337 'title': result['title'],
338 'content': result['description'],
339 'publishedDate': _extract_published_date(result['age']),
340 }
341 if result['thumbnail'] is not None:
342 item['thumbnail'] = result['thumbnail']['src']
343 result_list.append(item)
344
345 return result_list
346
347
348def _parse_images(json_resp):
349 result_list = []
350
351 for result in json_resp["results"]:
352 item = {
353 'url': result['url'],
354 'title': result['title'],
355 'content': result['description'],
356 'template': 'images.html',
357 'resolution': result['properties']['format'],
358 'source': result['source'],
359 'img_src': result['properties']['url'],
360 'thumbnail_src': result['thumbnail']['src'],
361 }
362 result_list.append(item)
363
364 return result_list
365
366
367def _parse_videos(json_resp):
368 result_list = []
369
370 for result in json_resp["results"]:
371
372 url = result['url']
373 item = {
374 'url': url,
375 'title': result['title'],
376 'content': result['description'],
377 'template': 'videos.html',
378 'length': result['video']['duration'],
379 'duration': result['video']['duration'],
380 'publishedDate': _extract_published_date(result['age']),
381 }
382
383 if result['thumbnail'] is not None:
384 item['thumbnail'] = result['thumbnail']['src']
385
386 iframe_src = get_embeded_stream_url(url)
387 if iframe_src:
388 item['iframe_src'] = iframe_src
389
390 result_list.append(item)
391
392 return result_list
393
394
395def fetch_traits(engine_traits: EngineTraits):
396 """Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
397 regions>` from Brave."""
398
399 # pylint: disable=import-outside-toplevel, too-many-branches
400
401 import babel.languages
402 from searx.locales import region_tag, language_tag
403 from searx.network import get # see https://github.com/searxng/searxng/issues/762
404
405 engine_traits.custom["ui_lang"] = {}
406
407 headers = {
408 'Accept-Encoding': 'gzip, deflate',
409 }
410 lang_map = {'no': 'nb'} # norway
411
412 # languages (UI)
413
414 resp = get('https://search.brave.com/settings', headers=headers)
415
416 if not resp.ok: # type: ignore
417 print("ERROR: response from Brave is not OK.")
418 dom = html.fromstring(resp.text) # type: ignore
419
420 for option in dom.xpath('//section//option[@value="en-us"]/../option'):
421
422 ui_lang = option.get('value')
423 try:
424 l = babel.Locale.parse(ui_lang, sep='-')
425 if l.territory:
426 sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
427 else:
428 sxng_tag = language_tag(babel.Locale.parse(ui_lang, sep='-'))
429
430 except babel.UnknownLocaleError:
431 print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
432 continue
433
434 conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
435 if conflict:
436 if conflict != ui_lang:
437 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
438 continue
439 engine_traits.custom["ui_lang"][sxng_tag] = ui_lang
440
441 # search regions of brave
442
443 resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js', headers=headers)
444
445 if not resp.ok: # type: ignore
446 print("ERROR: response from Brave is not OK.")
447
448 country_js = resp.text[resp.text.index("options:{all") + len('options:') :] # type: ignore
449 country_js = country_js[: country_js.index("},k={default")]
450 country_tags = js_variable_to_python(country_js)
451
452 for k, v in country_tags.items():
453 if k == 'all':
454 engine_traits.all_locale = 'all'
455 continue
456 country_tag = v['value']
457
458 # add official languages of the country ..
459 for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
460 lang_tag = lang_map.get(lang_tag, lang_tag)
461 sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
462 # print("%-20s: %s <-- %s" % (v['label'], country_tag, sxng_tag))
463
464 conflict = engine_traits.regions.get(sxng_tag)
465 if conflict:
466 if conflict != country_tag:
467 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
468 continue
469 engine_traits.regions[sxng_tag] = country_tag
_parse_images(json_resp)
Definition brave.py:348
_parse_videos(json_resp)
Definition brave.py:367
request(query, params)
Definition brave.py:203
fetch_traits(EngineTraits engine_traits)
Definition brave.py:395
_parse_search(resp)
Definition brave.py:272
_extract_published_date(published_date_raw)
Definition brave.py:241
_parse_news(json_resp)
Definition brave.py:331