.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
brave.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Brave supports the categories listed in :py:obj:`brave_category` (General,
3news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range
4<time_range_support>` is limited (see remarks).
5
6Configured ``brave`` engines:
7
8.. code:: yaml
9
10 - name: brave
11 engine: brave
12 ...
13 brave_category: search
14 time_range_support: true
15 paging: true
16
17 - name: brave.images
18 engine: brave
19 ...
20 brave_category: images
21
22 - name: brave.videos
23 engine: brave
24 ...
25 brave_category: videos
26
27 - name: brave.news
28 engine: brave
29 ...
30 brave_category: news
31
32 - name: brave.goggles
33 brave_category: goggles
34 time_range_support: true
35 paging: true
36 ...
37 brave_category: goggles
38
39
40.. _brave regions:
41
42Brave regions
43=============
44
45Brave uses two-digit tags for the regions like ``ca`` while SearXNG deals with
46locales. To get a mapping, all *officiat de-facto* languages of the Brave
47region are mapped to regions in SearXNG (see :py:obj:`babel
48<babel.languages.get_official_languages>`):
49
50.. code:: python
51
52 "regions": {
53 ..
54 "en-CA": "ca",
55 "fr-CA": "ca",
56 ..
57 }
58
59
60.. note::
61
62 The language (aka region) support of Brave's index is limited to very basic
63 languages. The search results for languages like Chinese or Arabic are of
64 low quality.
65
66
67.. _brave googles:
68
69Brave Goggles
70=============
71
72.. _list of Goggles: https://search.brave.com/goggles/discover
73.. _Goggles Whitepaper: https://brave.com/static-assets/files/goggles.pdf
74.. _Goggles Quickstart: https://github.com/brave/goggles-quickstart
75
76Goggles allow you to choose, alter, or extend the ranking of Brave Search
77results (`Goggles Whitepaper`_). Goggles are openly developed by the community
78of Brave Search users.
79
80Select from the `list of Goggles`_ people have published, or create your own
81(`Goggles Quickstart`_).
82
83
84.. _brave languages:
85
86Brave languages
87===============
88
89Brave's language support is limited to the UI (menus, area local notations,
90etc). Brave's index only seems to support a locale, but it does not seem to
91support any languages in its index. The choice of available languages is very
92small (and its not clear to me where the difference in UI is when switching
93from en-us to en-ca or en-gb).
94
95In the :py:obj:`EngineTraits object <searx.enginelib.traits.EngineTraits>` the
96UI languages are stored in a custom field named ``ui_lang``:
97
98.. code:: python
99
100 "custom": {
101 "ui_lang": {
102 "ca": "ca",
103 "de-DE": "de-de",
104 "en-CA": "en-ca",
105 "en-GB": "en-gb",
106 "en-US": "en-us",
107 "es": "es",
108 "fr-CA": "fr-ca",
109 "fr-FR": "fr-fr",
110 "ja-JP": "ja-jp",
111 "pt-BR": "pt-br",
112 "sq-AL": "sq-al"
113 }
114 },
115
116Implementations
117===============
118
119"""
120
121from typing import Any, TYPE_CHECKING
122
123from urllib.parse import (
124 urlencode,
125 urlparse,
126 parse_qs,
127)
128
129from dateutil import parser
130from lxml import html
131
132from searx import locales
133from searx.utils import (
134 extract_text,
135 eval_xpath,
136 eval_xpath_list,
137 eval_xpath_getindex,
138 js_variable_to_python,
139)
140from searx.enginelib.traits import EngineTraits
141
142if TYPE_CHECKING:
143 import logging
144
145 logger: logging.Logger
146
147traits: EngineTraits
148
149about = {
150 "website": 'https://search.brave.com/',
151 "wikidata_id": 'Q22906900',
152 "official_api_documentation": None,
153 "use_official_api": False,
154 "require_api_key": False,
155 "results": 'HTML',
156}
157
158base_url = "https://search.brave.com/"
159categories = []
160brave_category = 'search'
161Goggles = Any
162"""Brave supports common web-search, videos, images, news, and goggles search.
163
164- ``search``: Common WEB search
165- ``videos``: search for videos
166- ``images``: search for images
167- ``news``: search for news
168- ``goggles``: Common WEB search with custom rules
169"""
170
171brave_spellcheck = False
172"""Brave supports some kind of spell checking. When activated, Brave tries to
173fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In
174the UI of Brave the user gets warned about this, since we can not warn the user
175in SearXNG, the spellchecking is disabled by default.
176"""
177
178send_accept_language_header = True
179paging = False
180"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
181category All) and in the goggles category."""
182max_page = 10
183"""Tested 9 pages maximum (``&offset=8``), to be save max is set to 10. Trying
184to do more won't return any result and you will most likely be flagged as a bot.
185"""
186
187safesearch = True
188safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off
189
190time_range_support = False
191"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
192category All) and in the goggles category."""
193
194time_range_map = {
195 'day': 'pd',
196 'week': 'pw',
197 'month': 'pm',
198 'year': 'py',
199}
200
201
202def request(query, params):
203
204 # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
205 params['headers']['Accept-Encoding'] = 'gzip, deflate'
206
207 args = {
208 'q': query,
209 }
210 if brave_spellcheck:
211 args['spellcheck'] = '1'
212
213 if brave_category in ('search', 'goggles'):
214 if params.get('pageno', 1) - 1:
215 args['offset'] = params.get('pageno', 1) - 1
216 if time_range_map.get(params['time_range']):
217 args['tf'] = time_range_map.get(params['time_range'])
218
219 if brave_category == 'goggles':
220 args['goggles_id'] = Goggles
221
222 params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
223
224 # set properties in the cookies
225
226 params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
227 # the useLocation is IP based, we use cookie 'country' for the region
228 params['cookies']['useLocation'] = '0'
229 params['cookies']['summarizer'] = '0'
230
231 engine_region = traits.get_region(params['searxng_locale'], 'all')
232 params['cookies']['country'] = engine_region.split('-')[-1].lower() # type: ignore
233
234 ui_lang = locales.get_engine_locale(params['searxng_locale'], traits.custom["ui_lang"], 'en-us')
235 params['cookies']['ui_lang'] = ui_lang
236
237 logger.debug("cookies %s", params['cookies'])
238
239
240def _extract_published_date(published_date_raw):
241 if published_date_raw is None:
242 return None
243
244 try:
245 return parser.parse(published_date_raw)
246 except parser.ParserError:
247 return None
248
249
250def response(resp):
251
252 if brave_category in ('search', 'goggles'):
253 return _parse_search(resp)
254
255 datastr = ""
256 for line in resp.text.split("\n"):
257 if "const data = " in line:
258 datastr = line.replace("const data = ", "").strip()[:-1]
259 break
260
261 json_data = js_variable_to_python(datastr)
262 json_resp = json_data[1]['data']['body']['response']
263
264 if brave_category == 'news':
265 return _parse_news(json_resp['news'])
266
267 if brave_category == 'images':
268 return _parse_images(json_resp)
269 if brave_category == 'videos':
270 return _parse_videos(json_resp)
271
272 raise ValueError(f"Unsupported brave category: {brave_category}")
273
274
276
277 result_list = []
278 dom = html.fromstring(resp.text)
279
280 answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
281 if answer_tag:
282 url = eval_xpath_getindex(dom, '//div[@id="featured_snippet"]/a[@class="result-header"]/@href', 0, default=None)
283 result_list.append({'answer': extract_text(answer_tag), 'url': url})
284
285 # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
286 xpath_results = '//div[contains(@class, "snippet ")]'
287
288 for result in eval_xpath_list(dom, xpath_results):
289
290 url = eval_xpath_getindex(result, './/a[contains(@class, "h")]/@href', 0, default=None)
291 title_tag = eval_xpath_getindex(
292 result, './/a[contains(@class, "h")]//div[contains(@class, "title")]', 0, default=None
293 )
294 if url is None or title_tag is None or not urlparse(url).netloc: # partial url likely means it's an ad
295 continue
296
297 content_tag = eval_xpath_getindex(result, './/div[contains(@class, "snippet-description")]', 0, default='')
298 pub_date_raw = eval_xpath(result, 'substring-before(.//div[contains(@class, "snippet-description")], "-")')
299 img_src = eval_xpath_getindex(result, './/img[contains(@class, "thumb")]/@src', 0, default='')
300
301 item = {
302 'url': url,
303 'title': extract_text(title_tag),
304 'content': extract_text(content_tag),
305 'publishedDate': _extract_published_date(pub_date_raw),
306 'img_src': img_src,
307 }
308
309 video_tag = eval_xpath_getindex(
310 result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
311 )
312 if video_tag is not None:
313
314 # In my tests a video tag in the WEB search was most often not a
315 # video, except the ones from youtube ..
316
317 iframe_src = _get_iframe_src(url)
318 if iframe_src:
319 item['iframe_src'] = iframe_src
320 item['template'] = 'videos.html'
321 item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
322 pub_date_raw = extract_text(
323 eval_xpath(video_tag, './/div[contains(@class, "snippet-attributes")]/div/text()')
324 )
325 item['publishedDate'] = _extract_published_date(pub_date_raw)
326 else:
327 item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
328
329 result_list.append(item)
330
331 return result_list
332
333
335 parsed_url = urlparse(url)
336 if parsed_url.path == '/watch' and parsed_url.query:
337 video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore
338 if video_id:
339 return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore
340 return None
341
342
343def _parse_news(json_resp):
344 result_list = []
345
346 for result in json_resp["results"]:
347 item = {
348 'url': result['url'],
349 'title': result['title'],
350 'content': result['description'],
351 'publishedDate': _extract_published_date(result['age']),
352 }
353 if result['thumbnail'] is not None:
354 item['img_src'] = result['thumbnail']['src']
355 result_list.append(item)
356
357 return result_list
358
359
360def _parse_images(json_resp):
361 result_list = []
362
363 for result in json_resp["results"]:
364 item = {
365 'url': result['url'],
366 'title': result['title'],
367 'content': result['description'],
368 'template': 'images.html',
369 'resolution': result['properties']['format'],
370 'source': result['source'],
371 'img_src': result['properties']['url'],
372 'thumbnail_src': result['thumbnail']['src'],
373 }
374 result_list.append(item)
375
376 return result_list
377
378
379def _parse_videos(json_resp):
380 result_list = []
381
382 for result in json_resp["results"]:
383
384 url = result['url']
385 item = {
386 'url': url,
387 'title': result['title'],
388 'content': result['description'],
389 'template': 'videos.html',
390 'length': result['video']['duration'],
391 'duration': result['video']['duration'],
392 'publishedDate': _extract_published_date(result['age']),
393 }
394
395 if result['thumbnail'] is not None:
396 item['thumbnail'] = result['thumbnail']['src']
397
398 iframe_src = _get_iframe_src(url)
399 if iframe_src:
400 item['iframe_src'] = iframe_src
401
402 result_list.append(item)
403
404 return result_list
405
406
407def fetch_traits(engine_traits: EngineTraits):
408 """Fetch :ref:`languages <brave languages>` and :ref:`regions <brave
409 regions>` from Brave."""
410
411 # pylint: disable=import-outside-toplevel, too-many-branches
412
413 import babel.languages
414 from searx.locales import region_tag, language_tag
415 from searx.network import get # see https://github.com/searxng/searxng/issues/762
416
417 engine_traits.custom["ui_lang"] = {}
418
419 headers = {
420 'Accept-Encoding': 'gzip, deflate',
421 }
422 lang_map = {'no': 'nb'} # norway
423
424 # languages (UI)
425
426 resp = get('https://search.brave.com/settings', headers=headers)
427
428 if not resp.ok: # type: ignore
429 print("ERROR: response from Brave is not OK.")
430 dom = html.fromstring(resp.text) # type: ignore
431
432 for option in dom.xpath('//div[@id="language-select"]//option'):
433
434 ui_lang = option.get('value')
435 try:
436 if '-' in ui_lang:
437 sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
438 else:
439 sxng_tag = language_tag(babel.Locale.parse(ui_lang))
440
441 except babel.UnknownLocaleError:
442 print("ERROR: can't determine babel locale of Brave's (UI) language %s" % ui_lang)
443 continue
444
445 conflict = engine_traits.custom["ui_lang"].get(sxng_tag)
446 if conflict:
447 if conflict != ui_lang:
448 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, ui_lang))
449 continue
450 engine_traits.custom["ui_lang"][sxng_tag] = ui_lang
451
452 # search regions of brave
453
454 resp = get('https://cdn.search.brave.com/serp/v2/_app/immutable/chunks/parameters.734c106a.js', headers=headers)
455
456 if not resp.ok: # type: ignore
457 print("ERROR: response from Brave is not OK.")
458
459 country_js = resp.text[resp.text.index("options:{all") + len('options:') :]
460 country_js = country_js[: country_js.index("},k={default")]
461 country_tags = js_variable_to_python(country_js)
462
463 for k, v in country_tags.items():
464 if k == 'all':
465 engine_traits.all_locale = 'all'
466 continue
467 country_tag = v['value']
468
469 # add official languages of the country ..
470 for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True):
471 lang_tag = lang_map.get(lang_tag, lang_tag)
472 sxng_tag = region_tag(babel.Locale.parse('%s_%s' % (lang_tag, country_tag.upper())))
473 # print("%-20s: %s <-- %s" % (v['label'], country_tag, sxng_tag))
474
475 conflict = engine_traits.regions.get(sxng_tag)
476 if conflict:
477 if conflict != country_tag:
478 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, country_tag))
479 continue
480 engine_traits.regions[sxng_tag] = country_tag
_parse_images(json_resp)
Definition brave.py:360
_parse_videos(json_resp)
Definition brave.py:379
request(query, params)
Definition brave.py:202
fetch_traits(EngineTraits engine_traits)
Definition brave.py:407
_parse_search(resp)
Definition brave.py:275
_extract_published_date(published_date_raw)
Definition brave.py:240
_parse_news(json_resp)
Definition brave.py:343