.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
bing_news.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Bing-News: description see :py:obj:`searx.engines.bing`.
3
4.. hint::
5
6 Bing News is *different* in some ways!
7
8"""
9
10# pylint: disable=invalid-name
11
12from urllib.parse import urlencode
13
14from lxml import html
15
16from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
17from searx.enginelib.traits import EngineTraits
18from searx.engines.bing import set_bing_cookies
19
20# about
21about = {
22 "website": 'https://www.bing.com/news',
23 "wikidata_id": 'Q2878637',
24 "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api',
25 "use_official_api": False,
26 "require_api_key": False,
27 "results": 'RSS',
28}
29
30# engine dependent config
31categories = ['news']
32paging = True
33"""If go through the pages and there are actually no new results for another
34page, then bing returns the results from the last page again."""
35
36time_range_support = True
37time_map = {
38 'day': 'interval="4"',
39 'week': 'interval="7"',
40 'month': 'interval="9"',
41}
42"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the
43difference of *last day* and *last week* in the result list is just marginally.
44Bing does not have news range ``year`` / we use ``month`` instead."""
45
46base_url = 'https://www.bing.com/news/infinitescrollajax'
47"""Bing (News) search URL"""
48
49
50def request(query, params):
51 """Assemble a Bing-News request."""
52
53 engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
54 engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore
55 set_bing_cookies(params, engine_language, engine_region)
56
57 # build URL query
58 #
59 # example: https://www.bing.com/news/infinitescrollajax?q=london&first=1
60
61 page = int(params.get('pageno', 1)) - 1
62 query_params = {
63 'q': query,
64 'InfiniteScroll': 1,
65 # to simplify the page count lets use the default of 10 images per page
66 'first': page * 10 + 1,
67 'SFX': page,
68 'form': 'PTFTNR',
69 'setlang': engine_region.split('-')[0],
70 'cc': engine_region.split('-')[-1],
71 }
72
73 if params['time_range']:
74 query_params['qft'] = time_map.get(params['time_range'], 'interval="9"')
75
76 params['url'] = base_url + '?' + urlencode(query_params)
77
78 return params
79
80
81def response(resp):
82 """Get response from Bing-Video"""
83 results = []
84
85 if not resp.ok or not resp.text:
86 return results
87
88 dom = html.fromstring(resp.text)
89
90 for newsitem in eval_xpath_list(dom, '//div[contains(@class, "newsitem")]'):
91
92 link = eval_xpath_getindex(newsitem, './/a[@class="title"]', 0, None)
93 if link is None:
94 continue
95 url = link.attrib.get('href')
96 title = extract_text(link)
97 content = extract_text(eval_xpath(newsitem, './/div[@class="snippet"]'))
98
99 metadata = []
100 source = eval_xpath_getindex(newsitem, './/div[contains(@class, "source")]', 0, None)
101 if source is not None:
102 for item in (
103 eval_xpath_getindex(source, './/span[@aria-label]/@aria-label', 0, None),
104 # eval_xpath_getindex(source, './/a', 0, None),
105 # eval_xpath_getindex(source, './div/span', 3, None),
106 link.attrib.get('data-author'),
107 ):
108 if item is not None:
109 t = extract_text(item)
110 if t and t.strip():
111 metadata.append(t.strip())
112 metadata = ' | '.join(metadata)
113
114 thumbnail = None
115 imagelink = eval_xpath_getindex(newsitem, './/a[@class="imagelink"]//img', 0, None)
116 if imagelink is not None:
117 thumbnail = imagelink.attrib.get('src')
118 if not thumbnail.startswith("https://www.bing.com"):
119 thumbnail = 'https://www.bing.com/' + thumbnail
120
121 results.append(
122 {
123 'url': url,
124 'title': title,
125 'content': content,
126 'thumbnail': thumbnail,
127 'metadata': metadata,
128 }
129 )
130
131 return results
132
133
134def fetch_traits(engine_traits: EngineTraits):
135 """Fetch languages and regions from Bing-News."""
136 # pylint: disable=import-outside-toplevel
137
138 from searx.engines.bing import fetch_traits as _f
139
140 _f(engine_traits)
141
142 # fix market codes not known by bing news:
143
144 # In bing the market code 'zh-cn' exists, but there is no 'news' category in
145 # bing for this market. Alternatively we use the the market code from Honk
146 # Kong. Even if this is not correct, it is better than having no hits at
147 # all, or sending false queries to bing that could raise the suspicion of a
148 # bot.
149
150 # HINT: 'en-hk' is the region code it does not indicate the language en!!
151 engine_traits.regions['zh-CN'] = 'en-hk'
request(query, params)
Definition bing_news.py:50
fetch_traits(EngineTraits engine_traits)
Definition bing_news.py:134