.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
google_news.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""This is the implementation of the Google News engine.
3
4Google News has a different region handling compared to Google WEB.
5
6- the ``ceid`` argument has to be set (:py:obj:`ceid_list`)
7- the hl_ argument has to be set correctly (and different to Google WEB)
8- the gl_ argument is mandatory
9
10If one of this argument is not set correctly, the request is redirected to
11CONSENT dialog::
12
13 https://consent.google.com/m?continue=
14
15The google news API ignores some parameters from the common :ref:`google API`:
16
17- num_ : the number of search results is ignored / there is no paging all
18 results for a query term are in the first response.
19- save_ : is ignored / Google-News results are always *SafeSearch*
20
21.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp
22.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp
23.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
24.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
25"""
26
27from urllib.parse import urlencode
28import base64
29from lxml import html
30import babel
31
32from searx import locales
33from searx.utils import (
34 eval_xpath,
35 eval_xpath_list,
36 eval_xpath_getindex,
37 extract_text,
38)
39
40from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import
41from searx.engines.google import (
42 get_google_info,
43 detect_google_sorry,
44)
45from searx.enginelib.traits import EngineTraits
46
47# about
48about = {
49 "website": 'https://news.google.com',
50 "wikidata_id": 'Q12020',
51 "official_api_documentation": 'https://developers.google.com/custom-search',
52 "use_official_api": False,
53 "require_api_key": False,
54 "results": 'HTML',
55}
56
57# engine dependent config
58categories = ['news']
59paging = False
60time_range_support = False
61
62# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
63# False here, otherwise checker will report safesearch-errors::
64#
65# safesearch : results are identical for safesearch=0 and safesearch=2
66safesearch = True
67# send_accept_language_header = True
68
69
70def request(query, params):
71 """Google-News search request"""
72
73 sxng_locale = params.get('searxng_locale', 'en-US')
74 ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
75 google_info = get_google_info(params, traits)
76 google_info['subdomain'] = 'news.google.com' # google news has only one domain
77
78 ceid_region, ceid_lang = ceid.split(':')
79 ceid_lang, ceid_suffix = (
80 ceid_lang.split('-')
81 + [
82 None,
83 ]
84 )[:2]
85
86 google_info['params']['hl'] = ceid_lang
87
88 if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
89
90 if ceid_region.lower() == ceid_lang:
91 google_info['params']['hl'] = ceid_lang + '-' + ceid_region
92 else:
93 google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
94
95 elif ceid_region.lower() != ceid_lang:
96
97 if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
98 google_info['params']['hl'] = ceid_lang
99 else:
100 google_info['params']['hl'] = ceid_lang + '-' + ceid_region
101
102 google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
103 google_info['params']['gl'] = ceid_region
104
105 query_url = (
106 'https://'
107 + google_info['subdomain']
108 + "/search?"
109 + urlencode(
110 {
111 'q': query,
112 **google_info['params'],
113 }
114 )
115 # ceid includes a ':' character which must not be urlencoded
116 + ('&ceid=%s' % ceid)
117 )
118
119 params['url'] = query_url
120 params['cookies'] = google_info['cookies']
121 params['headers'].update(google_info['headers'])
122 return params
123
124
125def response(resp):
126 """Get response from google's search request"""
127 results = []
128 detect_google_sorry(resp)
129
130 # convert the text to dom
131 dom = html.fromstring(resp.text)
132
133 for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
134
135 # The first <a> tag in the <article> contains the link to the article
136 # The href attribute of the <a> tag is a google internal link, we have
137 # to decode
138
139 href = eval_xpath_getindex(result, './article/a/@href', 0)
140 href = href.split('?')[0]
141 href = href.split('/')[-1]
142 href = base64.urlsafe_b64decode(href + '====')
143 href = href[href.index(b'http') :].split(b'\xd2')[0]
144 href = href.decode()
145
146 title = extract_text(eval_xpath(result, './article/h3[1]'))
147
148 # The pub_date is mostly a string like 'yesterday', not a real
149 # timezone date or time. Therefore we can't use publishedDate.
150 pub_date = extract_text(eval_xpath(result, './article//time'))
151 pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
152
153 content = ' / '.join([x for x in [pub_origin, pub_date] if x])
154
155 # The image URL is located in a preceding sibling <img> tag, e.g.:
156 # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
157 # These URL are long but not personalized (double checked via tor).
158
159 thumbnail = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
160
161 results.append(
162 {
163 'url': href,
164 'title': title,
165 'content': content,
166 'thumbnail': thumbnail,
167 }
168 )
169
170 # return results
171 return results
172
173
174ceid_list = [
175 'AE:ar',
176 'AR:es-419',
177 'AT:de',
178 'AU:en',
179 'BD:bn',
180 'BE:fr',
181 'BE:nl',
182 'BG:bg',
183 'BR:pt-419',
184 'BW:en',
185 'CA:en',
186 'CA:fr',
187 'CH:de',
188 'CH:fr',
189 'CL:es-419',
190 'CN:zh-Hans',
191 'CO:es-419',
192 'CU:es-419',
193 'CZ:cs',
194 'DE:de',
195 'EG:ar',
196 'ES:es',
197 'ET:en',
198 'FR:fr',
199 'GB:en',
200 'GH:en',
201 'GR:el',
202 'HK:zh-Hant',
203 'HU:hu',
204 'ID:en',
205 'ID:id',
206 'IE:en',
207 'IL:en',
208 'IL:he',
209 'IN:bn',
210 'IN:en',
211 'IN:hi',
212 'IN:ml',
213 'IN:mr',
214 'IN:ta',
215 'IN:te',
216 'IT:it',
217 'JP:ja',
218 'KE:en',
219 'KR:ko',
220 'LB:ar',
221 'LT:lt',
222 'LV:en',
223 'LV:lv',
224 'MA:fr',
225 'MX:es-419',
226 'MY:en',
227 'NA:en',
228 'NG:en',
229 'NL:nl',
230 'NO:no',
231 'NZ:en',
232 'PE:es-419',
233 'PH:en',
234 'PK:en',
235 'PL:pl',
236 'PT:pt-150',
237 'RO:ro',
238 'RS:sr',
239 'RU:ru',
240 'SA:ar',
241 'SE:sv',
242 'SG:en',
243 'SI:sl',
244 'SK:sk',
245 'SN:fr',
246 'TH:th',
247 'TR:tr',
248 'TW:zh-Hant',
249 'TZ:en',
250 'UA:ru',
251 'UA:uk',
252 'UG:en',
253 'US:en',
254 'US:es-419',
255 'VE:es-419',
256 'VN:vi',
257 'ZA:en',
258 'ZW:en',
259]
260"""List of region/language combinations supported by Google News. Values of the
261``ceid`` argument of the Google News REST API."""
262
263
264_skip_values = [
265 'ET:en', # english (ethiopia)
266 'ID:en', # english (indonesia)
267 'LV:en', # english (latvia)
268]
269
270_ceid_locale_map = {'NO:no': 'nb-NO'}
271
272
273def fetch_traits(engine_traits: EngineTraits):
274 _fetch_traits(engine_traits, add_domains=False)
275
276 engine_traits.custom['ceid'] = {}
277
278 for ceid in ceid_list:
279 if ceid in _skip_values:
280 continue
281
282 region, lang = ceid.split(':')
283 x = lang.split('-')
284 if len(x) > 1:
285 if x[1] not in ['Hant', 'Hans']:
286 lang = x[0]
287
288 sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
289 try:
290 locale = babel.Locale.parse(sxng_locale, sep='-')
291 except babel.UnknownLocaleError:
292 print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
293 continue
294
295 engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid
fetch_traits(EngineTraits engine_traits)