.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
google_news.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""This is the implementation of the Google News engine.
3
4Google News has a different region handling compared to Google WEB.
5
6- the ``ceid`` argument has to be set (:py:obj:`ceid_list`)
7- the hl_ argument has to be set correctly (and different to Google WEB)
8- the gl_ argument is mandatory
9
10If one of this argument is not set correctly, the request is redirected to
11CONSENT dialog::
12
13 https://consent.google.com/m?continue=
14
15The google news API ignores some parameters from the common :ref:`google API`:
16
17- num_ : the number of search results is ignored / there is no paging all
18 results for a query term are in the first response.
19- save_ : is ignored / Google-News results are always *SafeSearch*
20
21.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp
22.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp
23.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
24.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
25"""
26
27from typing import TYPE_CHECKING
28
29from urllib.parse import urlencode
30import base64
31from lxml import html
32import babel
33
34from searx import locales
35from searx.utils import (
36 eval_xpath,
37 eval_xpath_list,
38 eval_xpath_getindex,
39 extract_text,
40)
41
42from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import
43from searx.engines.google import (
44 get_google_info,
45 detect_google_sorry,
46)
47from searx.enginelib.traits import EngineTraits
48
49if TYPE_CHECKING:
50 import logging
51
52 logger: logging.Logger
53
54traits: EngineTraits
55
56# about
57about = {
58 "website": 'https://news.google.com',
59 "wikidata_id": 'Q12020',
60 "official_api_documentation": 'https://developers.google.com/custom-search',
61 "use_official_api": False,
62 "require_api_key": False,
63 "results": 'HTML',
64}
65
66# engine dependent config
67categories = ['news']
68paging = False
69time_range_support = False
70
71# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
72# False here, otherwise checker will report safesearch-errors::
73#
74# safesearch : results are identical for safesearch=0 and safesearch=2
75safesearch = True
76# send_accept_language_header = True
77
78
79def request(query, params):
80 """Google-News search request"""
81
82 sxng_locale = params.get('searxng_locale', 'en-US')
83 ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
84 google_info = get_google_info(params, traits)
85 google_info['subdomain'] = 'news.google.com' # google news has only one domain
86
87 ceid_region, ceid_lang = ceid.split(':')
88 ceid_lang, ceid_suffix = (
89 ceid_lang.split('-')
90 + [
91 None,
92 ]
93 )[:2]
94
95 google_info['params']['hl'] = ceid_lang
96
97 if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
98
99 if ceid_region.lower() == ceid_lang:
100 google_info['params']['hl'] = ceid_lang + '-' + ceid_region
101 else:
102 google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
103
104 elif ceid_region.lower() != ceid_lang:
105
106 if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
107 google_info['params']['hl'] = ceid_lang
108 else:
109 google_info['params']['hl'] = ceid_lang + '-' + ceid_region
110
111 google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
112 google_info['params']['gl'] = ceid_region
113
114 query_url = (
115 'https://'
116 + google_info['subdomain']
117 + "/search?"
118 + urlencode(
119 {
120 'q': query,
121 **google_info['params'],
122 }
123 )
124 # ceid includes a ':' character which must not be urlencoded
125 + ('&ceid=%s' % ceid)
126 )
127
128 params['url'] = query_url
129 params['cookies'] = google_info['cookies']
130 params['headers'].update(google_info['headers'])
131 return params
132
133
134def response(resp):
135 """Get response from google's search request"""
136 results = []
137 detect_google_sorry(resp)
138
139 # convert the text to dom
140 dom = html.fromstring(resp.text)
141
142 for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
143
144 # The first <a> tag in the <article> contains the link to the article
145 # The href attribute of the <a> tag is a google internal link, we have
146 # to decode
147
148 href = eval_xpath_getindex(result, './article/a/@href', 0)
149 href = href.split('?')[0]
150 href = href.split('/')[-1]
151 href = base64.urlsafe_b64decode(href + '====')
152 href = href[href.index(b'http') :].split(b'\xd2')[0]
153 href = href.decode()
154
155 title = extract_text(eval_xpath(result, './article/h3[1]'))
156
157 # The pub_date is mostly a string like 'yesterday', not a real
158 # timezone date or time. Therefore we can't use publishedDate.
159 pub_date = extract_text(eval_xpath(result, './article//time'))
160 pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
161
162 content = ' / '.join([x for x in [pub_origin, pub_date] if x])
163
164 # The image URL is located in a preceding sibling <img> tag, e.g.:
165 # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
166 # These URL are long but not personalized (double checked via tor).
167
168 img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
169
170 results.append(
171 {
172 'url': href,
173 'title': title,
174 'content': content,
175 'img_src': img_src,
176 }
177 )
178
179 # return results
180 return results
181
182
183ceid_list = [
184 'AE:ar',
185 'AR:es-419',
186 'AT:de',
187 'AU:en',
188 'BD:bn',
189 'BE:fr',
190 'BE:nl',
191 'BG:bg',
192 'BR:pt-419',
193 'BW:en',
194 'CA:en',
195 'CA:fr',
196 'CH:de',
197 'CH:fr',
198 'CL:es-419',
199 'CN:zh-Hans',
200 'CO:es-419',
201 'CU:es-419',
202 'CZ:cs',
203 'DE:de',
204 'EG:ar',
205 'ES:es',
206 'ET:en',
207 'FR:fr',
208 'GB:en',
209 'GH:en',
210 'GR:el',
211 'HK:zh-Hant',
212 'HU:hu',
213 'ID:en',
214 'ID:id',
215 'IE:en',
216 'IL:en',
217 'IL:he',
218 'IN:bn',
219 'IN:en',
220 'IN:hi',
221 'IN:ml',
222 'IN:mr',
223 'IN:ta',
224 'IN:te',
225 'IT:it',
226 'JP:ja',
227 'KE:en',
228 'KR:ko',
229 'LB:ar',
230 'LT:lt',
231 'LV:en',
232 'LV:lv',
233 'MA:fr',
234 'MX:es-419',
235 'MY:en',
236 'NA:en',
237 'NG:en',
238 'NL:nl',
239 'NO:no',
240 'NZ:en',
241 'PE:es-419',
242 'PH:en',
243 'PK:en',
244 'PL:pl',
245 'PT:pt-150',
246 'RO:ro',
247 'RS:sr',
248 'RU:ru',
249 'SA:ar',
250 'SE:sv',
251 'SG:en',
252 'SI:sl',
253 'SK:sk',
254 'SN:fr',
255 'TH:th',
256 'TR:tr',
257 'TW:zh-Hant',
258 'TZ:en',
259 'UA:ru',
260 'UA:uk',
261 'UG:en',
262 'US:en',
263 'US:es-419',
264 'VE:es-419',
265 'VN:vi',
266 'ZA:en',
267 'ZW:en',
268]
269"""List of region/language combinations supported by Google News. Values of the
270``ceid`` argument of the Google News REST API."""
271
272
273_skip_values = [
274 'ET:en', # english (ethiopia)
275 'ID:en', # english (indonesia)
276 'LV:en', # english (latvia)
277]
278
279_ceid_locale_map = {'NO:no': 'nb-NO'}
280
281
282def fetch_traits(engine_traits: EngineTraits):
283 _fetch_traits(engine_traits, add_domains=False)
284
285 engine_traits.custom['ceid'] = {}
286
287 for ceid in ceid_list:
288 if ceid in _skip_values:
289 continue
290
291 region, lang = ceid.split(':')
292 x = lang.split('-')
293 if len(x) > 1:
294 if x[1] not in ['Hant', 'Hans']:
295 lang = x[0]
296
297 sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
298 try:
299 locale = babel.Locale.parse(sxng_locale, sep='-')
300 except babel.UnknownLocaleError:
301 print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
302 continue
303
304 engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid
fetch_traits(EngineTraits engine_traits)