.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
wikipedia.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""This module implements the Wikipedia engine. Some of this implementations
3are shared by other engines:
4
5- :ref:`wikidata engine`
6
7The list of supported languages is :py:obj:`fetched <fetch_wikimedia_traits>` from
8the article linked by :py:obj:`list_of_wikipedias`.
9
10Unlike traditional search engines, wikipedia does not support one Wikipedia for
11all languages, but there is one Wikipedia for each supported language. Some of
12these Wikipedias have a LanguageConverter_ enabled
13(:py:obj:`rest_v1_summary_url`).
14
15A LanguageConverter_ (LC) is a system based on language variants that
16automatically converts the content of a page into a different variant. A variant
17is mostly the same language in a different script.
18
19- `Wikipedias in multiple writing systems`_
20- `Automatic conversion between traditional and simplified Chinese characters`_
21
22PR-2554_:
23 The Wikipedia link returned by the API is still the same in all cases
24 (`https://zh.wikipedia.org/wiki/出租車`_) but if your browser's
25 ``Accept-Language`` is set to any of ``zh``, ``zh-CN``, ``zh-TW``, ``zh-HK``
26 or .. Wikipedia's LC automatically returns the desired script in their
27 web-page.
28
29 - You can test the API here: https://reqbin.com/gesg2kvx
30
31.. _https://zh.wikipedia.org/wiki/出租車:
32 https://zh.wikipedia.org/wiki/%E5%87%BA%E7%A7%9F%E8%BB%8A
33
34To support Wikipedia's LanguageConverter_, a SearXNG request to Wikipedia uses
35:py:obj:`get_wiki_params` and :py:obj:`wiki_lc_locale_variants' in the
36:py:obj:`fetch_wikimedia_traits` function.
37
38To test in SearXNG, query for ``!wp 出租車`` with each of the available Chinese
39options:
40
41- ``!wp 出租車 :zh`` should show 出租車
42- ``!wp 出租車 :zh-CN`` should show 出租车
43- ``!wp 出租車 :zh-TW`` should show 計程車
44- ``!wp 出租車 :zh-HK`` should show 的士
45- ``!wp 出租車 :zh-SG`` should show 德士
46
47.. _LanguageConverter:
48 https://www.mediawiki.org/wiki/Writing_systems#LanguageConverter
49.. _Wikipedias in multiple writing systems:
50 https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems
51.. _Automatic conversion between traditional and simplified Chinese characters:
52 https://en.wikipedia.org/wiki/Chinese_Wikipedia#Automatic_conversion_between_traditional_and_simplified_Chinese_characters
53.. _PR-2554: https://github.com/searx/searx/pull/2554
54
55"""
56
57import urllib.parse
58import babel
59
60from lxml import html
61
62from searx import utils
63from searx import network as _network
64from searx import locales
65from searx.enginelib.traits import EngineTraits
66
67# about
68about = {
69 "website": 'https://www.wikipedia.org/',
70 "wikidata_id": 'Q52',
71 "official_api_documentation": 'https://en.wikipedia.org/api/',
72 "use_official_api": True,
73 "require_api_key": False,
74 "results": 'JSON',
75}
76
77display_type = ["infobox"]
78"""A list of display types composed from ``infobox`` and ``list``. The latter
79one will add a hit to the result list. The first one will show a hit in the
80info box. Both values can be set, or one of the two can be set."""
81
82send_accept_language_header = True
83"""The HTTP ``Accept-Language`` header is needed for wikis where
84LanguageConverter_ is enabled."""
85
86list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
87"""`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_
88"""
89
90wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
91"""The *editing depth* of Wikipedia is one of several possible rough indicators
92of the encyclopedia's collaborative quality, showing how frequently its articles
93are updated. The measurement of depth was introduced after some limitations of
94the classic measurement of article count were realized.
95"""
96
97rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
98"""
99`wikipedia rest_v1 summary API`_:
100 The summary response includes an extract of the first paragraph of the page in
101 plain text and HTML as well as the type of page. This is useful for page
102 previews (fka. Hovercards, aka. Popups) on the web and link previews in the
103 apps.
104
105HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`):
106 The desired language variant code for wikis where LanguageConverter_ is
107 enabled.
108
109.. _wikipedia rest_v1 summary API:
110 https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
111
112"""
113
114wiki_lc_locale_variants = {
115 "zh": (
116 "zh-CN",
117 "zh-HK",
118 "zh-MO",
119 "zh-MY",
120 "zh-SG",
121 "zh-TW",
122 ),
123 "zh-classical": ("zh-classical",),
124}
125"""Mapping rule of the LanguageConverter_ to map a language and its variants to
126a Locale (used in the HTTP ``Accept-Language`` header). For example see `LC
127Chinese`_.
128
129.. _LC Chinese:
130 https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems#Chinese
131"""
132
133wikipedia_script_variants = {
134 "zh": (
135 "zh_Hant",
136 "zh_Hans",
137 )
138}
139
140
141def get_wiki_params(sxng_locale, eng_traits):
142 """Returns the Wikipedia language tag and the netloc that fits to the
143 ``sxng_locale``. To support LanguageConverter_ this function rates a locale
144 (region) higher than a language (compare :py:obj:`wiki_lc_locale_variants`).
145
146 """
147 eng_tag = eng_traits.get_region(sxng_locale, eng_traits.get_language(sxng_locale, 'en'))
148 wiki_netloc = eng_traits.custom['wiki_netloc'].get(eng_tag, 'en.wikipedia.org')
149 return eng_tag, wiki_netloc
150
151
152def request(query, params):
153 """Assemble a request (`wikipedia rest_v1 summary API`_)."""
154 if query.islower():
155 query = query.title()
156
157 _eng_tag, wiki_netloc = get_wiki_params(params['searxng_locale'], traits)
158 title = urllib.parse.quote(query)
159 params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
160
161 params['raise_for_httperror'] = False
162 params['soft_max_redirects'] = 2
163
164 return params
165
166
167# get response from search-request
168def response(resp):
169
170 results = []
171 if resp.status_code == 404:
172 return []
173 if resp.status_code == 400:
174 try:
175 api_result = resp.json()
176 except Exception: # pylint: disable=broad-except
177 pass
178 else:
179 if (
180 api_result['type'] == 'https://mediawiki.org/wiki/HyperSwitch/errors/bad_request'
181 and api_result['detail'] == 'title-invalid-characters'
182 ):
183 return []
184
185 _network.raise_for_httperror(resp)
186
187 api_result = resp.json()
188 title = utils.html_to_text(api_result.get('titles', {}).get('display') or api_result.get('title'))
189 wikipedia_link = api_result['content_urls']['desktop']['page']
190
191 if "list" in display_type or api_result.get('type') != 'standard':
192 # show item in the result list if 'list' is in the display options or it
193 # is a item that can't be displayed in a infobox.
194 results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
195
196 if "infobox" in display_type:
197 if api_result.get('type') == 'standard':
198 results.append(
199 {
200 'infobox': title,
201 'id': wikipedia_link,
202 'content': api_result.get('extract', ''),
203 'img_src': api_result.get('thumbnail', {}).get('source'),
204 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
205 }
206 )
207
208 return results
209
210
211# Nonstandard language codes
212#
213# These Wikipedias use language codes that do not conform to the ISO 639
214# standard (which is how wiki subdomains are chosen nowadays).
215
216lang_map = locales.LOCALE_BEST_MATCH.copy()
217lang_map.update(
218 {
219 'be-tarask': 'bel',
220 'ak': 'aka',
221 'als': 'gsw',
222 'bat-smg': 'sgs',
223 'cbk-zam': 'cbk',
224 'fiu-vro': 'vro',
225 'map-bms': 'map',
226 'no': 'nb-NO',
227 'nrm': 'nrf',
228 'roa-rup': 'rup',
229 'nds-nl': 'nds',
230 #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
231 'zh-min-nan': 'nan',
232 'zh-yue': 'yue',
233 'an': 'arg',
234 }
235)
236
237
238def fetch_traits(engine_traits: EngineTraits):
239 fetch_wikimedia_traits(engine_traits)
240 print("WIKIPEDIA_LANGUAGES: %s" % len(engine_traits.custom['WIKIPEDIA_LANGUAGES']))
241
242
243def fetch_wikimedia_traits(engine_traits: EngineTraits):
244 """Fetch languages from Wikipedia. Not all languages from the
245 :py:obj:`list_of_wikipedias` are supported by SearXNG locales, only those
246 known from :py:obj:`searx.locales.LOCALE_NAMES` or those with a minimal
247 :py:obj:`editing depth <wikipedia_article_depth>`.
248
249 The location of the Wikipedia address of a language is mapped in a
250 :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
251 (``wiki_netloc``). Here is a reduced example:
252
253 .. code:: python
254
255 traits.custom['wiki_netloc'] = {
256 "en": "en.wikipedia.org",
257 ..
258 "gsw": "als.wikipedia.org",
259 ..
260 "zh": "zh.wikipedia.org",
261 "zh-classical": "zh-classical.wikipedia.org"
262 }
263 """
264 # pylint: disable=too-many-branches
265 engine_traits.custom['wiki_netloc'] = {}
266 engine_traits.custom['WIKIPEDIA_LANGUAGES'] = []
267
268 # insert alias to map from a script or region to a wikipedia variant
269
270 for eng_tag, sxng_tag_list in wikipedia_script_variants.items():
271 for sxng_tag in sxng_tag_list:
272 engine_traits.languages[sxng_tag] = eng_tag
273 for eng_tag, sxng_tag_list in wiki_lc_locale_variants.items():
274 for sxng_tag in sxng_tag_list:
275 engine_traits.regions[sxng_tag] = eng_tag
276
277 resp = _network.get(list_of_wikipedias)
278 if not resp.ok:
279 print("ERROR: response from Wikipedia is not OK.")
280
281 dom = html.fromstring(resp.text)
282 for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):
283
284 cols = row.xpath('./td')
285 if not cols:
286 continue
287 cols = [c.text_content().strip() for c in cols]
288
289 depth = float(cols[11].replace('-', '0').replace(',', ''))
290 articles = int(cols[4].replace(',', '').replace(',', ''))
291
292 eng_tag = cols[3]
293 wiki_url = row.xpath('./td[4]/a/@href')[0]
294 wiki_url = urllib.parse.urlparse(wiki_url)
295
296 try:
297 sxng_tag = locales.language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
298 except babel.UnknownLocaleError:
299 # print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
300 continue
301 finally:
302 engine_traits.custom['WIKIPEDIA_LANGUAGES'].append(eng_tag)
303
304 if sxng_tag not in locales.LOCALE_NAMES:
305
306 if articles < 10000:
307 # exclude languages with too few articles
308 continue
309
310 if int(depth) < 20:
311 # Rough indicator of a Wikipedia’s quality, showing how
312 # frequently its articles are updated.
313 continue
314
315 conflict = engine_traits.languages.get(sxng_tag)
316 if conflict:
317 if conflict != eng_tag:
318 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
319 continue
320
321 engine_traits.languages[sxng_tag] = eng_tag
322 engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc
323
324 engine_traits.custom['WIKIPEDIA_LANGUAGES'].sort()
get_wiki_params(sxng_locale, eng_traits)
Definition wikipedia.py:141
fetch_wikimedia_traits(EngineTraits engine_traits)
Definition wikipedia.py:243
request(query, params)
Definition wikipedia.py:152
fetch_traits(EngineTraits engine_traits)
Definition wikipedia.py:238