.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
wikipedia.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""This module implements the Wikipedia engine. Some of this implementations
3are shared by other engines:
4
5- :ref:`wikidata engine`
6
7The list of supported languages is :py:obj:`fetched <fetch_wikimedia_traits>` from
8the article linked by :py:obj:`list_of_wikipedias`.
9
10Unlike traditional search engines, wikipedia does not support one Wikipedia for
11all languages, but there is one Wikipedia for each supported language. Some of
12these Wikipedias have a LanguageConverter_ enabled
13(:py:obj:`rest_v1_summary_url`).
14
15A LanguageConverter_ (LC) is a system based on language variants that
16automatically converts the content of a page into a different variant. A variant
17is mostly the same language in a different script.
18
19- `Wikipedias in multiple writing systems`_
20- `Automatic conversion between traditional and simplified Chinese characters`_
21
22PR-2554_:
23 The Wikipedia link returned by the API is still the same in all cases
24 (`https://zh.wikipedia.org/wiki/出租車`_) but if your browser's
25 ``Accept-Language`` is set to any of ``zh``, ``zh-CN``, ``zh-TW``, ``zh-HK``
26 or .. Wikipedia's LC automatically returns the desired script in their
27 web-page.
28
29 - You can test the API here: https://reqbin.com/gesg2kvx
30
31.. _https://zh.wikipedia.org/wiki/出租車:
32 https://zh.wikipedia.org/wiki/%E5%87%BA%E7%A7%9F%E8%BB%8A
33
34To support Wikipedia's LanguageConverter_, a SearXNG request to Wikipedia uses
35:py:obj:`get_wiki_params` and :py:obj:`wiki_lc_locale_variants' in the
36:py:obj:`fetch_wikimedia_traits` function.
37
38To test in SearXNG, query for ``!wp 出租車`` with each of the available Chinese
39options:
40
41- ``!wp 出租車 :zh`` should show 出租車
42- ``!wp 出租車 :zh-CN`` should show 出租车
43- ``!wp 出租車 :zh-TW`` should show 計程車
44- ``!wp 出租車 :zh-HK`` should show 的士
45- ``!wp 出租車 :zh-SG`` should show 德士
46
47.. _LanguageConverter:
48 https://www.mediawiki.org/wiki/Writing_systems#LanguageConverter
49.. _Wikipedias in multiple writing systems:
50 https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems
51.. _Automatic conversion between traditional and simplified Chinese characters:
52 https://en.wikipedia.org/wiki/Chinese_Wikipedia#Automatic_conversion_between_traditional_and_simplified_Chinese_characters
53.. _PR-2554: https://github.com/searx/searx/pull/2554
54
55"""
56
57import urllib.parse
58import babel
59
60from lxml import html
61
62from searx import utils
63from searx import network as _network
64from searx import locales
65from searx.enginelib.traits import EngineTraits
66
67traits: EngineTraits
68
69# about
70about = {
71 "website": 'https://www.wikipedia.org/',
72 "wikidata_id": 'Q52',
73 "official_api_documentation": 'https://en.wikipedia.org/api/',
74 "use_official_api": True,
75 "require_api_key": False,
76 "results": 'JSON',
77}
78
79display_type = ["infobox"]
80"""A list of display types composed from ``infobox`` and ``list``. The latter
81one will add a hit to the result list. The first one will show a hit in the
82info box. Both values can be set, or one of the two can be set."""
83
84send_accept_language_header = True
85"""The HTTP ``Accept-Language`` header is needed for wikis where
86LanguageConverter_ is enabled."""
87
88list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
89"""`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_
90"""
91
92wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
93"""The *editing depth* of Wikipedia is one of several possible rough indicators
94of the encyclopedia's collaborative quality, showing how frequently its articles
95are updated. The measurement of depth was introduced after some limitations of
96the classic measurement of article count were realized.
97"""
98
99rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
100"""
101`wikipedia rest_v1 summary API`_:
102 The summary response includes an extract of the first paragraph of the page in
103 plain text and HTML as well as the type of page. This is useful for page
104 previews (fka. Hovercards, aka. Popups) on the web and link previews in the
105 apps.
106
107HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`):
108 The desired language variant code for wikis where LanguageConverter_ is
109 enabled.
110
111.. _wikipedia rest_v1 summary API:
112 https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
113
114"""
115
116wiki_lc_locale_variants = {
117 "zh": (
118 "zh-CN",
119 "zh-HK",
120 "zh-MO",
121 "zh-MY",
122 "zh-SG",
123 "zh-TW",
124 ),
125 "zh-classical": ("zh-classical",),
126}
127"""Mapping rule of the LanguageConverter_ to map a language and its variants to
128a Locale (used in the HTTP ``Accept-Language`` header). For example see `LC
129Chinese`_.
130
131.. _LC Chinese:
132 https://meta.wikimedia.org/wiki/Wikipedias_in_multiple_writing_systems#Chinese
133"""
134
135wikipedia_script_variants = {
136 "zh": (
137 "zh_Hant",
138 "zh_Hans",
139 )
140}
141
142
143def get_wiki_params(sxng_locale, eng_traits):
144 """Returns the Wikipedia language tag and the netloc that fits to the
145 ``sxng_locale``. To support LanguageConverter_ this function rates a locale
146 (region) higher than a language (compare :py:obj:`wiki_lc_locale_variants`).
147
148 """
149 eng_tag = eng_traits.get_region(sxng_locale, eng_traits.get_language(sxng_locale, 'en'))
150 wiki_netloc = eng_traits.custom['wiki_netloc'].get(eng_tag, 'en.wikipedia.org')
151 return eng_tag, wiki_netloc
152
153
154def request(query, params):
155 """Assemble a request (`wikipedia rest_v1 summary API`_)."""
156 if query.islower():
157 query = query.title()
158
159 _eng_tag, wiki_netloc = get_wiki_params(params['searxng_locale'], traits)
160 title = urllib.parse.quote(query)
161 params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
162
163 params['raise_for_httperror'] = False
164 params['soft_max_redirects'] = 2
165
166 return params
167
168
169# get response from search-request
170def response(resp):
171
172 results = []
173 if resp.status_code == 404:
174 return []
175 if resp.status_code == 400:
176 try:
177 api_result = resp.json()
178 except Exception: # pylint: disable=broad-except
179 pass
180 else:
181 if (
182 api_result['type'] == 'https://mediawiki.org/wiki/HyperSwitch/errors/bad_request'
183 and api_result['detail'] == 'title-invalid-characters'
184 ):
185 return []
186
187 _network.raise_for_httperror(resp)
188
189 api_result = resp.json()
190 title = utils.html_to_text(api_result.get('titles', {}).get('display') or api_result.get('title'))
191 wikipedia_link = api_result['content_urls']['desktop']['page']
192
193 if "list" in display_type or api_result.get('type') != 'standard':
194 # show item in the result list if 'list' is in the display options or it
195 # is a item that can't be displayed in a infobox.
196 results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
197
198 if "infobox" in display_type:
199 if api_result.get('type') == 'standard':
200 results.append(
201 {
202 'infobox': title,
203 'id': wikipedia_link,
204 'content': api_result.get('extract', ''),
205 'img_src': api_result.get('thumbnail', {}).get('source'),
206 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}],
207 }
208 )
209
210 return results
211
212
213# Nonstandard language codes
214#
215# These Wikipedias use language codes that do not conform to the ISO 639
216# standard (which is how wiki subdomains are chosen nowadays).
217
218lang_map = locales.LOCALE_BEST_MATCH.copy()
219lang_map.update(
220 {
221 'be-tarask': 'bel',
222 'ak': 'aka',
223 'als': 'gsw',
224 'bat-smg': 'sgs',
225 'cbk-zam': 'cbk',
226 'fiu-vro': 'vro',
227 'map-bms': 'map',
228 'no': 'nb-NO',
229 'nrm': 'nrf',
230 'roa-rup': 'rup',
231 'nds-nl': 'nds',
232 #'simple: – invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
233 'zh-min-nan': 'nan',
234 'zh-yue': 'yue',
235 'an': 'arg',
236 }
237)
238
239
240def fetch_traits(engine_traits: EngineTraits):
241 fetch_wikimedia_traits(engine_traits)
242 print("WIKIPEDIA_LANGUAGES: %s" % len(engine_traits.custom['WIKIPEDIA_LANGUAGES']))
243
244
245def fetch_wikimedia_traits(engine_traits: EngineTraits):
246 """Fetch languages from Wikipedia. Not all languages from the
247 :py:obj:`list_of_wikipedias` are supported by SearXNG locales, only those
248 known from :py:obj:`searx.locales.LOCALE_NAMES` or those with a minimal
249 :py:obj:`editing depth <wikipedia_article_depth>`.
250
251 The location of the Wikipedia address of a language is mapped in a
252 :py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
253 (``wiki_netloc``). Here is a reduced example:
254
255 .. code:: python
256
257 traits.custom['wiki_netloc'] = {
258 "en": "en.wikipedia.org",
259 ..
260 "gsw": "als.wikipedia.org",
261 ..
262 "zh": "zh.wikipedia.org",
263 "zh-classical": "zh-classical.wikipedia.org"
264 }
265 """
266 # pylint: disable=too-many-branches
267 engine_traits.custom['wiki_netloc'] = {}
268 engine_traits.custom['WIKIPEDIA_LANGUAGES'] = []
269
270 # insert alias to map from a script or region to a wikipedia variant
271
272 for eng_tag, sxng_tag_list in wikipedia_script_variants.items():
273 for sxng_tag in sxng_tag_list:
274 engine_traits.languages[sxng_tag] = eng_tag
275 for eng_tag, sxng_tag_list in wiki_lc_locale_variants.items():
276 for sxng_tag in sxng_tag_list:
277 engine_traits.regions[sxng_tag] = eng_tag
278
279 resp = _network.get(list_of_wikipedias)
280 if not resp.ok:
281 print("ERROR: response from Wikipedia is not OK.")
282
283 dom = html.fromstring(resp.text)
284 for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):
285
286 cols = row.xpath('./td')
287 if not cols:
288 continue
289 cols = [c.text_content().strip() for c in cols]
290
291 depth = float(cols[11].replace('-', '0').replace(',', ''))
292 articles = int(cols[4].replace(',', '').replace(',', ''))
293
294 eng_tag = cols[3]
295 wiki_url = row.xpath('./td[4]/a/@href')[0]
296 wiki_url = urllib.parse.urlparse(wiki_url)
297
298 try:
299 sxng_tag = locales.language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
300 except babel.UnknownLocaleError:
301 # print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
302 continue
303 finally:
304 engine_traits.custom['WIKIPEDIA_LANGUAGES'].append(eng_tag)
305
306 if sxng_tag not in locales.LOCALE_NAMES:
307
308 if articles < 10000:
309 # exclude languages with too few articles
310 continue
311
312 if int(depth) < 20:
313 # Rough indicator of a Wikipedia’s quality, showing how
314 # frequently its articles are updated.
315 continue
316
317 conflict = engine_traits.languages.get(sxng_tag)
318 if conflict:
319 if conflict != eng_tag:
320 print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
321 continue
322
323 engine_traits.languages[sxng_tag] = eng_tag
324 engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc
325
326 engine_traits.custom['WIKIPEDIA_LANGUAGES'].sort()
get_wiki_params(sxng_locale, eng_traits)
Definition wikipedia.py:143
fetch_wikimedia_traits(EngineTraits engine_traits)
Definition wikipedia.py:245
request(query, params)
Definition wikipedia.py:154
fetch_traits(EngineTraits engine_traits)
Definition wikipedia.py:240