.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
mediawiki.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
3the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have
4endpoints that follow this pattern::
5
6 https://{base_url}/w/api.php?action=query&list=search&format=json
7
8.. note::
9
10 In its actual state, this engine is implemented to parse JSON result
11 (`format=json`_) from a search query (`list=search`_). If you need other
12 ``action`` and ``list`` types ask SearXNG developers to extend the
13 implementation according to your needs.
14
15.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
16.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
17.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
18.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
19
20Configuration
21=============
22
23Request:
24
25- :py:obj:`base_url`
26- :py:obj:`search_type`
27- :py:obj:`srenablerewrites`
28- :py:obj:`srsort`
29- :py:obj:`srprop`
30
31Implementations
32===============
33
34"""
35from __future__ import annotations
36from typing import TYPE_CHECKING
37
38from datetime import datetime
39from urllib.parse import urlencode, quote
40
41from searx.utils import html_to_text
42from searx.enginelib.traits import EngineTraits
43
44if TYPE_CHECKING:
45 import logging
46
47 logger: logging.Logger
48
49traits: EngineTraits
50
51# about
52about = {
53 "website": None,
54 "wikidata_id": None,
55 "official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
56 "use_official_api": True,
57 "require_api_key": False,
58 "results": 'JSON',
59}
60
61# engine dependent config
62categories = ['general']
63paging = True
64number_of_results = 5
65
66search_type: str = 'nearmatch'
67"""Which type of search to perform. One of the following values: ``nearmatch``,
68``text`` or ``title``.
69
70See ``srwhat`` argument in `list=search`_ documentation.
71"""
72
73srenablerewrites: bool = True
74"""Enable internal query rewriting (Type: boolean). Some search backends can
75rewrite the query into another which is thought to provide better results, for
76instance by correcting spelling errors.
77
78See ``srenablerewrites`` argument in `list=search`_ documentation.
79"""
80
81srsort: str = 'relevance'
82"""Set the sort order of returned results. One of the following values:
83``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
84``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
85``none``, ``random``, ``relevance``, ``user_random``.
86
87See ``srenablerewrites`` argument in `list=search`_ documentation.
88"""
89
90srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
91"""Which properties to return.
92
93See ``srprop`` argument in `list=search`_ documentation.
94"""
95
96base_url: str = 'https://{language}.wikipedia.org/'
97"""Base URL of the Wikimedia wiki.
98
99``{language}``:
100 ISO 639-1 language code (en, de, fr ..) of the search language.
101"""
102
103api_path: str = 'w/api.php'
104"""The path the PHP api is listening on.
105
106The default path should work fine usually.
107"""
108
109timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
110"""The longhand version of MediaWiki time strings."""
111
112
113def request(query, params):
114
115 # write search-language back to params, required in response
116
117 if params['language'] == 'all':
118 params['language'] = 'en'
119 else:
120 params['language'] = params['language'].split('-')[0]
121
122 api_url = f"{base_url.rstrip('/')}/{api_path}?".format(language=params['language'])
123 offset = (params['pageno'] - 1) * number_of_results
124
125 args = {
126 'action': 'query',
127 'list': 'search',
128 'format': 'json',
129 'srsearch': query,
130 'sroffset': offset,
131 'srlimit': number_of_results,
132 'srwhat': search_type,
133 'srprop': srprop,
134 'srsort': srsort,
135 }
136 if srenablerewrites:
137 args['srenablerewrites'] = '1'
138
139 params['url'] = api_url + urlencode(args)
140 return params
141
142
143# get response from search-request
144def response(resp):
145
146 results = []
147 search_results = resp.json()
148
149 # return empty array if there are no results
150 if not search_results.get('query', {}).get('search'):
151 return []
152
153 for result in search_results['query']['search']:
154
155 if result.get('snippet', '').startswith('#REDIRECT'):
156 continue
157
158 title = result['title']
159 sectiontitle = result.get('sectiontitle')
160 content = html_to_text(result.get('snippet', ''))
161 metadata = html_to_text(result.get('categorysnippet', ''))
162 timestamp = result.get('timestamp')
163
164 url = (
165 base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
166 )
167 if sectiontitle:
168 # in case of sectiontitle create a link to the section in the wiki page
169 url += '#' + quote(sectiontitle.replace(' ', '_').encode())
170 title += ' / ' + sectiontitle
171
172 item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
173
174 if timestamp:
175 item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
176
177 results.append(item)
178
179 # return results
180 return results
request(query, params)
Definition mediawiki.py:113