.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
presearch.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Presearch supports the search types listed in :py:obj:`search_type` (general,
3images, videos, news).
4
5Configured ``presarch`` engines:
6
7.. code:: yaml
8
9 - name: presearch
10 engine: presearch
11 search_type: search
12 categories: [general, web]
13
14 - name: presearch images
15 ...
16 search_type: images
17 categories: [images, web]
18
19 - name: presearch videos
20 ...
21 search_type: videos
22 categories: [general, web]
23
24 - name: presearch news
25 ...
26 search_type: news
27 categories: [news, web]
28
29.. hint::
30
31 By default Presearch's video category is intentionally placed into::
32
33 categories: [general, web]
34
35
36Search type ``video``
37=====================
38
39The results in the video category are most often links to pages that contain a
40video, for instance many links from Preasearch's video category link content
41from facebook (aka Meta) or Twitter (aka X). Since these are not real links to
42video streams SearXNG can't use the video template for this and if SearXNG can't
43use this template, then the user doesn't want to see these hits in the videos
44category.
45
46
47Languages & Regions
48===================
49
50In Presearch there are languages for the UI and regions for narrowing down the
51search. If we set "auto" for the region in the WEB-UI of Presearch and cookie
52``use_local_search_results=false``, then the defaults are set for both (the
53language and the region) from the ``Accept-Language`` header.
54
55Since the region is already "auto" by default, we only need to set the
56``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
57have to set these values in both requests we send to Presearch; in the first
58request to get the request-ID from Presearch and in the final request to get the
59result list (see ``send_accept_language_header``).
60
61
62Implementations
63===============
64
65"""
66
67from urllib.parse import urlencode
68from searx import locales
69from searx.network import get
70from searx.utils import gen_useragent, html_to_text
71
72about = {
73 "website": "https://presearch.io",
74 "wikidiata_id": "Q7240905",
75 "official_api_documentation": "https://docs.presearch.io/nodes/api",
76 "use_official_api": False,
77 "require_api_key": False,
78 "results": "JSON",
79}
80paging = True
81safesearch = True
82time_range_support = True
83send_accept_language_header = True
84categories = ["general", "web"] # general, images, videos, news
85
86search_type = "search"
87"""must be any of ``search``, ``images``, ``videos``, ``news``"""
88
89base_url = "https://presearch.com"
90safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
91
92
93def init(_):
94 if search_type not in ['search', 'images', 'videos', 'news']:
95 raise ValueError(f'presearch search_type: {search_type}')
96
97
98def _get_request_id(query, params):
99
100 args = {
101 "q": query,
102 "page": params["pageno"],
103 }
104
105 if params["time_range"]:
106 args["time"] = params["time_range"]
107
108 url = f"{base_url}/{search_type}?{urlencode(args)}"
109
110 headers = {
111 'User-Agent': gen_useragent(),
112 'Cookie': (
113 f"b=1;"
114 f" presearch_session=;"
115 f" use_local_search_results=false;"
116 f" use_safe_search={safesearch_map[params['safesearch']]}"
117 ),
118 }
119 if params['searxng_locale'] != 'all':
120 l = locales.get_locale(params['searxng_locale'])
121
122 # Presearch narrows down the search by region. In SearXNG when the user
123 # does not set a region (e.g. 'en-CA' / canada) we cannot hand over a region.
124
125 # We could possibly use searx.locales.get_official_locales to determine
126 # in which regions this language is an official one, but then we still
127 # wouldn't know which region should be given more weight / Presearch
128 # performs an IP-based geolocation of the user, we don't want that in
129 # SearXNG ;-)
130
131 if l.territory:
132 headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5"
133
134 resp_text = get(url, headers=headers).text # type: ignore
135
136 for line in resp_text.split("\n"):
137 if "window.searchId = " in line:
138 return line.split("= ")[1][:-1].replace('"', "")
139
140 return None
141
142
143def request(query, params):
144 request_id = _get_request_id(query, params)
145 params["headers"]["Accept"] = "application/json"
146 params["url"] = f"{base_url}/results?id={request_id}"
147
148 return params
149
150
152 for x in ['wikipedia', 'google']:
153 if text.lower().endswith(x):
154 text = text[: -len(x)]
155 return text.strip()
156
157
158def parse_search_query(json_results):
159 results = []
160
161 for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
162 result = {
163 'url': item['link'],
164 'title': item['title'],
165 'thumbnail': item['image'],
166 'content': '',
167 'metadata': item.get('source'),
168 }
169 results.append(result)
170
171 for item in json_results.get('standardResults', []):
172 result = {
173 'url': item['link'],
174 'title': item['title'],
175 'content': html_to_text(item['description']),
176 }
177 results.append(result)
178
179 info = json_results.get('infoSection', {}).get('data')
180 if info:
181 attributes = []
182 for item in info.get('about', []):
183
184 text = html_to_text(item)
185 if ':' in text:
186 # split text into key / value
187 label, value = text.split(':', 1)
188 else:
189 # In other languages (tested with zh-TW) a colon is represented
190 # by a different symbol --> then we split at the first space.
191 label, value = text.split(' ', 1)
192 label = label[:-1]
193
194 value = _strip_leading_strings(value)
195 attributes.append({'label': label, 'value': value})
196 content = []
197 for item in [info.get('subtitle'), info.get('description')]:
198 if not item:
199 continue
200 item = _strip_leading_strings(html_to_text(item))
201 if item:
202 content.append(item)
203
204 results.append(
205 {
206 'infobox': info['title'],
207 'id': info['title'],
208 'img_src': info.get('image'),
209 'content': ' | '.join(content),
210 'attributes': attributes,
211 }
212 )
213 return results
214
215
216def response(resp):
217 results = []
218 json_resp = resp.json()
219
220 if search_type == 'search':
221 results = parse_search_query(json_resp.get('results'))
222
223 elif search_type == 'images':
224 for item in json_resp.get('images', []):
225 results.append(
226 {
227 'template': 'images.html',
228 'title': item['title'],
229 'url': item.get('link'),
230 'img_src': item.get('image'),
231 'thumbnail_src': item.get('thumbnail'),
232 }
233 )
234
235 elif search_type == 'videos':
236 # The results in the video category are most often links to pages that contain
237 # a video and not to a video stream --> SearXNG can't use the video template.
238
239 for item in json_resp.get('videos', []):
240 metadata = [x for x in [item.get('description'), item.get('duration')] if x]
241 results.append(
242 {
243 'title': item['title'],
244 'url': item.get('link'),
245 'content': '',
246 'metadata': ' / '.join(metadata),
247 'thumbnail': item.get('image'),
248 }
249 )
250
251 elif search_type == 'news':
252 for item in json_resp.get('news', []):
253 metadata = [x for x in [item.get('source'), item.get('time')] if x]
254 results.append(
255 {
256 'title': item['title'],
257 'url': item.get('link'),
258 'content': item.get('description', ''),
259 'metadata': ' / '.join(metadata),
260 'thumbnail': item.get('image'),
261 }
262 )
263
264 return results
parse_search_query(json_results)
Definition presearch.py:158
request(query, params)
Definition presearch.py:143
_get_request_id(query, params)
Definition presearch.py:98