.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
presearch.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Presearch supports the search types listed in :py:obj:`search_type` (general,
3images, videos, news).
4
5Configured ``presarch`` engines:
6
7.. code:: yaml
8
9 - name: presearch
10 engine: presearch
11 search_type: search
12 categories: [general, web]
13
14 - name: presearch images
15 ...
16 search_type: images
17 categories: [images, web]
18
19 - name: presearch videos
20 ...
21 search_type: videos
22 categories: [general, web]
23
24 - name: presearch news
25 ...
26 search_type: news
27 categories: [news, web]
28
29.. hint::
30
31 By default Presearch's video category is intentionally placed into::
32
33 categories: [general, web]
34
35
36Search type ``video``
37=====================
38
39The results in the video category are most often links to pages that contain a
40video, for instance many links from Preasearch's video category link content
41from facebook (aka Meta) or Twitter (aka X). Since these are not real links to
42video streams SearXNG can't use the video template for this and if SearXNG can't
43use this template, then the user doesn't want to see these hits in the videos
44category.
45
46
47Languages & Regions
48===================
49
50In Presearch there are languages for the UI and regions for narrowing down the
51search. If we set "auto" for the region in the WEB-UI of Presearch and cookie
52``use_local_search_results=false``, then the defaults are set for both (the
53language and the region) from the ``Accept-Language`` header.
54
55Since the region is already "auto" by default, we only need to set the
56``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
57have to set these values in both requests we send to Presearch; in the first
58request to get the request-ID from Presearch and in the final request to get the
59result list (see ``send_accept_language_header``).
60
61The time format returned by Presearch varies depending on the language set.
62Multiple different formats can be supported by using ``dateutil`` parser, but
63it doesn't support formats such as "N time ago", "vor N time" (German),
64"Hace N time" (Spanish). Because of this, the dates are simply joined together
65with the rest of other metadata.
66
67
68Implementations
69===============
70
71"""
72
73from urllib.parse import urlencode, urlparse
74from searx import locales
75from searx.network import get
76from searx.utils import gen_useragent, html_to_text, parse_duration_string
77
78about = {
79 "website": "https://presearch.io",
80 "wikidiata_id": "Q7240905",
81 "official_api_documentation": "https://docs.presearch.io/nodes/api",
82 "use_official_api": False,
83 "require_api_key": False,
84 "results": "JSON",
85}
86paging = True
87safesearch = True
88time_range_support = True
89send_accept_language_header = True
90categories = ["general", "web"] # general, images, videos, news
91
92search_type = "search"
93"""must be any of ``search``, ``images``, ``videos``, ``news``"""
94
95base_url = "https://presearch.com"
96safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
97
98
99def init(_):
100 if search_type not in ['search', 'images', 'videos', 'news']:
101 raise ValueError(f'presearch search_type: {search_type}')
102
103
104def _get_request_id(query, params):
105
106 args = {
107 "q": query,
108 "page": params["pageno"],
109 }
110
111 if params["time_range"]:
112 args["time"] = params["time_range"]
113
114 url = f"{base_url}/{search_type}?{urlencode(args)}"
115
116 headers = {
117 'User-Agent': gen_useragent(),
118 'Cookie': (
119 f"b=1;"
120 f" presearch_session=;"
121 f" use_local_search_results=false;"
122 f" use_safe_search={safesearch_map[params['safesearch']]}"
123 ),
124 }
125 if params['searxng_locale'] != 'all':
126 l = locales.get_locale(params['searxng_locale'])
127
128 # Presearch narrows down the search by region. In SearXNG when the user
129 # does not set a region (e.g. 'en-CA' / canada) we cannot hand over a region.
130
131 # We could possibly use searx.locales.get_official_locales to determine
132 # in which regions this language is an official one, but then we still
133 # wouldn't know which region should be given more weight / Presearch
134 # performs an IP-based geolocation of the user, we don't want that in
135 # SearXNG ;-)
136
137 if l.territory:
138 headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5"
139
140 resp = get(url, headers=headers)
141
142 for line in resp.text.split("\n"):
143 if "window.searchId = " in line:
144 return line.split("= ")[1][:-1].replace('"', ""), resp.cookies
145
146 raise RuntimeError("Couldn't find any request id for presearch")
147
148
149def request(query, params):
150 request_id, cookies = _get_request_id(query, params)
151 params["headers"]["Accept"] = "application/json"
152 params["url"] = f"{base_url}/results?id={request_id}"
153 params["cookies"] = cookies
154
155 return params
156
157
159 for x in ['wikipedia', 'google']:
160 if text.lower().endswith(x):
161 text = text[: -len(x)]
162 return text.strip()
163
164
165def _fix_title(title, url):
166 """
167 Titles from Presearch shows domain + title without spacing, and HTML
168 This function removes these 2 issues.
169 Transforming "translate.google.co.in<em>Google</em> Translate" into "Google Translate"
170 """
171 parsed_url = urlparse(url)
172 domain = parsed_url.netloc
173 title = html_to_text(title)
174 # Fixes issue where domain would show up in the title
175 # translate.google.co.inGoogle Translate -> Google Translate
176 if (
177 title.startswith(domain)
178 and len(title) > len(domain)
179 and not title.startswith(domain + "/")
180 and not title.startswith(domain + " ")
181 ):
182 title = title.removeprefix(domain)
183 return title
184
185
186def parse_search_query(json_results):
187 results = []
188 if not json_results:
189 return results
190
191 for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
192 result = {
193 'url': item['link'],
194 'title': _fix_title(item['title'], item['link']),
195 'thumbnail': item['image'],
196 'content': '',
197 'metadata': item.get('source'),
198 }
199 results.append(result)
200
201 for item in json_results.get('standardResults', []):
202 result = {
203 'url': item['link'],
204 'title': _fix_title(item['title'], item['link']),
205 'content': html_to_text(item['description']),
206 }
207 results.append(result)
208
209 info = json_results.get('infoSection', {}).get('data')
210 if info:
211 attributes = []
212 for item in info.get('about', []):
213
214 text = html_to_text(item)
215 if ':' in text:
216 # split text into key / value
217 label, value = text.split(':', 1)
218 else:
219 # In other languages (tested with zh-TW) a colon is represented
220 # by a different symbol --> then we split at the first space.
221 label, value = text.split(' ', 1)
222 label = label[:-1]
223
224 value = _strip_leading_strings(value)
225 attributes.append({'label': label, 'value': value})
226 content = []
227 for item in [info.get('subtitle'), info.get('description')]:
228 if not item:
229 continue
230 item = _strip_leading_strings(html_to_text(item))
231 if item:
232 content.append(item)
233
234 results.append(
235 {
236 'infobox': info['title'],
237 'id': info['title'],
238 'img_src': info.get('image'),
239 'content': ' | '.join(content),
240 'attributes': attributes,
241 }
242 )
243 return results
244
245
246def response(resp):
247 results = []
248 json_resp = resp.json()
249
250 if search_type == 'search':
251 results = parse_search_query(json_resp.get('results', {}))
252
253 elif search_type == 'images':
254 for item in json_resp.get('images', []):
255 results.append(
256 {
257 'template': 'images.html',
258 'title': html_to_text(item['title']),
259 'url': item.get('link'),
260 'img_src': item.get('image'),
261 'thumbnail_src': item.get('thumbnail'),
262 }
263 )
264
265 elif search_type == 'videos':
266 # The results in the video category are most often links to pages that contain
267 # a video and not to a video stream --> SearXNG can't use the video template.
268
269 for item in json_resp.get('videos', []):
270 duration = item.get('duration')
271 if duration:
272 duration = parse_duration_string(duration)
273
274 results.append(
275 {
276 'title': html_to_text(item['title']),
277 'url': item.get('link'),
278 'content': item.get('description', ''),
279 'thumbnail': item.get('image'),
280 'length': duration,
281 }
282 )
283
284 elif search_type == 'news':
285 for item in json_resp.get('news', []):
286 source = item.get('source')
287 # Bug on their end, time sometimes returns "</a>"
288 time = html_to_text(item.get('time')).strip()
289 metadata = [source]
290 if time != "":
291 metadata.append(time)
292
293 results.append(
294 {
295 'title': html_to_text(item['title']),
296 'url': item.get('link'),
297 'content': html_to_text(item.get('description', '')),
298 'metadata': ' / '.join(metadata),
299 'thumbnail': item.get('image'),
300 }
301 )
302
303 return results
parse_search_query(json_results)
Definition presearch.py:186
request(query, params)
Definition presearch.py:149
_get_request_id(query, params)
Definition presearch.py:104