.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
presearch.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Presearch supports the search types listed in :py:obj:`search_type` (general,
3images, videos, news).
4
5Configured ``presarch`` engines:
6
7.. code:: yaml
8
9 - name: presearch
10 engine: presearch
11 search_type: search
12 categories: [general, web]
13
14 - name: presearch images
15 ...
16 search_type: images
17 categories: [images, web]
18
19 - name: presearch videos
20 ...
21 search_type: videos
22 categories: [general, web]
23
24 - name: presearch news
25 ...
26 search_type: news
27 categories: [news, web]
28
29.. hint::
30
31 By default Presearch's video category is intentionally placed into::
32
33 categories: [general, web]
34
35
36Search type ``video``
37=====================
38
39The results in the video category are most often links to pages that contain a
40video, for instance many links from Preasearch's video category link content
41from facebook (aka Meta) or Twitter (aka X). Since these are not real links to
42video streams SearXNG can't use the video template for this and if SearXNG can't
43use this template, then the user doesn't want to see these hits in the videos
44category.
45
46
47Languages & Regions
48===================
49
50In Presearch there are languages for the UI and regions for narrowing down the
51search. If we set "auto" for the region in the WEB-UI of Presearch and cookie
52``use_local_search_results=false``, then the defaults are set for both (the
53language and the region) from the ``Accept-Language`` header.
54
55Since the region is already "auto" by default, we only need to set the
56``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
57have to set these values in both requests we send to Presearch; in the first
58request to get the request-ID from Presearch and in the final request to get the
59result list (see ``send_accept_language_header``).
60
61The time format returned by Presearch varies depending on the language set.
62Multiple different formats can be supported by using ``dateutil`` parser, but
63it doesn't support formats such as "N time ago", "vor N time" (German),
64"Hace N time" (Spanish). Because of this, the dates are simply joined together
65with the rest of other metadata.
66
67
68Implementations
69===============
70
71"""
72
73from urllib.parse import urlencode, urlparse
74from searx import locales
75from searx.network import get
76from searx.utils import gen_useragent, html_to_text, parse_duration_string
77
78about = {
79 "website": "https://presearch.io",
80 "wikidiata_id": "Q7240905",
81 "official_api_documentation": "https://docs.presearch.io/nodes/api",
82 "use_official_api": False,
83 "require_api_key": False,
84 "results": "JSON",
85}
86paging = True
87safesearch = True
88time_range_support = True
89send_accept_language_header = True
90categories = ["general", "web"] # general, images, videos, news
91
92search_type = "search"
93"""must be any of ``search``, ``images``, ``videos``, ``news``"""
94
95base_url = "https://presearch.com"
96safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
97
98
99def init(_):
100 if search_type not in ['search', 'images', 'videos', 'news']:
101 raise ValueError(f'presearch search_type: {search_type}')
102
103
104def _get_request_id(query, params):
105
106 args = {
107 "q": query,
108 "page": params["pageno"],
109 }
110
111 if params["time_range"]:
112 args["time"] = params["time_range"]
113
114 url = f"{base_url}/{search_type}?{urlencode(args)}"
115
116 headers = {
117 'User-Agent': gen_useragent(),
118 'Cookie': (
119 f"b=1;"
120 f" presearch_session=;"
121 f" use_local_search_results=false;"
122 f" use_safe_search={safesearch_map[params['safesearch']]}"
123 ),
124 }
125 if params['searxng_locale'] != 'all':
126 l = locales.get_locale(params['searxng_locale'])
127
128 # Presearch narrows down the search by region. In SearXNG when the user
129 # does not set a region (e.g. 'en-CA' / canada) we cannot hand over a region.
130
131 # We could possibly use searx.locales.get_official_locales to determine
132 # in which regions this language is an official one, but then we still
133 # wouldn't know which region should be given more weight / Presearch
134 # performs an IP-based geolocation of the user, we don't want that in
135 # SearXNG ;-)
136
137 if l.territory:
138 headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5"
139
140 resp_text = get(url, headers=headers).text # type: ignore
141
142 for line in resp_text.split("\n"):
143 if "window.searchId = " in line:
144 return line.split("= ")[1][:-1].replace('"', "")
145
146 return None
147
148
149def request(query, params):
150 request_id = _get_request_id(query, params)
151 params["headers"]["Accept"] = "application/json"
152 params["url"] = f"{base_url}/results?id={request_id}"
153
154 return params
155
156
158 for x in ['wikipedia', 'google']:
159 if text.lower().endswith(x):
160 text = text[: -len(x)]
161 return text.strip()
162
163
164def _fix_title(title, url):
165 """
166 Titles from Presearch shows domain + title without spacing, and HTML
167 This function removes these 2 issues.
168 Transforming "translate.google.co.in<em>Google</em> Translate" into "Google Translate"
169 """
170 parsed_url = urlparse(url)
171 domain = parsed_url.netloc
172 title = html_to_text(title)
173 # Fixes issue where domain would show up in the title
174 # translate.google.co.inGoogle Translate -> Google Translate
175 if (
176 title.startswith(domain)
177 and len(title) > len(domain)
178 and not title.startswith(domain + "/")
179 and not title.startswith(domain + " ")
180 ):
181 title = title.removeprefix(domain)
182 return title
183
184
185def parse_search_query(json_results):
186 results = []
187 if not json_results:
188 return results
189
190 for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
191 result = {
192 'url': item['link'],
193 'title': _fix_title(item['title'], item['link']),
194 'thumbnail': item['image'],
195 'content': '',
196 'metadata': item.get('source'),
197 }
198 results.append(result)
199
200 for item in json_results.get('standardResults', []):
201 result = {
202 'url': item['link'],
203 'title': _fix_title(item['title'], item['link']),
204 'content': html_to_text(item['description']),
205 }
206 results.append(result)
207
208 info = json_results.get('infoSection', {}).get('data')
209 if info:
210 attributes = []
211 for item in info.get('about', []):
212
213 text = html_to_text(item)
214 if ':' in text:
215 # split text into key / value
216 label, value = text.split(':', 1)
217 else:
218 # In other languages (tested with zh-TW) a colon is represented
219 # by a different symbol --> then we split at the first space.
220 label, value = text.split(' ', 1)
221 label = label[:-1]
222
223 value = _strip_leading_strings(value)
224 attributes.append({'label': label, 'value': value})
225 content = []
226 for item in [info.get('subtitle'), info.get('description')]:
227 if not item:
228 continue
229 item = _strip_leading_strings(html_to_text(item))
230 if item:
231 content.append(item)
232
233 results.append(
234 {
235 'infobox': info['title'],
236 'id': info['title'],
237 'img_src': info.get('image'),
238 'content': ' | '.join(content),
239 'attributes': attributes,
240 }
241 )
242 return results
243
244
245def response(resp):
246 results = []
247 json_resp = resp.json()
248
249 if search_type == 'search':
250 results = parse_search_query(json_resp.get('results', {}))
251
252 elif search_type == 'images':
253 for item in json_resp.get('images', []):
254 results.append(
255 {
256 'template': 'images.html',
257 'title': html_to_text(item['title']),
258 'url': item.get('link'),
259 'img_src': item.get('image'),
260 'thumbnail_src': item.get('thumbnail'),
261 }
262 )
263
264 elif search_type == 'videos':
265 # The results in the video category are most often links to pages that contain
266 # a video and not to a video stream --> SearXNG can't use the video template.
267
268 for item in json_resp.get('videos', []):
269 duration = item.get('duration')
270 if duration:
271 duration = parse_duration_string(duration)
272
273 results.append(
274 {
275 'title': html_to_text(item['title']),
276 'url': item.get('link'),
277 'content': item.get('description', ''),
278 'thumbnail': item.get('image'),
279 'length': duration,
280 }
281 )
282
283 elif search_type == 'news':
284 for item in json_resp.get('news', []):
285 source = item.get('source')
286 # Bug on their end, time sometimes returns "</a>"
287 time = html_to_text(item.get('time')).strip()
288 metadata = [source]
289 if time != "":
290 metadata.append(time)
291
292 results.append(
293 {
294 'title': html_to_text(item['title']),
295 'url': item.get('link'),
296 'content': html_to_text(item.get('description', '')),
297 'metadata': ' / '.join(metadata),
298 'thumbnail': item.get('image'),
299 }
300 )
301
302 return results
parse_search_query(json_results)
Definition presearch.py:185
request(query, params)
Definition presearch.py:149
_get_request_id(query, params)
Definition presearch.py:104