.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
presearch.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Presearch supports the search types listed in :py:obj:`search_type` (general,
3images, videos, news).
4
5Configured ``presarch`` engines:
6
7.. code:: yaml
8
9 - name: presearch
10 engine: presearch
11 search_type: search
12 categories: [general, web]
13
14 - name: presearch images
15 ...
16 search_type: images
17 categories: [images, web]
18
19 - name: presearch videos
20 ...
21 search_type: videos
22 categories: [general, web]
23
24 - name: presearch news
25 ...
26 search_type: news
27 categories: [news, web]
28
29.. hint::
30
31 By default Presearch's video category is intentionally placed into::
32
33 categories: [general, web]
34
35
36Search type ``video``
37=====================
38
39The results in the video category are most often links to pages that contain a
40video, for instance many links from Preasearch's video category link content
41from facebook (aka Meta) or Twitter (aka X). Since these are not real links to
42video streams SearXNG can't use the video template for this and if SearXNG can't
43use this template, then the user doesn't want to see these hits in the videos
44category.
45
46
47Languages & Regions
48===================
49
50In Presearch there are languages for the UI and regions for narrowing down the
51search. If we set "auto" for the region in the WEB-UI of Presearch and cookie
52``use_local_search_results=false``, then the defaults are set for both (the
53language and the region) from the ``Accept-Language`` header.
54
55Since the region is already "auto" by default, we only need to set the
56``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
57have to set these values in both requests we send to Presearch; in the first
58request to get the request-ID from Presearch and in the final request to get the
59result list (see ``send_accept_language_header``).
60
61The time format returned by Presearch varies depending on the language set.
62Multiple different formats can be supported by using ``dateutil`` parser, but
63it doesn't support formats such as "N time ago", "vor N time" (German),
64"Hace N time" (Spanish). Because of this, the dates are simply joined together
65with the rest of other metadata.
66
67
68Implementations
69===============
70
71"""
72
73from urllib.parse import urlencode, urlparse
74from searx import locales
75from searx.network import get
76from searx.utils import gen_useragent, html_to_text, parse_duration_string
77
78about = {
79 "website": "https://presearch.io",
80 "wikidiata_id": "Q7240905",
81 "official_api_documentation": "https://docs.presearch.io/nodes/api",
82 "use_official_api": False,
83 "require_api_key": False,
84 "results": "JSON",
85}
86paging = True
87safesearch = True
88time_range_support = True
89send_accept_language_header = True
90categories = ["general", "web"] # general, images, videos, news
91
92search_type = "search"
93"""must be any of ``search``, ``images``, ``videos``, ``news``"""
94
95base_url = "https://presearch.com"
96safesearch_map = {0: 'false', 1: 'true', 2: 'true'}
97
98
99def init(_):
100 if search_type not in ['search', 'images', 'videos', 'news']:
101 raise ValueError(f'presearch search_type: {search_type}')
102
103
104def _get_request_id(query, params):
105
106 args = {
107 "q": query,
108 "page": params["pageno"],
109 }
110
111 if params["time_range"]:
112 args["time"] = params["time_range"]
113
114 url = f"{base_url}/{search_type}?{urlencode(args)}"
115
116 headers = {
117 'User-Agent': gen_useragent(),
118 'Cookie': (
119 f"b=1;"
120 f" presearch_session=;"
121 f" use_local_search_results=false;"
122 f" use_safe_search={safesearch_map[params['safesearch']]}"
123 ),
124 }
125 if params['searxng_locale'] != 'all':
126 l = locales.get_locale(params['searxng_locale'])
127
128 # Presearch narrows down the search by region. In SearXNG when the user
129 # does not set a region (e.g. 'en-CA' / canada) we cannot hand over a region.
130
131 # We could possibly use searx.locales.get_official_locales to determine
132 # in which regions this language is an official one, but then we still
133 # wouldn't know which region should be given more weight / Presearch
134 # performs an IP-based geolocation of the user, we don't want that in
135 # SearXNG ;-)
136
137 if l.territory:
138 headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5"
139
140 resp_text = get(url, headers=headers).text # type: ignore
141
142 for line in resp_text.split("\n"):
143 if "window.searchId = " in line:
144 return line.split("= ")[1][:-1].replace('"', "")
145
146 return None
147
148
149def request(query, params):
150 request_id = _get_request_id(query, params)
151 params["headers"]["Accept"] = "application/json"
152 params["url"] = f"{base_url}/results?id={request_id}"
153
154 return params
155
156
158 for x in ['wikipedia', 'google']:
159 if text.lower().endswith(x):
160 text = text[: -len(x)]
161 return text.strip()
162
163
164def _fix_title(title, url):
165 """
166 Titles from Presearch shows domain + title without spacing, and HTML
167 This function removes these 2 issues.
168 Transforming "translate.google.co.in<em>Google</em> Translate" into "Google Translate"
169 """
170 parsed_url = urlparse(url)
171 domain = parsed_url.netloc
172 title = html_to_text(title)
173 # Fixes issue where domain would show up in the title
174 # translate.google.co.inGoogle Translate -> Google Translate
175 if (
176 title.startswith(domain)
177 and len(title) > len(domain)
178 and not title.startswith(domain + "/")
179 and not title.startswith(domain + " ")
180 ):
181 title = title.removeprefix(domain)
182 return title
183
184
185def parse_search_query(json_results):
186 results = []
187
188 for item in json_results.get('specialSections', {}).get('topStoriesCompact', {}).get('data', []):
189 result = {
190 'url': item['link'],
191 'title': _fix_title(item['title'], item['link']),
192 'thumbnail': item['image'],
193 'content': '',
194 'metadata': item.get('source'),
195 }
196 results.append(result)
197
198 for item in json_results.get('standardResults', []):
199 result = {
200 'url': item['link'],
201 'title': _fix_title(item['title'], item['link']),
202 'content': html_to_text(item['description']),
203 }
204 results.append(result)
205
206 info = json_results.get('infoSection', {}).get('data')
207 if info:
208 attributes = []
209 for item in info.get('about', []):
210
211 text = html_to_text(item)
212 if ':' in text:
213 # split text into key / value
214 label, value = text.split(':', 1)
215 else:
216 # In other languages (tested with zh-TW) a colon is represented
217 # by a different symbol --> then we split at the first space.
218 label, value = text.split(' ', 1)
219 label = label[:-1]
220
221 value = _strip_leading_strings(value)
222 attributes.append({'label': label, 'value': value})
223 content = []
224 for item in [info.get('subtitle'), info.get('description')]:
225 if not item:
226 continue
227 item = _strip_leading_strings(html_to_text(item))
228 if item:
229 content.append(item)
230
231 results.append(
232 {
233 'infobox': info['title'],
234 'id': info['title'],
235 'img_src': info.get('image'),
236 'content': ' | '.join(content),
237 'attributes': attributes,
238 }
239 )
240 return results
241
242
243def response(resp):
244 results = []
245 json_resp = resp.json()
246
247 if search_type == 'search':
248 results = parse_search_query(json_resp.get('results'))
249
250 elif search_type == 'images':
251 for item in json_resp.get('images', []):
252 results.append(
253 {
254 'template': 'images.html',
255 'title': html_to_text(item['title']),
256 'url': item.get('link'),
257 'img_src': item.get('image'),
258 'thumbnail_src': item.get('thumbnail'),
259 }
260 )
261
262 elif search_type == 'videos':
263 # The results in the video category are most often links to pages that contain
264 # a video and not to a video stream --> SearXNG can't use the video template.
265
266 for item in json_resp.get('videos', []):
267 duration = item.get('duration')
268 if duration:
269 duration = parse_duration_string(duration)
270
271 results.append(
272 {
273 'title': html_to_text(item['title']),
274 'url': item.get('link'),
275 'content': item.get('description', ''),
276 'thumbnail': item.get('image'),
277 'length': duration,
278 }
279 )
280
281 elif search_type == 'news':
282 for item in json_resp.get('news', []):
283 source = item.get('source')
284 # Bug on their end, time sometimes returns "</a>"
285 time = html_to_text(item.get('time')).strip()
286 metadata = [source]
287 if time != "":
288 metadata.append(time)
289
290 results.append(
291 {
292 'title': html_to_text(item['title']),
293 'url': item.get('link'),
294 'content': html_to_text(item.get('description', '')),
295 'metadata': ' / '.join(metadata),
296 'thumbnail': item.get('image'),
297 }
298 )
299
300 return results
parse_search_query(json_results)
Definition presearch.py:185
request(query, params)
Definition presearch.py:149
_get_request_id(query, params)
Definition presearch.py:104