.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
results.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2# pylint: disable=missing-module-docstring
3from __future__ import annotations
4
5import warnings
6import re
7from collections import defaultdict
8from operator import itemgetter
9from threading import RLock
10from typing import List, NamedTuple, Set
11from urllib.parse import urlparse, unquote
12
13from searx import logger
14from searx.engines import engines
15from searx.metrics import histogram_observe, counter_add, count_error
16
17from searx.result_types import Result, LegacyResult
18from searx.result_types.answer import AnswerSet, BaseAnswer
19
20CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
21
22
23# return the meaningful length of the content for a result
24def result_content_len(content):
25 if isinstance(content, str):
26 return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
27 return 0
28
29
30def compare_urls(url_a, url_b):
31 """Lazy compare between two URL.
32 "www.example.com" and "example.com" are equals.
33 "www.example.com/path/" and "www.example.com/path" are equals.
34 "https://www.example.com/" and "http://www.example.com/" are equals.
35
36 Args:
37 url_a (ParseResult): first URL
38 url_b (ParseResult): second URL
39
40 Returns:
41 bool: True if url_a and url_b are equals
42 """
43 # ignore www. in comparison
44 if url_a.netloc.startswith('www.'):
45 host_a = url_a.netloc.replace('www.', '', 1)
46 else:
47 host_a = url_a.netloc
48 if url_b.netloc.startswith('www.'):
49 host_b = url_b.netloc.replace('www.', '', 1)
50 else:
51 host_b = url_b.netloc
52
53 if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
54 return False
55
56 # remove / from the end of the url if required
57 path_a = url_a.path[:-1] if url_a.path.endswith('/') else url_a.path
58 path_b = url_b.path[:-1] if url_b.path.endswith('/') else url_b.path
59
60 return unquote(path_a) == unquote(path_b)
61
62
63def merge_two_infoboxes(infobox1, infobox2): # pylint: disable=too-many-branches, too-many-statements
64 # get engines weights
65 if hasattr(engines[infobox1['engine']], 'weight'):
66 weight1 = engines[infobox1['engine']].weight
67 else:
68 weight1 = 1
69 if hasattr(engines[infobox2['engine']], 'weight'):
70 weight2 = engines[infobox2['engine']].weight
71 else:
72 weight2 = 1
73
74 if weight2 > weight1:
75 infobox1['engine'] = infobox2['engine']
76
77 infobox1['engines'] |= infobox2['engines']
78
79 if 'urls' in infobox2:
80 urls1 = infobox1.get('urls', None)
81 if urls1 is None:
82 urls1 = []
83
84 for url2 in infobox2.get('urls', []):
85 unique_url = True
86 parsed_url2 = urlparse(url2.get('url', ''))
87 entity_url2 = url2.get('entity')
88 for url1 in urls1:
89 if (entity_url2 is not None and url1.get('entity') == entity_url2) or compare_urls(
90 urlparse(url1.get('url', '')), parsed_url2
91 ):
92 unique_url = False
93 break
94 if unique_url:
95 urls1.append(url2)
96
97 infobox1['urls'] = urls1
98
99 if 'img_src' in infobox2:
100 img1 = infobox1.get('img_src', None)
101 img2 = infobox2.get('img_src')
102 if img1 is None:
103 infobox1['img_src'] = img2
104 elif weight2 > weight1:
105 infobox1['img_src'] = img2
106
107 if 'attributes' in infobox2:
108 attributes1 = infobox1.get('attributes')
109 if attributes1 is None:
110 infobox1['attributes'] = attributes1 = []
111
112 attributeSet = set()
113 for attribute in attributes1:
114 label = attribute.get('label')
115 if label not in attributeSet:
116 attributeSet.add(label)
117 entity = attribute.get('entity')
118 if entity not in attributeSet:
119 attributeSet.add(entity)
120
121 for attribute in infobox2.get('attributes', []):
122 if attribute.get('label') not in attributeSet and attribute.get('entity') not in attributeSet:
123 attributes1.append(attribute)
124
125 if 'content' in infobox2:
126 content1 = infobox1.get('content', None)
127 content2 = infobox2.get('content', '')
128 if content1 is not None:
129 if result_content_len(content2) > result_content_len(content1):
130 infobox1['content'] = content2
131 else:
132 infobox1['content'] = content2
133
134
135def result_score(result, priority):
136 weight = 1.0
137
138 for result_engine in result['engines']:
139 if hasattr(engines.get(result_engine), 'weight'):
140 weight *= float(engines[result_engine].weight)
141
142 weight *= len(result['positions'])
143 score = 0
144
145 for position in result['positions']:
146 if priority == 'low':
147 continue
148 if priority == 'high':
149 score += weight
150 else:
151 score += weight / position
152
153 return score
154
155
156class Timing(NamedTuple): # pylint: disable=missing-class-docstring
157 engine: str
158 total: float
159 load: float
160
161
162class UnresponsiveEngine(NamedTuple): # pylint: disable=missing-class-docstring
163 engine: str
164 error_type: str
165 suspended: bool
166
167
169 """docstring for ResultContainer"""
170
171 __slots__ = (
172 '_merged_results',
173 'infoboxes',
174 'suggestions',
175 'answers',
176 'corrections',
177 '_number_of_results',
178 '_closed',
179 'paging',
180 'unresponsive_engines',
181 'timings',
182 'redirect_url',
183 'engine_data',
184 'on_result',
185 '_lock',
186 )
187
188 def __init__(self):
189 super().__init__()
190 self._merged_results: list[LegacyResult] = []
191 self.infoboxes: list[dict] = []
192 self.suggestions: set[str] = set()
193 self.answers = AnswerSet()
194 self.corrections = set()
195 self._number_of_results: list[int] = []
196 self.engine_data: dict[str, str | dict] = defaultdict(dict)
197 self._closed: bool = False
198 self.paging: bool = False
199 self.unresponsive_engines: Set[UnresponsiveEngine] = set()
200 self.timings: List[Timing] = []
201 self.redirect_url = None
202 self.on_result = lambda _: True
203 self._lock = RLock()
204
205 def extend(self, engine_name: str | None, results): # pylint: disable=too-many-branches
206 if self._closed:
207 return
208
209 standard_result_count = 0
210 error_msgs = set()
211
212 for result in list(results):
213
214 if isinstance(result, Result):
215 result.engine = result.engine or engine_name
216 result.normalize_result_fields()
217
218 if isinstance(result, BaseAnswer) and self.on_result(result):
219 self.answers.add(result)
220 else:
221 # more types need to be implemented in the future ..
222 raise NotImplementedError(f"no handler implemented to process the result of type {result}")
223
224 else:
225 result['engine'] = result.get('engine') or engine_name or ""
226 result = LegacyResult(result) # for backward compatibility, will be romeved one day
227
228 if 'suggestion' in result and self.on_result(result):
229 self.suggestions.add(result['suggestion'])
230 elif 'answer' in result and self.on_result(result):
231 warnings.warn(
232 f"answer results from engine {result.engine}"
233 " are without typification / migrate to Answer class.",
234 DeprecationWarning,
235 )
236 self.answers.add(result)
237 elif 'correction' in result and self.on_result(result):
238 self.corrections.add(result['correction'])
239 elif 'infobox' in result and self.on_result(result):
240 self._merge_infobox(result)
241 elif 'number_of_results' in result and self.on_result(result):
242 self._number_of_results.append(result['number_of_results'])
243 elif 'engine_data' in result and self.on_result(result):
244 self.engine_data[result.engine][result['key']] = result['engine_data']
245 elif result.url:
246 # standard result (url, title, content)
247 if not self._is_valid_url_result(result, error_msgs):
248 continue
249 # normalize the result
250 result.normalize_result_fields()
251 # call on_result call searx.search.SearchWithPlugins._on_result
252 # which calls the plugins
253 if not self.on_result(result):
254 continue
255 self.__merge_url_result(result, standard_result_count + 1)
256 standard_result_count += 1
257 elif self.on_result(result):
258 self.__merge_result_no_url(result, standard_result_count + 1)
259 standard_result_count += 1
260
261 if len(error_msgs) > 0:
262 for msg in error_msgs:
263 count_error(engine_name, 'some results are invalids: ' + msg, secondary=True)
264
265 if engine_name in engines:
266 histogram_observe(standard_result_count, 'engine', engine_name, 'result', 'count')
267
268 if not self.paging and engine_name in engines and engines[engine_name].paging:
269 self.paging = True
270
271 def _merge_infobox(self, infobox):
272 add_infobox = True
273 infobox_id = infobox.get('id', None)
274 infobox['engines'] = set([infobox['engine']])
275 if infobox_id is not None:
276 parsed_url_infobox_id = urlparse(infobox_id)
277 with self._lock:
278 for existingIndex in self.infoboxes:
279 if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id):
280 merge_two_infoboxes(existingIndex, infobox)
281 add_infobox = False
282
283 if add_infobox:
284 self.infoboxes.append(infobox)
285
286 def _is_valid_url_result(self, result, error_msgs):
287 if 'url' in result:
288 if not isinstance(result['url'], str):
289 logger.debug('result: invalid URL: %s', str(result))
290 error_msgs.add('invalid URL')
291 return False
292
293 if 'title' in result and not isinstance(result['title'], str):
294 logger.debug('result: invalid title: %s', str(result))
295 error_msgs.add('invalid title')
296 return False
297
298 if 'content' in result:
299 if not isinstance(result['content'], str):
300 logger.debug('result: invalid content: %s', str(result))
301 error_msgs.add('invalid content')
302 return False
303
304 return True
305
306 def __merge_url_result(self, result, position):
307 result['engines'] = set([result['engine']])
308 with self._lock:
309 duplicated = self.__find_duplicated_http_result(result)
310 if duplicated:
311 self.__merge_duplicated_http_result(duplicated, result, position)
312 return
313
314 # if there is no duplicate found, append result
315 result['positions'] = [position]
316 self._merged_results.append(result)
317
318 def __find_duplicated_http_result(self, result):
319 result_template = result.get('template')
320 for merged_result in self._merged_results:
321 if not merged_result.get('parsed_url'):
322 continue
323
324 if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
325 'template'
326 ):
327 if result_template != 'images.html':
328 # not an image, same template, same url : it's a duplicate
329 return merged_result
330
331 # it's an image
332 # it's a duplicate if the parsed_url, template and img_src are different
333 if result.get('img_src', '') == merged_result.get('img_src', ''):
334 return merged_result
335 return None
336
337 def __merge_duplicated_http_result(self, duplicated, result, position):
338 # use content with more text
339 if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
340 duplicated['content'] = result['content']
341
342 # use title with more text
343 if result_content_len(result.get('title', '')) > len(duplicated.get('title', '')):
344 duplicated['title'] = result['title']
345
346 # merge all result's parameters not found in duplicate
347 for key in result.keys():
348 if not duplicated.get(key):
349 duplicated[key] = result.get(key)
350
351 # add the new position
352 duplicated['positions'].append(position)
353
354 # add engine to list of result-engines
355 duplicated['engines'].add(result['engine'])
356
357 # use https if possible
358 if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
359 duplicated['url'] = result['parsed_url'].geturl()
360 duplicated['parsed_url'] = result['parsed_url']
361
362 def __merge_result_no_url(self, result, position):
363 result['engines'] = set([result['engine']])
364 result['positions'] = [position]
365 with self._lock:
366 self._merged_results.append(result)
367
368 def close(self):
369 self._closed = True
370
371 for result in self._merged_results:
372 result['score'] = result_score(result, result.get('priority'))
373 # removing html content and whitespace duplications
374 if result.get('content'):
375 result['content'] = result['content'].strip()
376 if result.get('title'):
377 result['title'] = ' '.join(result['title'].strip().split())
378
379 for result_engine in result['engines']:
380 counter_add(result['score'], 'engine', result_engine, 'score')
381
382 results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
383
384 # pass 2 : group results by category and template
385 gresults = []
386 categoryPositions = {}
387
388 for res in results:
389 if not res.get('url'):
390 continue
391
392 # do we need to handle more than one category per engine?
393 engine = engines[res['engine']]
394 res['category'] = engine.categories[0] if len(engine.categories) > 0 else ''
395
396 # do we need to handle more than one category per engine?
397 category = (
398 res['category']
399 + ':'
400 + res.get('template', '')
401 + ':'
402 + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')
403 )
404
405 current = None if category not in categoryPositions else categoryPositions[category]
406
407 # group with previous results using the same category
408 # if the group can accept more result and is not too far
409 # from the current position
410 if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20):
411 # group with the previous results using
412 # the same category with this one
413 index = current['index']
414 gresults.insert(index, res)
415
416 # update every index after the current one
417 # (including the current one)
418 for k in categoryPositions: # pylint: disable=consider-using-dict-items
419 v = categoryPositions[k]['index']
420 if v >= index:
421 categoryPositions[k]['index'] = v + 1
422
423 # update this category
424 current['count'] -= 1
425
426 else:
427 # same category
428 gresults.append(res)
429
430 # update categoryIndex
431 categoryPositions[category] = {'index': len(gresults), 'count': 8}
432
433 # update _merged_results
434 self._merged_results = gresults
435
436 def get_ordered_results(self):
437 if not self._closed:
438 self.close()
439 return self._merged_results
440
441 def results_length(self):
442 return len(self._merged_results)
443
444 @property
445 def number_of_results(self) -> int:
446 """Returns the average of results number, returns zero if the average
447 result number is smaller than the actual result count."""
448
449 with self._lock:
450 if not self._closed:
451 logger.error("call to ResultContainer.number_of_results before ResultContainer.close")
452 return 0
453
454 resultnum_sum = sum(self._number_of_results)
455 if not resultnum_sum or not self._number_of_results:
456 return 0
457
458 average = int(resultnum_sum / len(self._number_of_results))
459 if average < self.results_length():
460 average = 0
461 return average
462
463 def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
464 with self._lock:
465 if self._closed:
466 logger.error("call to ResultContainer.add_unresponsive_engine after ResultContainer.close")
467 return
468 if engines[engine_name].display_error_messages:
469 self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
470
471 def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
472 with self._lock:
473 if self._closed:
474 logger.error("call to ResultContainer.add_timing after ResultContainer.close")
475 return
476 self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))
477
478 def get_timings(self):
479 with self._lock:
480 if not self._closed:
481 logger.error("call to ResultContainer.get_timings before ResultContainer.close")
482 return []
483 return self.timings
::1337x
Definition 1337x.py:1
result_score(result, priority)
Definition results.py:135
compare_urls(url_a, url_b)
Definition results.py:30
merge_two_infoboxes(infobox1, infobox2)
Definition results.py:63
result_content_len(content)
Definition results.py:24