.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
results.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2# pylint: disable=missing-module-docstring
3
4import re
5from collections import defaultdict
6from operator import itemgetter
7from threading import RLock
8from typing import List, NamedTuple, Set
9from urllib.parse import urlparse, unquote
10
11from searx import logger
12from searx.engines import engines
13from searx.metrics import histogram_observe, counter_add, count_error
14
15
16CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
17WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
18
19
20# return the meaningful length of the content for a result
21def result_content_len(content):
22 if isinstance(content, str):
23 return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
24 return 0
25
26
27def compare_urls(url_a, url_b):
28 """Lazy compare between two URL.
29 "www.example.com" and "example.com" are equals.
30 "www.example.com/path/" and "www.example.com/path" are equals.
31 "https://www.example.com/" and "http://www.example.com/" are equals.
32
33 Args:
34 url_a (ParseResult): first URL
35 url_b (ParseResult): second URL
36
37 Returns:
38 bool: True if url_a and url_b are equals
39 """
40 # ignore www. in comparison
41 if url_a.netloc.startswith('www.'):
42 host_a = url_a.netloc.replace('www.', '', 1)
43 else:
44 host_a = url_a.netloc
45 if url_b.netloc.startswith('www.'):
46 host_b = url_b.netloc.replace('www.', '', 1)
47 else:
48 host_b = url_b.netloc
49
50 if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
51 return False
52
53 # remove / from the end of the url if required
54 path_a = url_a.path[:-1] if url_a.path.endswith('/') else url_a.path
55 path_b = url_b.path[:-1] if url_b.path.endswith('/') else url_b.path
56
57 return unquote(path_a) == unquote(path_b)
58
59
60def merge_two_infoboxes(infobox1, infobox2): # pylint: disable=too-many-branches, too-many-statements
61 # get engines weights
62 if hasattr(engines[infobox1['engine']], 'weight'):
63 weight1 = engines[infobox1['engine']].weight
64 else:
65 weight1 = 1
66 if hasattr(engines[infobox2['engine']], 'weight'):
67 weight2 = engines[infobox2['engine']].weight
68 else:
69 weight2 = 1
70
71 if weight2 > weight1:
72 infobox1['engine'] = infobox2['engine']
73
74 infobox1['engines'] |= infobox2['engines']
75
76 if 'urls' in infobox2:
77 urls1 = infobox1.get('urls', None)
78 if urls1 is None:
79 urls1 = []
80
81 for url2 in infobox2.get('urls', []):
82 unique_url = True
83 parsed_url2 = urlparse(url2.get('url', ''))
84 entity_url2 = url2.get('entity')
85 for url1 in urls1:
86 if (entity_url2 is not None and url1.get('entity') == entity_url2) or compare_urls(
87 urlparse(url1.get('url', '')), parsed_url2
88 ):
89 unique_url = False
90 break
91 if unique_url:
92 urls1.append(url2)
93
94 infobox1['urls'] = urls1
95
96 if 'img_src' in infobox2:
97 img1 = infobox1.get('img_src', None)
98 img2 = infobox2.get('img_src')
99 if img1 is None:
100 infobox1['img_src'] = img2
101 elif weight2 > weight1:
102 infobox1['img_src'] = img2
103
104 if 'attributes' in infobox2:
105 attributes1 = infobox1.get('attributes')
106 if attributes1 is None:
107 infobox1['attributes'] = attributes1 = []
108
109 attributeSet = set()
110 for attribute in attributes1:
111 label = attribute.get('label')
112 if label not in attributeSet:
113 attributeSet.add(label)
114 entity = attribute.get('entity')
115 if entity not in attributeSet:
116 attributeSet.add(entity)
117
118 for attribute in infobox2.get('attributes', []):
119 if attribute.get('label') not in attributeSet and attribute.get('entity') not in attributeSet:
120 attributes1.append(attribute)
121
122 if 'content' in infobox2:
123 content1 = infobox1.get('content', None)
124 content2 = infobox2.get('content', '')
125 if content1 is not None:
126 if result_content_len(content2) > result_content_len(content1):
127 infobox1['content'] = content2
128 else:
129 infobox1['content'] = content2
130
131
132def result_score(result, priority):
133 weight = 1.0
134
135 for result_engine in result['engines']:
136 if hasattr(engines[result_engine], 'weight'):
137 weight *= float(engines[result_engine].weight)
138
139 weight *= len(result['positions'])
140 score = 0
141
142 for position in result['positions']:
143 if priority == 'low':
144 continue
145 if priority == 'high':
146 score += weight
147 else:
148 score += weight / position
149
150 return score
151
152
153class Timing(NamedTuple): # pylint: disable=missing-class-docstring
154 engine: str
155 total: float
156 load: float
157
158
159class UnresponsiveEngine(NamedTuple): # pylint: disable=missing-class-docstring
160 engine: str
161 error_type: str
162 suspended: bool
163
164
166 """docstring for ResultContainer"""
167
168 __slots__ = (
169 '_merged_results',
170 'infoboxes',
171 'suggestions',
172 'answers',
173 'corrections',
174 '_number_of_results',
175 '_closed',
176 'paging',
177 'unresponsive_engines',
178 'timings',
179 'redirect_url',
180 'engine_data',
181 'on_result',
182 '_lock',
183 )
184
185 def __init__(self):
186 super().__init__()
187 self._merged_results = []
188 self.infoboxes = []
189 self.suggestions = set()
190 self.answers = {}
191 self.corrections = set()
192 self._number_of_results = []
193 self.engine_data = defaultdict(dict)
194 self._closed = False
195 self.paging = False
196 self.unresponsive_engines: Set[UnresponsiveEngine] = set()
197 self.timings: List[Timing] = []
198 self.redirect_url = None
199 self.on_result = lambda _: True
200 self._lock = RLock()
201
202 def extend(self, engine_name, results): # pylint: disable=too-many-branches
203 if self._closed:
204 return
205
206 standard_result_count = 0
207 error_msgs = set()
208 for result in list(results):
209 result['engine'] = engine_name
210 if 'suggestion' in result and self.on_result(result):
211 self.suggestions.add(result['suggestion'])
212 elif 'answer' in result and self.on_result(result):
213 self.answers[result['answer']] = result
214 elif 'correction' in result and self.on_result(result):
215 self.corrections.add(result['correction'])
216 elif 'infobox' in result and self.on_result(result):
217 self._merge_infobox(result)
218 elif 'number_of_results' in result and self.on_result(result):
219 self._number_of_results.append(result['number_of_results'])
220 elif 'engine_data' in result and self.on_result(result):
221 self.engine_data[engine_name][result['key']] = result['engine_data']
222 elif 'url' in result:
223 # standard result (url, title, content)
224 if not self._is_valid_url_result(result, error_msgs):
225 continue
226 # normalize the result
227 self._normalize_url_result(result)
228 # call on_result call searx.search.SearchWithPlugins._on_result
229 # which calls the plugins
230 if not self.on_result(result):
231 continue
232 self.__merge_url_result(result, standard_result_count + 1)
233 standard_result_count += 1
234 elif self.on_result(result):
235 self.__merge_result_no_url(result, standard_result_count + 1)
236 standard_result_count += 1
237
238 if len(error_msgs) > 0:
239 for msg in error_msgs:
240 count_error(engine_name, 'some results are invalids: ' + msg, secondary=True)
241
242 if engine_name in engines:
243 histogram_observe(standard_result_count, 'engine', engine_name, 'result', 'count')
244
245 if not self.paging and engine_name in engines and engines[engine_name].paging:
246 self.paging = True
247
248 def _merge_infobox(self, infobox):
249 add_infobox = True
250 infobox_id = infobox.get('id', None)
251 infobox['engines'] = set([infobox['engine']])
252 if infobox_id is not None:
253 parsed_url_infobox_id = urlparse(infobox_id)
254 with self._lock:
255 for existingIndex in self.infoboxes:
256 if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id):
257 merge_two_infoboxes(existingIndex, infobox)
258 add_infobox = False
259
260 if add_infobox:
261 self.infoboxes.append(infobox)
262
263 def _is_valid_url_result(self, result, error_msgs):
264 if 'url' in result:
265 if not isinstance(result['url'], str):
266 logger.debug('result: invalid URL: %s', str(result))
267 error_msgs.add('invalid URL')
268 return False
269
270 if 'title' in result and not isinstance(result['title'], str):
271 logger.debug('result: invalid title: %s', str(result))
272 error_msgs.add('invalid title')
273 return False
274
275 if 'content' in result:
276 if not isinstance(result['content'], str):
277 logger.debug('result: invalid content: %s', str(result))
278 error_msgs.add('invalid content')
279 return False
280
281 return True
282
283 def _normalize_url_result(self, result):
284 """Return True if the result is valid"""
285 result['parsed_url'] = urlparse(result['url'])
286
287 # if the result has no scheme, use http as default
288 if not result['parsed_url'].scheme:
289 result['parsed_url'] = result['parsed_url']._replace(scheme="http")
290 result['url'] = result['parsed_url'].geturl()
291
292 # avoid duplicate content between the content and title fields
293 if result.get('content') == result.get('title'):
294 del result['content']
295
296 # make sure there is a template
297 if 'template' not in result:
298 result['template'] = 'default.html'
299
300 # strip multiple spaces and carriage returns from content
301 if result.get('content'):
302 result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
303
304 def __merge_url_result(self, result, position):
305 result['engines'] = set([result['engine']])
306 with self._lock:
307 duplicated = self.__find_duplicated_http_result(result)
308 if duplicated:
309 self.__merge_duplicated_http_result(duplicated, result, position)
310 return
311
312 # if there is no duplicate found, append result
313 result['positions'] = [position]
314 self._merged_results.append(result)
315
316 def __find_duplicated_http_result(self, result):
317 result_template = result.get('template')
318 for merged_result in self._merged_results:
319 if 'parsed_url' not in merged_result:
320 continue
321 if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
322 'template'
323 ):
324 if result_template != 'images.html':
325 # not an image, same template, same url : it's a duplicate
326 return merged_result
327
328 # it's an image
329 # it's a duplicate if the parsed_url, template and img_src are different
330 if result.get('img_src', '') == merged_result.get('img_src', ''):
331 return merged_result
332 return None
333
334 def __merge_duplicated_http_result(self, duplicated, result, position):
335 # using content with more text
336 if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
337 duplicated['content'] = result['content']
338
339 # merge all result's parameters not found in duplicate
340 for key in result.keys():
341 if not duplicated.get(key):
342 duplicated[key] = result.get(key)
343
344 # add the new position
345 duplicated['positions'].append(position)
346
347 # add engine to list of result-engines
348 duplicated['engines'].add(result['engine'])
349
350 # using https if possible
351 if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
352 duplicated['url'] = result['parsed_url'].geturl()
353 duplicated['parsed_url'] = result['parsed_url']
354
355 def __merge_result_no_url(self, result, position):
356 result['engines'] = set([result['engine']])
357 result['positions'] = [position]
358 with self._lock:
359 self._merged_results.append(result)
360
361 def close(self):
362 self._closed = True
363
364 for result in self._merged_results:
365 result['score'] = result_score(result, result.get('priority'))
366 # removing html content and whitespace duplications
367 if result.get('content'):
368 result['content'] = result['content'].strip()
369 if result.get('title'):
370 result['title'] = ' '.join(result['title'].strip().split())
371
372 for result_engine in result['engines']:
373 counter_add(result['score'], 'engine', result_engine, 'score')
374
375 results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
376
377 # pass 2 : group results by category and template
378 gresults = []
379 categoryPositions = {}
380
381 for res in results:
382 # do we need to handle more than one category per engine?
383 engine = engines[res['engine']]
384 res['category'] = engine.categories[0] if len(engine.categories) > 0 else ''
385
386 # do we need to handle more than one category per engine?
387 category = (
388 res['category']
389 + ':'
390 + res.get('template', '')
391 + ':'
392 + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')
393 )
394
395 current = None if category not in categoryPositions else categoryPositions[category]
396
397 # group with previous results using the same category
398 # if the group can accept more result and is not too far
399 # from the current position
400 if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20):
401 # group with the previous results using
402 # the same category with this one
403 index = current['index']
404 gresults.insert(index, res)
405
406 # update every index after the current one
407 # (including the current one)
408 for k in categoryPositions: # pylint: disable=consider-using-dict-items
409 v = categoryPositions[k]['index']
410 if v >= index:
411 categoryPositions[k]['index'] = v + 1
412
413 # update this category
414 current['count'] -= 1
415
416 else:
417 # same category
418 gresults.append(res)
419
420 # update categoryIndex
421 categoryPositions[category] = {'index': len(gresults), 'count': 8}
422
423 # update _merged_results
424 self._merged_results = gresults
425
426 def get_ordered_results(self):
427 if not self._closed:
428 self.close()
429 return self._merged_results
430
431 def results_length(self):
432 return len(self._merged_results)
433
434 @property
435 def number_of_results(self) -> int:
436 """Returns the average of results number, returns zero if the average
437 result number is smaller than the actual result count."""
438
439 with self._lock:
440 if not self._closed:
441 logger.error("call to ResultContainer.number_of_results before ResultContainer.close")
442 return 0
443
444 resultnum_sum = sum(self._number_of_results)
445 if not resultnum_sum or not self._number_of_results:
446 return 0
447
448 average = int(resultnum_sum / len(self._number_of_results))
449 if average < self.results_length():
450 average = 0
451 return average
452
453 def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
454 with self._lock:
455 if self._closed:
456 logger.error("call to ResultContainer.add_unresponsive_engine after ResultContainer.close")
457 return
458 if engines[engine_name].display_error_messages:
459 self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
460
461 def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
462 with self._lock:
463 if self._closed:
464 logger.error("call to ResultContainer.add_timing after ResultContainer.close")
465 return
466 self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))
467
468 def get_timings(self):
469 with self._lock:
470 if not self._closed:
471 logger.error("call to ResultContainer.get_timings before ResultContainer.close")
472 return []
473 return self.timings
::1337x
Definition 1337x.py:1
result_score(result, priority)
Definition results.py:132
compare_urls(url_a, url_b)
Definition results.py:27
merge_two_infoboxes(infobox1, infobox2)
Definition results.py:60
result_content_len(content)
Definition results.py:21