.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
results.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2# pylint: disable=missing-module-docstring
3
4import re
5from collections import defaultdict
6from operator import itemgetter
7from threading import RLock
8from typing import List, NamedTuple, Set
9from urllib.parse import urlparse, unquote
10
11from searx import logger
12from searx.engines import engines
13from searx.metrics import histogram_observe, counter_add, count_error
14
15CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
16WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
17
18
19# return the meaningful length of the content for a result
20def result_content_len(content):
21 if isinstance(content, str):
22 return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
23 return 0
24
25
26def compare_urls(url_a, url_b):
27 """Lazy compare between two URL.
28 "www.example.com" and "example.com" are equals.
29 "www.example.com/path/" and "www.example.com/path" are equals.
30 "https://www.example.com/" and "http://www.example.com/" are equals.
31
32 Args:
33 url_a (ParseResult): first URL
34 url_b (ParseResult): second URL
35
36 Returns:
37 bool: True if url_a and url_b are equals
38 """
39 # ignore www. in comparison
40 if url_a.netloc.startswith('www.'):
41 host_a = url_a.netloc.replace('www.', '', 1)
42 else:
43 host_a = url_a.netloc
44 if url_b.netloc.startswith('www.'):
45 host_b = url_b.netloc.replace('www.', '', 1)
46 else:
47 host_b = url_b.netloc
48
49 if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
50 return False
51
52 # remove / from the end of the url if required
53 path_a = url_a.path[:-1] if url_a.path.endswith('/') else url_a.path
54 path_b = url_b.path[:-1] if url_b.path.endswith('/') else url_b.path
55
56 return unquote(path_a) == unquote(path_b)
57
58
59def merge_two_infoboxes(infobox1, infobox2): # pylint: disable=too-many-branches, too-many-statements
60 # get engines weights
61 if hasattr(engines[infobox1['engine']], 'weight'):
62 weight1 = engines[infobox1['engine']].weight
63 else:
64 weight1 = 1
65 if hasattr(engines[infobox2['engine']], 'weight'):
66 weight2 = engines[infobox2['engine']].weight
67 else:
68 weight2 = 1
69
70 if weight2 > weight1:
71 infobox1['engine'] = infobox2['engine']
72
73 infobox1['engines'] |= infobox2['engines']
74
75 if 'urls' in infobox2:
76 urls1 = infobox1.get('urls', None)
77 if urls1 is None:
78 urls1 = []
79
80 for url2 in infobox2.get('urls', []):
81 unique_url = True
82 parsed_url2 = urlparse(url2.get('url', ''))
83 entity_url2 = url2.get('entity')
84 for url1 in urls1:
85 if (entity_url2 is not None and url1.get('entity') == entity_url2) or compare_urls(
86 urlparse(url1.get('url', '')), parsed_url2
87 ):
88 unique_url = False
89 break
90 if unique_url:
91 urls1.append(url2)
92
93 infobox1['urls'] = urls1
94
95 if 'img_src' in infobox2:
96 img1 = infobox1.get('img_src', None)
97 img2 = infobox2.get('img_src')
98 if img1 is None:
99 infobox1['img_src'] = img2
100 elif weight2 > weight1:
101 infobox1['img_src'] = img2
102
103 if 'attributes' in infobox2:
104 attributes1 = infobox1.get('attributes')
105 if attributes1 is None:
106 infobox1['attributes'] = attributes1 = []
107
108 attributeSet = set()
109 for attribute in attributes1:
110 label = attribute.get('label')
111 if label not in attributeSet:
112 attributeSet.add(label)
113 entity = attribute.get('entity')
114 if entity not in attributeSet:
115 attributeSet.add(entity)
116
117 for attribute in infobox2.get('attributes', []):
118 if attribute.get('label') not in attributeSet and attribute.get('entity') not in attributeSet:
119 attributes1.append(attribute)
120
121 if 'content' in infobox2:
122 content1 = infobox1.get('content', None)
123 content2 = infobox2.get('content', '')
124 if content1 is not None:
125 if result_content_len(content2) > result_content_len(content1):
126 infobox1['content'] = content2
127 else:
128 infobox1['content'] = content2
129
130
131def result_score(result, priority):
132 weight = 1.0
133
134 for result_engine in result['engines']:
135 if hasattr(engines.get(result_engine), 'weight'):
136 weight *= float(engines[result_engine].weight)
137
138 weight *= len(result['positions'])
139 score = 0
140
141 for position in result['positions']:
142 if priority == 'low':
143 continue
144 if priority == 'high':
145 score += weight
146 else:
147 score += weight / position
148
149 return score
150
151
152class Timing(NamedTuple): # pylint: disable=missing-class-docstring
153 engine: str
154 total: float
155 load: float
156
157
158class UnresponsiveEngine(NamedTuple): # pylint: disable=missing-class-docstring
159 engine: str
160 error_type: str
161 suspended: bool
162
163
165 """docstring for ResultContainer"""
166
167 __slots__ = (
168 '_merged_results',
169 'infoboxes',
170 'suggestions',
171 'answers',
172 'corrections',
173 '_number_of_results',
174 '_closed',
175 'paging',
176 'unresponsive_engines',
177 'timings',
178 'redirect_url',
179 'engine_data',
180 'on_result',
181 '_lock',
182 )
183
184 def __init__(self):
185 super().__init__()
186 self._merged_results = []
187 self.infoboxes = []
188 self.suggestions = set()
189 self.answers = {}
190 self.corrections = set()
191 self._number_of_results = []
192 self.engine_data = defaultdict(dict)
193 self._closed = False
194 self.paging = False
195 self.unresponsive_engines: Set[UnresponsiveEngine] = set()
196 self.timings: List[Timing] = []
197 self.redirect_url = None
198 self.on_result = lambda _: True
199 self._lock = RLock()
200
201 def extend(self, engine_name, results): # pylint: disable=too-many-branches
202 if self._closed:
203 return
204
205 standard_result_count = 0
206 error_msgs = set()
207 for result in list(results):
208 result['engine'] = engine_name
209 if 'suggestion' in result and self.on_result(result):
210 self.suggestions.add(result['suggestion'])
211 elif 'answer' in result and self.on_result(result):
212 self.answers[result['answer']] = result
213 elif 'correction' in result and self.on_result(result):
214 self.corrections.add(result['correction'])
215 elif 'infobox' in result and self.on_result(result):
216 self._merge_infobox(result)
217 elif 'number_of_results' in result and self.on_result(result):
218 self._number_of_results.append(result['number_of_results'])
219 elif 'engine_data' in result and self.on_result(result):
220 self.engine_data[engine_name][result['key']] = result['engine_data']
221 elif 'url' in result:
222 # standard result (url, title, content)
223 if not self._is_valid_url_result(result, error_msgs):
224 continue
225 # normalize the result
226 self._normalize_url_result(result)
227 # call on_result call searx.search.SearchWithPlugins._on_result
228 # which calls the plugins
229 if not self.on_result(result):
230 continue
231 self.__merge_url_result(result, standard_result_count + 1)
232 standard_result_count += 1
233 elif self.on_result(result):
234 self.__merge_result_no_url(result, standard_result_count + 1)
235 standard_result_count += 1
236
237 if len(error_msgs) > 0:
238 for msg in error_msgs:
239 count_error(engine_name, 'some results are invalids: ' + msg, secondary=True)
240
241 if engine_name in engines:
242 histogram_observe(standard_result_count, 'engine', engine_name, 'result', 'count')
243
244 if not self.paging and engine_name in engines and engines[engine_name].paging:
245 self.paging = True
246
247 def _merge_infobox(self, infobox):
248 add_infobox = True
249 infobox_id = infobox.get('id', None)
250 infobox['engines'] = set([infobox['engine']])
251 if infobox_id is not None:
252 parsed_url_infobox_id = urlparse(infobox_id)
253 with self._lock:
254 for existingIndex in self.infoboxes:
255 if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id):
256 merge_two_infoboxes(existingIndex, infobox)
257 add_infobox = False
258
259 if add_infobox:
260 self.infoboxes.append(infobox)
261
262 def _is_valid_url_result(self, result, error_msgs):
263 if 'url' in result:
264 if not isinstance(result['url'], str):
265 logger.debug('result: invalid URL: %s', str(result))
266 error_msgs.add('invalid URL')
267 return False
268
269 if 'title' in result and not isinstance(result['title'], str):
270 logger.debug('result: invalid title: %s', str(result))
271 error_msgs.add('invalid title')
272 return False
273
274 if 'content' in result:
275 if not isinstance(result['content'], str):
276 logger.debug('result: invalid content: %s', str(result))
277 error_msgs.add('invalid content')
278 return False
279
280 return True
281
282 def _normalize_url_result(self, result):
283 """Return True if the result is valid"""
284 result['parsed_url'] = urlparse(result['url'])
285
286 # if the result has no scheme, use http as default
287 if not result['parsed_url'].scheme:
288 result['parsed_url'] = result['parsed_url']._replace(scheme="http")
289 result['url'] = result['parsed_url'].geturl()
290
291 # avoid duplicate content between the content and title fields
292 if result.get('content') == result.get('title'):
293 del result['content']
294
295 # make sure there is a template
296 if 'template' not in result:
297 result['template'] = 'default.html'
298
299 # strip multiple spaces and carriage returns from content
300 if result.get('content'):
301 result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
302
303 def __merge_url_result(self, result, position):
304 result['engines'] = set([result['engine']])
305 with self._lock:
306 duplicated = self.__find_duplicated_http_result(result)
307 if duplicated:
308 self.__merge_duplicated_http_result(duplicated, result, position)
309 return
310
311 # if there is no duplicate found, append result
312 result['positions'] = [position]
313 self._merged_results.append(result)
314
315 def __find_duplicated_http_result(self, result):
316 result_template = result.get('template')
317 for merged_result in self._merged_results:
318 if 'parsed_url' not in merged_result:
319 continue
320 if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
321 'template'
322 ):
323 if result_template != 'images.html':
324 # not an image, same template, same url : it's a duplicate
325 return merged_result
326
327 # it's an image
328 # it's a duplicate if the parsed_url, template and img_src are different
329 if result.get('img_src', '') == merged_result.get('img_src', ''):
330 return merged_result
331 return None
332
333 def __merge_duplicated_http_result(self, duplicated, result, position):
334 # use content with more text
335 if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
336 duplicated['content'] = result['content']
337
338 # use title with more text
339 if result_content_len(result.get('title', '')) > len(duplicated.get('title', '')):
340 duplicated['title'] = result['title']
341
342 # merge all result's parameters not found in duplicate
343 for key in result.keys():
344 if not duplicated.get(key):
345 duplicated[key] = result.get(key)
346
347 # add the new position
348 duplicated['positions'].append(position)
349
350 # add engine to list of result-engines
351 duplicated['engines'].add(result['engine'])
352
353 # use https if possible
354 if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
355 duplicated['url'] = result['parsed_url'].geturl()
356 duplicated['parsed_url'] = result['parsed_url']
357
358 def __merge_result_no_url(self, result, position):
359 result['engines'] = set([result['engine']])
360 result['positions'] = [position]
361 with self._lock:
362 self._merged_results.append(result)
363
364 def close(self):
365 self._closed = True
366
367 for result in self._merged_results:
368 result['score'] = result_score(result, result.get('priority'))
369 # removing html content and whitespace duplications
370 if result.get('content'):
371 result['content'] = result['content'].strip()
372 if result.get('title'):
373 result['title'] = ' '.join(result['title'].strip().split())
374
375 for result_engine in result['engines']:
376 counter_add(result['score'], 'engine', result_engine, 'score')
377
378 results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
379
380 # pass 2 : group results by category and template
381 gresults = []
382 categoryPositions = {}
383
384 for res in results:
385 # do we need to handle more than one category per engine?
386 engine = engines[res['engine']]
387 res['category'] = engine.categories[0] if len(engine.categories) > 0 else ''
388
389 # do we need to handle more than one category per engine?
390 category = (
391 res['category']
392 + ':'
393 + res.get('template', '')
394 + ':'
395 + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')
396 )
397
398 current = None if category not in categoryPositions else categoryPositions[category]
399
400 # group with previous results using the same category
401 # if the group can accept more result and is not too far
402 # from the current position
403 if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20):
404 # group with the previous results using
405 # the same category with this one
406 index = current['index']
407 gresults.insert(index, res)
408
409 # update every index after the current one
410 # (including the current one)
411 for k in categoryPositions: # pylint: disable=consider-using-dict-items
412 v = categoryPositions[k]['index']
413 if v >= index:
414 categoryPositions[k]['index'] = v + 1
415
416 # update this category
417 current['count'] -= 1
418
419 else:
420 # same category
421 gresults.append(res)
422
423 # update categoryIndex
424 categoryPositions[category] = {'index': len(gresults), 'count': 8}
425
426 # update _merged_results
427 self._merged_results = gresults
428
429 def get_ordered_results(self):
430 if not self._closed:
431 self.close()
432 return self._merged_results
433
434 def results_length(self):
435 return len(self._merged_results)
436
437 @property
438 def number_of_results(self) -> int:
439 """Returns the average of results number, returns zero if the average
440 result number is smaller than the actual result count."""
441
442 with self._lock:
443 if not self._closed:
444 logger.error("call to ResultContainer.number_of_results before ResultContainer.close")
445 return 0
446
447 resultnum_sum = sum(self._number_of_results)
448 if not resultnum_sum or not self._number_of_results:
449 return 0
450
451 average = int(resultnum_sum / len(self._number_of_results))
452 if average < self.results_length():
453 average = 0
454 return average
455
456 def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
457 with self._lock:
458 if self._closed:
459 logger.error("call to ResultContainer.add_unresponsive_engine after ResultContainer.close")
460 return
461 if engines[engine_name].display_error_messages:
462 self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
463
464 def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
465 with self._lock:
466 if self._closed:
467 logger.error("call to ResultContainer.add_timing after ResultContainer.close")
468 return
469 self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))
470
471 def get_timings(self):
472 with self._lock:
473 if not self._closed:
474 logger.error("call to ResultContainer.get_timings before ResultContainer.close")
475 return []
476 return self.timings
::1337x
Definition 1337x.py:1
result_score(result, priority)
Definition results.py:131
compare_urls(url_a, url_b)
Definition results.py:26
merge_two_infoboxes(infobox1, infobox2)
Definition results.py:59
result_content_len(content)
Definition results.py:20