.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
results.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2# pylint: disable=missing-module-docstring
3
4import re
5from collections import defaultdict
6from operator import itemgetter
7from threading import RLock
8from typing import List, NamedTuple, Set
9from urllib.parse import urlparse, unquote
10
11from searx import logger
12from searx import utils
13from searx.engines import engines
14from searx.metrics import histogram_observe, counter_add, count_error
15
16
17CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
18WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
19
20
21# return the meaningful length of the content for a result
22def result_content_len(content):
23 if isinstance(content, str):
24 return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
25 return 0
26
27
28def compare_urls(url_a, url_b):
29 """Lazy compare between two URL.
30 "www.example.com" and "example.com" are equals.
31 "www.example.com/path/" and "www.example.com/path" are equals.
32 "https://www.example.com/" and "http://www.example.com/" are equals.
33
34 Args:
35 url_a (ParseResult): first URL
36 url_b (ParseResult): second URL
37
38 Returns:
39 bool: True if url_a and url_b are equals
40 """
41 # ignore www. in comparison
42 if url_a.netloc.startswith('www.'):
43 host_a = url_a.netloc.replace('www.', '', 1)
44 else:
45 host_a = url_a.netloc
46 if url_b.netloc.startswith('www.'):
47 host_b = url_b.netloc.replace('www.', '', 1)
48 else:
49 host_b = url_b.netloc
50
51 if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
52 return False
53
54 # remove / from the end of the url if required
55 path_a = url_a.path[:-1] if url_a.path.endswith('/') else url_a.path
56 path_b = url_b.path[:-1] if url_b.path.endswith('/') else url_b.path
57
58 return unquote(path_a) == unquote(path_b)
59
60
61def merge_two_infoboxes(infobox1, infobox2): # pylint: disable=too-many-branches, too-many-statements
62 # get engines weights
63 if hasattr(engines[infobox1['engine']], 'weight'):
64 weight1 = engines[infobox1['engine']].weight
65 else:
66 weight1 = 1
67 if hasattr(engines[infobox2['engine']], 'weight'):
68 weight2 = engines[infobox2['engine']].weight
69 else:
70 weight2 = 1
71
72 if weight2 > weight1:
73 infobox1['engine'] = infobox2['engine']
74
75 infobox1['engines'] |= infobox2['engines']
76
77 if 'urls' in infobox2:
78 urls1 = infobox1.get('urls', None)
79 if urls1 is None:
80 urls1 = []
81
82 for url2 in infobox2.get('urls', []):
83 unique_url = True
84 parsed_url2 = urlparse(url2.get('url', ''))
85 entity_url2 = url2.get('entity')
86 for url1 in urls1:
87 if (entity_url2 is not None and url1.get('entity') == entity_url2) or compare_urls(
88 urlparse(url1.get('url', '')), parsed_url2
89 ):
90 unique_url = False
91 break
92 if unique_url:
93 urls1.append(url2)
94
95 infobox1['urls'] = urls1
96
97 if 'img_src' in infobox2:
98 img1 = infobox1.get('img_src', None)
99 img2 = infobox2.get('img_src')
100 if img1 is None:
101 infobox1['img_src'] = img2
102 elif weight2 > weight1:
103 infobox1['img_src'] = img2
104
105 if 'attributes' in infobox2:
106 attributes1 = infobox1.get('attributes')
107 if attributes1 is None:
108 infobox1['attributes'] = attributes1 = []
109
110 attributeSet = set()
111 for attribute in attributes1:
112 label = attribute.get('label')
113 if label not in attributeSet:
114 attributeSet.add(label)
115 entity = attribute.get('entity')
116 if entity not in attributeSet:
117 attributeSet.add(entity)
118
119 for attribute in infobox2.get('attributes', []):
120 if attribute.get('label') not in attributeSet and attribute.get('entity') not in attributeSet:
121 attributes1.append(attribute)
122
123 if 'content' in infobox2:
124 content1 = infobox1.get('content', None)
125 content2 = infobox2.get('content', '')
126 if content1 is not None:
127 if result_content_len(content2) > result_content_len(content1):
128 infobox1['content'] = content2
129 else:
130 infobox1['content'] = content2
131
132
133def result_score(result):
134 weight = 1.0
135
136 for result_engine in result['engines']:
137 if hasattr(engines[result_engine], 'weight'):
138 weight *= float(engines[result_engine].weight)
139
140 occurrences = len(result['positions'])
141
142 return sum((occurrences * weight) / position for position in result['positions'])
143
144
145class Timing(NamedTuple): # pylint: disable=missing-class-docstring
146 engine: str
147 total: float
148 load: float
149
150
151class UnresponsiveEngine(NamedTuple): # pylint: disable=missing-class-docstring
152 engine: str
153 error_type: str
154 suspended: bool
155
156
158 """docstring for ResultContainer"""
159
160 __slots__ = (
161 '_merged_results',
162 'infoboxes',
163 'suggestions',
164 'answers',
165 'corrections',
166 '_number_of_results',
167 '_closed',
168 'paging',
169 'unresponsive_engines',
170 'timings',
171 'redirect_url',
172 'engine_data',
173 'on_result',
174 '_lock',
175 )
176
177 def __init__(self):
178 super().__init__()
179 self._merged_results = []
180 self.infoboxes = []
181 self.suggestions = set()
182 self.answers = {}
183 self.corrections = set()
184 self._number_of_results = []
185 self.engine_data = defaultdict(dict)
186 self._closed = False
187 self.paging = False
188 self.unresponsive_engines: Set[UnresponsiveEngine] = set()
189 self.timings: List[Timing] = []
190 self.redirect_url = None
191 self.on_result = lambda _: True
192 self._lock = RLock()
193
194 def extend(self, engine_name, results): # pylint: disable=too-many-branches
195 if self._closed:
196 return
197
198 standard_result_count = 0
199 error_msgs = set()
200 for result in list(results):
201 result['engine'] = engine_name
202 if 'suggestion' in result and self.on_result(result):
203 self.suggestions.add(result['suggestion'])
204 elif 'answer' in result and self.on_result(result):
205 self.answers[result['answer']] = result
206 elif 'correction' in result and self.on_result(result):
207 self.corrections.add(result['correction'])
208 elif 'infobox' in result and self.on_result(result):
209 self._merge_infobox(result)
210 elif 'number_of_results' in result and self.on_result(result):
211 self._number_of_results.append(result['number_of_results'])
212 elif 'engine_data' in result and self.on_result(result):
213 self.engine_data[engine_name][result['key']] = result['engine_data']
214 elif 'url' in result:
215 # standard result (url, title, content)
216 if not self._is_valid_url_result(result, error_msgs):
217 continue
218 # normalize the result
219 self._normalize_url_result(result)
220 # call on_result call searx.search.SearchWithPlugins._on_result
221 # which calls the plugins
222 if not self.on_result(result):
223 continue
224 self.__merge_url_result(result, standard_result_count + 1)
225 standard_result_count += 1
226 elif self.on_result(result):
227 self.__merge_result_no_url(result, standard_result_count + 1)
228 standard_result_count += 1
229
230 if len(error_msgs) > 0:
231 for msg in error_msgs:
232 count_error(engine_name, 'some results are invalids: ' + msg, secondary=True)
233
234 if engine_name in engines:
235 histogram_observe(standard_result_count, 'engine', engine_name, 'result', 'count')
236
237 if not self.paging and engine_name in engines and engines[engine_name].paging:
238 self.paging = True
239
240 def _merge_infobox(self, infobox):
241 add_infobox = True
242 infobox_id = infobox.get('id', None)
243 infobox['engines'] = set([infobox['engine']])
244 if infobox_id is not None:
245 parsed_url_infobox_id = urlparse(infobox_id)
246 with self._lock:
247 for existingIndex in self.infoboxes:
248 if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id):
249 merge_two_infoboxes(existingIndex, infobox)
250 add_infobox = False
251
252 if add_infobox:
253 self.infoboxes.append(infobox)
254
255 def _is_valid_url_result(self, result, error_msgs):
256 if 'url' in result:
257 if not isinstance(result['url'], str):
258 logger.debug('result: invalid URL: %s', str(result))
259 error_msgs.add('invalid URL')
260 return False
261
262 if 'title' in result and not isinstance(result['title'], str):
263 logger.debug('result: invalid title: %s', str(result))
264 error_msgs.add('invalid title')
265 return False
266
267 if 'content' in result:
268 if not isinstance(result['content'], str):
269 logger.debug('result: invalid content: %s', str(result))
270 error_msgs.add('invalid content')
271 return False
272
273 return True
274
275 def _normalize_url_result(self, result):
276 """Return True if the result is valid"""
277 result['parsed_url'] = urlparse(result['url'])
278
279 # if the result has no scheme, use http as default
280 if not result['parsed_url'].scheme:
281 result['parsed_url'] = result['parsed_url']._replace(scheme="http")
282 result['url'] = result['parsed_url'].geturl()
283
284 # avoid duplicate content between the content and title fields
285 if result.get('content') == result.get('title'):
286 del result['content']
287
288 # make sure there is a template
289 if 'template' not in result:
290 result['template'] = 'default.html'
291
292 # strip multiple spaces and carriage returns from content
293 if result.get('content'):
294 result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
295
296 def __merge_url_result(self, result, position):
297 result['engines'] = set([result['engine']])
298 with self._lock:
299 duplicated = self.__find_duplicated_http_result(result)
300 if duplicated:
301 self.__merge_duplicated_http_result(duplicated, result, position)
302 return
303
304 # if there is no duplicate found, append result
305 result['positions'] = [position]
306 self._merged_results.append(result)
307
308 def __find_duplicated_http_result(self, result):
309 result_template = result.get('template')
310 for merged_result in self._merged_results:
311 if 'parsed_url' not in merged_result:
312 continue
313 if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
314 'template'
315 ):
316 if result_template != 'images.html':
317 # not an image, same template, same url : it's a duplicate
318 return merged_result
319
320 # it's an image
321 # it's a duplicate if the parsed_url, template and img_src are different
322 if result.get('img_src', '') == merged_result.get('img_src', ''):
323 return merged_result
324 return None
325
326 def __merge_duplicated_http_result(self, duplicated, result, position):
327 # using content with more text
328 if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
329 duplicated['content'] = result['content']
330
331 # merge all result's parameters not found in duplicate
332 for key in result.keys():
333 if not duplicated.get(key):
334 duplicated[key] = result.get(key)
335
336 # add the new position
337 duplicated['positions'].append(position)
338
339 # add engine to list of result-engines
340 duplicated['engines'].add(result['engine'])
341
342 # using https if possible
343 if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
344 duplicated['url'] = result['parsed_url'].geturl()
345 duplicated['parsed_url'] = result['parsed_url']
346
347 def __merge_result_no_url(self, result, position):
348 result['engines'] = set([result['engine']])
349 result['positions'] = [position]
350 with self._lock:
351 self._merged_results.append(result)
352
353 def close(self):
354 self._closed = True
355
356 for result in self._merged_results:
357 score = result_score(result)
358 result['score'] = score
359
360 # removing html content and whitespace duplications
361 if result.get('content'):
362 result['content'] = utils.html_to_text(result['content']).strip()
363 if result.get('title'):
364 result['title'] = ' '.join(utils.html_to_text(result['title']).strip().split())
365
366 for result_engine in result['engines']:
367 counter_add(score, 'engine', result_engine, 'score')
368
369 results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
370
371 # pass 2 : group results by category and template
372 gresults = []
373 categoryPositions = {}
374
375 for res in results:
376 # do we need to handle more than one category per engine?
377 engine = engines[res['engine']]
378 res['category'] = engine.categories[0] if len(engine.categories) > 0 else ''
379
380 # do we need to handle more than one category per engine?
381 category = (
382 res['category']
383 + ':'
384 + res.get('template', '')
385 + ':'
386 + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')
387 )
388
389 current = None if category not in categoryPositions else categoryPositions[category]
390
391 # group with previous results using the same category
392 # if the group can accept more result and is not too far
393 # from the current position
394 if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20):
395 # group with the previous results using
396 # the same category with this one
397 index = current['index']
398 gresults.insert(index, res)
399
400 # update every index after the current one
401 # (including the current one)
402 for k in categoryPositions: # pylint: disable=consider-using-dict-items
403 v = categoryPositions[k]['index']
404 if v >= index:
405 categoryPositions[k]['index'] = v + 1
406
407 # update this category
408 current['count'] -= 1
409
410 else:
411 # same category
412 gresults.append(res)
413
414 # update categoryIndex
415 categoryPositions[category] = {'index': len(gresults), 'count': 8}
416
417 # update _merged_results
418 self._merged_results = gresults
419
420 def get_ordered_results(self):
421 if not self._closed:
422 self.close()
423 return self._merged_results
424
425 def results_length(self):
426 return len(self._merged_results)
427
428 @property
429 def number_of_results(self) -> int:
430 """Returns the average of results number, returns zero if the average
431 result number is smaller than the actual result count."""
432
433 resultnum_sum = sum(self._number_of_results)
434 if not resultnum_sum or not self._number_of_results:
435 return 0
436
437 average = int(resultnum_sum / len(self._number_of_results))
438 if average < self.results_length():
439 average = 0
440 return average
441
442 def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
443 if engines[engine_name].display_error_messages:
444 self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
445
446 def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
447 self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))
448
449 def get_timings(self):
450 return self.timings
::1337x
Definition 1337x.py:1
result_score(result)
Definition results.py:133
compare_urls(url_a, url_b)
Definition results.py:28
merge_two_infoboxes(infobox1, infobox2)
Definition results.py:61
result_content_len(content)
Definition results.py:22