283 def _normalize_url_result(self, result):
284 """Return True if the result is valid"""
285 result['parsed_url'] = urlparse(result['url'])
287 # if the result has no scheme, use http as default
288 if not result['parsed_url'].scheme:
289 result['parsed_url'] = result['parsed_url']._replace(scheme="http")
290 result['url'] = result['parsed_url'].geturl()
292 # avoid duplicate content between the content and title fields
293 if result.get('content') == result.get('title'):
294 del result['content']
296 # make sure there is a template
297 if 'template' not in result:
298 result['template'] = 'default.html'
300 # strip multiple spaces and carriage returns from content
301 if result.get('content'):
302 result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
304 def __merge_url_result(self, result, position):
305 result['engines'] = set([result['engine']])
307 duplicated = self.__find_duplicated_http_result(result)
309 self.__merge_duplicated_http_result(duplicated, result, position)
312 # if there is no duplicate found, append result
313 result['positions'] = [position]
314 self._merged_results.append(result)
316 def __find_duplicated_http_result(self, result):
317 result_template = result.get('template')
318 for merged_result in self._merged_results:
319 if 'parsed_url' not in merged_result:
321 if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
324 if result_template != 'images.html':
325 # not an image, same template, same url : it's a duplicate
329 # it's a duplicate if the parsed_url, template and img_src are different
330 if result.get('img_src', '') == merged_result.get('img_src', ''):
334 def __merge_duplicated_http_result(self, duplicated, result, position):
335 # using content with more text
336 if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
337 duplicated['content'] = result['content']
339 # merge all result's parameters not found in duplicate
340 for key in result.keys():
341 if not duplicated.get(key):
342 duplicated[key] = result.get(key)
344 # add the new position
345 duplicated['positions'].append(position)
347 # add engine to list of result-engines
348 duplicated['engines'].add(result['engine'])
350 # using https if possible
351 if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
352 duplicated['url'] = result['parsed_url'].geturl()
353 duplicated['parsed_url'] = result['parsed_url']
355 def __merge_result_no_url(self, result, position):
356 result['engines'] = set([result['engine']])
357 result['positions'] = [position]
359 self._merged_results.append(result)
364 for result in self._merged_results:
365 result['score'] = result_score(result, result.get('priority'))
366 # removing html content and whitespace duplications
367 if result.get('content'):
368 result['content'] = result['content'].strip()
369 if result.get('title'):
370 result['title'] = ' '.join(result['title'].strip().split())
372 for result_engine in result['engines']:
373 counter_add(result['score'], 'engine', result_engine, 'score')
375 results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
377 # pass 2 : group results by category and template
379 categoryPositions = {}
382 # do we need to handle more than one category per engine?
383 engine = engines[res['engine']]
384 res['category'] = engine.categories[0] if len(engine.categories) > 0 else ''
386 # do we need to handle more than one category per engine?
390 + res.get('template', '')
392 + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')
395 current = None if category not in categoryPositions else categoryPositions[category]
397 # group with previous results using the same category
398 # if the group can accept more result and is not too far
399 # from the current position
400 if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20):
401 # group with the previous results using
402 # the same category with this one
403 index = current['index']
404 gresults.insert(index, res)
406 # update every index after the current one
407 # (including the current one)
408 for k in categoryPositions: # pylint: disable=consider-using-dict-items
409 v = categoryPositions[k]['index']
411 categoryPositions[k]['index'] = v + 1
413 # update this category
414 current['count'] -= 1
420 # update categoryIndex
421 categoryPositions[category] = {'index': len(gresults), 'count': 8}
423 # update _merged_results
424 self._merged_results = gresults
435 def number_of_results(self) -> int:
436 """Returns the average of results number, returns zero if the average
437 result number is smaller than the actual result count."""
441 logger.error("call to ResultContainer.number_of_results before ResultContainer.close")
444 resultnum_sum = sum(self._number_of_results)
445 if not resultnum_sum or not self._number_of_results:
448 average = int(resultnum_sum / len(self._number_of_results))
449 if average < self.results_length():
453 def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
456 logger.error("call to ResultContainer.add_unresponsive_engine after ResultContainer.close")
458 if engines[engine_name].display_error_messages:
459 self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
461 def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
464 logger.error("call to ResultContainer.add_timing after ResultContainer.close")
466 self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))