282 def _normalize_url_result(self, result):
283 """Return True if the result is valid"""
284 result['parsed_url'] = urlparse(result['url'])
286 # if the result has no scheme, use http as default
287 if not result['parsed_url'].scheme:
288 result['parsed_url'] = result['parsed_url']._replace(scheme="http")
289 result['url'] = result['parsed_url'].geturl()
291 # avoid duplicate content between the content and title fields
292 if result.get('content') == result.get('title'):
293 del result['content']
295 # make sure there is a template
296 if 'template' not in result:
297 result['template'] = 'default.html'
299 # strip multiple spaces and carriage returns from content
300 if result.get('content'):
301 result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
303 def __merge_url_result(self, result, position):
304 result['engines'] = set([result['engine']])
306 duplicated = self.__find_duplicated_http_result(result)
308 self.__merge_duplicated_http_result(duplicated, result, position)
311 # if there is no duplicate found, append result
312 result['positions'] = [position]
313 self._merged_results.append(result)
315 def __find_duplicated_http_result(self, result):
316 result_template = result.get('template')
317 for merged_result in self._merged_results:
318 if 'parsed_url' not in merged_result:
320 if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
323 if result_template != 'images.html':
324 # not an image, same template, same url : it's a duplicate
328 # it's a duplicate if the parsed_url, template and img_src are different
329 if result.get('img_src', '') == merged_result.get('img_src', ''):
333 def __merge_duplicated_http_result(self, duplicated, result, position):
334 # use content with more text
335 if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
336 duplicated['content'] = result['content']
338 # use title with more text
339 if result_content_len(result.get('title', '')) > len(duplicated.get('title', '')):
340 duplicated['title'] = result['title']
342 # merge all result's parameters not found in duplicate
343 for key in result.keys():
344 if not duplicated.get(key):
345 duplicated[key] = result.get(key)
347 # add the new position
348 duplicated['positions'].append(position)
350 # add engine to list of result-engines
351 duplicated['engines'].add(result['engine'])
353 # use https if possible
354 if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
355 duplicated['url'] = result['parsed_url'].geturl()
356 duplicated['parsed_url'] = result['parsed_url']
358 def __merge_result_no_url(self, result, position):
359 result['engines'] = set([result['engine']])
360 result['positions'] = [position]
362 self._merged_results.append(result)
367 for result in self._merged_results:
368 result['score'] = result_score(result, result.get('priority'))
369 # removing html content and whitespace duplications
370 if result.get('content'):
371 result['content'] = result['content'].strip()
372 if result.get('title'):
373 result['title'] = ' '.join(result['title'].strip().split())
375 for result_engine in result['engines']:
376 counter_add(result['score'], 'engine', result_engine, 'score')
378 results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
380 # pass 2 : group results by category and template
382 categoryPositions = {}
385 # do we need to handle more than one category per engine?
386 engine = engines[res['engine']]
387 res['category'] = engine.categories[0] if len(engine.categories) > 0 else ''
389 # do we need to handle more than one category per engine?
393 + res.get('template', '')
395 + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')
398 current = None if category not in categoryPositions else categoryPositions[category]
400 # group with previous results using the same category
401 # if the group can accept more result and is not too far
402 # from the current position
403 if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20):
404 # group with the previous results using
405 # the same category with this one
406 index = current['index']
407 gresults.insert(index, res)
409 # update every index after the current one
410 # (including the current one)
411 for k in categoryPositions: # pylint: disable=consider-using-dict-items
412 v = categoryPositions[k]['index']
414 categoryPositions[k]['index'] = v + 1
416 # update this category
417 current['count'] -= 1
423 # update categoryIndex
424 categoryPositions[category] = {'index': len(gresults), 'count': 8}
426 # update _merged_results
427 self._merged_results = gresults
438 def number_of_results(self) -> int:
439 """Returns the average of results number, returns zero if the average
440 result number is smaller than the actual result count."""
444 logger.error("call to ResultContainer.number_of_results before ResultContainer.close")
447 resultnum_sum = sum(self._number_of_results)
448 if not resultnum_sum or not self._number_of_results:
451 average = int(resultnum_sum / len(self._number_of_results))
452 if average < self.results_length():
456 def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
459 logger.error("call to ResultContainer.add_unresponsive_engine after ResultContainer.close")
461 if engines[engine_name].display_error_messages:
462 self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
464 def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
467 logger.error("call to ResultContainer.add_timing after ResultContainer.close")
469 self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))