275 def _normalize_url_result(self, result):
276 """Return True if the result is valid"""
277 result['parsed_url'] = urlparse(result['url'])
279 # if the result has no scheme, use http as default
280 if not result['parsed_url'].scheme:
281 result['parsed_url'] = result['parsed_url']._replace(scheme="http")
282 result['url'] = result['parsed_url'].geturl()
284 # avoid duplicate content between the content and title fields
285 if result.get('content') == result.get('title'):
286 del result['content']
288 # make sure there is a template
289 if 'template' not in result:
290 result['template'] = 'default.html'
292 # strip multiple spaces and carriage returns from content
293 if result.get('content'):
294 result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
296 def __merge_url_result(self, result, position):
297 result['engines'] = set([result['engine']])
299 duplicated = self.__find_duplicated_http_result(result)
301 self.__merge_duplicated_http_result(duplicated, result, position)
304 # if there is no duplicate found, append result
305 result['positions'] = [position]
306 self._merged_results.append(result)
308 def __find_duplicated_http_result(self, result):
309 result_template = result.get('template')
310 for merged_result in self._merged_results:
311 if 'parsed_url' not in merged_result:
313 if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
316 if result_template != 'images.html':
317 # not an image, same template, same url : it's a duplicate
321 # it's a duplicate if the parsed_url, template and img_src are different
322 if result.get('img_src', '') == merged_result.get('img_src', ''):
326 def __merge_duplicated_http_result(self, duplicated, result, position):
327 # using content with more text
328 if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
329 duplicated['content'] = result['content']
331 # merge all result's parameters not found in duplicate
332 for key in result.keys():
333 if not duplicated.get(key):
334 duplicated[key] = result.get(key)
336 # add the new position
337 duplicated['positions'].append(position)
339 # add engine to list of result-engines
340 duplicated['engines'].add(result['engine'])
342 # using https if possible
343 if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
344 duplicated['url'] = result['parsed_url'].geturl()
345 duplicated['parsed_url'] = result['parsed_url']
347 def __merge_result_no_url(self, result, position):
348 result['engines'] = set([result['engine']])
349 result['positions'] = [position]
351 self._merged_results.append(result)
356 for result in self._merged_results:
357 score = result_score(result)
358 result['score'] = score
360 # removing html content and whitespace duplications
361 if result.get('content'):
362 result['content'] = utils.html_to_text(result['content']).strip()
363 if result.get('title'):
364 result['title'] = ' '.join(utils.html_to_text(result['title']).strip().split())
366 for result_engine in result['engines']:
367 counter_add(score, 'engine', result_engine, 'score')
369 results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
371 # pass 2 : group results by category and template
373 categoryPositions = {}
376 # do we need to handle more than one category per engine?
377 engine = engines[res['engine']]
378 res['category'] = engine.categories[0] if len(engine.categories) > 0 else ''
380 # do we need to handle more than one category per engine?
384 + res.get('template', '')
386 + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')
389 current = None if category not in categoryPositions else categoryPositions[category]
391 # group with previous results using the same category
392 # if the group can accept more result and is not too far
393 # from the current position
394 if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20):
395 # group with the previous results using
396 # the same category with this one
397 index = current['index']
398 gresults.insert(index, res)
400 # update every index after the current one
401 # (including the current one)
402 for k in categoryPositions: # pylint: disable=consider-using-dict-items
403 v = categoryPositions[k]['index']
405 categoryPositions[k]['index'] = v + 1
407 # update this category
408 current['count'] -= 1
414 # update categoryIndex
415 categoryPositions[category] = {'index': len(gresults), 'count': 8}
417 # update _merged_results
418 self._merged_results = gresults
429 def number_of_results(self) -> int:
430 """Returns the average of results number, returns zero if the average
431 result number is smaller than the actual result count."""
433 resultnum_sum = sum(self._number_of_results)
434 if not resultnum_sum or not self._number_of_results:
437 average = int(resultnum_sum / len(self._number_of_results))
438 if average < self.results_length():
442 def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
443 if engines[engine_name].display_error_messages:
444 self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
446 def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
447 self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))