.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
_base.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2# pylint: disable=too-few-public-methods, missing-module-docstring
3"""Basic types for the typification of results.
4
5- :py:obj:`Result` base class
6- :py:obj:`LegacyResult` for internal use only
7
8----
9
10.. autoclass:: Result
11 :members:
12
13.. _LegacyResult:
14
15.. autoclass:: LegacyResult
16 :members:
17"""
18
19
20from __future__ import annotations
21
22__all__ = ["Result"]
23
24import re
25import urllib.parse
26import warnings
27import typing
28import time
29import datetime
30
31from collections.abc import Callable
32
33import msgspec
34
35from searx import logger as log
36
37WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
38UNKNOWN = object()
39
40
41def _normalize_url_fields(result: Result | LegacyResult):
42
43 # As soon we need LegacyResult not any longer, we can move this function to
44 # method Result.normalize_result_fields
45
46 if result.url and not result.parsed_url:
47 if not isinstance(result.url, str):
48 log.debug('result: invalid URL: %s', str(result))
49 result.url = ""
50 result.parsed_url = None
51 else:
52 result.parsed_url = urllib.parse.urlparse(result.url)
53
54 if result.parsed_url:
55 result.parsed_url = result.parsed_url._replace(
56 # if the result has no scheme, use http as default
57 scheme=result.parsed_url.scheme or "http",
58 path=result.parsed_url.path,
59 )
60 result.url = result.parsed_url.geturl()
61
62 if isinstance(result, LegacyResult) and getattr(result, "infobox", None):
63 # As soon we have InfoboxResult, we can move this function to method
64 # InfoboxResult.normalize_result_fields
65
66 infobox_urls: list[dict[str, str]] = getattr(result, "urls", [])
67 for item in infobox_urls:
68 _url = item.get("url")
69 if not _url:
70 continue
71 _url = urllib.parse.urlparse(_url)
72 item["url"] = _url._replace(
73 scheme=_url.scheme or "http",
74 # netloc=_url.netloc.replace("www.", ""),
75 path=_url.path,
76 ).geturl()
77
78 infobox_id = getattr(result, "id", None)
79 if infobox_id:
80 _url = urllib.parse.urlparse(infobox_id)
81 result.id = _url._replace(
82 scheme=_url.scheme or "http",
83 # netloc=_url.netloc.replace("www.", ""),
84 path=_url.path,
85 ).geturl()
86
87
88def _normalize_text_fields(result: MainResult | LegacyResult):
89
90 # As soon we need LegacyResult not any longer, we can move this function to
91 # method MainResult.normalize_result_fields
92
93 # Actually, a type check should not be necessary if the engine is
94 # implemented correctly. Historically, however, we have always had a type
95 # check here.
96
97 if result.title and not isinstance(result.title, str):
98 log.debug("result: invalid type of field 'title': %s", str(result))
99 result.title = str(result)
100 if result.content and not isinstance(result.content, str):
101 log.debug("result: invalid type of field 'content': %s", str(result))
102 result.content = str(result)
103
104 # normalize title and content
105 if result.title:
106 result.title = WHITESPACE_REGEX.sub(" ", result.title).strip()
107 if result.content:
108 result.content = WHITESPACE_REGEX.sub(" ", result.content).strip()
109 if result.content == result.title:
110 # avoid duplicate content between the content and title fields
111 result.content = ""
112
113
114def _filter_urls(result: Result | LegacyResult, filter_func: Callable[[Result | LegacyResult, str, str], str | bool]):
115 # pylint: disable=too-many-branches, too-many-statements
116
117 # As soon we need LegacyResult not any longer, we can move this function to
118 # method Result.
119
120 url_fields = ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"]
121
122 for field_name in url_fields:
123 url_src = getattr(result, field_name, "")
124 if not url_src:
125 continue
126
127 new_url = filter_func(result, field_name, url_src)
128 # log.debug("filter_urls: filter_func(result, %s) '%s' -> '%s'", field_name, field_value, new_url)
129 if isinstance(new_url, bool):
130 if new_url:
131 # log.debug("filter_urls: unchanged field %s URL %s", field_name, field_value)
132 continue
133 log.debug("filter_urls: drop field %s URL %s", field_name, url_src)
134 new_url = None
135 else:
136 log.debug("filter_urls: modify field %s URL %s -> %s", field_name, url_src, new_url)
137
138 setattr(result, field_name, new_url)
139 if field_name == "url":
140 # sync parsed_url with new_url
141 if not new_url:
142 result.parsed_url = None
143 elif isinstance(new_url, str):
144 result.parsed_url = urllib.parse.urlparse(new_url)
145
146 # "urls": are from infobox
147 #
148 # As soon we have InfoboxResult, we can move this function to method
149 # InfoboxResult.normalize_result_fields
150
151 infobox_urls: list[dict[str, str]] = getattr(result, "urls", [])
152
153 if infobox_urls:
154 # log.debug("filter_urls: infobox_urls .. %s", infobox_urls)
155 new_infobox_urls: list[dict[str, str]] = []
156
157 for item in infobox_urls:
158 url_src = item.get("url")
159 if not url_src:
160 new_infobox_urls.append(item)
161 continue
162
163 new_url = filter_func(result, "infobox_urls", url_src)
164 if isinstance(new_url, bool):
165 if new_url:
166 new_infobox_urls.append(item)
167 # log.debug("filter_urls: leave URL in field 'urls' ('infobox_urls') unchanged -> %s", _url)
168 continue
169 log.debug("filter_urls: remove URL from field 'urls' ('infobox_urls') URL %s", url_src)
170 new_url = None
171 if new_url:
172 log.debug("filter_urls: modify URL from field 'urls' ('infobox_urls') URL %s -> %s", url_src, new_url)
173 item["url"] = new_url
174 new_infobox_urls.append(item)
175
176 setattr(result, "urls", new_infobox_urls)
177
178 # "attributes": are from infobox
179 #
180 # The infobox has additional subsections for attributes, urls and relatedTopics:
181
182 infobox_attributes: list[dict[str, dict]] = getattr(result, "attributes", [])
183
184 if infobox_attributes:
185 # log.debug("filter_urls: infobox_attributes .. %s", infobox_attributes)
186 new_infobox_attributes: list[dict[str, dict]] = []
187
188 for item in infobox_attributes:
189 image = item.get("image", {})
190 url_src = image.get("src", "")
191 if not url_src:
192 new_infobox_attributes.append(item)
193 continue
194
195 new_url = filter_func(result, "infobox_attributes", url_src)
196 if isinstance(new_url, bool):
197 if new_url:
198 new_infobox_attributes.append(item)
199 # log.debug("filter_urls: leave URL in field 'image.src' unchanged -> %s", url_src)
200 continue
201 log.debug("filter_urls: drop field 'image.src' ('infobox_attributes') URL %s", url_src)
202 new_url = None
203
204 if new_url:
205 log.debug(
206 "filter_urls: modify 'image.src' ('infobox_attributes') URL %s -> %s",
207 url_src,
208 new_url,
209 )
210 item["image"]["src"] = new_url
211 new_infobox_attributes.append(item)
212
213 setattr(result, "attributes", new_infobox_attributes)
214
215 result.normalize_result_fields()
216
217
218def _normalize_date_fields(result: MainResult | LegacyResult):
219
220 if result.publishedDate: # do not try to get a date from an empty string or a None type
221 try: # test if publishedDate >= 1900 (datetime module bug)
222 result.pubdate = result.publishedDate.strftime('%Y-%m-%d %H:%M:%S%z')
223 except ValueError:
224 result.publishedDate = None
225
226
227class Result(msgspec.Struct, kw_only=True):
228 """Base class of all result types :ref:`result types`."""
229
230 url: str | None = None
231 """A link related to this *result*"""
232
233 template: str = "default.html"
234 """Name of the template used to render the result.
235
236 By default :origin:`result_templates/default.html
237 <searx/templates/simple/result_templates/default.html>` is used.
238 """
239
240 engine: str | None = ""
241 """Name of the engine *this* result comes from. In case of *plugins* a
242 prefix ``plugin:`` is set, in case of *answerer* prefix ``answerer:`` is
243 set.
244
245 The field is optional and is initialized from the context if necessary.
246 """
247
248 parsed_url: urllib.parse.ParseResult | None = None
249 """:py:obj:`urllib.parse.ParseResult` of :py:obj:`Result.url`.
250
251 The field is optional and is initialized from the context if necessary.
252 """
253
255 """Normalize fields ``url`` and ``parse_sql``.
256
257 - If field ``url`` is set and field ``parse_url`` is unset, init
258 ``parse_url`` from field ``url``. The ``url`` field is initialized
259 with the resulting value in ``parse_url``, if ``url`` and
260 ``parse_url`` are not equal.
261 """
263
264 def __post_init__(self):
265 pass
266
267 def filter_urls(self, filter_func: Callable[[Result | LegacyResult, str, str], str | bool]):
268 """A filter function is passed in the ``filter_func`` argument to
269 filter and/or modify the URLs.
270
271 The filter function receives the :py:obj:`result object <Result>` as
272 the first argument and the field name (``str``) in the second argument.
273 In the third argument the URL string value is passed to the filter function.
274
275 The filter function is applied to all fields that contain a URL,
276 in addition to the familiar ``url`` field, these include fields such as::
277
278 ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"]
279
280 and the ``urls`` list of items of the infobox.
281
282 For each field, the filter function is called and returns a bool or a
283 string value:
284
285 - ``True``: leave URL in field unchanged
286 - ``False``: remove URL field from result (or remove entire result)
287 - ``str``: modified URL to be used instead
288
289 See :ref:`filter urls example`.
290
291 """
292 _filter_urls(self, filter_func=filter_func)
293
294 def __hash__(self) -> int:
295 """Generates a hash value that uniquely identifies the content of *this*
296 result. The method can be adapted in the inheritance to compare results
297 from different sources.
298
299 If two result objects are not identical but have the same content, their
300 hash values should also be identical.
301
302 The hash value is used in contexts, e.g. when checking for equality to
303 identify identical results from different sources (engines).
304 """
305 return id(self)
306
307 def __eq__(self, other):
308 """py:obj:`Result` objects are equal if the hash values of the two
309 objects are equal. If needed, its recommended to overwrite
310 "py:obj:`Result.__hash__`."""
311
312 return hash(self) == hash(other)
313
314 # for legacy code where a result is treated as a Python dict
315
316 def __setitem__(self, field_name, value):
317
318 return setattr(self, field_name, value)
319
320 def __getitem__(self, field_name):
321
322 if field_name not in self.__struct_fields__:
323 raise KeyError(f"{field_name}")
324 return getattr(self, field_name)
325
326 def __iter__(self):
327
328 return iter(self.__struct_fields__)
329
330 def as_dict(self):
331 return {f: getattr(self, f) for f in self.__struct_fields__}
332
333 def defaults_from(self, other: Result):
334 """Fields not set in *self* will be updated from the field values of the
335 *other*.
336 """
337 for field_name in self.__struct_fields__:
338 self_val = getattr(self, field_name, False)
339 other_val = getattr(other, field_name, False)
340 if self_val:
341 setattr(self, field_name, other_val)
342
343
344class MainResult(Result): # pylint: disable=missing-class-docstring
345 """Base class of all result types displayed in :ref:`area main results`."""
346
347 title: str = ""
348 """Link title of the result item."""
349
350 content: str = ""
351 """Extract or description of the result item"""
352
353 img_src: str = ""
354 """URL of a image that is displayed in the result item."""
355
356 thumbnail: str = ""
357 """URL of a thumbnail that is displayed in the result item."""
358
359 publishedDate: datetime.datetime | None = None
360 """The date on which the object was published."""
361
362 pubdate: str = ""
363 """String representation of :py:obj:`MainResult.publishedDate`"""
364
365 length: time.struct_time | None = None
366 """Playing duration in seconds."""
367
368 views: str = ""
369 """View count in humanized number format."""
370
371 author: str = ""
372 """Author of the title."""
373
374 metadata: str = ""
375 """Miscellaneous metadata."""
376
377 priority: typing.Literal["", "high", "low"] = ""
378 """The priority can be set via :ref:`hostnames plugin`, for example."""
379
380 engines: set[str] = set()
381 """In a merged results list, the names of the engines that found this result
382 are listed in this field."""
383
384 # open_group and close_group should not manged in the Result
385 # class (we should drop it from here!)
386 open_group: bool = False
387 close_group: bool = False
388 positions: list[int] = []
389 score: float = 0
390 category: str = ""
391
392 def __hash__(self) -> int:
393 """Ordinary url-results are equal if their values for
394 :py:obj:`Result.template`, :py:obj:`Result.parsed_url` (without scheme)
395 and :py:obj:`MainResult.img_src` are equal.
396 """
397 if not self.parsed_url:
398 raise ValueError(f"missing a value in field 'parsed_url': {self}")
399
400 url = self.parsed_url
401 return hash(
402 f"{self.template}"
403 + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}"
404 + f"|{self.img_src}"
405 )
406
411 if self.engine:
412 self.engines.add(self.engine)
413
414
415class LegacyResult(dict):
416 """A wrapper around a legacy result item. The SearXNG core uses this class
417 for untyped dictionaries / to be downward compatible.
418
419 This class is needed until we have implemented an :py:obj:`Result` class for
420 each result type and the old usages in the codebase have been fully
421 migrated.
422
423 There is only one place where this class is used, in the
424 :py:obj:`searx.results.ResultContainer`.
425
426 .. attention::
427
428 Do not use this class in your own implementations!
429 """
430
431 UNSET = object()
432
433 # emulate field types from type class Result
434 url: str | None
435 template: str
436 engine: str
437 parsed_url: urllib.parse.ParseResult | None
438
439 # emulate field types from type class MainResult
440 title: str
441 content: str
442 img_src: str
443 thumbnail: str
444 priority: typing.Literal["", "high", "low"]
445 engines: set[str]
446 positions: list[int]
447 score: float
448 category: str
449 publishedDate: datetime.datetime | None = None
450 pubdate: str = ""
451
452 # infobox result
453 urls: list[dict[str, str]]
454 attributes: list[dict[str, str]]
455
456 def as_dict(self):
457 return self
458
459 def __init__(self, *args, **kwargs):
460
461 super().__init__(*args, **kwargs)
462
463 # emulate field types from type class Result
464 self["url"] = self.get("url")
465 self["template"] = self.get("template", "default.html")
466 self["engine"] = self.get("engine", "")
467 self["parsed_url"] = self.get("parsed_url")
468
469 # emulate field types from type class MainResult
470 self["title"] = self.get("title", "")
471 self["content"] = self.get("content", "")
472 self["img_src"] = self.get("img_src", "")
473 self["thumbnail"] = self.get("thumbnail", "")
474 self["priority"] = self.get("priority", "")
475 self["engines"] = self.get("engines", set())
476 self["positions"] = self.get("positions", "")
477 self["score"] = self.get("score", 0)
478 self["category"] = self.get("category", "")
479
480 if "infobox" in self:
481 self["urls"] = self.get("urls", [])
482 self["attributes"] = self.get("attributes", [])
483
484 # Legacy types that have already been ported to a type ..
485
486 if "answer" in self:
487 warnings.warn(
488 f"engine {self.engine} is using deprecated `dict` for answers"
489 f" / use a class from searx.result_types.answer",
490 DeprecationWarning,
491 )
492 self.template = "answer/legacy.html"
493
494 if self.template == "keyvalue.html":
495 warnings.warn(
496 f"engine {self.engine} is using deprecated `dict` for key/value results"
497 f" / use a class from searx.result_types",
498 DeprecationWarning,
499 )
500
501 def __getattr__(self, name: str, default=UNSET) -> typing.Any:
502 if default == self.UNSET and name not in self:
503 raise AttributeError(f"LegacyResult object has no field named: {name}")
504 return self[name]
505
506 def __setattr__(self, name: str, val):
507 self[name] = val
508
509 def __hash__(self) -> int: # type: ignore
510
511 if "answer" in self:
512 # deprecated ..
513 return hash(self["answer"])
514
515 if self.template == "images.html":
516 # image results are equal if their values for template, the url and
517 # the img_src are equal.
518 return hash(f"{self.template}|{self.url}|{self.img_src}")
519
520 if not any(cls in self for cls in ["suggestion", "correction", "infobox", "number_of_results", "engine_data"]):
521 # Ordinary url-results are equal if their values for template,
522 # parsed_url (without schema) and img_src` are equal.
523
524 # Code copied from with MainResult.__hash__:
525 if not self.parsed_url:
526 raise ValueError(f"missing a value in field 'parsed_url': {self}")
527
528 url = self.parsed_url
529 return hash(
530 f"{self.template}"
531 + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}"
532 + f"|{self.img_src}"
533 )
534
535 return id(self)
536
537 def __eq__(self, other):
538
539 return hash(self) == hash(other)
540
541 def __repr__(self) -> str:
542
543 return f"LegacyResult: {super().__repr__()}"
544
549 if self.engine:
550 self.engines.add(self.engine)
551
552 def defaults_from(self, other: LegacyResult):
553 for k, v in other.items():
554 if not self.get(k):
555 self[k] = v
556
557 def filter_urls(self, filter_func: Callable[[Result | LegacyResult, str, str], str | bool]):
558 """See :py:obj:`Result.filter_urls`"""
559 _filter_urls(self, filter_func=filter_func)
defaults_from(self, LegacyResult other)
Definition _base.py:552
typing.Any __getattr__(self, str name, default=UNSET)
Definition _base.py:501
__setattr__(self, str name, val)
Definition _base.py:506
__init__(self, *args, **kwargs)
Definition _base.py:459
filter_urls(self, Callable[[Result|LegacyResult, str, str], str|bool] filter_func)
Definition _base.py:557
__setitem__(self, field_name, value)
Definition _base.py:316
defaults_from(self, Result other)
Definition _base.py:333
__getitem__(self, field_name)
Definition _base.py:320
filter_urls(self, Callable[[Result|LegacyResult, str, str], str|bool] filter_func)
Definition _base.py:267
_normalize_url_fields(Result|LegacyResult result)
Definition _base.py:41
_normalize_text_fields(MainResult|LegacyResult result)
Definition _base.py:88
_normalize_date_fields(MainResult|LegacyResult result)
Definition _base.py:218
_filter_urls(Result|LegacyResult result, Callable[[Result|LegacyResult, str, str], str|bool] filter_func)
Definition _base.py:114