.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
_base.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2# pylint: disable=too-few-public-methods, missing-module-docstring
3"""Basic types for the typification of results.
4
5- :py:obj:`Result` base class
6- :py:obj:`LegacyResult` for internal use only
7
8----
9
10.. autoclass:: Result
11 :members:
12
13.. _LegacyResult:
14
15.. autoclass:: LegacyResult
16 :members:
17"""
18
19__all__ = ["Result"]
20
21import typing as t
22
23import re
24import urllib.parse
25import warnings
26import time
27import datetime
28
29from collections.abc import Callable
30
31import msgspec
32
33from searx import logger as log
34
35WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
36UNKNOWN = object()
37
38
39def _normalize_url_fields(result: "Result | LegacyResult"):
40
41 # As soon we need LegacyResult not any longer, we can move this function to
42 # method Result.normalize_result_fields
43
44 if result.url and not result.parsed_url:
45 if not isinstance(result.url, str):
46 log.debug('result: invalid URL: %s', str(result))
47 result.url = ""
48 result.parsed_url = None
49 else:
50 result.parsed_url = urllib.parse.urlparse(result.url)
51
52 if result.parsed_url:
53 result.parsed_url = result.parsed_url._replace(
54 # if the result has no scheme, use http as default
55 scheme=result.parsed_url.scheme or "http",
56 path=result.parsed_url.path,
57 )
58 result.url = result.parsed_url.geturl()
59
60 if isinstance(result, LegacyResult) and getattr(result, "infobox", None):
61 # As soon we have InfoboxResult, we can move this function to method
62 # InfoboxResult.normalize_result_fields
63
64 infobox_urls: list[dict[str, str]] = getattr(result, "urls", [])
65 for item in infobox_urls:
66 _url = item.get("url")
67 if not _url:
68 continue
69 _url = urllib.parse.urlparse(_url)
70 item["url"] = _url._replace(
71 scheme=_url.scheme or "http",
72 # netloc=_url.netloc.replace("www.", ""),
73 path=_url.path,
74 ).geturl()
75
76 infobox_id: str | None = getattr(result, "id", None)
77 if infobox_id:
78 _url = urllib.parse.urlparse(infobox_id)
79 result.id = _url._replace(
80 scheme=_url.scheme or "http",
81 # netloc=_url.netloc.replace("www.", ""),
82 path=_url.path,
83 ).geturl()
84
85
86def _normalize_text_fields(result: "MainResult | LegacyResult"):
87
88 # As soon we need LegacyResult not any longer, we can move this function to
89 # method MainResult.normalize_result_fields
90
91 # Actually, a type check should not be necessary if the engine is
92 # implemented correctly. Historically, however, we have always had a type
93 # check here.
94
95 if result.title and not isinstance(result.title, str):
96 log.debug("result: invalid type of field 'title': %s", str(result))
97 result.title = str(result)
98 if result.content and not isinstance(result.content, str):
99 log.debug("result: invalid type of field 'content': %s", str(result))
100 result.content = str(result)
101
102 # normalize title and content
103 if result.title:
104 result.title = WHITESPACE_REGEX.sub(" ", result.title).strip()
105 if result.content:
106 result.content = WHITESPACE_REGEX.sub(" ", result.content).strip()
107 if result.content == result.title:
108 # avoid duplicate content between the content and title fields
109 result.content = ""
110
111
113 result: "Result | LegacyResult", filter_func: "Callable[[Result | LegacyResult, str, str], str | bool]"
114):
115 # pylint: disable=too-many-branches, too-many-statements
116
117 # As soon we need LegacyResult not any longer, we can move this function to
118 # method Result.
119
120 url_fields = ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"]
121
122 url_src: str
123
124 for field_name in url_fields:
125 url_src = getattr(result, field_name, "")
126 if not url_src:
127 continue
128
129 new_url = filter_func(result, field_name, url_src)
130 # log.debug("filter_urls: filter_func(result, %s) '%s' -> '%s'", field_name, field_value, new_url)
131 if isinstance(new_url, bool):
132 if new_url:
133 # log.debug("filter_urls: unchanged field %s URL %s", field_name, field_value)
134 continue
135 log.debug("filter_urls: drop field %s URL %s", field_name, url_src)
136 new_url = None
137 else:
138 log.debug("filter_urls: modify field %s URL %s -> %s", field_name, url_src, new_url)
139
140 setattr(result, field_name, new_url)
141 if field_name == "url":
142 # sync parsed_url with new_url
143 if not new_url:
144 result.parsed_url = None
145 elif isinstance(new_url, str):
146 result.parsed_url = urllib.parse.urlparse(new_url)
147
148 # "urls": are from infobox
149 #
150 # As soon we have InfoboxResult, we can move this function to method
151 # InfoboxResult.normalize_result_fields
152
153 infobox_urls: list[dict[str, str]] = getattr(result, "urls", [])
154
155 if infobox_urls:
156 # log.debug("filter_urls: infobox_urls .. %s", infobox_urls)
157 new_infobox_urls: list[dict[str, str]] = []
158
159 for item in infobox_urls:
160 url_src = item.get("url", "")
161 if not url_src:
162 new_infobox_urls.append(item)
163 continue
164
165 new_url = filter_func(result, "infobox_urls", url_src)
166 if isinstance(new_url, bool):
167 if new_url:
168 new_infobox_urls.append(item)
169 # log.debug("filter_urls: leave URL in field 'urls' ('infobox_urls') unchanged -> %s", _url)
170 continue
171 log.debug("filter_urls: remove URL from field 'urls' ('infobox_urls') URL %s", url_src)
172 new_url = None
173 if new_url:
174 log.debug("filter_urls: modify URL from field 'urls' ('infobox_urls') URL %s -> %s", url_src, new_url)
175 item["url"] = new_url
176 new_infobox_urls.append(item)
177
178 setattr(result, "urls", new_infobox_urls)
179
180 # "attributes": are from infobox
181 #
182 # The infobox has additional subsections for attributes, urls and relatedTopics:
183
184 infobox_attributes: list[dict[str, t.Any]] = getattr(result, "attributes", [])
185
186 if infobox_attributes:
187 # log.debug("filter_urls: infobox_attributes .. %s", infobox_attributes)
188 new_infobox_attributes: list[dict[str, str | list[dict[str, str]]]] = []
189
190 for item in infobox_attributes:
191 image: dict[str, str] = item.get("image", {})
192 url_src = image.get("src", "")
193 if not url_src:
194 new_infobox_attributes.append(item)
195 continue
196
197 new_url = filter_func(result, "infobox_attributes", url_src)
198 if isinstance(new_url, bool):
199 if new_url:
200 new_infobox_attributes.append(item)
201 # log.debug("filter_urls: leave URL in field 'image.src' unchanged -> %s", url_src)
202 continue
203 log.debug("filter_urls: drop field 'image.src' ('infobox_attributes') URL %s", url_src)
204 new_url = None
205
206 if new_url:
207 log.debug(
208 "filter_urls: modify 'image.src' ('infobox_attributes') URL %s -> %s",
209 url_src,
210 new_url,
211 )
212 item["image"]["src"] = new_url
213 new_infobox_attributes.append(item)
214
215 setattr(result, "attributes", new_infobox_attributes)
216
217 result.normalize_result_fields()
218
219
220def _normalize_date_fields(result: "MainResult | LegacyResult"):
221
222 if result.publishedDate: # do not try to get a date from an empty string or a None type
223 try: # test if publishedDate >= 1900 (datetime module bug)
224 result.pubdate = result.publishedDate.strftime('%Y-%m-%d %H:%M:%S%z')
225 except ValueError:
226 result.publishedDate = None
227
228
229class Result(msgspec.Struct, kw_only=True):
230 """Base class of all result types :ref:`result types`."""
231
232 url: str | None = None
233 """A link related to this *result*"""
234
235 template: str = "default.html"
236 """Name of the template used to render the result.
237
238 By default :origin:`result_templates/default.html
239 <searx/templates/simple/result_templates/default.html>` is used.
240 """
241
242 engine: str | None = ""
243 """Name of the engine *this* result comes from. In case of *plugins* a
244 prefix ``plugin:`` is set, in case of *answerer* prefix ``answerer:`` is
245 set.
246
247 The field is optional and is initialized from the context if necessary.
248 """
249
250 parsed_url: urllib.parse.ParseResult | None = None
251 """:py:obj:`urllib.parse.ParseResult` of :py:obj:`Result.url`.
252
253 The field is optional and is initialized from the context if necessary.
254 """
255
257 """Normalize fields ``url`` and ``parse_sql``.
258
259 - If field ``url`` is set and field ``parse_url`` is unset, init
260 ``parse_url`` from field ``url``. The ``url`` field is initialized
261 with the resulting value in ``parse_url``, if ``url`` and
262 ``parse_url`` are not equal.
263 """
265
266 def __post_init__(self):
267 pass
268
269 def filter_urls(self, filter_func: "Callable[[Result | LegacyResult, str, str], str | bool]"):
270 """A filter function is passed in the ``filter_func`` argument to
271 filter and/or modify the URLs.
272
273 The filter function receives the :py:obj:`result object <Result>` as
274 the first argument and the field name (``str``) in the second argument.
275 In the third argument the URL string value is passed to the filter function.
276
277 The filter function is applied to all fields that contain a URL,
278 in addition to the familiar ``url`` field, these include fields such as::
279
280 ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"]
281
282 and the ``urls`` list of items of the infobox.
283
284 For each field, the filter function is called and returns a bool or a
285 string value:
286
287 - ``True``: leave URL in field unchanged
288 - ``False``: remove URL field from result (or remove entire result)
289 - ``str``: modified URL to be used instead
290
291 See :ref:`filter urls example`.
292
293 """
294 _filter_urls(self, filter_func=filter_func)
295
296 def __hash__(self) -> int:
297 """Generates a hash value that uniquely identifies the content of *this*
298 result. The method can be adapted in the inheritance to compare results
299 from different sources.
300
301 If two result objects are not identical but have the same content, their
302 hash values should also be identical.
303
304 The hash value is used in contexts, e.g. when checking for equality to
305 identify identical results from different sources (engines).
306 """
307 return id(self)
308
309 def __eq__(self, other: object):
310 """py:obj:`Result` objects are equal if the hash values of the two
311 objects are equal. If needed, its recommended to overwrite
312 "py:obj:`Result.__hash__`."""
313
314 return hash(self) == hash(other)
315
316 # for legacy code where a result is treated as a Python dict
317
318 def __setitem__(self, field_name: str, value: t.Any):
319
320 return setattr(self, field_name, value)
321
322 def __getitem__(self, field_name: str) -> t.Any:
323
324 if field_name not in self.__struct_fields__:
325 raise KeyError(f"{field_name}")
326 return getattr(self, field_name)
327
328 def __iter__(self):
329
330 return iter(self.__struct_fields__)
331
332 def as_dict(self):
333 return {f: getattr(self, f) for f in self.__struct_fields__}
334
335 def defaults_from(self, other: "Result"):
336 """Fields not set in *self* will be updated from the field values of the
337 *other*.
338 """
339 for field_name in self.__struct_fields__:
340 self_val = getattr(self, field_name, False)
341 other_val = getattr(other, field_name, False)
342 if self_val:
343 setattr(self, field_name, other_val)
344
345
346class MainResult(Result): # pylint: disable=missing-class-docstring
347 """Base class of all result types displayed in :ref:`area main results`."""
348
349 title: str = ""
350 """Link title of the result item."""
351
352 content: str = ""
353 """Extract or description of the result item"""
354
355 img_src: str = ""
356 """URL of a image that is displayed in the result item."""
357
358 thumbnail: str = ""
359 """URL of a thumbnail that is displayed in the result item."""
360
361 publishedDate: datetime.datetime | None = None
362 """The date on which the object was published."""
363
364 pubdate: str = ""
365 """String representation of :py:obj:`MainResult.publishedDate`"""
366
367 length: time.struct_time | None = None
368 """Playing duration in seconds."""
369
370 views: str = ""
371 """View count in humanized number format."""
372
373 author: str = ""
374 """Author of the title."""
375
376 metadata: str = ""
377 """Miscellaneous metadata."""
378
379 PriorityType = t.Literal["", "high", "low"] # pyright: ignore[reportUnannotatedClassAttribute]
380 priority: "MainResult.PriorityType" = ""
381 """The priority can be set via :ref:`hostnames plugin`, for example."""
382
383 engines: set[str] = set()
384 """In a merged results list, the names of the engines that found this result
385 are listed in this field."""
386
387 # open_group and close_group should not manged in the Result
388 # class (we should drop it from here!)
389 open_group: bool = False
390 close_group: bool = False
391 positions: list[int] = []
392 score: float = 0
393 category: str = ""
394
395 def __hash__(self) -> int:
396 """Ordinary url-results are equal if their values for
397 :py:obj:`Result.template`, :py:obj:`Result.parsed_url` (without scheme)
398 and :py:obj:`MainResult.img_src` are equal.
399 """
400 if not self.parsed_url:
401 raise ValueError(f"missing a value in field 'parsed_url': {self}")
402
403 url = self.parsed_url
404 return hash(
405 f"{self.template}"
406 + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}"
407 + f"|{self.img_src}"
408 )
409
414 if self.engine:
415 self.engines.add(self.engine)
416
417
418class LegacyResult(dict[str, t.Any]):
419 """A wrapper around a legacy result item. The SearXNG core uses this class
420 for untyped dictionaries / to be downward compatible.
421
422 This class is needed until we have implemented an :py:obj:`Result` class for
423 each result type and the old usages in the codebase have been fully
424 migrated.
425
426 There is only one place where this class is used, in the
427 :py:obj:`searx.results.ResultContainer`.
428
429 .. attention::
430
431 Do not use this class in your own implementations!
432 """
433
434 UNSET: object = object()
435
436 # emulate field types from type class Result
437 url: str | None
438 template: str
439 engine: str
440 parsed_url: urllib.parse.ParseResult | None
441
442 # emulate field types from type class MainResult
443 title: str
444 content: str
445 img_src: str
446 thumbnail: str
447 priority: t.Literal["", "high", "low"]
448 engines: set[str]
449 positions: list[int]
450 score: float
451 category: str
452 publishedDate: datetime.datetime | None
453 pubdate: str = ""
454
455 # infobox result
456 urls: list[dict[str, str]]
457 attributes: list[dict[str, str]]
458
459 def as_dict(self):
460 return self
461
462 def __init__(self, *args: t.Any, **kwargs: t.Any):
463
464 super().__init__(*args, **kwargs)
465
466 # emulate field types from type class Result
467 self["url"] = self.get("url")
468 self["template"] = self.get("template", "default.html")
469 self["engine"] = self.get("engine", "")
470 self["parsed_url"] = self.get("parsed_url")
471
472 # emulate field types from type class MainResult
473 self["title"] = self.get("title", "")
474 self["content"] = self.get("content", "")
475 self["img_src"] = self.get("img_src", "")
476 self["thumbnail"] = self.get("thumbnail", "")
477 self["priority"] = self.get("priority", "")
478 self["engines"] = self.get("engines", set())
479 self["positions"] = self.get("positions", "")
480 self["score"] = self.get("score", 0)
481 self["category"] = self.get("category", "")
482 self["publishedDate"] = self.get("publishedDate")
483
484 if "infobox" in self:
485 self["urls"] = self.get("urls", [])
486 self["attributes"] = self.get("attributes", [])
487
488 # Legacy types that have already been ported to a type ..
489
490 if "answer" in self:
491 warnings.warn(
492 f"engine {self.engine} is using deprecated `dict` for answers"
493 f" / use a class from searx.result_types.answer",
494 DeprecationWarning,
495 )
496 self.template = "answer/legacy.html"
497
498 if self.template == "keyvalue.html":
499 warnings.warn(
500 f"engine {self.engine} is using deprecated `dict` for key/value results"
501 f" / use a class from searx.result_types",
502 DeprecationWarning,
503 )
504
505 def __getattr__(self, name: str, default: t.Any = UNSET) -> t.Any:
506 if default == self.UNSET and name not in self:
507 raise AttributeError(f"LegacyResult object has no field named: {name}")
508 return self[name]
509
510 def __setattr__(self, name: str, val: t.Any):
511 self[name] = val
512
513 def __hash__(self) -> int: # pyright: ignore[reportIncompatibleVariableOverride]
514
515 if "answer" in self:
516 # deprecated ..
517 return hash(self["answer"])
518
519 if self.template == "images.html":
520 # image results are equal if their values for template, the url and
521 # the img_src are equal.
522 return hash(f"{self.template}|{self.url}|{self.img_src}")
523
524 if not any(cls in self for cls in ["suggestion", "correction", "infobox", "number_of_results", "engine_data"]):
525 # Ordinary url-results are equal if their values for template,
526 # parsed_url (without schema) and img_src` are equal.
527
528 # Code copied from with MainResult.__hash__:
529 if not self.parsed_url:
530 raise ValueError(f"missing a value in field 'parsed_url': {self}")
531
532 url = self.parsed_url
533 return hash(
534 f"{self.template}"
535 + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}"
536 + f"|{self.img_src}"
537 )
538
539 return id(self)
540
541 def __eq__(self, other: object):
542
543 return hash(self) == hash(other)
544
545 def __repr__(self) -> str:
546
547 return f"LegacyResult: {super().__repr__()}"
548
553 if self.engine:
554 self.engines.add(self.engine)
555
556 def defaults_from(self, other: "LegacyResult"):
557 for k, v in other.items():
558 if not self.get(k):
559 self[k] = v
560
561 def filter_urls(self, filter_func: "Callable[[Result | LegacyResult, str, str], str | bool]"):
562 """See :py:obj:`Result.filter_urls`"""
563 _filter_urls(self, filter_func=filter_func)
defaults_from(self, "LegacyResult" other)
Definition _base.py:556
__init__(self, *t.Any args, **t.Any kwargs)
Definition _base.py:462
filter_urls(self, "Callable[[Result | LegacyResult, str, str], str | bool]" filter_func)
Definition _base.py:561
t.Any __getattr__(self, str name, t.Any default=UNSET)
Definition _base.py:505
__setattr__(self, str name, t.Any val)
Definition _base.py:510
t.Any __getitem__(self, str field_name)
Definition _base.py:322
filter_urls(self, "Callable[[Result | LegacyResult, str, str], str | bool]" filter_func)
Definition _base.py:269
__setitem__(self, str field_name, t.Any value)
Definition _base.py:318
defaults_from(self, "Result" other)
Definition _base.py:335
__eq__(self, object other)
Definition _base.py:309
_normalize_url_fields("Result | LegacyResult" result)
Definition _base.py:39
_normalize_text_fields("MainResult | LegacyResult" result)
Definition _base.py:86
_filter_urls("Result | LegacyResult" result, "Callable[[Result | LegacyResult, str, str], str | bool]" filter_func)
Definition _base.py:114
_normalize_date_fields("MainResult | LegacyResult" result)
Definition _base.py:220