.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
_base.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2# pylint: disable=too-few-public-methods, missing-module-docstring
3"""Basic types for the typification of results.
4
5- :py:obj:`Result` base class
6- :py:obj:`LegacyResult` for internal use only
7
8----
9
10.. autoclass:: Result
11 :members:
12
13.. _LegacyResult:
14
15.. autoclass:: LegacyResult
16 :members:
17"""
18
19__all__ = ["Result"]
20
21import typing as t
22
23import re
24import urllib.parse
25import warnings
26import time
27import datetime
28
29from collections.abc import Callable
30
31import msgspec
32
33from searx import logger as log
34
35WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
36UNKNOWN = object()
37
38
39def _normalize_url_fields(result: "Result | LegacyResult"):
40
41 # As soon we need LegacyResult not any longer, we can move this function to
42 # method Result.normalize_result_fields
43
44 if result.url and not result.parsed_url:
45 if not isinstance(result.url, str):
46 log.debug('result: invalid URL: %s', str(result))
47 result.url = ""
48 result.parsed_url = None
49 else:
50 result.parsed_url = urllib.parse.urlparse(result.url)
51
52 if result.parsed_url:
53 result.parsed_url = result.parsed_url._replace(
54 # if the result has no scheme, use http as default
55 scheme=result.parsed_url.scheme or "http",
56 path=result.parsed_url.path,
57 )
58 result.url = result.parsed_url.geturl()
59
60 if isinstance(result, LegacyResult) and getattr(result, "infobox", None):
61 # As soon we have InfoboxResult, we can move this function to method
62 # InfoboxResult.normalize_result_fields
63
64 infobox_urls: list[dict[str, str]] = getattr(result, "urls", [])
65 for item in infobox_urls:
66 _url = item.get("url")
67 if not _url:
68 continue
69 _url = urllib.parse.urlparse(_url)
70 item["url"] = _url._replace(
71 scheme=_url.scheme or "http",
72 # netloc=_url.netloc.replace("www.", ""),
73 path=_url.path,
74 ).geturl()
75
76 infobox_id: str | None = getattr(result, "id", None)
77 if infobox_id:
78 _url = urllib.parse.urlparse(infobox_id)
79 result.id = _url._replace(
80 scheme=_url.scheme or "http",
81 # netloc=_url.netloc.replace("www.", ""),
82 path=_url.path,
83 ).geturl()
84
85
86def _normalize_text_fields(result: "MainResult | LegacyResult"):
87
88 # As soon we need LegacyResult not any longer, we can move this function to
89 # method MainResult.normalize_result_fields
90
91 # Actually, a type check should not be necessary if the engine is
92 # implemented correctly. Historically, however, we have always had a type
93 # check here.
94
95 if result.title and not isinstance(result.title, str):
96 log.debug("result: invalid type of field 'title': %s", str(result))
97 result.title = str(result)
98 if result.content and not isinstance(result.content, str):
99 log.debug("result: invalid type of field 'content': %s", str(result))
100 result.content = str(result)
101
102 # normalize title and content
103 if result.title:
104 result.title = WHITESPACE_REGEX.sub(" ", result.title).strip()
105 if result.content:
106 result.content = WHITESPACE_REGEX.sub(" ", result.content).strip()
107 if result.content == result.title:
108 # avoid duplicate content between the content and title fields
109 result.content = ""
110
111
113 result: "Result | LegacyResult", filter_func: "Callable[[Result | LegacyResult, str, str], str | bool]"
114):
115 # pylint: disable=too-many-branches, too-many-statements
116
117 # As soon we need LegacyResult not any longer, we can move this function to
118 # method Result.
119
120 url_fields = ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"]
121
122 url_src: str
123
124 for field_name in url_fields:
125 url_src = getattr(result, field_name, "")
126 if not url_src:
127 continue
128
129 new_url = filter_func(result, field_name, url_src)
130 # log.debug("filter_urls: filter_func(result, %s) '%s' -> '%s'", field_name, field_value, new_url)
131 if isinstance(new_url, bool):
132 if new_url:
133 # log.debug("filter_urls: unchanged field %s URL %s", field_name, field_value)
134 continue
135 log.debug("filter_urls: drop field %s URL %s", field_name, url_src)
136 new_url = None
137 else:
138 log.debug("filter_urls: modify field %s URL %s -> %s", field_name, url_src, new_url)
139
140 setattr(result, field_name, new_url)
141 if field_name == "url":
142 # sync parsed_url with new_url
143 if not new_url:
144 result.parsed_url = None
145 elif isinstance(new_url, str):
146 result.parsed_url = urllib.parse.urlparse(new_url)
147
148 # "urls": are from infobox
149 #
150 # As soon we have InfoboxResult, we can move this function to method
151 # InfoboxResult.normalize_result_fields
152
153 infobox_urls: list[dict[str, str]] = getattr(result, "urls", [])
154
155 if infobox_urls:
156 # log.debug("filter_urls: infobox_urls .. %s", infobox_urls)
157 new_infobox_urls: list[dict[str, str]] = []
158
159 for item in infobox_urls:
160 url_src = item.get("url", "")
161 if not url_src:
162 new_infobox_urls.append(item)
163 continue
164
165 new_url = filter_func(result, "infobox_urls", url_src)
166 if isinstance(new_url, bool):
167 if new_url:
168 new_infobox_urls.append(item)
169 # log.debug("filter_urls: leave URL in field 'urls' ('infobox_urls') unchanged -> %s", _url)
170 continue
171 log.debug("filter_urls: remove URL from field 'urls' ('infobox_urls') URL %s", url_src)
172 new_url = None
173 if new_url:
174 log.debug("filter_urls: modify URL from field 'urls' ('infobox_urls') URL %s -> %s", url_src, new_url)
175 item["url"] = new_url
176 new_infobox_urls.append(item)
177
178 setattr(result, "urls", new_infobox_urls)
179
180 # "attributes": are from infobox
181 #
182 # The infobox has additional subsections for attributes, urls and relatedTopics:
183
184 infobox_attributes: list[dict[str, t.Any]] = getattr(result, "attributes", [])
185
186 if infobox_attributes:
187 # log.debug("filter_urls: infobox_attributes .. %s", infobox_attributes)
188 new_infobox_attributes: list[dict[str, str | list[dict[str, str]]]] = []
189
190 for item in infobox_attributes:
191 image: dict[str, str] = item.get("image", {})
192 url_src = image.get("src", "")
193 if not url_src:
194 new_infobox_attributes.append(item)
195 continue
196
197 new_url = filter_func(result, "infobox_attributes", url_src)
198 if isinstance(new_url, bool):
199 if new_url:
200 new_infobox_attributes.append(item)
201 # log.debug("filter_urls: leave URL in field 'image.src' unchanged -> %s", url_src)
202 continue
203 log.debug("filter_urls: drop field 'image.src' ('infobox_attributes') URL %s", url_src)
204 new_url = None
205
206 if new_url:
207 log.debug(
208 "filter_urls: modify 'image.src' ('infobox_attributes') URL %s -> %s",
209 url_src,
210 new_url,
211 )
212 item["image"]["src"] = new_url
213 new_infobox_attributes.append(item)
214
215 setattr(result, "attributes", new_infobox_attributes)
216
217 result.normalize_result_fields()
218
219
220def _normalize_date_fields(result: "MainResult | LegacyResult"):
221
222 if result.publishedDate: # do not try to get a date from an empty string or a None type
223 try: # test if publishedDate >= 1900 (datetime module bug)
224 result.pubdate = result.publishedDate.strftime('%Y-%m-%d %H:%M:%S%z')
225 except ValueError:
226 result.publishedDate = None
227
228
229class Result(msgspec.Struct, kw_only=True):
230 """Base class of all result types :ref:`result types`."""
231
232 url: str | None = None
233 """A link related to this *result*"""
234
235 template: str = "default.html"
236 """Name of the template used to render the result.
237
238 By default :origin:`result_templates/default.html
239 <searx/templates/simple/result_templates/default.html>` is used.
240 """
241
242 engine: str | None = ""
243 """Name of the engine *this* result comes from. In case of *plugins* a
244 prefix ``plugin:`` is set, in case of *answerer* prefix ``answerer:`` is
245 set.
246
247 The field is optional and is initialized from the context if necessary.
248 """
249
250 parsed_url: urllib.parse.ParseResult | None = None
251 """:py:obj:`urllib.parse.ParseResult` of :py:obj:`Result.url`.
252
253 The field is optional and is initialized from the context if necessary.
254 """
255
257 """Normalize fields ``url`` and ``parse_sql``.
258
259 - If field ``url`` is set and field ``parse_url`` is unset, init
260 ``parse_url`` from field ``url``. The ``url`` field is initialized
261 with the resulting value in ``parse_url``, if ``url`` and
262 ``parse_url`` are not equal.
263 """
265
266 def __post_init__(self):
267 pass
268
269 def filter_urls(self, filter_func: "Callable[[Result | LegacyResult, str, str], str | bool]"):
270 """A filter function is passed in the ``filter_func`` argument to
271 filter and/or modify the URLs.
272
273 The filter function receives the :py:obj:`result object <Result>` as
274 the first argument and the field name (``str``) in the second argument.
275 In the third argument the URL string value is passed to the filter function.
276
277 The filter function is applied to all fields that contain a URL,
278 in addition to the familiar ``url`` field, these include fields such as::
279
280 ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"]
281
282 and the ``urls`` list of items of the infobox.
283
284 For each field, the filter function is called and returns a bool or a
285 string value:
286
287 - ``True``: leave URL in field unchanged
288 - ``False``: remove URL field from result (or remove entire result)
289 - ``str``: modified URL to be used instead
290
291 See :ref:`filter urls example`.
292
293 """
294 _filter_urls(self, filter_func=filter_func)
295
296 def __hash__(self) -> int:
297 """Generates a hash value that uniquely identifies the content of *this*
298 result. The method can be adapted in the inheritance to compare results
299 from different sources.
300
301 If two result objects are not identical but have the same content, their
302 hash values should also be identical.
303
304 The hash value is used in contexts, e.g. when checking for equality to
305 identify identical results from different sources (engines).
306 """
307 return id(self)
308
309 def __eq__(self, other: object):
310 """py:obj:`Result` objects are equal if the hash values of the two
311 objects are equal. If needed, its recommended to overwrite
312 "py:obj:`Result.__hash__`."""
313
314 return hash(self) == hash(other)
315
316 # for legacy code where a result is treated as a Python dict
317
318 def __setitem__(self, field_name: str, value: t.Any):
319
320 return setattr(self, field_name, value)
321
322 def __getitem__(self, field_name: str) -> t.Any:
323
324 if field_name not in self.__struct_fields__:
325 raise KeyError(f"{field_name}")
326 return getattr(self, field_name)
327
328 def __iter__(self):
329
330 return iter(self.__struct_fields__)
331
332 def as_dict(self):
333 return {f: getattr(self, f) for f in self.__struct_fields__}
334
335 def defaults_from(self, other: "Result"):
336 """Fields not set in *self* will be updated from the field values of the
337 *other*.
338 """
339 for field_name in self.__struct_fields__:
340 self_val = getattr(self, field_name, False)
341 other_val = getattr(other, field_name, False)
342 if self_val:
343 setattr(self, field_name, other_val)
344
345
346class MainResult(Result): # pylint: disable=missing-class-docstring
347 """Base class of all result types displayed in :ref:`area main results`."""
348
349 title: str = ""
350 """Link title of the result item."""
351
352 content: str = ""
353 """Extract or description of the result item"""
354
355 img_src: str = ""
356 """URL of a image that is displayed in the result item."""
357
358 thumbnail: str = ""
359 """URL of a thumbnail that is displayed in the result item."""
360
361 publishedDate: datetime.datetime | None = None
362 """The date on which the object was published."""
363
364 pubdate: str = ""
365 """String representation of :py:obj:`MainResult.publishedDate`
366
367 Deprecated: it is still partially used in the templates, but will one day be
368 completely eliminated.
369 """
370
371 length: time.struct_time | None = None
372 """Playing duration in seconds."""
373
374 views: str = ""
375 """View count in humanized number format."""
376
377 author: str = ""
378 """Author of the title."""
379
380 metadata: str = ""
381 """Miscellaneous metadata."""
382
383 PriorityType = t.Literal["", "high", "low"] # pyright: ignore[reportUnannotatedClassAttribute]
384 priority: "MainResult.PriorityType" = ""
385 """The priority can be set via :ref:`hostnames plugin`, for example."""
386
387 engines: set[str] = set()
388 """In a merged results list, the names of the engines that found this result
389 are listed in this field."""
390
391 # open_group and close_group should not manged in the Result
392 # class (we should drop it from here!)
393 open_group: bool = False
394 close_group: bool = False
395 positions: list[int] = []
396 score: float = 0
397 category: str = ""
398
399 def __hash__(self) -> int:
400 """Ordinary url-results are equal if their values for
401 :py:obj:`Result.template`, :py:obj:`Result.parsed_url` (without scheme)
402 and :py:obj:`MainResult.img_src` are equal.
403 """
404 if not self.parsed_url:
405 raise ValueError(f"missing a value in field 'parsed_url': {self}")
406
407 url = self.parsed_url
408 return hash(
409 f"{self.template}"
410 + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}"
411 + f"|{self.img_src}"
412 )
413
418 if self.engine:
419 self.engines.add(self.engine)
420
421
422class LegacyResult(dict[str, t.Any]):
423 """A wrapper around a legacy result item. The SearXNG core uses this class
424 for untyped dictionaries / to be downward compatible.
425
426 This class is needed until we have implemented an :py:obj:`Result` class for
427 each result type and the old usages in the codebase have been fully
428 migrated.
429
430 There is only one place where this class is used, in the
431 :py:obj:`searx.results.ResultContainer`.
432
433 .. attention::
434
435 Do not use this class in your own implementations!
436 """
437
438 UNSET: object = object()
439
440 # emulate field types from type class Result
441 url: str | None
442 template: str
443 engine: str
444 parsed_url: urllib.parse.ParseResult | None
445
446 # emulate field types from type class MainResult
447 title: str
448 content: str
449 img_src: str
450 thumbnail: str
451 priority: t.Literal["", "high", "low"]
452 engines: set[str]
453 positions: list[int]
454 score: float
455 category: str
456 publishedDate: datetime.datetime | None
457 pubdate: str = ""
458
459 # infobox result
460 urls: list[dict[str, str]]
461 attributes: list[dict[str, str]]
462
463 def as_dict(self):
464 return self
465
466 def __init__(self, *args: t.Any, **kwargs: t.Any):
467
468 super().__init__(*args, **kwargs)
469
470 # emulate field types from type class Result
471 self["url"] = self.get("url")
472 self["template"] = self.get("template", "default.html")
473 self["engine"] = self.get("engine", "")
474 self["parsed_url"] = self.get("parsed_url")
475
476 # emulate field types from type class MainResult
477 self["title"] = self.get("title", "")
478 self["content"] = self.get("content", "")
479 self["img_src"] = self.get("img_src", "")
480 self["thumbnail"] = self.get("thumbnail", "")
481 self["priority"] = self.get("priority", "")
482 self["engines"] = self.get("engines", set())
483 self["positions"] = self.get("positions", "")
484 self["score"] = self.get("score", 0)
485 self["category"] = self.get("category", "")
486 self["publishedDate"] = self.get("publishedDate")
487
488 if "infobox" in self:
489 self["urls"] = self.get("urls", [])
490 self["attributes"] = self.get("attributes", [])
491
492 # Legacy types that have already been ported to a type ..
493
494 if "answer" in self:
495 warnings.warn(
496 f"engine {self.engine} is using deprecated `dict` for answers"
497 f" / use a class from searx.result_types.answer",
498 DeprecationWarning,
499 )
500 self.template = "answer/legacy.html"
501
502 if self.template == "keyvalue.html":
503 warnings.warn(
504 f"engine {self.engine} is using deprecated `dict` for key/value results"
505 f" / use a class from searx.result_types",
506 DeprecationWarning,
507 )
508
509 def __getattr__(self, name: str, default: t.Any = UNSET) -> t.Any:
510 if default == self.UNSET and name not in self:
511 raise AttributeError(f"LegacyResult object has no field named: {name}")
512 return self[name]
513
514 def __setattr__(self, name: str, val: t.Any):
515 self[name] = val
516
517 def __hash__(self) -> int: # pyright: ignore[reportIncompatibleVariableOverride]
518
519 if "answer" in self:
520 # deprecated ..
521 return hash(self["answer"])
522
523 if self.template == "images.html":
524 # image results are equal if their values for template, the url and
525 # the img_src are equal.
526 return hash(f"{self.template}|{self.url}|{self.img_src}")
527
528 if not any(cls in self for cls in ["suggestion", "correction", "infobox", "number_of_results", "engine_data"]):
529 # Ordinary url-results are equal if their values for template,
530 # parsed_url (without schema) and img_src` are equal.
531
532 # Code copied from with MainResult.__hash__:
533 if not self.parsed_url:
534 raise ValueError(f"missing a value in field 'parsed_url': {self}")
535
536 url = self.parsed_url
537 return hash(
538 f"{self.template}"
539 + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}"
540 + f"|{self.img_src}"
541 )
542
543 return id(self)
544
545 def __eq__(self, other: object):
546
547 return hash(self) == hash(other)
548
549 def __repr__(self) -> str:
550
551 return f"LegacyResult: {super().__repr__()}"
552
557 if self.engine:
558 self.engines.add(self.engine)
559
560 def defaults_from(self, other: "LegacyResult"):
561 for k, v in other.items():
562 if not self.get(k):
563 self[k] = v
564
565 def filter_urls(self, filter_func: "Callable[[Result | LegacyResult, str, str], str | bool]"):
566 """See :py:obj:`Result.filter_urls`"""
567 _filter_urls(self, filter_func=filter_func)
defaults_from(self, "LegacyResult" other)
Definition _base.py:560
__init__(self, *t.Any args, **t.Any kwargs)
Definition _base.py:466
filter_urls(self, "Callable[[Result | LegacyResult, str, str], str | bool]" filter_func)
Definition _base.py:565
t.Any __getattr__(self, str name, t.Any default=UNSET)
Definition _base.py:509
__setattr__(self, str name, t.Any val)
Definition _base.py:514
t.Any __getitem__(self, str field_name)
Definition _base.py:322
filter_urls(self, "Callable[[Result | LegacyResult, str, str], str | bool]" filter_func)
Definition _base.py:269
__setitem__(self, str field_name, t.Any value)
Definition _base.py:318
defaults_from(self, "Result" other)
Definition _base.py:335
__eq__(self, object other)
Definition _base.py:309
_normalize_url_fields("Result | LegacyResult" result)
Definition _base.py:39
_normalize_text_fields("MainResult | LegacyResult" result)
Definition _base.py:86
_filter_urls("Result | LegacyResult" result, "Callable[[Result | LegacyResult, str, str], str | bool]" filter_func)
Definition _base.py:114
_normalize_date_fields("MainResult | LegacyResult" result)
Definition _base.py:220