.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
_base.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2# pylint: disable=too-few-public-methods, missing-module-docstring
3"""Basic types for the typification of results.
4
5- :py:obj:`Result` base class
6- :py:obj:`LegacyResult` for internal use only
7
8----
9
10.. autoclass:: Result
11 :members:
12
13.. _LegacyResult:
14
15.. autoclass:: LegacyResult
16 :members:
17"""
18
19
20from __future__ import annotations
21
22__all__ = ["Result"]
23
24import re
25import urllib.parse
26import warnings
27import typing
28import time
29import datetime
30
31from collections.abc import Callable
32
33import msgspec
34
35from searx import logger as log
36
37WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
38UNKNOWN = object()
39
40
41def _normalize_url_fields(result: Result | LegacyResult):
42
43 # As soon we need LegacyResult not any longer, we can move this function to
44 # method Result.normalize_result_fields
45
46 if result.url and not result.parsed_url:
47 if not isinstance(result.url, str):
48 log.debug('result: invalid URL: %s', str(result))
49 result.url = ""
50 result.parsed_url = None
51 else:
52 result.parsed_url = urllib.parse.urlparse(result.url)
53
54 if result.parsed_url:
55 result.parsed_url = result.parsed_url._replace(
56 # if the result has no scheme, use http as default
57 scheme=result.parsed_url.scheme or "http",
58 # normalize ``example.com/path/`` to ``example.com/path``
59 path=result.parsed_url.path.rstrip("/"),
60 )
61 result.url = result.parsed_url.geturl()
62
63 if isinstance(result, LegacyResult) and getattr(result, "infobox", None):
64 # As soon we have InfoboxResult, we can move this function to method
65 # InfoboxResult.normalize_result_fields
66
67 infobox_urls: list[dict[str, str]] = getattr(result, "urls", [])
68 for item in infobox_urls:
69 _url = item.get("url")
70 if not _url:
71 continue
72 _url = urllib.parse.urlparse(_url)
73 item["url"] = _url._replace(
74 scheme=_url.scheme or "http",
75 # netloc=_url.netloc.replace("www.", ""),
76 path=_url.path.rstrip("/"),
77 ).geturl()
78
79 infobox_id = getattr(result, "id", None)
80 if infobox_id:
81 _url = urllib.parse.urlparse(infobox_id)
82 result.id = _url._replace(
83 scheme=_url.scheme or "http",
84 # netloc=_url.netloc.replace("www.", ""),
85 path=_url.path.rstrip("/"),
86 ).geturl()
87
88
89def _normalize_text_fields(result: MainResult | LegacyResult):
90
91 # As soon we need LegacyResult not any longer, we can move this function to
92 # method MainResult.normalize_result_fields
93
94 # Actually, a type check should not be necessary if the engine is
95 # implemented correctly. Historically, however, we have always had a type
96 # check here.
97
98 if result.title and not isinstance(result.title, str):
99 log.debug("result: invalid type of field 'title': %s", str(result))
100 result.title = str(result)
101 if result.content and not isinstance(result.content, str):
102 log.debug("result: invalid type of field 'content': %s", str(result))
103 result.content = str(result)
104
105 # normalize title and content
106 if result.title:
107 result.title = WHITESPACE_REGEX.sub(" ", result.title).strip()
108 if result.content:
109 result.content = WHITESPACE_REGEX.sub(" ", result.content).strip()
110 if result.content == result.title:
111 # avoid duplicate content between the content and title fields
112 result.content = ""
113
114
115def _filter_urls(result: Result | LegacyResult, filter_func: Callable[[Result | LegacyResult, str, str], str | bool]):
116 # pylint: disable=too-many-branches, too-many-statements
117
118 # As soon we need LegacyResult not any longer, we can move this function to
119 # method Result.
120
121 url_fields = ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"]
122
123 for field_name in url_fields:
124 url_src = getattr(result, field_name, "")
125 if not url_src:
126 continue
127
128 new_url = filter_func(result, field_name, url_src)
129 # log.debug("filter_urls: filter_func(result, %s) '%s' -> '%s'", field_name, field_value, new_url)
130 if isinstance(new_url, bool):
131 if new_url:
132 # log.debug("filter_urls: unchanged field %s URL %s", field_name, field_value)
133 continue
134 log.debug("filter_urls: drop field %s URL %s", field_name, url_src)
135 new_url = None
136 else:
137 log.debug("filter_urls: modify field %s URL %s -> %s", field_name, url_src, new_url)
138
139 setattr(result, field_name, new_url)
140 if field_name == "url":
141 # sync parsed_url with new_url
142 if not new_url:
143 result.parsed_url = None
144 elif isinstance(new_url, str):
145 result.parsed_url = urllib.parse.urlparse(new_url)
146
147 # "urls": are from infobox
148 #
149 # As soon we have InfoboxResult, we can move this function to method
150 # InfoboxResult.normalize_result_fields
151
152 infobox_urls: list[dict[str, str]] = getattr(result, "urls", [])
153
154 if infobox_urls:
155 # log.debug("filter_urls: infobox_urls .. %s", infobox_urls)
156 new_infobox_urls: list[dict[str, str]] = []
157
158 for item in infobox_urls:
159 url_src = item.get("url")
160 if not url_src:
161 new_infobox_urls.append(item)
162 continue
163
164 new_url = filter_func(result, "infobox_urls", url_src)
165 if isinstance(new_url, bool):
166 if new_url:
167 new_infobox_urls.append(item)
168 # log.debug("filter_urls: leave URL in field 'urls' ('infobox_urls') unchanged -> %s", _url)
169 continue
170 log.debug("filter_urls: remove URL from field 'urls' ('infobox_urls') URL %s", url_src)
171 new_url = None
172 if new_url:
173 log.debug("filter_urls: modify URL from field 'urls' ('infobox_urls') URL %s -> %s", url_src, new_url)
174 item["url"] = new_url
175 new_infobox_urls.append(item)
176
177 setattr(result, "urls", new_infobox_urls)
178
179 # "attributes": are from infobox
180 #
181 # The infobox has additional subsections for attributes, urls and relatedTopics:
182
183 infobox_attributes: list[dict[str, dict]] = getattr(result, "attributes", [])
184
185 if infobox_attributes:
186 # log.debug("filter_urls: infobox_attributes .. %s", infobox_attributes)
187 new_infobox_attributes: list[dict[str, dict]] = []
188
189 for item in infobox_attributes:
190 image = item.get("image", {})
191 url_src = image.get("src", "")
192 if not url_src:
193 new_infobox_attributes.append(item)
194 continue
195
196 new_url = filter_func(result, "infobox_attributes", url_src)
197 if isinstance(new_url, bool):
198 if new_url:
199 new_infobox_attributes.append(item)
200 # log.debug("filter_urls: leave URL in field 'image.src' unchanged -> %s", url_src)
201 continue
202 log.debug("filter_urls: drop field 'image.src' ('infobox_attributes') URL %s", url_src)
203 new_url = None
204
205 if new_url:
206 log.debug(
207 "filter_urls: modify 'image.src' ('infobox_attributes') URL %s -> %s",
208 url_src,
209 new_url,
210 )
211 item["image"]["src"] = new_url
212 new_infobox_attributes.append(item)
213
214 setattr(result, "attributes", new_infobox_attributes)
215
216 result.normalize_result_fields()
217
218
219def _normalize_date_fields(result: MainResult | LegacyResult):
220
221 if result.publishedDate: # do not try to get a date from an empty string or a None type
222 try: # test if publishedDate >= 1900 (datetime module bug)
223 result.pubdate = result.publishedDate.strftime('%Y-%m-%d %H:%M:%S%z')
224 except ValueError:
225 result.publishedDate = None
226
227
228class Result(msgspec.Struct, kw_only=True):
229 """Base class of all result types :ref:`result types`."""
230
231 url: str | None = None
232 """A link related to this *result*"""
233
234 template: str = "default.html"
235 """Name of the template used to render the result.
236
237 By default :origin:`result_templates/default.html
238 <searx/templates/simple/result_templates/default.html>` is used.
239 """
240
241 engine: str | None = ""
242 """Name of the engine *this* result comes from. In case of *plugins* a
243 prefix ``plugin:`` is set, in case of *answerer* prefix ``answerer:`` is
244 set.
245
246 The field is optional and is initialized from the context if necessary.
247 """
248
249 parsed_url: urllib.parse.ParseResult | None = None
250 """:py:obj:`urllib.parse.ParseResult` of :py:obj:`Result.url`.
251
252 The field is optional and is initialized from the context if necessary.
253 """
254
256 """Normalize fields ``url`` and ``parse_sql``.
257
258 - If field ``url`` is set and field ``parse_url`` is unset, init
259 ``parse_url`` from field ``url``. The ``url`` field is initialized
260 with the resulting value in ``parse_url``, if ``url`` and
261 ``parse_url`` are not equal.
262
263 - ``example.com/path/`` and ``example.com/path`` are equivalent and are
264 normalized to ``example.com/path``.
265 """
267
268 def __post_init__(self):
269 pass
270
271 def filter_urls(self, filter_func: Callable[[Result | LegacyResult, str, str], str | bool]):
272 """A filter function is passed in the ``filter_func`` argument to
273 filter and/or modify the URLs.
274
275 The filter function receives the :py:obj:`result object <Result>` as
276 the first argument and the field name (``str``) in the second argument.
277 In the third argument the URL string value is passed to the filter function.
278
279 The filter function is applied to all fields that contain a URL,
280 in addition to the familiar ``url`` field, these include fields such as::
281
282 ["url", "iframe_src", "audio_src", "img_src", "thumbnail_src", "thumbnail"]
283
284 and the ``urls`` list of items of the infobox.
285
286 For each field, the filter function is called and returns a bool or a
287 string value:
288
289 - ``True``: leave URL in field unchanged
290 - ``False``: remove URL field from result (or remove entire result)
291 - ``str``: modified URL to be used instead
292
293 See :ref:`filter urls example`.
294
295 """
296 _filter_urls(self, filter_func=filter_func)
297
298 def __hash__(self) -> int:
299 """Generates a hash value that uniquely identifies the content of *this*
300 result. The method can be adapted in the inheritance to compare results
301 from different sources.
302
303 If two result objects are not identical but have the same content, their
304 hash values should also be identical.
305
306 The hash value is used in contexts, e.g. when checking for equality to
307 identify identical results from different sources (engines).
308 """
309 return id(self)
310
311 def __eq__(self, other):
312 """py:obj:`Result` objects are equal if the hash values of the two
313 objects are equal. If needed, its recommended to overwrite
314 "py:obj:`Result.__hash__`."""
315
316 return hash(self) == hash(other)
317
318 # for legacy code where a result is treated as a Python dict
319
320 def __setitem__(self, field_name, value):
321
322 return setattr(self, field_name, value)
323
324 def __getitem__(self, field_name):
325
326 if field_name not in self.__struct_fields__:
327 raise KeyError(f"{field_name}")
328 return getattr(self, field_name)
329
330 def __iter__(self):
331
332 return iter(self.__struct_fields__)
333
334 def as_dict(self):
335 return {f: getattr(self, f) for f in self.__struct_fields__}
336
337 def defaults_from(self, other: Result):
338 """Fields not set in *self* will be updated from the field values of the
339 *other*.
340 """
341 for field_name in self.__struct_fields__:
342 self_val = getattr(self, field_name, False)
343 other_val = getattr(other, field_name, False)
344 if self_val:
345 setattr(self, field_name, other_val)
346
347
348class MainResult(Result): # pylint: disable=missing-class-docstring
349 """Base class of all result types displayed in :ref:`area main results`."""
350
351 title: str = ""
352 """Link title of the result item."""
353
354 content: str = ""
355 """Extract or description of the result item"""
356
357 img_src: str = ""
358 """URL of a image that is displayed in the result item."""
359
360 thumbnail: str = ""
361 """URL of a thumbnail that is displayed in the result item."""
362
363 publishedDate: datetime.datetime | None = None
364 """The date on which the object was published."""
365
366 pubdate: str = ""
367 """String representation of :py:obj:`MainResult.publishedDate`"""
368
369 length: time.struct_time | None = None
370 """Playing duration in seconds."""
371
372 views: str = ""
373 """View count in humanized number format."""
374
375 author: str = ""
376 """Author of the title."""
377
378 metadata: str = ""
379 """Miscellaneous metadata."""
380
381 priority: typing.Literal["", "high", "low"] = ""
382 """The priority can be set via :ref:`hostnames plugin`, for example."""
383
384 engines: set[str] = set()
385 """In a merged results list, the names of the engines that found this result
386 are listed in this field."""
387
388 # open_group and close_group should not manged in the Result
389 # class (we should drop it from here!)
390 open_group: bool = False
391 close_group: bool = False
392 positions: list[int] = []
393 score: float = 0
394 category: str = ""
395
396 def __hash__(self) -> int:
397 """Ordinary url-results are equal if their values for
398 :py:obj:`Result.template`, :py:obj:`Result.parsed_url` (without scheme)
399 and :py:obj:`MainResult.img_src` are equal.
400 """
401 if not self.parsed_url:
402 raise ValueError(f"missing a value in field 'parsed_url': {self}")
403
404 url = self.parsed_url
405 return hash(
406 f"{self.template}"
407 + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}"
408 + f"|{self.img_src}"
409 )
410
415 if self.engine:
416 self.engines.add(self.engine)
417
418
419class LegacyResult(dict):
420 """A wrapper around a legacy result item. The SearXNG core uses this class
421 for untyped dictionaries / to be downward compatible.
422
423 This class is needed until we have implemented an :py:obj:`Result` class for
424 each result type and the old usages in the codebase have been fully
425 migrated.
426
427 There is only one place where this class is used, in the
428 :py:obj:`searx.results.ResultContainer`.
429
430 .. attention::
431
432 Do not use this class in your own implementations!
433 """
434
435 UNSET = object()
436
437 # emulate field types from type class Result
438 url: str | None
439 template: str
440 engine: str
441 parsed_url: urllib.parse.ParseResult | None
442
443 # emulate field types from type class MainResult
444 title: str
445 content: str
446 img_src: str
447 thumbnail: str
448 priority: typing.Literal["", "high", "low"]
449 engines: set[str]
450 positions: list[int]
451 score: float
452 category: str
453 publishedDate: datetime.datetime | None = None
454 pubdate: str = ""
455
456 # infobox result
457 urls: list[dict[str, str]]
458 attributes: list[dict[str, str]]
459
460 def as_dict(self):
461 return self
462
463 def __init__(self, *args, **kwargs):
464
465 super().__init__(*args, **kwargs)
466
467 # emulate field types from type class Result
468 self["url"] = self.get("url")
469 self["template"] = self.get("template", "default.html")
470 self["engine"] = self.get("engine", "")
471 self["parsed_url"] = self.get("parsed_url")
472
473 # emulate field types from type class MainResult
474 self["title"] = self.get("title", "")
475 self["content"] = self.get("content", "")
476 self["img_src"] = self.get("img_src", "")
477 self["thumbnail"] = self.get("thumbnail", "")
478 self["priority"] = self.get("priority", "")
479 self["engines"] = self.get("engines", set())
480 self["positions"] = self.get("positions", "")
481 self["score"] = self.get("score", 0)
482 self["category"] = self.get("category", "")
483
484 if "infobox" in self:
485 self["urls"] = self.get("urls", [])
486 self["attributes"] = self.get("attributes", [])
487
488 # Legacy types that have already been ported to a type ..
489
490 if "answer" in self:
491 warnings.warn(
492 f"engine {self.engine} is using deprecated `dict` for answers"
493 f" / use a class from searx.result_types.answer",
494 DeprecationWarning,
495 )
496 self.template = "answer/legacy.html"
497
498 if self.template == "keyvalue.html":
499 warnings.warn(
500 f"engine {self.engine} is using deprecated `dict` for key/value results"
501 f" / use a class from searx.result_types",
502 DeprecationWarning,
503 )
504
505 def __getattr__(self, name: str, default=UNSET) -> typing.Any:
506 if default == self.UNSET and name not in self:
507 raise AttributeError(f"LegacyResult object has no field named: {name}")
508 return self[name]
509
510 def __setattr__(self, name: str, val):
511 self[name] = val
512
513 def __hash__(self) -> int: # type: ignore
514
515 if "answer" in self:
516 # deprecated ..
517 return hash(self["answer"])
518
519 if self.template == "images.html":
520 # image results are equal if their values for template, the url and
521 # the img_src are equal.
522 return hash(f"{self.template}|{self.url}|{self.img_src}")
523
524 if not any(cls in self for cls in ["suggestion", "correction", "infobox", "number_of_results", "engine_data"]):
525 # Ordinary url-results are equal if their values for template,
526 # parsed_url (without schema) and img_src` are equal.
527
528 # Code copied from with MainResult.__hash__:
529 if not self.parsed_url:
530 raise ValueError(f"missing a value in field 'parsed_url': {self}")
531
532 url = self.parsed_url
533 return hash(
534 f"{self.template}"
535 + f"|{url.netloc}|{url.path}|{url.params}|{url.query}|{url.fragment}"
536 + f"|{self.img_src}"
537 )
538
539 return id(self)
540
541 def __eq__(self, other):
542
543 return hash(self) == hash(other)
544
545 def __repr__(self) -> str:
546
547 return f"LegacyResult: {super().__repr__()}"
548
553 if self.engine:
554 self.engines.add(self.engine)
555
556 def defaults_from(self, other: LegacyResult):
557 for k, v in other.items():
558 if not self.get(k):
559 self[k] = v
560
561 def filter_urls(self, filter_func: Callable[[Result | LegacyResult, str, str], str | bool]):
562 """See :py:obj:`Result.filter_urls`"""
563 _filter_urls(self, filter_func=filter_func)
defaults_from(self, LegacyResult other)
Definition _base.py:556
typing.Any __getattr__(self, str name, default=UNSET)
Definition _base.py:505
__setattr__(self, str name, val)
Definition _base.py:510
__init__(self, *args, **kwargs)
Definition _base.py:463
filter_urls(self, Callable[[Result|LegacyResult, str, str], str|bool] filter_func)
Definition _base.py:561
__setitem__(self, field_name, value)
Definition _base.py:320
defaults_from(self, Result other)
Definition _base.py:337
__getitem__(self, field_name)
Definition _base.py:324
filter_urls(self, Callable[[Result|LegacyResult, str, str], str|bool] filter_func)
Definition _base.py:271
_normalize_url_fields(Result|LegacyResult result)
Definition _base.py:41
_normalize_text_fields(MainResult|LegacyResult result)
Definition _base.py:89
_normalize_date_fields(MainResult|LegacyResult result)
Definition _base.py:219
_filter_urls(Result|LegacyResult result, Callable[[Result|LegacyResult, str, str], str|bool] filter_func)
Definition _base.py:115