.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
cache.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Implementations for caching favicons.
3
4:py:obj:`FaviconCacheConfig`:
5 Configuration of the favicon cache
6
7:py:obj:`FaviconCache`:
8 Abstract base class for the implementation of a favicon cache.
9
10:py:obj:`FaviconCacheSQLite`:
11 Favicon cache that manages the favicon BLOBs in a SQLite DB.
12
13:py:obj:`FaviconCacheNull`:
14 Fallback solution if the configured cache cannot be used for system reasons.
15
16----
17
18"""
19
20import typing as t
21
22import os
23import abc
24import dataclasses
25import hashlib
26import logging
27import sqlite3
28import tempfile
29import time
30import typer
31
32import msgspec
33
34from searx import sqlitedb
35from searx import logger
36from searx.utils import humanize_bytes, humanize_number
37
38CACHE: "FaviconCache"
39FALLBACK_ICON = b"FALLBACK_ICON"
40
41logger = logger.getChild('favicons.cache')
42app = typer.Typer()
43
44
45@app.command()
46def state():
47 """show state of the cache"""
48 print(CACHE.state().report())
49
50
51@app.command()
52def maintenance(force: bool = True, debug: bool = False):
53 """perform maintenance of the cache"""
54 root_log = logging.getLogger()
55 if debug:
56 root_log.setLevel(logging.DEBUG)
57 else:
58 root_log.handlers = []
59 handler = logging.StreamHandler()
60 handler.setFormatter(logging.Formatter("%(message)s"))
61 logger.addHandler(handler)
62 logger.setLevel(logging.DEBUG)
63
64 state_t0 = CACHE.state()
65 CACHE.maintenance(force=force)
66 state_t1 = CACHE.state()
67 state_delta = state_t0 - state_t1
68 print("The cache has been reduced by:")
69 print(state_delta.report("\n- {descr}: {val}").lstrip("\n"))
70
71
72def init(cfg: "FaviconCacheConfig"):
73 """Initialization of a global ``CACHE``"""
74
75 global CACHE # pylint: disable=global-statement
76 if cfg.db_type == "sqlite":
77 if sqlite3.sqlite_version_info <= (3, 35):
78 logger.critical(
79 "Disable favicon caching completely: SQLite library (%s) is too old! (require >= 3.35)",
80 sqlite3.sqlite_version,
81 )
82 CACHE = FaviconCacheNull(cfg)
83 else:
84 CACHE = FaviconCacheSQLite(cfg)
85 elif cfg.db_type == "mem":
86 logger.error("Favicons are cached in memory, don't use this in production!")
87 CACHE = FaviconCacheMEM(cfg)
88 else:
89 raise NotImplementedError(f"favicons db_type '{cfg.db_type}' is unknown")
90
91
92@t.final
93class FaviconCacheConfig(msgspec.Struct): # pylint: disable=too-few-public-methods
94 """Configuration of the favicon cache."""
95
96 db_type: t.Literal["sqlite", "mem"] = "sqlite"
97 """Type of the database:
98
99 ``sqlite``:
100 :py:obj:`.cache.FaviconCacheSQLite`
101
102 ``mem``:
103 :py:obj:`.cache.FaviconCacheMEM` (not recommended)
104 """
105
106 db_url: str = tempfile.gettempdir() + os.sep + "faviconcache.db"
107 """URL of the SQLite DB, the path to the database file."""
108
109 HOLD_TIME: int = 60 * 60 * 24 * 30 # 30 days
110 """Hold time (default in sec.), after which a BLOB is removed from the cache."""
111
112 LIMIT_TOTAL_BYTES: int = 1024 * 1024 * 50 # 50 MB
113 """Maximum of bytes (default) stored in the cache of all blobs. Note: The
114 limit is only reached at each maintenance interval after which the oldest
115 BLOBs are deleted; the limit is exceeded during the maintenance period. If
116 the maintenance period is *too long* or maintenance is switched off
117 completely, the cache grows uncontrollably."""
118
119 BLOB_MAX_BYTES: int = 1024 * 20 # 20 KB
120 """The maximum BLOB size in bytes that a favicon may have so that it can be
121 saved in the cache. If the favicon is larger, it is not saved in the cache
122 and must be requested by the client via the proxy."""
123
124 MAINTENANCE_PERIOD: int = 60 * 60
125 """Maintenance period in seconds / when :py:obj:`MAINTENANCE_MODE` is set to
126 ``auto``."""
127
128 MAINTENANCE_MODE: t.Literal["auto", "off"] = "auto"
129 """Type of maintenance mode
130
131 ``auto``:
132 Maintenance is carried out automatically as part of the maintenance
133 intervals (:py:obj:`MAINTENANCE_PERIOD`); no external process is required.
134
135 ``off``:
136 Maintenance is switched off and must be carried out by an external process
137 if required.
138 """
139
140
141@dataclasses.dataclass
143 """Dataclass which provides information on the status of the cache."""
144
145 favicons: int | None = None
146 bytes: int | None = None
147 domains: int | None = None
148 resolvers: int | None = None
149
150 field_descr: tuple[tuple[str, str, t.Callable[[int, int], str] | type], ...] = (
151 ("favicons", "number of favicons in cache", humanize_number),
152 ("bytes", "total size (approx. bytes) of cache", humanize_bytes),
153 ("domains", "total number of domains in cache", humanize_number),
154 ("resolvers", "number of resolvers", str),
155 )
156
157 def __sub__(self, other: "FaviconCacheStats") -> "FaviconCacheStats":
158 if not isinstance(other, self.__class__):
159 raise TypeError(f"unsupported operand type(s) for +: '{self.__class__}' and '{type(other)}'")
160 kwargs = {}
161 for field, _, _ in self.field_descr:
162 self_val, other_val = getattr(self, field), getattr(other, field)
163 if None in (self_val, other_val):
164 continue
165 if isinstance(self_val, int):
166 kwargs[field] = self_val - other_val
167 else:
168 kwargs[field] = self_val
169 return self.__class__(**kwargs) # type: ignore
170
171 def report(self, fmt: str = "{descr}: {val}\n"):
172 s: list[str] = []
173 for field, descr, cast in self.field_descr:
174 val: str | None = getattr(self, field)
175 if val is None:
176 val = "--"
177 else:
178 val = cast(val) # type: ignore
179 s.append(fmt.format(descr=descr, val=val)) # pyright: ignore[reportUnknownArgumentType]
180 return "".join(s)
181
182
183class FaviconCache(abc.ABC):
184 """Abstract base class for the implementation of a favicon cache."""
185
186 @abc.abstractmethod
187 def __init__(self, cfg: FaviconCacheConfig):
188 """An instance of the favicon cache is build up from the configuration."""
189
190 @abc.abstractmethod
191 def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
192 """Returns ``None`` or the tuple of ``(data, mime)`` that has been
193 registered in the cache. The ``None`` indicates that there was no entry
194 in the cache."""
195
196 @abc.abstractmethod
197 def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
198 """Set data and mime-type in the cache. If data is None, the
199 :py:obj:`FALLBACK_ICON` is registered. in the cache."""
200
201 @abc.abstractmethod
202 def state(self) -> FaviconCacheStats:
203 """Returns a :py:obj:`FaviconCacheStats` (key/values) with information
204 on the state of the cache."""
205
206 @abc.abstractmethod
207 def maintenance(self, force: bool = False):
208 """Performs maintenance on the cache"""
209
210
211@t.final
213 """A dummy favicon cache that caches nothing / a fallback solution. The
214 NullCache is used when more efficient caches such as the
215 :py:obj:`FaviconCacheSQLite` cannot be used because, for example, the SQLite
216 library is only available in an old version and does not meet the
217 requirements."""
218
219 def __init__(self, cfg: FaviconCacheConfig):
220 return None
221
222 def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
223 return None
224
225 def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
226 return False
227
228 def state(self):
229 return FaviconCacheStats(favicons=0)
230
231 def maintenance(self, force: bool = False):
232 pass
233
234
235@t.final
236class FaviconCacheSQLite(sqlitedb.SQLiteAppl, FaviconCache): # pyright: ignore[reportUnsafeMultipleInheritance]
237 """Favicon cache that manages the favicon BLOBs in a SQLite DB. The DB
238 model in the SQLite DB is implemented using the abstract class
239 :py:obj:`sqlitedb.SQLiteAppl`.
240
241 For introspection of the DB, jump into developer environment and run command
242 to show cache state::
243
244 $ ./manage pyenv.cmd bash --norc --noprofile
245 (py3) python -m searx.favicons cache state
246
247 The following configurations are required / supported:
248
249 - :py:obj:`FaviconCacheConfig.db_url`
250 - :py:obj:`FaviconCacheConfig.HOLD_TIME`
251 - :py:obj:`FaviconCacheConfig.LIMIT_TOTAL_BYTES`
252 - :py:obj:`FaviconCacheConfig.BLOB_MAX_BYTES`
253 - :py:obj:`MAINTENANCE_PERIOD`
254 - :py:obj:`MAINTENANCE_MODE`
255 """
256
257 DB_SCHEMA = 1
258
259 DDL_BLOBS = """\
260CREATE TABLE IF NOT EXISTS blobs (
261 sha256 TEXT,
262 bytes_c INTEGER,
263 mime TEXT NOT NULL,
264 data BLOB NOT NULL,
265 PRIMARY KEY (sha256))"""
266
267 """Table to store BLOB objects by their sha256 hash values."""
268
269 DDL_BLOB_MAP = """\
270CREATE TABLE IF NOT EXISTS blob_map (
271 m_time INTEGER DEFAULT (strftime('%s', 'now')), -- last modified (unix epoch) time in sec.
272 sha256 TEXT,
273 resolver TEXT,
274 authority TEXT,
275 PRIMARY KEY (resolver, authority))"""
276
277 """Table to map from (resolver, authority) to sha256 hash values."""
278
279 DDL_CREATE_TABLES = {
280 "blobs": DDL_BLOBS,
281 "blob_map": DDL_BLOB_MAP,
282 }
283
284 SQL_DROP_LEFTOVER_BLOBS = (
285 "DELETE FROM blobs WHERE sha256 IN ("
286 " SELECT b.sha256"
287 " FROM blobs b"
288 " LEFT JOIN blob_map bm"
289 " ON b.sha256 = bm.sha256"
290 " WHERE bm.sha256 IS NULL)"
291 )
292 """Delete blobs.sha256 (BLOBs) no longer in blob_map.sha256."""
293
294 SQL_ITER_BLOBS_SHA256_BYTES_C = (
295 "SELECT b.sha256, b.bytes_c FROM blobs b"
296 " JOIN blob_map bm "
297 " ON b.sha256 = bm.sha256"
298 " ORDER BY bm.m_time ASC"
299 )
300
301 SQL_INSERT_BLOBS = (
302 "INSERT INTO blobs (sha256, bytes_c, mime, data) VALUES (?, ?, ?, ?)"
303 " ON CONFLICT (sha256) DO NOTHING"
304 ) # fmt: skip
305
306 SQL_INSERT_BLOB_MAP = (
307 "INSERT INTO blob_map (sha256, resolver, authority) VALUES (?, ?, ?)"
308 " ON CONFLICT DO UPDATE "
309 " SET sha256=excluded.sha256, m_time=strftime('%s', 'now')"
310 )
311
312 def __init__(self, cfg: FaviconCacheConfig):
313 """An instance of the favicon cache is build up from the configuration.""" #
314
315 if cfg.db_url == ":memory:":
316 logger.critical("don't use SQLite DB in :memory: in production!!")
317 super().__init__(cfg.db_url)
318 self.cfg = cfg
319
320 def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
321
322 sql = "SELECT sha256 FROM blob_map WHERE resolver = ? AND authority = ?"
323 res = self.DB.execute(sql, (resolver, authority)).fetchone()
324 if res is None:
325 return None
326
327 data, mime = (None, None)
328 sha256 = res[0]
329 if sha256 == FALLBACK_ICON:
330 return data, mime
331
332 sql = "SELECT data, mime FROM blobs WHERE sha256 = ?"
333 res = self.DB.execute(sql, (sha256,)).fetchone()
334 if res is not None:
335 data, mime = res
336 return data, mime
337
338 def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
339
340 if self.cfg.MAINTENANCE_MODE == "auto" and int(time.time()) > self.next_maintenance_time:
341 # Should automatic maintenance be moved to a new thread?
342 self.maintenance()
343
344 if data is not None and mime is None:
345 logger.error(
346 "favicon resolver %s tries to cache mime-type None for authority %s",
347 resolver,
348 authority,
349 )
350 return False
351
352 bytes_c = len(data or b"")
353 if bytes_c > self.cfg.BLOB_MAX_BYTES:
354 logger.info(
355 "favicon of resolver: %s / authority: %s to big to cache (bytes: %s) " % (resolver, authority, bytes_c)
356 )
357 return False
358
359 if data is None:
360 sha256 = FALLBACK_ICON
361 else:
362 sha256 = hashlib.sha256(data).hexdigest()
363
364 with self.connect() as conn:
365 if sha256 != FALLBACK_ICON:
366 conn.execute(self.SQL_INSERT_BLOBS, (sha256, bytes_c, mime, data))
367 conn.execute(self.SQL_INSERT_BLOB_MAP, (sha256, resolver, authority))
368 # hint: the with context of the connection object closes the transaction
369 # but not the DB connection. The connection has to be closed by the
370 # caller of self.connect()!
371 conn.close()
372
373 return True
374
375 @property
376 def next_maintenance_time(self) -> int:
377 """Returns (unix epoch) time of the next maintenance."""
378
379 return self.cfg.MAINTENANCE_PERIOD + self.properties.m_time("LAST_MAINTENANCE")
380
381 def maintenance(self, force: bool = False):
382
383 # Prevent parallel DB maintenance cycles from other DB connections
384 # (e.g. in multi thread or process environments).
385
386 if not force and int(time.time()) < self.next_maintenance_time:
387 logger.debug("no maintenance required yet, next maintenance interval is in the future")
388 return
389 self.properties.set("LAST_MAINTENANCE", "") # hint: this (also) sets the m_time of the property!
390
391 # Do maintenance tasks. This can be take a little more time, to avoid
392 # DB locks, establish a new DB connection.
393
394 with self.connect() as conn:
395
396 # drop items not in HOLD time
397 res = conn.execute(
398 f"DELETE FROM blob_map"
399 f" WHERE cast(m_time as integer) < cast(strftime('%s', 'now') as integer) - {self.cfg.HOLD_TIME}"
400 )
401 logger.debug("dropped %s obsolete blob_map items from db", res.rowcount)
402 res = conn.execute(self.SQL_DROP_LEFTOVER_BLOBS)
403 logger.debug("dropped %s obsolete BLOBS from db", res.rowcount)
404
405 # drop old items to be in LIMIT_TOTAL_BYTES
406 total_bytes = conn.execute("SELECT SUM(bytes_c) FROM blobs").fetchone()[0] or 0
407 if total_bytes > self.cfg.LIMIT_TOTAL_BYTES:
408
409 x = total_bytes - self.cfg.LIMIT_TOTAL_BYTES
410 c = 0
411 sha_list: list[str] = []
412 for row in conn.execute(self.SQL_ITER_BLOBS_SHA256_BYTES_C):
413 sha256, bytes_c = row
414 sha_list.append(sha256)
415 c += bytes_c
416 if c > x:
417 break
418 if sha_list:
419 conn.execute("DELETE FROM blobs WHERE sha256 IN ('%s')" % "','".join(sha_list))
420 conn.execute("DELETE FROM blob_map WHERE sha256 IN ('%s')" % "','".join(sha_list))
421 logger.debug("dropped %s blobs with total size of %s bytes", len(sha_list), c)
422
423 # Vacuuming the WALs
424 # https://www.theunterminatedstring.com/sqlite-vacuuming/
425
426 conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
427 conn.close()
428
429 def _query_val(self, sql: str, default: t.Any = None):
430 val = self.DB.execute(sql).fetchone()
431 if val is not None:
432 val = val[0]
433 if val is None:
434 val = default
435 return val
436
437 def state(self) -> FaviconCacheStats:
438 return FaviconCacheStats(
439 favicons=self._query_val("SELECT count(*) FROM blobs", 0),
440 bytes=self._query_val("SELECT SUM(bytes_c) FROM blobs", 0),
441 domains=self._query_val("SELECT count(*) FROM (SELECT authority FROM blob_map GROUP BY authority)", 0),
442 resolvers=self._query_val("SELECT count(*) FROM (SELECT resolver FROM blob_map GROUP BY resolver)", 0),
443 )
444
445
446@t.final
448 """Favicon cache in process' memory. Its just a POC that stores the
449 favicons in the memory of the process.
450
451 .. attention::
452
453 Don't use it in production, it will blow up your memory!!
454
455 """
456
457 def __init__(self, cfg: FaviconCacheConfig):
458
459 self.cfg = cfg
460 self._data: dict[str, t.Any] = {}
461 self._sha_mime: dict[str, tuple[str, str | None]] = {}
462
463 def __call__(self, resolver: str, authority: str) -> None | tuple[bytes | None, str | None]:
464
465 sha, mime = self._sha_mime.get(f"{resolver}:{authority}", (None, None))
466 if sha is None:
467 return None
468 data = self._data.get(sha)
469 if data == FALLBACK_ICON:
470 data = None
471 return data, mime
472
473 def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
474
475 if data is None:
476 data = FALLBACK_ICON
477 mime = None
478
479 elif mime is None:
480 logger.error(
481 "favicon resolver %s tries to cache mime-type None for authority %s",
482 resolver,
483 authority,
484 )
485 return False
486
487 digest = hashlib.sha256(data).hexdigest()
488 self._data[digest] = data
489 self._sha_mime[f"{resolver}:{authority}"] = (digest, mime)
490 return True
491
492 def state(self):
493 return FaviconCacheStats(favicons=len(self._data.keys()))
494
495 def maintenance(self, force: bool = False):
496 pass
None|tuple[bytes|None, str|None] __call__(self, str resolver, str authority)
Definition cache.py:463
__init__(self, FaviconCacheConfig cfg)
Definition cache.py:457
maintenance(self, bool force=False)
Definition cache.py:495
bool set(self, str resolver, str authority, str|None mime, bytes|None data)
Definition cache.py:473
None|tuple[None|bytes, None|str] __call__(self, str resolver, str authority)
Definition cache.py:222
__init__(self, FaviconCacheConfig cfg)
Definition cache.py:219
maintenance(self, bool force=False)
Definition cache.py:231
bool set(self, str resolver, str authority, str|None mime, bytes|None data)
Definition cache.py:225
bool set(self, str resolver, str authority, str|None mime, bytes|None data)
Definition cache.py:338
None|tuple[None|bytes, None|str] __call__(self, str resolver, str authority)
Definition cache.py:320
maintenance(self, bool force=False)
Definition cache.py:381
__init__(self, FaviconCacheConfig cfg)
Definition cache.py:312
_query_val(self, str sql, t.Any default=None)
Definition cache.py:429
"FaviconCacheStats" __sub__(self, "FaviconCacheStats" other)
Definition cache.py:157
report(self, str fmt="{descr}: {val}\n")
Definition cache.py:171
__init__(self, FaviconCacheConfig cfg)
Definition cache.py:187
None|tuple[None|bytes, None|str] __call__(self, str resolver, str authority)
Definition cache.py:191
bool set(self, str resolver, str authority, str|None mime, bytes|None data)
Definition cache.py:197
maintenance(self, bool force=False)
Definition cache.py:207
sqlite3.Connection connect(self)
Definition sqlitedb.py:196
SQLiteProperties properties
Definition sqlitedb.py:157
sqlite3.Connection DB(self)
Definition sqlitedb.py:242
maintenance(bool force=True, bool debug=False)
Definition cache.py:52