.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
cache.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Implementations for caching favicons.
3
4:py:obj:`FaviconCacheConfig`:
5 Configuration of the favicon cache
6
7:py:obj:`FaviconCache`:
8 Abstract base class for the implementation of a favicon cache.
9
10:py:obj:`FaviconCacheSQLite`:
11 Favicon cache that manages the favicon BLOBs in a SQLite DB.
12
13:py:obj:`FaviconCacheNull`:
14 Fallback solution if the configured cache cannot be used for system reasons.
15
16----
17
18"""
19
20from __future__ import annotations
21from typing import Literal
22
23import os
24import abc
25import dataclasses
26import hashlib
27import logging
28import sqlite3
29import tempfile
30import time
31import typer
32
33import msgspec
34
35from searx import sqlitedb
36from searx import logger
37from searx.utils import humanize_bytes, humanize_number
38
39CACHE: "FaviconCache"
40FALLBACK_ICON = b"FALLBACK_ICON"
41
42logger = logger.getChild('favicons.cache')
43app = typer.Typer()
44
45
46@app.command()
47def state():
48 """show state of the cache"""
49 print(CACHE.state().report())
50
51
52@app.command()
53def maintenance(force: bool = True, debug: bool = False):
54 """perform maintenance of the cache"""
55 root_log = logging.getLogger()
56 if debug:
57 root_log.setLevel(logging.DEBUG)
58 else:
59 root_log.handlers = []
60 handler = logging.StreamHandler()
61 handler.setFormatter(logging.Formatter("%(message)s"))
62 logger.addHandler(handler)
63 logger.setLevel(logging.DEBUG)
64
65 state_t0 = CACHE.state()
66 CACHE.maintenance(force=force)
67 state_t1 = CACHE.state()
68 state_delta = state_t0 - state_t1
69 print("The cache has been reduced by:")
70 print(state_delta.report("\n- {descr}: {val}").lstrip("\n"))
71
72
73def init(cfg: "FaviconCacheConfig"):
74 """Initialization of a global ``CACHE``"""
75
76 global CACHE # pylint: disable=global-statement
77 if cfg.db_type == "sqlite":
78 if sqlite3.sqlite_version_info <= (3, 35):
79 logger.critical(
80 "Disable favicon caching completely: SQLite library (%s) is too old! (require >= 3.35)",
81 sqlite3.sqlite_version,
82 )
83 CACHE = FaviconCacheNull(cfg)
84 else:
85 CACHE = FaviconCacheSQLite(cfg)
86 elif cfg.db_type == "mem":
87 logger.error("Favicons are cached in memory, don't use this in production!")
88 CACHE = FaviconCacheMEM(cfg)
89 else:
90 raise NotImplementedError(f"favicons db_type '{cfg.db_type}' is unknown")
91
92
93class FaviconCacheConfig(msgspec.Struct): # pylint: disable=too-few-public-methods
94 """Configuration of the favicon cache."""
95
96 db_type: Literal["sqlite", "mem"] = "sqlite"
97 """Type of the database:
98
99 ``sqlite``:
100 :py:obj:`.cache.FaviconCacheSQLite`
101
102 ``mem``:
103 :py:obj:`.cache.FaviconCacheMEM` (not recommended)
104 """
105
106 db_url: str = tempfile.gettempdir() + os.sep + "faviconcache.db"
107 """URL of the SQLite DB, the path to the database file."""
108
109 HOLD_TIME: int = 60 * 60 * 24 * 30 # 30 days
110 """Hold time (default in sec.), after which a BLOB is removed from the cache."""
111
112 LIMIT_TOTAL_BYTES: int = 1024 * 1024 * 50 # 50 MB
113 """Maximum of bytes (default) stored in the cache of all blobs. Note: The
114 limit is only reached at each maintenance interval after which the oldest
115 BLOBs are deleted; the limit is exceeded during the maintenance period. If
116 the maintenance period is *too long* or maintenance is switched off
117 completely, the cache grows uncontrollably."""
118
119 BLOB_MAX_BYTES: int = 1024 * 20 # 20 KB
120 """The maximum BLOB size in bytes that a favicon may have so that it can be
121 saved in the cache. If the favicon is larger, it is not saved in the cache
122 and must be requested by the client via the proxy."""
123
124 MAINTENANCE_PERIOD: int = 60 * 60
125 """Maintenance period in seconds / when :py:obj:`MAINTENANCE_MODE` is set to
126 ``auto``."""
127
128 MAINTENANCE_MODE: Literal["auto", "off"] = "auto"
129 """Type of maintenance mode
130
131 ``auto``:
132 Maintenance is carried out automatically as part of the maintenance
133 intervals (:py:obj:`MAINTENANCE_PERIOD`); no external process is required.
134
135 ``off``:
136 Maintenance is switched off and must be carried out by an external process
137 if required.
138 """
139
140
141@dataclasses.dataclass
143 """Dataclass wich provides information on the status of the cache."""
144
145 favicons: int | None = None
146 bytes: int | None = None
147 domains: int | None = None
148 resolvers: int | None = None
149
150 field_descr = (
151 ("favicons", "number of favicons in cache", humanize_number),
152 ("bytes", "total size (approx. bytes) of cache", humanize_bytes),
153 ("domains", "total number of domains in cache", humanize_number),
154 ("resolvers", "number of resolvers", str),
155 )
156
157 def __sub__(self, other) -> FaviconCacheStats:
158 if not isinstance(other, self.__class__):
159 raise TypeError(f"unsupported operand type(s) for +: '{self.__class__}' and '{type(other)}'")
160 kwargs = {}
161 for field, _, _ in self.field_descr:
162 self_val, other_val = getattr(self, field), getattr(other, field)
163 if None in (self_val, other_val):
164 continue
165 if isinstance(self_val, int):
166 kwargs[field] = self_val - other_val
167 else:
168 kwargs[field] = self_val
169 return self.__class__(**kwargs)
170
171 def report(self, fmt: str = "{descr}: {val}\n"):
172 s = []
173 for field, descr, cast in self.field_descr:
174 val = getattr(self, field)
175 if val is None:
176 val = "--"
177 else:
178 val = cast(val)
179 s.append(fmt.format(descr=descr, val=val))
180 return "".join(s)
181
182
183class FaviconCache(abc.ABC):
184 """Abstract base class for the implementation of a favicon cache."""
185
186 @abc.abstractmethod
187 def __init__(self, cfg: FaviconCacheConfig):
188 """An instance of the favicon cache is build up from the configuration."""
189
190 @abc.abstractmethod
191 def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
192 """Returns ``None`` or the tuple of ``(data, mime)`` that has been
193 registered in the cache. The ``None`` indicates that there was no entry
194 in the cache."""
195
196 @abc.abstractmethod
197 def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
198 """Set data and mime-type in the cache. If data is None, the
199 :py:obj:`FALLBACK_ICON` is registered. in the cache."""
200
201 @abc.abstractmethod
202 def state(self) -> FaviconCacheStats:
203 """Returns a :py:obj:`FaviconCacheStats` (key/values) with information
204 on the state of the cache."""
205
206 @abc.abstractmethod
207 def maintenance(self, force=False):
208 """Performs maintenance on the cache"""
209
210
211class FaviconCacheNull(FaviconCache):
212 """A dummy favicon cache that caches nothing / a fallback solution. The
213 NullCache is used when more efficient caches such as the
214 :py:obj:`FaviconCacheSQLite` cannot be used because, for example, the SQLite
215 library is only available in an old version and does not meet the
216 requirements."""
217
218 def __init__(self, cfg: FaviconCacheConfig):
219 return None
220
221 def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
222 return None
223
224 def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
225 return False
226
227 def state(self):
228 return FaviconCacheStats(favicons=0)
229
230 def maintenance(self, force=False):
231 pass
232
233
234class FaviconCacheSQLite(sqlitedb.SQLiteAppl, FaviconCache):
235 """Favicon cache that manages the favicon BLOBs in a SQLite DB. The DB
236 model in the SQLite DB is implemented using the abstract class
237 :py:obj:`sqlitedb.SQLiteAppl`.
238
239 For introspection of the DB, jump into developer environment and run command
240 to show cache state::
241
242 $ ./manage pyenv.cmd bash --norc --noprofile
243 (py3) python -m searx.favicons cache state
244
245 The following configurations are required / supported:
246
247 - :py:obj:`FaviconCacheConfig.db_url`
248 - :py:obj:`FaviconCacheConfig.HOLD_TIME`
249 - :py:obj:`FaviconCacheConfig.LIMIT_TOTAL_BYTES`
250 - :py:obj:`FaviconCacheConfig.BLOB_MAX_BYTES`
251 - :py:obj:`MAINTENANCE_PERIOD`
252 - :py:obj:`MAINTENANCE_MODE`
253 """
254
255 DB_SCHEMA = 1
256
257 DDL_BLOBS = """\
258CREATE TABLE IF NOT EXISTS blobs (
259 sha256 TEXT,
260 bytes_c INTEGER,
261 mime TEXT NOT NULL,
262 data BLOB NOT NULL,
263 PRIMARY KEY (sha256))"""
264
265 """Table to store BLOB objects by their sha256 hash values."""
266
267 DDL_BLOB_MAP = """\
268CREATE TABLE IF NOT EXISTS blob_map (
269 m_time INTEGER DEFAULT (strftime('%s', 'now')), -- last modified (unix epoch) time in sec.
270 sha256 TEXT,
271 resolver TEXT,
272 authority TEXT,
273 PRIMARY KEY (resolver, authority))"""
274
275 """Table to map from (resolver, authority) to sha256 hash values."""
276
277 DDL_CREATE_TABLES = {
278 "blobs": DDL_BLOBS,
279 "blob_map": DDL_BLOB_MAP,
280 }
281
282 SQL_DROP_LEFTOVER_BLOBS = (
283 "DELETE FROM blobs WHERE sha256 IN ("
284 " SELECT b.sha256"
285 " FROM blobs b"
286 " LEFT JOIN blob_map bm"
287 " ON b.sha256 = bm.sha256"
288 " WHERE bm.sha256 IS NULL)"
289 )
290 """Delete blobs.sha256 (BLOBs) no longer in blob_map.sha256."""
291
292 SQL_ITER_BLOBS_SHA256_BYTES_C = (
293 "SELECT b.sha256, b.bytes_c FROM blobs b"
294 " JOIN blob_map bm "
295 " ON b.sha256 = bm.sha256"
296 " ORDER BY bm.m_time ASC"
297 )
298
299 SQL_INSERT_BLOBS = (
300 "INSERT INTO blobs (sha256, bytes_c, mime, data) VALUES (?, ?, ?, ?)"
301 " ON CONFLICT (sha256) DO NOTHING"
302 ) # fmt: skip
303
304 SQL_INSERT_BLOB_MAP = (
305 "INSERT INTO blob_map (sha256, resolver, authority) VALUES (?, ?, ?)"
306 " ON CONFLICT DO UPDATE "
307 " SET sha256=excluded.sha256, m_time=strftime('%s', 'now')"
308 )
309
310 def __init__(self, cfg: FaviconCacheConfig):
311 """An instance of the favicon cache is build up from the configuration.""" #
312
313 if cfg.db_url == ":memory:":
314 logger.critical("don't use SQLite DB in :memory: in production!!")
315 super().__init__(cfg.db_url)
316 self.cfg = cfg
317
318 def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
319
320 sql = "SELECT sha256 FROM blob_map WHERE resolver = ? AND authority = ?"
321 res = self.DB.execute(sql, (resolver, authority)).fetchone()
322 if res is None:
323 return None
324
325 data, mime = (None, None)
326 sha256 = res[0]
327 if sha256 == FALLBACK_ICON:
328 return data, mime
329
330 sql = "SELECT data, mime FROM blobs WHERE sha256 = ?"
331 res = self.DB.execute(sql, (sha256,)).fetchone()
332 if res is not None:
333 data, mime = res
334 return data, mime
335
336 def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
337
338 if self.cfg.MAINTENANCE_MODE == "auto" and int(time.time()) > self.next_maintenance_time:
339 # Should automatic maintenance be moved to a new thread?
340 self.maintenance()
341
342 if data is not None and mime is None:
343 logger.error(
344 "favicon resolver %s tries to cache mime-type None for authority %s",
345 resolver,
346 authority,
347 )
348 return False
349
350 bytes_c = len(data or b"")
351 if bytes_c > self.cfg.BLOB_MAX_BYTES:
352 logger.info(
353 "favicon of resolver: %s / authority: %s to big to cache (bytes: %s) " % (resolver, authority, bytes_c)
354 )
355 return False
356
357 if data is None:
358 sha256 = FALLBACK_ICON
359 else:
360 sha256 = hashlib.sha256(data).hexdigest()
361
362 with self.connect() as conn:
363 if sha256 != FALLBACK_ICON:
364 conn.execute(self.SQL_INSERT_BLOBS, (sha256, bytes_c, mime, data))
365 conn.execute(self.SQL_INSERT_BLOB_MAP, (sha256, resolver, authority))
366 # hint: the with context of the connection object closes the transaction
367 # but not the DB connection. The connection has to be closed by the
368 # caller of self.connect()!
369 conn.close()
370
371 return True
372
373 @property
374 def next_maintenance_time(self) -> int:
375 """Returns (unix epoch) time of the next maintenance."""
376
377 return self.cfg.MAINTENANCE_PERIOD + self.properties.m_time("LAST_MAINTENANCE")
378
379 def maintenance(self, force=False):
380
381 # Prevent parallel DB maintenance cycles from other DB connections
382 # (e.g. in multi thread or process environments).
383
384 if not force and int(time.time()) < self.next_maintenance_time:
385 logger.debug("no maintenance required yet, next maintenance interval is in the future")
386 return
387 self.properties.set("LAST_MAINTENANCE", "") # hint: this (also) sets the m_time of the property!
388
389 # Do maintenance tasks. This can be take a little more time, to avoid
390 # DB locks, etablish a new DB connecton.
391
392 with self.connect() as conn:
393
394 # drop items not in HOLD time
395 res = conn.execute(
396 f"DELETE FROM blob_map"
397 f" WHERE cast(m_time as integer) < cast(strftime('%s', 'now') as integer) - {self.cfg.HOLD_TIME}"
398 )
399 logger.debug("dropped %s obsolete blob_map items from db", res.rowcount)
400 res = conn.execute(self.SQL_DROP_LEFTOVER_BLOBS)
401 logger.debug("dropped %s obsolete BLOBS from db", res.rowcount)
402
403 # drop old items to be in LIMIT_TOTAL_BYTES
404 total_bytes = conn.execute("SELECT SUM(bytes_c) FROM blobs").fetchone()[0] or 0
405 if total_bytes > self.cfg.LIMIT_TOTAL_BYTES:
406
407 x = total_bytes - self.cfg.LIMIT_TOTAL_BYTES
408 c = 0
409 sha_list = []
410 for row in conn.execute(self.SQL_ITER_BLOBS_SHA256_BYTES_C):
411 sha256, bytes_c = row
412 sha_list.append(sha256)
413 c += bytes_c
414 if c > x:
415 break
416 if sha_list:
417 conn.execute("DELETE FROM blobs WHERE sha256 IN ('%s')" % "','".join(sha_list))
418 conn.execute("DELETE FROM blob_map WHERE sha256 IN ('%s')" % "','".join(sha_list))
419 logger.debug("dropped %s blobs with total size of %s bytes", len(sha_list), c)
420
421 # Vacuuming the WALs
422 # https://www.theunterminatedstring.com/sqlite-vacuuming/
423
424 conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
425 conn.close()
426
427 def _query_val(self, sql, default=None):
428 val = self.DB.execute(sql).fetchone()
429 if val is not None:
430 val = val[0]
431 if val is None:
432 val = default
433 return val
434
435 def state(self) -> FaviconCacheStats:
436 return FaviconCacheStats(
437 favicons=self._query_val("SELECT count(*) FROM blobs", 0),
438 bytes=self._query_val("SELECT SUM(bytes_c) FROM blobs", 0),
439 domains=self._query_val("SELECT count(*) FROM (SELECT authority FROM blob_map GROUP BY authority)", 0),
440 resolvers=self._query_val("SELECT count(*) FROM (SELECT resolver FROM blob_map GROUP BY resolver)", 0),
441 )
442
443
445 """Favicon cache in process' memory. Its just a POC that stores the
446 favicons in the memory of the process.
447
448 .. attention::
449
450 Don't use it in production, it will blow up your memory!!
451
452 """
453
454 def __init__(self, cfg):
455
456 self.cfg = cfg
457 self._data = {}
458 self._sha_mime = {}
459
460 def __call__(self, resolver: str, authority: str) -> None | tuple[bytes | None, str | None]:
461
462 sha, mime = self._sha_mime.get(f"{resolver}:{authority}", (None, None))
463 if sha is None:
464 return None
465 data = self._data.get(sha)
466 if data == FALLBACK_ICON:
467 data = None
468 return data, mime
469
470 def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
471
472 if data is None:
473 data = FALLBACK_ICON
474 mime = None
475
476 elif mime is None:
477 logger.error(
478 "favicon resolver %s tries to cache mime-type None for authority %s",
479 resolver,
480 authority,
481 )
482 return False
483
484 digest = hashlib.sha256(data).hexdigest()
485 self._data[digest] = data
486 self._sha_mime[f"{resolver}:{authority}"] = (digest, mime)
487 return True
488
489 def state(self):
490 return FaviconCacheStats(favicons=len(self._data.keys()))
491
492 def maintenance(self, force=False):
493 pass
None|tuple[bytes|None, str|None] __call__(self, str resolver, str authority)
Definition cache.py:460
maintenance(self, force=False)
Definition cache.py:492
bool set(self, str resolver, str authority, str|None mime, bytes|None data)
Definition cache.py:470
None|tuple[None|bytes, None|str] __call__(self, str resolver, str authority)
Definition cache.py:221
maintenance(self, force=False)
Definition cache.py:230
__init__(self, FaviconCacheConfig cfg)
Definition cache.py:218
bool set(self, str resolver, str authority, str|None mime, bytes|None data)
Definition cache.py:224
bool set(self, str resolver, str authority, str|None mime, bytes|None data)
Definition cache.py:336
None|tuple[None|bytes, None|str] __call__(self, str resolver, str authority)
Definition cache.py:318
_query_val(self, sql, default=None)
Definition cache.py:427
__init__(self, FaviconCacheConfig cfg)
Definition cache.py:310
FaviconCacheStats __sub__(self, other)
Definition cache.py:157
report(self, str fmt="{descr}: {val}\n")
Definition cache.py:171
maintenance(self, force=False)
Definition cache.py:207
__init__(self, FaviconCacheConfig cfg)
Definition cache.py:187
None|tuple[None|bytes, None|str] __call__(self, str resolver, str authority)
Definition cache.py:191
bool set(self, str resolver, str authority, str|None mime, bytes|None data)
Definition cache.py:197
sqlite3.Connection connect(self)
Definition sqlitedb.py:194
sqlite3.Connection DB(self)
Definition sqlitedb.py:240
maintenance(bool force=True, bool debug=False)
Definition cache.py:53