.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
cache.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Implementations for caching favicons.
3
4:py:obj:`FaviconCacheConfig`:
5 Configuration of the favicon cache
6
7:py:obj:`FaviconCache`:
8 Abstract base class for the implementation of a favicon cache.
9
10:py:obj:`FaviconCacheSQLite`:
11 Favicon cache that manages the favicon BLOBs in a SQLite DB.
12
13:py:obj:`FaviconCacheNull`:
14 Fallback solution if the configured cache cannot be used for system reasons.
15
16----
17
18"""
19
20from __future__ import annotations
21from typing import Literal
22
23import os
24import abc
25import dataclasses
26import hashlib
27import logging
28import sqlite3
29import tempfile
30import time
31import typer
32
33import msgspec
34
35from searx import sqlitedb
36from searx import logger
37from searx.utils import humanize_bytes, humanize_number
38
39CACHE: "FaviconCache"
40FALLBACK_ICON = b"FALLBACK_ICON"
41
42logger = logger.getChild('favicons.cache')
43app = typer.Typer()
44
45
46@app.command()
47def state():
48 """show state of the cache"""
49 print(CACHE.state().report())
50
51
52@app.command()
53def maintenance(force: bool = True, debug: bool = False):
54 """perform maintenance of the cache"""
55 root_log = logging.getLogger()
56 if debug:
57 root_log.setLevel(logging.DEBUG)
58 else:
59 root_log.handlers = []
60 handler = logging.StreamHandler()
61 handler.setFormatter(logging.Formatter("%(message)s"))
62 logger.addHandler(handler)
63 logger.setLevel(logging.DEBUG)
64
65 state_t0 = CACHE.state()
66 CACHE.maintenance(force=force)
67 state_t1 = CACHE.state()
68 state_delta = state_t0 - state_t1
69 print("The cache has been reduced by:")
70 print(state_delta.report("\n- {descr}: {val}").lstrip("\n"))
71
72
73def init(cfg: "FaviconCacheConfig"):
74 """Initialization of a global ``CACHE``"""
75
76 global CACHE # pylint: disable=global-statement
77 if cfg.db_type == "sqlite":
78 if sqlite3.sqlite_version_info <= (3, 35):
79 logger.critical(
80 "Disable favicon caching completely: SQLite library (%s) is too old! (require >= 3.35)",
81 sqlite3.sqlite_version,
82 )
83 CACHE = FaviconCacheNull(cfg)
84 else:
85 CACHE = FaviconCacheSQLite(cfg)
86 elif cfg.db_type == "mem":
87 logger.error("Favicons are cached in memory, don't use this in production!")
88 CACHE = FaviconCacheMEM(cfg)
89 else:
90 raise NotImplementedError(f"favicons db_type '{cfg.db_type}' is unknown")
91
92
93class FaviconCacheConfig(msgspec.Struct): # pylint: disable=too-few-public-methods
94 """Configuration of the favicon cache."""
95
96 db_type: Literal["sqlite", "mem"] = "sqlite"
97 """Type of the database:
98
99 ``sqlite``:
100 :py:obj:`.cache.FaviconCacheSQLite`
101
102 ``mem``:
103 :py:obj:`.cache.FaviconCacheMEM` (not recommended)
104 """
105
106 db_url: str = tempfile.gettempdir() + os.sep + "faviconcache.db"
107 """URL of the SQLite DB, the path to the database file."""
108
109 HOLD_TIME: int = 60 * 60 * 24 * 30 # 30 days
110 """Hold time (default in sec.), after which a BLOB is removed from the cache."""
111
112 LIMIT_TOTAL_BYTES: int = 1024 * 1024 * 50 # 50 MB
113 """Maximum of bytes (default) stored in the cache of all blobs. Note: The
114 limit is only reached at each maintenance interval after which the oldest
115 BLOBs are deleted; the limit is exceeded during the maintenance period. If
116 the maintenance period is *too long* or maintenance is switched off
117 completely, the cache grows uncontrollably."""
118
119 BLOB_MAX_BYTES: int = 1024 * 20 # 20 KB
120 """The maximum BLOB size in bytes that a favicon may have so that it can be
121 saved in the cache. If the favicon is larger, it is not saved in the cache
122 and must be requested by the client via the proxy."""
123
124 MAINTENANCE_PERIOD: int = 60 * 60
125 """Maintenance period in seconds / when :py:obj:`MAINTENANCE_MODE` is set to
126 ``auto``."""
127
128 MAINTENANCE_MODE: Literal["auto", "off"] = "auto"
129 """Type of maintenance mode
130
131 ``auto``:
132 Maintenance is carried out automatically as part of the maintenance
133 intervals (:py:obj:`MAINTENANCE_PERIOD`); no external process is required.
134
135 ``off``:
136 Maintenance is switched off and must be carried out by an external process
137 if required.
138 """
139
140
141@dataclasses.dataclass
143 """Dataclass wich provides information on the status of the cache."""
144
145 favicons: int | None = None
146 bytes: int | None = None
147 domains: int | None = None
148 resolvers: int | None = None
149
150 field_descr = (
151 ("favicons", "number of favicons in cache", humanize_number),
152 ("bytes", "total size (approx. bytes) of cache", humanize_bytes),
153 ("domains", "total number of domains in cache", humanize_number),
154 ("resolvers", "number of resolvers", str),
155 )
156
157 def __sub__(self, other) -> FaviconCacheStats:
158 if not isinstance(other, self.__class__):
159 raise TypeError(f"unsupported operand type(s) for +: '{self.__class__}' and '{type(other)}'")
160 kwargs = {}
161 for field, _, _ in self.field_descr:
162 self_val, other_val = getattr(self, field), getattr(other, field)
163 if None in (self_val, other_val):
164 continue
165 if isinstance(self_val, int):
166 kwargs[field] = self_val - other_val
167 else:
168 kwargs[field] = self_val
169 return self.__class__(**kwargs)
170
171 def report(self, fmt: str = "{descr}: {val}\n"):
172 s = []
173 for field, descr, cast in self.field_descr:
174 val = getattr(self, field)
175 if val is None:
176 val = "--"
177 else:
178 val = cast(val)
179 s.append(fmt.format(descr=descr, val=val))
180 return "".join(s)
181
182
183class FaviconCache(abc.ABC):
184 """Abstract base class for the implementation of a favicon cache."""
185
186 @abc.abstractmethod
187 def __init__(self, cfg: FaviconCacheConfig):
188 """An instance of the favicon cache is build up from the configuration."""
189
190 @abc.abstractmethod
191 def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
192 """Returns ``None`` or the tuple of ``(data, mime)`` that has been
193 registered in the cache. The ``None`` indicates that there was no entry
194 in the cache."""
195
196 @abc.abstractmethod
197 def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
198 """Set data and mime-type in the cache. If data is None, the
199 :py:obj:`FALLBACK_ICON` is registered. in the cache."""
200
201 @abc.abstractmethod
202 def state(self) -> FaviconCacheStats:
203 """Returns a :py:obj:`FaviconCacheStats` (key/values) with information
204 on the state of the cache."""
205
206 @abc.abstractmethod
207 def maintenance(self, force=False):
208 """Performs maintenance on the cache"""
209
210
211class FaviconCacheNull(FaviconCache):
212 """A dummy favicon cache that caches nothing / a fallback solution. The
213 NullCache is used when more efficient caches such as the
214 :py:obj:`FaviconCacheSQLite` cannot be used because, for example, the SQLite
215 library is only available in an old version and does not meet the
216 requirements."""
217
218 def __init__(self, cfg: FaviconCacheConfig):
219 return None
220
221 def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
222 return None
223
224 def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
225 return False
226
227 def state(self):
228 return FaviconCacheStats(favicons=0)
229
230 def maintenance(self, force=False):
231 pass
232
233
234class FaviconCacheSQLite(sqlitedb.SQLiteAppl, FaviconCache):
235 """Favicon cache that manages the favicon BLOBs in a SQLite DB. The DB
236 model in the SQLite DB is implemented using the abstract class
237 :py:obj:`sqlitedb.SQLiteAppl`.
238
239 The following configurations are required / supported:
240
241 - :py:obj:`FaviconCacheConfig.db_url`
242 - :py:obj:`FaviconCacheConfig.HOLD_TIME`
243 - :py:obj:`FaviconCacheConfig.LIMIT_TOTAL_BYTES`
244 - :py:obj:`FaviconCacheConfig.BLOB_MAX_BYTES`
245 - :py:obj:`MAINTENANCE_PERIOD`
246 - :py:obj:`MAINTENANCE_MODE`
247 """
248
249 DB_SCHEMA = 1
250
251 DDL_BLOBS = """\
252CREATE TABLE IF NOT EXISTS blobs (
253 sha256 TEXT,
254 bytes_c INTEGER,
255 mime TEXT NOT NULL,
256 data BLOB NOT NULL,
257 PRIMARY KEY (sha256))"""
258
259 """Table to store BLOB objects by their sha256 hash values."""
260
261 DDL_BLOB_MAP = """\
262CREATE TABLE IF NOT EXISTS blob_map (
263 m_time INTEGER DEFAULT (strftime('%s', 'now')), -- last modified (unix epoch) time in sec.
264 sha256 TEXT,
265 resolver TEXT,
266 authority TEXT,
267 PRIMARY KEY (resolver, authority))"""
268
269 """Table to map from (resolver, authority) to sha256 hash values."""
270
271 DDL_CREATE_TABLES = {
272 "blobs": DDL_BLOBS,
273 "blob_map": DDL_BLOB_MAP,
274 }
275
276 SQL_DROP_LEFTOVER_BLOBS = (
277 "DELETE FROM blobs WHERE sha256 IN ("
278 " SELECT b.sha256"
279 " FROM blobs b"
280 " LEFT JOIN blob_map bm"
281 " ON b.sha256 = bm.sha256"
282 " WHERE bm.sha256 IS NULL)"
283 )
284 """Delete blobs.sha256 (BLOBs) no longer in blob_map.sha256."""
285
286 SQL_ITER_BLOBS_SHA256_BYTES_C = (
287 "SELECT b.sha256, b.bytes_c FROM blobs b"
288 " JOIN blob_map bm "
289 " ON b.sha256 = bm.sha256"
290 " ORDER BY bm.m_time ASC"
291 )
292
293 SQL_INSERT_BLOBS = (
294 "INSERT INTO blobs (sha256, bytes_c, mime, data) VALUES (?, ?, ?, ?)"
295 " ON CONFLICT (sha256) DO NOTHING"
296 ) # fmt: skip
297
298 SQL_INSERT_BLOB_MAP = (
299 "INSERT INTO blob_map (sha256, resolver, authority) VALUES (?, ?, ?)"
300 " ON CONFLICT DO UPDATE "
301 " SET sha256=excluded.sha256, m_time=strftime('%s', 'now')"
302 )
303
304 def __init__(self, cfg: FaviconCacheConfig):
305 """An instance of the favicon cache is build up from the configuration.""" #
306
307 if cfg.db_url == ":memory:":
308 logger.critical("don't use SQLite DB in :memory: in production!!")
309 super().__init__(cfg.db_url)
310 self.cfg = cfg
311
312 def __call__(self, resolver: str, authority: str) -> None | tuple[None | bytes, None | str]:
313
314 sql = "SELECT sha256 FROM blob_map WHERE resolver = ? AND authority = ?"
315 res = self.DB.execute(sql, (resolver, authority)).fetchone()
316 if res is None:
317 return None
318
319 data, mime = (None, None)
320 sha256 = res[0]
321 if sha256 == FALLBACK_ICON:
322 return data, mime
323
324 sql = "SELECT data, mime FROM blobs WHERE sha256 = ?"
325 res = self.DB.execute(sql, (sha256,)).fetchone()
326 if res is not None:
327 data, mime = res
328 return data, mime
329
330 def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
331
332 if self.cfg.MAINTENANCE_MODE == "auto" and int(time.time()) > self.next_maintenance_timenext_maintenance_time:
333 # Should automatic maintenance be moved to a new thread?
335
336 if data is not None and mime is None:
337 logger.error(
338 "favicon resolver %s tries to cache mime-type None for authority %s",
339 resolver,
340 authority,
341 )
342 return False
343
344 bytes_c = len(data or b"")
345 if bytes_c > self.cfg.BLOB_MAX_BYTES:
346 logger.info(
347 "favicon of resolver: %s / authority: %s to big to cache (bytes: %s) " % (resolver, authority, bytes_c)
348 )
349 return False
350
351 if data is None:
352 sha256 = FALLBACK_ICON
353 else:
354 sha256 = hashlib.sha256(data).hexdigest()
355
356 with self.connect() as conn:
357 if sha256 != FALLBACK_ICON:
358 conn.execute(self.SQL_INSERT_BLOBS, (sha256, bytes_c, mime, data))
359 conn.execute(self.SQL_INSERT_BLOB_MAP, (sha256, resolver, authority))
360
361 return True
362
363 @property
364 def next_maintenance_time(self) -> int:
365 """Returns (unix epoch) time of the next maintenance."""
366
367 return self.cfg.MAINTENANCE_PERIOD + self.properties.m_time("LAST_MAINTENANCE")
368
369 def maintenance(self, force=False):
370
371 # Prevent parallel DB maintenance cycles from other DB connections
372 # (e.g. in multi thread or process environments).
373
374 if not force and int(time.time()) < self.next_maintenance_timenext_maintenance_time:
375 logger.debug("no maintenance required yet, next maintenance interval is in the future")
376 return
377 self.properties.set("LAST_MAINTENANCE", "") # hint: this (also) sets the m_time of the property!
378
379 # do maintenance tasks
380
381 with self.connect() as conn:
382
383 # drop items not in HOLD time
384 res = conn.execute(
385 f"DELETE FROM blob_map"
386 f" WHERE cast(m_time as integer) < cast(strftime('%s', 'now') as integer) - {self.cfg.HOLD_TIME}"
387 )
388 logger.debug("dropped %s obsolete blob_map items from db", res.rowcount)
389 res = conn.execute(self.SQL_DROP_LEFTOVER_BLOBS)
390 logger.debug("dropped %s obsolete BLOBS from db", res.rowcount)
391
392 # drop old items to be in LIMIT_TOTAL_BYTES
393 total_bytes = conn.execute("SELECT SUM(bytes_c) FROM blobs").fetchone()[0] or 0
394 if total_bytes > self.cfg.LIMIT_TOTAL_BYTES:
395
396 x = total_bytes - self.cfg.LIMIT_TOTAL_BYTES
397 c = 0
398 sha_list = []
399 for row in conn.execute(self.SQL_ITER_BLOBS_SHA256_BYTES_C):
400 sha256, bytes_c = row
401 sha_list.append(sha256)
402 c += bytes_c
403 if c > x:
404 break
405 if sha_list:
406 conn.execute("DELETE FROM blobs WHERE sha256 IN ('%s')" % "','".join(sha_list))
407 conn.execute("DELETE FROM blob_map WHERE sha256 IN ('%s')" % "','".join(sha_list))
408 logger.debug("dropped %s blobs with total size of %s bytes", len(sha_list), c)
409
410 def _query_val(self, sql, default=None):
411 val = self.DB.execute(sql).fetchone()
412 if val is not None:
413 val = val[0]
414 if val is None:
415 val = default
416 return val
417
418 def state(self) -> FaviconCacheStats:
419 return FaviconCacheStats(
420 favicons=self._query_val("SELECT count(*) FROM blobs", 0),
421 bytes=self._query_val("SELECT SUM(bytes_c) FROM blobs", 0),
422 domains=self._query_val("SELECT count(*) FROM (SELECT authority FROM blob_map GROUP BY authority)", 0),
423 resolvers=self._query_val("SELECT count(*) FROM (SELECT resolver FROM blob_map GROUP BY resolver)", 0),
424 )
425
426
428 """Favicon cache in process' memory. Its just a POC that stores the
429 favicons in the memory of the process.
430
431 .. attention::
432
433 Don't use it in production, it will blow up your memory!!
434
435 """
436
437 def __init__(self, cfg):
438
439 self.cfg = cfg
440 self._data = {}
441 self._sha_mime = {}
442
443 def __call__(self, resolver: str, authority: str) -> None | tuple[bytes | None, str | None]:
444
445 sha, mime = self._sha_mime.get(f"{resolver}:{authority}", (None, None))
446 if sha is None:
447 return None
448 data = self._data.get(sha)
449 if data == FALLBACK_ICON:
450 data = None
451 return data, mime
452
453 def set(self, resolver: str, authority: str, mime: str | None, data: bytes | None) -> bool:
454
455 if data is None:
456 data = FALLBACK_ICON
457 mime = None
458
459 elif mime is None:
460 logger.error(
461 "favicon resolver %s tries to cache mime-type None for authority %s",
462 resolver,
463 authority,
464 )
465 return False
466
467 digest = hashlib.sha256(data).hexdigest()
468 self._data[digest] = data
469 self._sha_mime[f"{resolver}:{authority}"] = (digest, mime)
470 return True
471
472 def state(self):
473 return FaviconCacheStats(favicons=len(self._data.keys()))
474
475 def maintenance(self, force=False):
476 pass
None|tuple[bytes|None, str|None] __call__(self, str resolver, str authority)
Definition cache.py:443
maintenance(self, force=False)
Definition cache.py:475
bool set(self, str resolver, str authority, str|None mime, bytes|None data)
Definition cache.py:453
None|tuple[None|bytes, None|str] __call__(self, str resolver, str authority)
Definition cache.py:221
maintenance(self, force=False)
Definition cache.py:230
__init__(self, FaviconCacheConfig cfg)
Definition cache.py:218
bool set(self, str resolver, str authority, str|None mime, bytes|None data)
Definition cache.py:224
bool set(self, str resolver, str authority, str|None mime, bytes|None data)
Definition cache.py:330
None|tuple[None|bytes, None|str] __call__(self, str resolver, str authority)
Definition cache.py:312
_query_val(self, sql, default=None)
Definition cache.py:410
__init__(self, FaviconCacheConfig cfg)
Definition cache.py:304
FaviconCacheStats __sub__(self, other)
Definition cache.py:157
report(self, str fmt="{descr}: {val}\n")
Definition cache.py:171
maintenance(self, force=False)
Definition cache.py:207
__init__(self, FaviconCacheConfig cfg)
Definition cache.py:187
None|tuple[None|bytes, None|str] __call__(self, str resolver, str authority)
Definition cache.py:191
bool set(self, str resolver, str authority, str|None mime, bytes|None data)
Definition cache.py:197
sqlite3.Connection connect(self)
Definition sqlitedb.py:116
sqlite3.Connection DB(self)
Definition sqlitedb.py:156
maintenance(bool force=True, bool debug=False)
Definition cache.py:53