.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
zlibrary.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""`Z-Library`_ (abbreviated as z-lib, formerly BookFinder) is a shadow library
3project for file-sharing access to scholarly journal articles, academic texts
4and general-interest books. It began as a mirror of Library Genesis, from which
5most of its books originate.
6
7.. _Z-Library: https://zlibrary-global.se/
8
9Configuration
10=============
11
12The engine has the following additional settings:
13
14- :py:obj:`zlib_year_from`
15- :py:obj:`zlib_year_to`
16- :py:obj:`zlib_ext`
17
18With this options a SearXNG maintainer is able to configure **additional**
19engines for specific searches in Z-Library. For example a engine to search
20only for EPUB from 2010 to 2020.
21
22.. code:: yaml
23
24 - name: z-library 2010s epub
25 engine: zlibrary
26 shortcut: zlib2010s
27 zlib_year_from: '2010'
28 zlib_year_to: '2020'
29 zlib_ext: 'EPUB'
30
31Implementations
32===============
33
34"""
35
36import typing as t
37from datetime import datetime
38from urllib.parse import quote
39from lxml import html
40from flask_babel import gettext # pyright: ignore[reportUnknownVariableType]
41
42from searx.utils import extract_text, eval_xpath, eval_xpath_list, ElementType
43from searx.enginelib.traits import EngineTraits
44from searx.data import ENGINE_TRAITS
45from searx.exceptions import SearxException
46from searx.result_types import EngineResults
47
48if t.TYPE_CHECKING:
49 from searx.extended_types import SXNG_Response
50 from searx.search.processors import OnlineParams
51
52about: dict[str, t.Any] = {
53 "website": "https://zlibrary-global.se",
54 "wikidata_id": "Q104863992",
55 "official_api_documentation": None,
56 "use_official_api": False,
57 "require_api_key": False,
58 "results": "HTML",
59}
60
61categories: list[str] = ["files", "books"]
62paging: bool = True
63base_url: str = "https://zlibrary-global.se"
64
65zlib_year_from: str = ""
66"""Filter z-library's results by year from. E.g '2010'.
67"""
68
69zlib_year_to: str = ""
70"""Filter z-library's results by year to. E.g. '2010'.
71"""
72
73zlib_ext: str = ""
74"""Filter z-library's results by a file ending. Common filters for example are
75``PDF`` and ``EPUB``.
76"""
77
78i18n_language = gettext("Language")
79i18n_book_rating = gettext("Book rating")
80i18n_file_quality = gettext("File quality")
81
82
83def setup(engine_settings: dict[str, t.Any]) -> bool: # pylint: disable=unused-argument
84 """Check of engine's settings."""
85 traits: EngineTraits = EngineTraits(**ENGINE_TRAITS["z-library"])
86
87 if zlib_ext and zlib_ext not in traits.custom["ext"]:
88 raise ValueError(f"invalid setting ext: {zlib_ext}")
89 if zlib_year_from and zlib_year_from not in traits.custom["year_from"]:
90 raise ValueError(f"invalid setting year_from: {zlib_year_from}")
91 if zlib_year_to and zlib_year_to not in traits.custom["year_to"]:
92 raise ValueError(f"invalid setting year_to: {zlib_year_to}")
93 return True
94
95
96def request(query: str, params: "OnlineParams") -> None:
97 lang: str | None = traits.get_language(params["searxng_locale"], traits.all_locale)
98 search_url: str = (
99 base_url
100 + "/s/{search_query}/?page={pageno}"
101 + "&yearFrom={zlib_year_from}"
102 + "&yearTo={zlib_year_to}"
103 + "&languages[]={lang}"
104 + "&extensions[]={zlib_ext}"
105 )
106 params["url"] = search_url.format(
107 search_query=quote(query),
108 pageno=params["pageno"],
109 lang=lang,
110 zlib_year_from=zlib_year_from,
111 zlib_year_to=zlib_year_to,
112 zlib_ext=zlib_ext,
113 )
114 params["verify"] = False
115
116
117def response(resp: "SXNG_Response") -> EngineResults:
118 res = EngineResults()
119 dom = html.fromstring(resp.text)
120
121 if domain_is_seized(dom):
122 raise SearxException(f"zlibrary domain is seized: {base_url}")
123
124 for item in dom.xpath('//div[@id="searchResultBox"]//div[contains(@class, "resItemBox")]'):
125 kwargs = _parse_result(item)
126 res.add(res.types.Paper(**kwargs))
127
128 return res
129
130
131def domain_is_seized(dom: ElementType):
132 return bool(dom.xpath('//title') and "seized" in dom.xpath('//title')[0].text.lower())
133
134
135def _text(item: ElementType, selector: str) -> str | None:
136 return extract_text(eval_xpath(item, selector))
137
138
139def _parse_result(item: ElementType) -> dict[str, t.Any]:
140
141 author_elements = eval_xpath_list(item, './/div[@class="authors"]//a[@itemprop="author"]')
142
143 result = {
144 "url": base_url + item.xpath('(.//a[starts-with(@href, "/book/")])[1]/@href')[0],
145 "title": _text(item, './/*[@itemprop="name"]'),
146 "authors": [extract_text(author) for author in author_elements],
147 "publisher": _text(item, './/a[@title="Publisher"]'),
148 "type": _text(item, './/div[contains(@class, "property__file")]//div[contains(@class, "property_value")]'),
149 }
150
151 thumbnail = _text(item, './/img[contains(@class, "cover")]/@data-src')
152 if thumbnail and not thumbnail.startswith('/'):
153 result["thumbnail"] = thumbnail
154
155 year = _text(item, './/div[contains(@class, "property_year")]//div[contains(@class, "property_value")]')
156 if year:
157 result["publishedDate"] = datetime.strptime(year, '%Y')
158
159 content: list[str] = []
160 language = _text(item, './/div[contains(@class, "property_language")]//div[contains(@class, "property_value")]')
161 if language:
162 content.append(f"{i18n_language}: {language.capitalize()}")
163 book_rating = _text(item, './/span[contains(@class, "book-rating-interest-score")]')
164 if book_rating and float(book_rating):
165 content.append(f"{i18n_book_rating}: {book_rating}")
166 file_quality = _text(item, './/span[contains(@class, "book-rating-quality-score")]')
167 if file_quality and float(file_quality):
168 content.append(f"{i18n_file_quality}: {file_quality}")
169 result["content"] = " | ".join(content)
170
171 return result
172
173
174def fetch_traits(engine_traits: EngineTraits) -> None:
175 """Fetch languages and other search arguments from zlibrary's search form."""
176 # pylint: disable=import-outside-toplevel, too-many-branches, too-many-statements
177
178 import babel
179 import babel.core
180 import httpx
181
182 from searx.network import get # see https://github.com/searxng/searxng/issues/762
183 from searx.locales import language_tag
184
185 def _use_old_values():
186 # don't change anything, re-use the existing values
187 engine_traits.all_locale = ENGINE_TRAITS["z-library"]["all_locale"]
188 engine_traits.custom = ENGINE_TRAITS["z-library"]["custom"]
189 engine_traits.languages = ENGINE_TRAITS["z-library"]["languages"]
190
191 try:
192 resp = get(base_url, verify=False)
193 except (SearxException, httpx.HTTPError) as exc:
194 print(f"ERROR: zlibrary domain '{base_url}' is seized?")
195 print(f" --> {exc}")
196 _use_old_values()
197 return
198
199 if not resp.ok:
200 raise RuntimeError("Response from zlibrary's search page is not OK.")
201 dom = html.fromstring(resp.text)
202
203 if domain_is_seized(dom):
204 print(f"ERROR: zlibrary domain is seized: {base_url}")
205 _use_old_values()
206 return
207
208 engine_traits.all_locale = ""
209 engine_traits.custom["ext"] = []
210
211 l: list[str]
212 # years_from
213 l = []
214 for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearFrom']/option"):
215 l.append(year.get("value") or "")
216 engine_traits.custom["year_from"] = l
217
218 # years_to
219 l = []
220 for year in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_yearTo']/option"):
221 l.append(year.get("value") or "")
222 engine_traits.custom["year_to"] = l
223
224 # ext (file extensions)
225 l = []
226 for ext in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_extensions']/option"):
227 l.append(ext.get("value") or "")
228 engine_traits.custom["ext"] = l
229
230 # Handle languages
231 # Z-library uses English names for languages, so we need to map them to their respective locales
232 language_name_locale_map: dict[str, babel.Locale] = {}
233 for locale in babel.core.localedata.locale_identifiers():
234 # Create a Locale object for the current locale
235 loc = babel.Locale.parse(locale)
236 if loc.english_name is None:
237 continue
238 language_name_locale_map[loc.english_name.lower()] = loc
239
240 for x in eval_xpath_list(dom, "//div[@id='advSearch-noJS']//select[@id='sf_languages']/option"):
241 eng_lang = x.get("value")
242 if eng_lang is None:
243 continue
244 try:
245 locale = language_name_locale_map[eng_lang.lower()]
246 except KeyError:
247 # silently ignore unknown languages
248 # print("ERROR: %s is unknown by babel" % (eng_lang))
249 continue
250 sxng_lang = language_tag(locale)
251 conflict = engine_traits.languages.get(sxng_lang)
252 if conflict:
253 if conflict != eng_lang:
254 print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
255 continue
256 engine_traits.languages[sxng_lang] = eng_lang
str|None _text(ElementType item, str selector)
Definition zlibrary.py:135
None request(str query, "OnlineParams" params)
Definition zlibrary.py:96
dict[str, t.Any] _parse_result(ElementType item)
Definition zlibrary.py:139
None fetch_traits(EngineTraits engine_traits)
Definition zlibrary.py:174
EngineResults response("SXNG_Response" resp)
Definition zlibrary.py:117
bool setup(dict[str, t.Any] engine_settings)
Definition zlibrary.py:83
domain_is_seized(ElementType dom)
Definition zlibrary.py:131