.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
annas_archive.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""`Anna's Archive`_ is a free non-profit online shadow library metasearch
3engine providing access to a variety of book resources (also via IPFS), created
4by a team of anonymous archivists (AnnaArchivist_).
5
6.. _Anna's Archive: https://annas-archive.org/
7.. _AnnaArchivist: https://annas-software.org/AnnaArchivist/annas-archive
8
9Configuration
10=============
11
12The engine has the following additional settings:
13
14- :py:obj:`aa_content`
15- :py:obj:`aa_ext`
16- :py:obj:`aa_sort`
17
18With this options a SearXNG maintainer is able to configure **additional**
19engines for specific searches in Anna's Archive. For example a engine to search
20for *newest* articles and journals (PDF) / by shortcut ``!aaa <search-term>``.
21
22.. code:: yaml
23
24 - name: annas articles
25 engine: annas_archive
26 categories = ["general", "articles"]
27 shortcut: aaa
28 aa_content: "magazine"
29 aa_ext: "pdf"
30 aa_sort: "newest"
31
32
33Implementations
34===============
35
36"""
37import typing as t
38
39from urllib.parse import urlencode
40from lxml import html
41from lxml.etree import ElementBase
42
43from searx.utils import extract_text, eval_xpath, eval_xpath_getindex, eval_xpath_list
44from searx.enginelib.traits import EngineTraits
45from searx.data import ENGINE_TRAITS
46from searx.exceptions import SearxEngineXPathException
47
48from searx.result_types import EngineResults
49
50if t.TYPE_CHECKING:
51 from searx.extended_types import SXNG_Response
52 from searx.search.processors import OnlineParams
53
54# about
55about: dict[str, t.Any] = {
56 "website": "https://annas-archive.org/",
57 "wikidata_id": "Q115288326",
58 "official_api_documentation": None,
59 "use_official_api": False,
60 "require_api_key": False,
61 "results": "HTML",
62}
63
64# engine dependent config
65categories = ["files", "books"]
66paging: bool = True
67
68# search-url
69base_url: str = "https://annas-archive.org"
70aa_content: str = ""
71"""Anan's search form field **Content** / possible values::
72
73 book_fiction, book_unknown, book_nonfiction,
74 book_comic, magazine, standards_document
75
76To not filter use an empty string (default).
77"""
78aa_sort: str = ""
79"""Sort Anna's results, possible values::
80
81 newest, oldest, largest, smallest
82
83To sort by *most relevant* use an empty string (default)."""
84
85aa_ext: str = ""
86"""Filter Anna's results by a file ending. Common filters for example are
87``pdf`` and ``epub``.
88
89.. note::
90
91 Anna's Archive is a beta release: Filter results by file extension does not
92 really work on Anna's Archive.
93
94"""
95
96
97def setup(engine_settings: dict[str, t.Any]) -> bool: # pylint: disable=unused-argument
98 """Check of engine's settings."""
99 traits = EngineTraits(**ENGINE_TRAITS["annas archive"])
100
101 if aa_content and aa_content not in traits.custom["content"]:
102 raise ValueError(f"invalid setting content: {aa_content}")
103
104 if aa_sort and aa_sort not in traits.custom["sort"]:
105 raise ValueError(f"invalid setting sort: {aa_sort}")
106
107 if aa_ext and aa_ext not in traits.custom["ext"]:
108 raise ValueError(f"invalid setting ext: {aa_ext}")
109
110 return True
111
112
113def request(query: str, params: "OnlineParams") -> None:
114 lang = traits.get_language(params["searxng_locale"], traits.all_locale)
115 args = {
116 "lang": lang,
117 "content": aa_content,
118 "ext": aa_ext,
119 "sort": aa_sort,
120 "q": query,
121 "page": params["pageno"],
122 }
123 # filter out None and empty values
124 filtered_args = dict((k, v) for k, v in args.items() if v)
125 params["url"] = f"{base_url}/search?{urlencode(filtered_args)}"
126
127
128def response(resp: "SXNG_Response") -> EngineResults:
129 res = EngineResults()
130 dom = html.fromstring(resp.text)
131
132 # The rendering of the WEB page is strange; positions of Anna's result page
133 # are enclosed in SGML comments. These comments are *uncommented* by some
134 # JS code, see query of class ".js-scroll-hidden" in Anna's HTML template:
135 # https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html
136
137 for item in eval_xpath_list(dom, "//main//div[contains(@class, 'js-aarecord-list-outer')]/div"):
138 try:
139 kwargs: dict[str, t.Any] = _get_result(item)
140 except SearxEngineXPathException:
141 continue
142 res.add(res.types.Paper(**kwargs))
143 return res
144
145
146def _get_result(item: ElementBase) -> dict[str, t.Any]:
147 return {
148 "url": base_url + eval_xpath_getindex(item, "./a/@href", 0),
149 "title": extract_text(eval_xpath(item, "./div//a[starts-with(@href, '/md5')]")),
150 "authors": [extract_text(eval_xpath_getindex(item, ".//a[starts-with(@href, '/search')]", 0))],
151 "publisher": extract_text(
152 eval_xpath_getindex(item, ".//a[starts-with(@href, '/search')]", 1, default=None), allow_none=True
153 ),
154 "content": extract_text(eval_xpath(item, ".//div[contains(@class, 'relative')]")),
155 "thumbnail": extract_text(eval_xpath_getindex(item, ".//img/@src", 0, default=None), allow_none=True),
156 }
157
158
159def fetch_traits(engine_traits: EngineTraits):
160 """Fetch languages and other search arguments from Anna's search form."""
161 # pylint: disable=import-outside-toplevel
162
163 import babel
164 from searx.network import get # see https://github.com/searxng/searxng/issues/762
165 from searx.locales import language_tag
166
167 engine_traits.all_locale = ""
168 engine_traits.custom["content"] = []
169 engine_traits.custom["ext"] = []
170 engine_traits.custom["sort"] = []
171
172 resp = get(base_url + "/search")
173 if not resp.ok:
174 raise RuntimeError("Response from Anna's search page is not OK.")
175 dom = html.fromstring(resp.text)
176
177 # supported language codes
178
179 lang_map: dict[str, str] = {}
180 for x in eval_xpath_list(dom, "//form//input[@name='lang']"):
181 eng_lang = x.get("value")
182 if eng_lang in ("", "_empty", "nl-BE", "und") or eng_lang.startswith("anti__"):
183 continue
184 try:
185 locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep="-")
186 except babel.UnknownLocaleError:
187 # silently ignore unknown languages
188 # print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
189 continue
190 sxng_lang = language_tag(locale)
191 conflict = engine_traits.languages.get(sxng_lang)
192 if conflict:
193 if conflict != eng_lang:
194 print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
195 continue
196 engine_traits.languages[sxng_lang] = eng_lang
197
198 for x in eval_xpath_list(dom, "//form//input[@name='content']"):
199 if not x.get("value").startswith("anti__"):
200 engine_traits.custom["content"].append(x.get("value"))
201
202 for x in eval_xpath_list(dom, "//form//input[@name='ext']"):
203 if not x.get("value").startswith("anti__"):
204 engine_traits.custom["ext"].append(x.get("value"))
205
206 for x in eval_xpath_list(dom, "//form//select[@name='sort']//option"):
207 engine_traits.custom["sort"].append(x.get("value"))
208
209 # for better diff; sort the persistence of these traits
210 engine_traits.custom["content"].sort()
211 engine_traits.custom["ext"].sort()
212 engine_traits.custom["sort"].sort()
EngineResults response("SXNG_Response" resp)
fetch_traits(EngineTraits engine_traits)
None request(str query, "OnlineParams" params)
dict[str, t.Any] _get_result(ElementBase item)
bool setup(dict[str, t.Any] engine_settings)