.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
openalex.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""The OpenAlex engine integrates the `OpenAlex`_ Works API to return scientific
3paper results using the :ref:`result_types.paper` class. It is an "online" JSON
4engine that uses the official public API and does not require an API key.
5
6.. _OpenAlex: https://openalex.org
7.. _OpenAlex API overview: https://docs.openalex.org/how-to-use-the-api/api-overview
8
9Key features
10------------
11
12- Uses the official Works endpoint (JSON)
13- Paging support via ``page`` and ``per-page``
14- Relevance sorting (``sort=relevance_score:desc``)
15- Language filter support (maps SearXNG language to ``filter=language:<iso2>``)
16- Maps fields commonly used in scholarly results: title, authors, abstract
17 (reconstructed from inverted index), journal/venue, publisher, DOI, tags
18 (concepts), PDF/HTML links, pages, volume, issue, published date, and a short
19 citations comment
20- Supports OpenAlex "polite pool" by adding a ``mailto`` parameter
21
22
23Configuration
24=============
25
26Minimal example for :origin:`settings.yml <searx/settings.yml>`:
27
28.. code:: yaml
29
30 - name: openalex
31 engine: openalex
32 shortcut: oa
33 categories: science, scientific publications
34 timeout: 5.0
35 # Recommended by OpenAlex: join the polite pool with an email address
36 mailto: "[email protected]"
37
38Notes
39-----
40
41- The ``mailto`` key is optional but recommended by OpenAlex for better service.
42- Language is inherited from the user's UI language; when it is not ``all``, the
43 engine adds ``filter=language:<iso2>`` (e.g. ``language:fr``). If OpenAlex has
44 few results for that language, you may see fewer items.
45- Results typically include a main link. When the primary landing page from
46 OpenAlex is a DOI resolver, the engine will use that stable link. When an open
47 access link is available, it is exposed via the ``PDF`` and/or ``HTML`` links
48 in the result footer.
49
50
51What is returned
52================
53
54Each result uses the :ref:`result_types.paper` class and may include:
55
56- ``title`` and ``content`` (abstract; reconstructed from the inverted index)
57- ``authors`` (display names)
58- ``journal`` (host venue display name) and ``publisher``
59- ``doi`` (normalized to the plain DOI, without the ``https://doi.org/`` prefix)
60- ``tags`` (OpenAlex concepts display names)
61- ``pdf_url`` (Open access PDF if available) and ``html_url`` (landing page)
62- ``publishedDate`` (parsed from ``publication_date``)
63- ``pages``, ``volume``, ``number`` (issue)
64- ``type`` and a brief ``comments`` string with citation count
65
66
67Rate limits & polite pool
68=========================
69
70OpenAlex offers a free public API with generous daily limits. For extra courtesy
71and improved service quality, include a contact email in each request via
72``mailto``. You can set it directly in the engine configuration as shown above.
73See: `OpenAlex API overview`_.
74
75
76Troubleshooting
77===============
78
79- Few or no results in a non-English UI language:
80 Ensure the selected language has sufficient coverage at OpenAlex, or set the
81 UI language to English and retry.
82- Preference changes fail while testing locally:
83 Make sure your ``server.secret_key`` and ``server.base_url`` are set in your
84 instance settings so signed cookies work; see :ref:`settings server`.
85
86
87Implementation
88===============
89
90"""
91
92import typing as t
93
94from datetime import datetime
95from urllib.parse import urlencode
96from searx.result_types import EngineResults
97
98if t.TYPE_CHECKING:
99 from searx.extended_types import SXNG_Response
100 from searx.search.processors import OnlineParams
101
102# about
103about = {
104 "website": "https://openalex.org/",
105 "wikidata_id": "Q110718454",
106 "official_api_documentation": "https://docs.openalex.org/how-to-use-the-api/api-overview",
107 "use_official_api": True,
108 "require_api_key": False,
109 "results": "JSON",
110}
111
112
113# engine dependent config
114categories = ["science", "scientific publications"]
115paging = True
116search_url = "https://api.openalex.org/works"
117
118# Optional: include your email for OpenAlex polite pool. Can be set from settings.yml
119# engines: - name: openalex; engine: openalex; mailto: "[email protected]"
120mailto = ""
121
122
123def request(query: str, params: "OnlineParams") -> None:
124 # Build OpenAlex query using search parameter and paging
125 args = {
126 "search": query,
127 "page": params["pageno"],
128 # keep result size moderate; OpenAlex default is 25
129 "per-page": 10,
130 # relevance sorting works only with `search`
131 "sort": "relevance_score:desc",
132 }
133
134 # Language filter (expects ISO639-1 like 'fr', 'en')
135 language = params.get("language")
136 filters: list[str] = []
137 if isinstance(language, str) and language != "all":
138 iso2 = language.split("-")[0].split("_")[0]
139 if len(iso2) == 2:
140 filters.append(f"language:{iso2}")
141
142 if filters:
143 args["filter"] = ",".join(filters)
144
145 # include mailto if configured for polite pool (engine module setting)
146 if isinstance(mailto, str) and mailto != "":
147 args["mailto"] = mailto
148
149 params["url"] = f"{search_url}?{urlencode(args)}"
150
151
152def response(resp: "SXNG_Response") -> EngineResults:
153 data = resp.json()
154 res = EngineResults()
155
156 for item in data.get("results", []):
157 url, html_url, pdf_url = _extract_links(item)
158 title: str = item.get("title", "")
159 content: str = _reconstruct_abstract(item.get("abstract_inverted_index")) or ""
160 authors = _extract_authors(item)
161 journal, publisher, pages, volume, number, published_date = _extract_biblio(item)
162 doi = _doi_to_plain(item.get("doi"))
163 tags = _extract_tags(item)
164 comments = _extract_comments(item)
165
166 res.add(
167 res.types.Paper(
168 url=url,
169 title=title,
170 content=content,
171 journal=journal,
172 publisher=publisher,
173 doi=doi,
174 tags=tags,
175 authors=authors,
176 pdf_url=pdf_url,
177 html_url=html_url,
178 publishedDate=published_date,
179 pages=pages,
180 volume=volume,
181 number=number,
182 type=item.get("type"),
183 comments=comments,
184 )
185 )
186
187 return res
188
189
190def _stringify_pages(biblio: dict[str, t.Any]) -> str:
191 first_page = biblio.get("first_page")
192 last_page = biblio.get("last_page")
193 if first_page and last_page:
194 return f"{first_page}-{last_page}"
195 if first_page:
196 return str(first_page)
197 if last_page:
198 return str(last_page)
199 return ""
200
201
202def _parse_date(value: str | None) -> datetime | None:
203 if not value:
204 return None
205 # OpenAlex may return YYYY, YYYY-MM or YYYY-MM-DD
206 for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"):
207 try:
208 return datetime.strptime(value, fmt)
209 except ValueError:
210 continue
211 return None
212
213
214def _doi_to_plain(doi_value: str | None) -> str:
215 if not doi_value:
216 return ""
217 # OpenAlex `doi` field is commonly a full URL like https://doi.org/10.1234/abcd
218 return doi_value.removeprefix("https://doi.org/")
219
220
222 abstract_inverted_index: dict[str, list[int]] | None,
223) -> str | None:
224 # The abstract is returned as an inverted index {token: [positions...]}
225 # Reconstruct by placing tokens at their positions and joining with spaces.
226 if not abstract_inverted_index:
227 return None
228 position_to_token: dict[int, str] = {}
229 max_index = -1
230 for token, positions in abstract_inverted_index.items():
231 for pos in positions:
232 position_to_token[pos] = token
233 max_index = max(max_index, pos)
234 if max_index < 0:
235 return None
236 ordered_tokens = [position_to_token.get(i, "") for i in range(0, max_index + 1)]
237 # collapse multiple empty tokens
238 text = " ".join(t for t in ordered_tokens if t != "")
239 return text if text != "" else None
240
241
242def _extract_links(item: dict[str, t.Any]) -> tuple[str, str, str]:
243 primary_location: dict[str, str] = item.get("primary_location", {})
244 open_access: dict[str, str] = item.get("open_access", {})
245
246 landing_page_url: str = primary_location.get("landing_page_url") or ""
247 work_url: str = item.get("id", "")
248
249 url: str = landing_page_url or work_url
250 html_url: str = landing_page_url
251 pdf_url: str = primary_location.get("pdf_url") or open_access.get("oa_url") or ""
252
253 return url, html_url, pdf_url
254
255
256def _extract_authors(item: dict[str, t.Any]) -> list[str]:
257 authors: list[str] = []
258 for auth in item.get("authorships", []):
259 if not auth:
260 continue
261 author_obj = auth.get("author", {})
262 display_name = author_obj.get("display_name")
263 if isinstance(display_name, str) and display_name != "":
264 authors.append(display_name)
265 return authors
266
267
268def _extract_tags(item: dict[str, t.Any]) -> list[str]:
269 tags: list[str] = []
270 for c in item.get("concepts", []):
271 name = (c or {}).get("display_name")
272 if isinstance(name, str) and name != "":
273 tags.append(name)
274 return tags
275
276
278 item: dict[str, t.Any],
279) -> tuple[str, str, str, str, str, datetime | None]:
280 host_venue: dict[str, str] = item.get("host_venue", {})
281 biblio: dict[str, str] = item.get("biblio", {})
282
283 journal: str = host_venue.get("display_name", "")
284 publisher: str = host_venue.get("publisher", "")
285 pages: str = _stringify_pages(biblio)
286 volume = biblio.get("volume", "")
287 number = biblio.get("issue", "")
288 published_date = _parse_date(item.get("publication_date"))
289 return journal, publisher, pages, volume, number, published_date
290
291
292def _extract_comments(item: dict[str, t.Any]) -> str:
293 cited_by_count = item.get("cited_by_count")
294 if isinstance(cited_by_count, int):
295 return f"{cited_by_count} citations"
296 return ""
None request(str query, "OnlineParams" params)
Definition openalex.py:123
str _extract_comments(dict[str, t.Any] item)
Definition openalex.py:292
str _stringify_pages(dict[str, t.Any] biblio)
Definition openalex.py:190
EngineResults response("SXNG_Response" resp)
Definition openalex.py:152
list[str] _extract_authors(dict[str, t.Any] item)
Definition openalex.py:256
tuple[str, str, str, str, str, datetime|None] _extract_biblio(dict[str, t.Any] item)
Definition openalex.py:279
list[str] _extract_tags(dict[str, t.Any] item)
Definition openalex.py:268
datetime|None _parse_date(str|None value)
Definition openalex.py:202
str|None _reconstruct_abstract(dict[str, list[int]]|None abstract_inverted_index)
Definition openalex.py:223
str _doi_to_plain(str|None doi_value)
Definition openalex.py:214
tuple[str, str, str] _extract_links(dict[str, t.Any] item)
Definition openalex.py:242