.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
semantic_scholar.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""`Semantic Scholar`_ provides free, AI-driven search and discovery tools, and
3open resources for the global research community. `Semantic Scholar`_ index
4over 200 million academic papers sourced from publisher partnerships, data
5providers, and web crawls.
6
7.. _Semantic Scholar: https://www.semanticscholar.org/about
8
9Configuration
10=============
11
12To get in use of this engine add the following entry to your engines list in
13``settings.yml``:
14
15.. code:: yaml
16
17 - name: semantic scholar
18 engine: semantic_scholar
19 shortcut: se
20
21Implementations
22===============
23
24"""
25
26import typing as t
27
28from datetime import datetime
29from lxml import html
30from flask_babel import gettext # pyright: ignore[reportUnknownVariableType]
31
32from searx.network import get
33from searx.utils import eval_xpath_getindex, html_to_text
34from searx.enginelib import EngineCache
35from searx.result_types import EngineResults
36
37if t.TYPE_CHECKING:
38 from searx.extended_types import SXNG_Response
39 from searx.search.processors import OnlineParams
40
41about = {
42 "website": "https://www.semanticscholar.org/",
43 "wikidata_id": "Q22908627",
44 "official_api_documentation": "https://api.semanticscholar.org/",
45 "use_official_api": True,
46 "require_api_key": False,
47 "results": "JSON",
48}
49
50categories = ["science", "scientific publications"]
51paging = True
52search_url = "https://www.semanticscholar.org/api/1/search"
53base_url = "https://www.semanticscholar.org"
54
55CACHE: EngineCache
56"""Persistent (SQLite) key/value cache that deletes its values after ``expire``
57seconds."""
58
59
60def setup(engine_settings: dict[str, t.Any]) -> bool:
61 global CACHE # pylint: disable=global-statement
62 CACHE = EngineCache(engine_settings["name"])
63 return True
64
65
66def get_ui_version() -> str:
67 ret_val: str = CACHE.get("X-S2-UI-Version")
68 if not ret_val:
69 resp = get(base_url)
70 if not resp.ok:
71 raise RuntimeError("Can't determine Semantic Scholar UI version")
72
73 doc = html.fromstring(resp.text)
74 ret_val = eval_xpath_getindex(doc, "//meta[@name='s2-ui-version']/@content", 0)
75 if not ret_val:
76 raise RuntimeError("Can't determine Semantic Scholar UI version")
77 # hold the cached value for 5min
78 CACHE.set("X-S2-UI-Version", value=ret_val, expire=300)
79 logger.debug("X-S2-UI-Version: %s", ret_val)
80 return ret_val
81
82
83def request(query: str, params: "OnlineParams") -> None:
84 params["url"] = search_url
85 params["method"] = "POST"
86 params["headers"].update(
87 {
88 "Content-Type": "application/json",
89 "X-S2-UI-Version": get_ui_version(),
90 "X-S2-Client": "webapp-browser",
91 }
92 )
93 params["json"] = {
94 "queryString": query,
95 "page": params["pageno"],
96 "pageSize": 10,
97 "sort": "relevance",
98 "getQuerySuggestions": False,
99 "authors": [],
100 "coAuthors": [],
101 "venues": [],
102 "performTitleMatch": True,
103 }
104
105
106def response(resp: "SXNG_Response") -> EngineResults:
107 res = EngineResults()
108 json_data = resp.json()
109
110 for result in json_data["results"]:
111 url: str = result.get("primaryPaperLink", {}).get("url")
112 if not url and result.get("links"):
113 url = result.get("links")[0]
114 if not url:
115 alternatePaperLinks = result.get("alternatePaperLinks")
116 if alternatePaperLinks:
117 url = alternatePaperLinks[0].get("url")
118 if not url:
119 url = base_url + "/paper/%s" % result["id"]
120
121 publishedDate: datetime | None
122 if "pubDate" in result:
123 publishedDate = datetime.strptime(result["pubDate"], "%Y-%m-%d")
124 else:
125 publishedDate = None
126
127 # authors
128 authors: list[str] = [author[0]["name"] for author in result.get("authors", [])]
129
130 # pick for the first alternate link, but not from the crawler
131 pdf_url: str = ""
132 for doc in result.get("alternatePaperLinks", []):
133 if doc["linkType"] not in ("crawler", "doi"):
134 pdf_url = doc["url"]
135 break
136
137 # comments
138 comments: str = ""
139 if "citationStats" in result:
140 comments = gettext(
141 "{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}"
142 ).format(
143 numCitations=result["citationStats"]["numCitations"],
144 firstCitationVelocityYear=result["citationStats"]["firstCitationVelocityYear"],
145 lastCitationVelocityYear=result["citationStats"]["lastCitationVelocityYear"],
146 )
147
148 res.add(
149 res.types.Paper(
150 title=result["title"]["text"],
151 url=url,
152 content=html_to_text(result["paperAbstract"]["text"]),
153 journal=result.get("venue", {}).get("text") or result.get("journal", {}).get("name"),
154 doi=result.get("doiInfo", {}).get("doi"),
155 tags=result.get("fieldsOfStudy"),
156 authors=authors,
157 pdf_url=pdf_url,
158 publishedDate=publishedDate,
159 comments=comments,
160 )
161 )
162
163 return res
bool setup(dict[str, t.Any] engine_settings)
None request(str query, "OnlineParams" params)
EngineResults response("SXNG_Response" resp)