.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
google_scholar.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Google Scholar is a freely accessible web search engine that indexes the full
3text or metadata of scholarly literature across an array of publishing formats
4and disciplines.
5
6Compared to other Google services the Scholar engine has a simple GET REST-API
7and there does not exists ``async`` API. Even though the API slightly vintage
8we can make use of the :ref:`google API` to assemble the arguments of the GET
9request.
10
11Configuration
12=============
13
14.. code:: yaml
15
16 - name: google scholar
17 engine: google_scholar
18 shortcut: gos
19
20Implementations
21===============
22
23"""
24
25import typing as t
26
27from urllib.parse import urlencode
28from datetime import datetime
29from lxml import html
30
31from searx.utils import (
32 eval_xpath,
33 eval_xpath_getindex,
34 eval_xpath_list,
35 extract_text,
36 ElementType,
37)
38
39from searx.exceptions import SearxEngineCaptchaException
40
41from searx.engines.google import fetch_traits # pylint: disable=unused-import
42from searx.engines.google import (
43 get_google_info,
44 time_range_dict,
45)
46
47from searx.result_types import EngineResults
48
49if t.TYPE_CHECKING:
50 from searx.extended_types import SXNG_Response
51 from searx.search.processors import OnlineParams
52
53about = {
54 "website": "https://scholar.google.com",
55 "wikidata_id": "Q494817",
56 "official_api_documentation": "https://developers.google.com/custom-search",
57 "use_official_api": False,
58 "require_api_key": False,
59 "results": "HTML",
60}
61
62# engine dependent config
63categories = ["science", "scientific publications"]
64paging = True
65max_page = 50
66"""`Google max 50 pages`_
67
68.. _Google max 50 pages: https://github.com/searxng/searxng/issues/2982
69"""
70language_support = True
71time_range_support = True
72safesearch = False
73send_accept_language_header = True
74
75
76def request(query: str, params: "OnlineParams") -> None:
77 """Google-Scholar search request"""
78
79 google_info = get_google_info(params, traits)
80 # subdomain is: scholar.google.xy
81 google_info["subdomain"] = google_info["subdomain"].replace("www.", "scholar.")
82
83 args = {
84 "q": query,
85 **google_info["params"],
86 "start": (params["pageno"] - 1) * 10,
87 "as_sdt": "2007", # include patents / to disable set "0,5"
88 "as_vis": "0", # include citations / to disable set "1"
89 }
90 args.update(time_range_args(params))
91
92 params["url"] = "https://" + google_info["subdomain"] + "/scholar?" + urlencode(args)
93 params["cookies"] = google_info["cookies"]
94 params["headers"].update(google_info["headers"])
95
96
97def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals
98 """Parse response from Google Scholar"""
99
100 res = EngineResults()
101 dom = html.fromstring(resp.text)
102 detect_google_captcha(dom)
103
104 # parse results
105 for result in eval_xpath_list(dom, "//div[@data-rp]"):
106
107 title = extract_text(eval_xpath(result, ".//h3[1]//a"))
108 if not title:
109 # this is a [ZITATION] block
110 continue
111
112 pub_type: str = extract_text(eval_xpath(result, ".//span[@class='gs_ctg2']")) or ""
113 if pub_type:
114 pub_type = pub_type[1:-1].lower()
115
116 url: str = eval_xpath_getindex(result, ".//h3[1]//a/@href", 0)
117 content: str = extract_text(eval_xpath(result, ".//div[@class='gs_rs']")) or ""
118 authors, journal, publisher, publishedDate = parse_gs_a(
119 extract_text(eval_xpath(result, ".//div[@class='gs_a']"))
120 )
121 if publisher in url:
122 publisher = ""
123
124 # cited by
125 comments: str = (
126 extract_text(eval_xpath(result, ".//div[@class='gs_fl']/a[starts-with(@href,'/scholar?cites=')]")) or ""
127 )
128
129 # link to the html or pdf document
130 html_url: str = ""
131 pdf_url: str = ""
132 doc_url = eval_xpath_getindex(result, ".//div[@class='gs_or_ggsm']/a/@href", 0, default=None)
133 doc_type = extract_text(eval_xpath(result, ".//span[@class='gs_ctg2']"))
134 if doc_type == "[PDF]":
135 pdf_url = doc_url
136 else:
137 html_url = doc_url
138
139 res.add(
140 res.types.Paper(
141 type=pub_type,
142 url=url,
143 title=title,
144 authors=authors,
145 publisher=publisher,
146 journal=journal,
147 publishedDate=publishedDate,
148 content=content,
149 comments=comments,
150 html_url=html_url,
151 pdf_url=pdf_url,
152 )
153 )
154
155 # parse suggestion
156 for suggestion in eval_xpath(dom, "//div[contains(@class, 'gs_qsuggest_wrap')]//li//a"):
157 res.add(res.types.LegacyResult(suggestion=extract_text(suggestion)))
158
159 for correction in eval_xpath(dom, "//div[@class='gs_r gs_pda']/a"):
160 res.add(res.types.LegacyResult(correction=extract_text(correction)))
161 return res
162
163
164def time_range_args(params: "OnlineParams") -> dict[str, int]:
165 """Returns a dictionary with a time range arguments based on
166 ``params["time_range"]``.
167
168 Google Scholar supports a detailed search by year. Searching by *last
169 month* or *last week* (as offered by SearXNG) is uncommon for scientific
170 publications and is not supported by Google Scholar.
171
172 To limit the result list when the users selects a range, all the SearXNG
173 ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
174 is set an empty dictionary of arguments is returned.
175
176 Example; when user selects a time range and we find ourselves in the year
177 2025 (current year minus one):
178
179 .. code:: python
180
181 { "as_ylo" : 2024 }
182
183 """
184 ret_val: dict[str, int] = {}
185 if params["time_range"] in time_range_dict:
186 ret_val["as_ylo"] = datetime.now().year - 1
187 return ret_val
188
189
190def detect_google_captcha(dom: ElementType):
191 """In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
192 not redirected to ``sorry.google.com``.
193 """
194 if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
195 raise SearxEngineCaptchaException()
196
197
198def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]:
199 """Parse the text written in green.
200
201 Possible formats:
202 * "{authors} - {journal}, {year} - {publisher}"
203 * "{authors} - {year} - {publisher}"
204 * "{authors} - {publisher}"
205 """
206 if text is None or text == "":
207 return [], "", "", None
208
209 s_text = text.split(" - ")
210 authors: list[str] = s_text[0].split(", ")
211 publisher: str = s_text[-1]
212 if len(s_text) != 3:
213 return authors, "", publisher, None
214
215 # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
216 # get journal and year
217 journal_year = s_text[1].split(", ")
218 # journal is optional and may contains some coma
219 if len(journal_year) > 1:
220 journal: str = ", ".join(journal_year[0:-1])
221 if journal == "…":
222 journal = ""
223 else:
224 journal = ""
225 # year
226 year = journal_year[-1]
227 try:
228 publishedDate = datetime.strptime(year.strip(), "%Y")
229 except ValueError:
230 publishedDate = None
231 return authors, journal, publisher, publishedDate
None request(str query, "OnlineParams" params)
dict[str, int] time_range_args("OnlineParams" params)
EngineResults response("SXNG_Response" resp)