.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
online.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Processor used for ``online`` engines."""
3
4__all__ = ["OnlineProcessor", "OnlineParams"]
5
6import typing as t
7
8from timeit import default_timer
9import asyncio
10import ssl
11import httpx
12
13import searx.network
14from searx.utils import gen_useragent
15from searx.exceptions import (
16 SearxEngineAccessDeniedException,
17 SearxEngineCaptchaException,
18 SearxEngineTooManyRequestsException,
19)
20from searx.metrics.error_recorder import count_error
21from .abstract import EngineProcessor, RequestParams
22
23if t.TYPE_CHECKING:
24 from searx.search.models import SearchQuery
25 from searx.results import ResultContainer
26 from searx.result_types import EngineResults
27
28
29class HTTPParams(t.TypedDict):
30 """HTTP request parameters"""
31
32 method: t.Literal["GET", "POST"]
33 """HTTP request method."""
34
35 headers: dict[str, str]
36 """HTTP header information."""
37
38 data: dict[str, str]
39 """Sending `form encoded data`_.
40
41 .. _form encoded data:
42 https://www.python-httpx.org/quickstart/#sending-form-encoded-data
43 """
44
45 json: dict[str, t.Any]
46 """`Sending `JSON encoded data`_.
47
48 .. _JSON encoded data:
49 https://www.python-httpx.org/quickstart/#sending-json-encoded-data
50 """
51
52 content: bytes
53 """`Sending `binary request data`_.
54
55 .. _binary request data:
56 https://www.python-httpx.org/quickstart/#sending-json-encoded-data
57 """
58
59 url: str
60 """Requested url."""
61
62 cookies: dict[str, str]
63 """HTTP cookies."""
64
65 allow_redirects: bool
66 """Follow redirects"""
67
68 max_redirects: int
69 """Maximum redirects, hard limit."""
70
71 soft_max_redirects: int
72 """Maximum redirects, soft limit. Record an error but don't stop the engine."""
73
74 verify: None | t.Literal[False] | str # not sure str really works
75 """If not ``None``, it overrides the verify value defined in the network. Use
76 ``False`` to accept any server certificate and use a path to file to specify a
77 server certificate"""
78
79 auth: str | None
80 """An authentication to use when sending requests."""
81
82 raise_for_httperror: bool
83 """Raise an exception if the `HTTP response status code`_ is ``>= 300``.
84
85 .. _HTTP response status code:
86 https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status
87 """
88
89
91 """Request parameters of a ``online`` engine."""
92
93
94def default_request_params() -> HTTPParams:
95 """Default request parameters for ``online`` engines."""
96 return {
97 "method": "GET",
98 "headers": {},
99 "data": {},
100 "json": {},
101 "content": b"",
102 "url": "",
103 "cookies": {},
104 "allow_redirects": False,
105 "max_redirects": 0,
106 "soft_max_redirects": 0,
107 "auth": None,
108 "verify": None,
109 "raise_for_httperror": True,
110 }
111
112
114 """Processor class for ``online`` engines."""
115
116 engine_type: str = "online"
117
118 def init_engine(self) -> bool:
119 """This method is called in a thread, and before the base method is
120 called, the network must be set up for the ``online`` engines."""
121 self.init_network_in_thread(start_time=default_timer(), timeout_limit=self.engine.timeout)
122 return super().init_engine()
123
124 def init_network_in_thread(self, start_time: float, timeout_limit: float):
125 # set timeout for all HTTP requests
126 searx.network.set_timeout_for_thread(timeout_limit, start_time=start_time)
127 # reset the HTTP total time
129 # set the network
131
132 def get_params(self, search_query: "SearchQuery", engine_category: str) -> OnlineParams | None:
133 """Returns a dictionary with the :ref:`request params <engine request
134 online>` (:py:obj:`OnlineParams`), if the search condition is not
135 supported by the engine, ``None`` is returned."""
136
137 base_params: RequestParams | None = super().get_params(search_query, engine_category)
138 if base_params is None:
139 return base_params
140
141 params: OnlineParams = {**default_request_params(), **base_params}
142
143 headers = params["headers"]
144
145 # add an user agent
146 headers["User-Agent"] = gen_useragent()
147
148 # add Accept-Language header
149 if self.engine.send_accept_language_header and search_query.locale:
150 ac_lang = search_query.locale.language
151 if search_query.locale.territory:
152 ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
153 search_query.locale.language,
154 search_query.locale.territory,
155 search_query.locale.language,
156 )
157 headers["Accept-Language"] = ac_lang
158
159 self.logger.debug("HTTP Accept-Language: %s", headers.get("Accept-Language", ""))
160 return params
161
162 def _send_http_request(self, params: OnlineParams):
163
164 # create dictionary which contain all information about the request
165 request_args: dict[str, t.Any] = {
166 "headers": params["headers"],
167 "cookies": params["cookies"],
168 "auth": params["auth"],
169 }
170
171 verify = params.get("verify")
172 if verify is not None:
173 request_args["verify"] = verify
174
175 # max_redirects
176 max_redirects = params.get("max_redirects")
177 if max_redirects:
178 request_args["max_redirects"] = max_redirects
179
180 # allow_redirects
181 if "allow_redirects" in params:
182 request_args["allow_redirects"] = params["allow_redirects"]
183
184 # soft_max_redirects
185 soft_max_redirects: int = params.get("soft_max_redirects", max_redirects or 0)
186
187 # raise_for_status
188 request_args["raise_for_httperror"] = params.get("raise_for_httperror", True)
189
190 # specific type of request (GET or POST)
191 if params["method"] == "GET":
192 req = searx.network.get
193 else:
194 req = searx.network.post
195 if params["data"]:
196 request_args["data"] = params["data"]
197 if params["json"]:
198 request_args["json"] = params["json"]
199 if params["content"]:
200 request_args["content"] = params["content"]
201
202 # send the request
203 response = req(params["url"], **request_args)
204
205 # check soft limit of the redirect count
206 if len(response.history) > soft_max_redirects:
207 # unexpected redirect : record an error
208 # but the engine might still return valid results.
209 status_code = str(response.status_code or "")
210 reason = response.reason_phrase or ""
211 hostname = response.url.host
212 count_error(
213 self.engine.name,
214 "{} redirects, maximum: {}".format(len(response.history), soft_max_redirects),
215 (status_code, reason, hostname),
216 secondary=True,
217 )
218
219 return response
220
221 def _search_basic(self, query: str, params: OnlineParams) -> "EngineResults|None":
222 # update request parameters dependent on
223 # search-engine (contained in engines folder)
224 self.engine.request(query, params)
225
226 # ignoring empty urls
227 if not params["url"]:
228 return None
229
230 # send request
231 response = self._send_http_request(params)
232
233 # parse the response
234 response.search_params = params
235 return self.engine.response(response)
236
237 def search( # pyright: ignore[reportIncompatibleMethodOverride]
238 self,
239 query: str,
240 params: OnlineParams,
241 result_container: "ResultContainer",
242 start_time: float,
243 timeout_limit: float,
244 ):
245 self.init_network_in_thread(start_time, timeout_limit)
246
247 try:
248 # send requests and parse the results
249 search_results = self._search_basic(query, params)
250 self.extend_container(result_container, start_time, search_results)
251 except ssl.SSLError as e:
252 # requests timeout (connect or read)
253 self.handle_exception(result_container, e, suspend=True)
254 self.logger.error("SSLError {}, verify={}".format(e, searx.network.get_network(self.engine.name).verify))
255 except (httpx.TimeoutException, asyncio.TimeoutError) as e:
256 # requests timeout (connect or read)
257 self.handle_exception(result_container, e, suspend=True)
258 self.logger.error(
259 "HTTP requests timeout (search duration : {0} s, timeout: {1} s) : {2}".format(
260 default_timer() - start_time, timeout_limit, e.__class__.__name__
261 )
262 )
263 except (httpx.HTTPError, httpx.StreamError) as e:
264 # other requests exception
265 self.handle_exception(result_container, e, suspend=True)
266 self.logger.exception(
267 "requests exception (search duration : {0} s, timeout: {1} s) : {2}".format(
268 default_timer() - start_time, timeout_limit, e
269 )
270 )
271 except (
272 SearxEngineCaptchaException,
273 SearxEngineTooManyRequestsException,
274 SearxEngineAccessDeniedException,
275 ) as e:
276 self.handle_exception(result_container, e, suspend=True)
277 self.logger.exception(e.message)
278 except Exception as e: # pylint: disable=broad-except
279 self.handle_exception(result_container, e)
280 self.logger.exception("exception : {0}".format(e))
extend_container(self, "ResultContainer" result_container, float start_time, "list[Result | LegacyResult]|None" search_results)
Definition abstract.py:215
handle_exception(self, "ResultContainer" result_container, BaseException|str exception_or_message, bool suspend=False)
Definition abstract.py:170
_send_http_request(self, OnlineParams params)
Definition online.py:162
init_network_in_thread(self, float start_time, float timeout_limit)
Definition online.py:124
"EngineResults|None" _search_basic(self, str query, OnlineParams params)
Definition online.py:221
OnlineParams|None get_params(self, "SearchQuery" search_query, str engine_category)
Definition online.py:132
set_context_network_name(str network_name)
Definition __init__.py:46
set_timeout_for_thread(float timeout, float|None start_time=None)
Definition __init__.py:41
reset_time_for_thread()
Definition __init__.py:32
HTTPParams default_request_params()
Definition online.py:94