.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
baidu.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Baidu_
3
4.. _Baidu: https://www.baidu.com
5"""
6
7# There exits a https://github.com/ohblue/baidu-serp-api/
8# but we don't use it here (may we can learn from).
9
10from urllib.parse import urlencode
11from datetime import datetime
12from html import unescape
13import time
14import json
15
16from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
17from searx.utils import html_to_text
18
19about = {
20 "website": "https://www.baidu.com",
21 "wikidata_id": "Q14772",
22 "official_api_documentation": None,
23 "use_official_api": False,
24 "require_api_key": False,
25 "results": "JSON",
26 "language": "zh",
27}
28
29paging = True
30categories = []
31results_per_page = 10
32
33baidu_category = 'general'
34
35time_range_support = True
36time_range_dict = {"day": 86400, "week": 604800, "month": 2592000, "year": 31536000}
37
38
39def init(_):
40 if baidu_category not in ('general', 'images', 'it'):
41 raise SearxEngineAPIException(f"Unsupported category: {baidu_category}")
42
43
44def request(query, params):
45 page_num = params["pageno"]
46
47 category_config = {
48 'general': {
49 'endpoint': 'https://www.baidu.com/s',
50 'params': {
51 "wd": query,
52 "rn": results_per_page,
53 "pn": (page_num - 1) * results_per_page,
54 "tn": "json",
55 },
56 },
57 'images': {
58 'endpoint': 'https://image.baidu.com/search/acjson',
59 'params': {
60 "word": query,
61 "rn": results_per_page,
62 "pn": (page_num - 1) * results_per_page,
63 "tn": "resultjson_com",
64 },
65 },
66 'it': {
67 'endpoint': 'https://kaifa.baidu.com/rest/v1/search',
68 'params': {
69 "wd": query,
70 "pageSize": results_per_page,
71 "pageNum": page_num,
72 "paramList": f"page_num={page_num},page_size={results_per_page}",
73 "position": 0,
74 },
75 },
76 }
77
78 query_params = category_config[baidu_category]['params']
79 query_url = category_config[baidu_category]['endpoint']
80
81 if params.get("time_range") in time_range_dict:
82 now = int(time.time())
83 past = now - time_range_dict[params["time_range"]]
84
85 if baidu_category == 'general':
86 query_params["gpc"] = f"stf={past},{now}|stftype=1"
87
88 if baidu_category == 'it':
89 query_params["paramList"] += f",timestamp_range={past}-{now}"
90
91 params["url"] = f"{query_url}?{urlencode(query_params)}"
92 params["allow_redirects"] = False
93 return params
94
95
96def response(resp):
97 # Detect Baidu Captcha, it will redirect to wappass.baidu.com
98 if 'wappass.baidu.com/static/captcha' in resp.headers.get('Location', ''):
100
101 text = resp.text
102 if baidu_category == 'images':
103 # baidu's JSON encoder wrongly quotes / and ' characters by \\ and \'
104 text = text.replace(r"\/", "/").replace(r"\'", "'")
105 data = json.loads(text, strict=False)
106 parsers = {'general': parse_general, 'images': parse_images, 'it': parse_it}
107
108 return parsers[baidu_category](data)
109
110
112 results = []
113 if not data.get("feed", {}).get("entry"):
114 raise SearxEngineAPIException("Invalid response")
115
116 for entry in data["feed"]["entry"]:
117 if not entry.get("title") or not entry.get("url"):
118 continue
119
120 published_date = None
121 if entry.get("time"):
122 try:
123 published_date = datetime.fromtimestamp(entry["time"])
124 except (ValueError, TypeError):
125 published_date = None
126
127 # title and content sometimes containing characters such as & ' " etc...
128 title = unescape(entry["title"])
129 content = unescape(entry.get("abs", ""))
130
131 results.append(
132 {
133 "title": title,
134 "url": entry["url"],
135 "content": content,
136 "publishedDate": published_date,
137 }
138 )
139 return results
140
141
142def parse_images(data):
143 results = []
144 if "data" in data:
145 for item in data["data"]:
146 if not item:
147 # the last item in the JSON list is empty, the JSON string ends with "}, {}]"
148 continue
149 replace_url = item.get("replaceUrl", [{}])[0]
150 width = item.get("width")
151 height = item.get("height")
152 img_date = item.get("bdImgnewsDate")
153 publishedDate = None
154 if img_date:
155 publishedDate = datetime.strptime(img_date, "%Y-%m-%d %H:%M")
156 results.append(
157 {
158 "template": "images.html",
159 "url": replace_url.get("FromURL"),
160 "thumbnail_src": item.get("thumbURL"),
161 "img_src": replace_url.get("ObjURL"),
162 "title": html_to_text(item.get("fromPageTitle")),
163 "source": item.get("fromURLHost"),
164 "resolution": f"{width} x {height}",
165 "img_format": item.get("type"),
166 "filesize": item.get("filesize"),
167 "publishedDate": publishedDate,
168 }
169 )
170 return results
171
172
173def parse_it(data):
174 results = []
175 if not data.get("data", {}).get("documents", {}).get("data"):
176 raise SearxEngineAPIException("Invalid response")
177
178 for entry in data["data"]["documents"]["data"]:
179 results.append(
180 {
181 'title': entry["techDocDigest"]["title"],
182 'url': entry["techDocDigest"]["url"],
183 'content': entry["techDocDigest"]["summary"],
184 }
185 )
186 return results
request(query, params)
Definition baidu.py:44
parse_general(data)
Definition baidu.py:111