.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
baidu.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Baidu_
3
4.. _Baidu: https://www.baidu.com
5"""
6
7# There exits a https://github.com/ohblue/baidu-serp-api/
8# but we don't use it here (may we can learn from).
9
10from urllib.parse import urlencode
11from datetime import datetime
12from html import unescape
13import time
14import json
15
16from searx.exceptions import SearxEngineAPIException
17from searx.utils import html_to_text
18
19about = {
20 "website": "https://www.baidu.com",
21 "wikidata_id": "Q14772",
22 "official_api_documentation": None,
23 "use_official_api": False,
24 "require_api_key": False,
25 "results": "JSON",
26 "language": "zh",
27}
28
29paging = True
30categories = []
31results_per_page = 10
32
33baidu_category = 'general'
34
35time_range_support = True
36time_range_dict = {"day": 86400, "week": 604800, "month": 2592000, "year": 31536000}
37
38
39def init(_):
40 if baidu_category not in ('general', 'images', 'it'):
41 raise SearxEngineAPIException(f"Unsupported category: {baidu_category}")
42
43
44def request(query, params):
45 page_num = params["pageno"]
46
47 category_config = {
48 'general': {
49 'endpoint': 'https://www.baidu.com/s',
50 'params': {
51 "wd": query,
52 "rn": results_per_page,
53 "pn": (page_num - 1) * results_per_page,
54 "tn": "json",
55 },
56 },
57 'images': {
58 'endpoint': 'https://image.baidu.com/search/acjson',
59 'params': {
60 "word": query,
61 "rn": results_per_page,
62 "pn": (page_num - 1) * results_per_page,
63 "tn": "resultjson_com",
64 },
65 },
66 'it': {
67 'endpoint': 'https://kaifa.baidu.com/rest/v1/search',
68 'params': {
69 "wd": query,
70 "pageSize": results_per_page,
71 "pageNum": page_num,
72 "paramList": f"page_num={page_num},page_size={results_per_page}",
73 "position": 0,
74 },
75 },
76 }
77
78 query_params = category_config[baidu_category]['params']
79 query_url = category_config[baidu_category]['endpoint']
80
81 if params.get("time_range") in time_range_dict:
82 now = int(time.time())
83 past = now - time_range_dict[params["time_range"]]
84
85 if baidu_category == 'general':
86 query_params["gpc"] = f"stf={past},{now}|stftype=1"
87
88 if baidu_category == 'it':
89 query_params["paramList"] += f",timestamp_range={past}-{now}"
90
91 params["url"] = f"{query_url}?{urlencode(query_params)}"
92 return params
93
94
95def response(resp):
96
97 text = resp.text
98 if baidu_category == 'images':
99 # baidu's JSON encoder wrongly quotes / and ' characters by \\ and \'
100 text = text.replace(r"\/", "/").replace(r"\'", "'")
101 data = json.loads(text, strict=False)
102 parsers = {'general': parse_general, 'images': parse_images, 'it': parse_it}
103
104 return parsers[baidu_category](data)
105
106
108 results = []
109 if not data.get("feed", {}).get("entry"):
110 raise SearxEngineAPIException("Invalid response")
111
112 for entry in data["feed"]["entry"]:
113 if not entry.get("title") or not entry.get("url"):
114 continue
115
116 published_date = None
117 if entry.get("time"):
118 try:
119 published_date = datetime.fromtimestamp(entry["time"])
120 except (ValueError, TypeError):
121 published_date = None
122
123 # title and content sometimes containing characters such as & ' " etc...
124 title = unescape(entry["title"])
125 content = unescape(entry.get("abs", ""))
126
127 results.append(
128 {
129 "title": title,
130 "url": entry["url"],
131 "content": content,
132 "publishedDate": published_date,
133 }
134 )
135 return results
136
137
138def parse_images(data):
139 results = []
140 if "data" in data:
141 for item in data["data"]:
142 if not item:
143 # the last item in the JSON list is empty, the JSON string ends with "}, {}]"
144 continue
145 replace_url = item.get("replaceUrl", [{}])[0]
146 width = item.get("width")
147 height = item.get("height")
148 img_date = item.get("bdImgnewsDate")
149 publishedDate = None
150 if img_date:
151 publishedDate = datetime.strptime(img_date, "%Y-%m-%d %H:%M")
152 results.append(
153 {
154 "template": "images.html",
155 "url": replace_url.get("FromURL"),
156 "thumbnail_src": item.get("thumbURL"),
157 "img_src": replace_url.get("ObjURL"),
158 "title": html_to_text(item.get("fromPageTitle")),
159 "source": item.get("fromURLHost"),
160 "resolution": f"{width} x {height}",
161 "img_format": item.get("type"),
162 "filesize": item.get("filesize"),
163 "publishedDate": publishedDate,
164 }
165 )
166 return results
167
168
169def parse_it(data):
170 results = []
171 if not data.get("data", {}).get("documents", {}).get("data"):
172 raise SearxEngineAPIException("Invalid response")
173
174 for entry in data["data"]["documents"]["data"]:
175 results.append(
176 {
177 'title': entry["techDocDigest"]["title"],
178 'url': entry["techDocDigest"]["url"],
179 'content': entry["techDocDigest"]["summary"],
180 }
181 )
182 return results
request(query, params)
Definition baidu.py:44
parse_general(data)
Definition baidu.py:107