.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
yahoo.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Yahoo Search (Web)
3
4Languages are supported by mapping the language to a domain. If domain is not
5found in :py:obj:`lang2domain` URL ``<lang>.search.yahoo.com`` is used.
6
7"""
8
9from urllib.parse import (
10 unquote,
11 urlencode,
12)
13from lxml import html
14
15from searx.utils import (
16 eval_xpath_getindex,
17 eval_xpath_list,
18 extract_text,
19 html_to_text,
20)
21
22# about
23about = {
24 "website": 'https://search.yahoo.com/',
25 "wikidata_id": None,
26 "official_api_documentation": 'https://developer.yahoo.com/api/',
27 "use_official_api": False,
28 "require_api_key": False,
29 "results": 'HTML',
30}
31
32# engine dependent config
33categories = ['general', 'web']
34paging = True
35time_range_support = True
36# send_accept_language_header = True
37
38time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'}
39safesearch_dict = {0: 'p', 1: 'i', 2: 'r'}
40
41region2domain = {
42 "CO": "co.search.yahoo.com", # Colombia
43 "TH": "th.search.yahoo.com", # Thailand
44 "VE": "ve.search.yahoo.com", # Venezuela
45 "CL": "cl.search.yahoo.com", # Chile
46 "HK": "hk.search.yahoo.com", # Hong Kong
47 "PE": "pe.search.yahoo.com", # Peru
48 "CA": "ca.search.yahoo.com", # Canada
49 "DE": "de.search.yahoo.com", # Germany
50 "FR": "fr.search.yahoo.com", # France
51 "TW": "tw.search.yahoo.com", # Taiwan
52 "GB": "uk.search.yahoo.com", # United Kingdom
53 "UK": "uk.search.yahoo.com",
54 "BR": "br.search.yahoo.com", # Brazil
55 "IN": "in.search.yahoo.com", # India
56 "ES": "espanol.search.yahoo.com", # Espanol
57 "PH": "ph.search.yahoo.com", # Philippines
58 "AR": "ar.search.yahoo.com", # Argentina
59 "MX": "mx.search.yahoo.com", # Mexico
60 "SG": "sg.search.yahoo.com", # Singapore
61}
62"""Map regions to domain"""
63
64lang2domain = {
65 'zh_chs': 'hk.search.yahoo.com',
66 'zh_cht': 'tw.search.yahoo.com',
67 'any': 'search.yahoo.com',
68 'en': 'search.yahoo.com',
69 'bg': 'search.yahoo.com',
70 'cs': 'search.yahoo.com',
71 'da': 'search.yahoo.com',
72 'el': 'search.yahoo.com',
73 'et': 'search.yahoo.com',
74 'he': 'search.yahoo.com',
75 'hr': 'search.yahoo.com',
76 'ja': 'search.yahoo.com',
77 'ko': 'search.yahoo.com',
78 'sk': 'search.yahoo.com',
79 'sl': 'search.yahoo.com',
80}
81"""Map language to domain"""
82
83yahoo_languages = {
84 "all": "any",
85 "ar": "ar", # Arabic
86 "bg": "bg", # Bulgarian
87 "cs": "cs", # Czech
88 "da": "da", # Danish
89 "de": "de", # German
90 "el": "el", # Greek
91 "en": "en", # English
92 "es": "es", # Spanish
93 "et": "et", # Estonian
94 "fi": "fi", # Finnish
95 "fr": "fr", # French
96 "he": "he", # Hebrew
97 "hr": "hr", # Croatian
98 "hu": "hu", # Hungarian
99 "it": "it", # Italian
100 "ja": "ja", # Japanese
101 "ko": "ko", # Korean
102 "lt": "lt", # Lithuanian
103 "lv": "lv", # Latvian
104 "nl": "nl", # Dutch
105 "no": "no", # Norwegian
106 "pl": "pl", # Polish
107 "pt": "pt", # Portuguese
108 "ro": "ro", # Romanian
109 "ru": "ru", # Russian
110 "sk": "sk", # Slovak
111 "sl": "sl", # Slovenian
112 "sv": "sv", # Swedish
113 "th": "th", # Thai
114 "tr": "tr", # Turkish
115 "zh": "zh_chs", # Chinese (Simplified)
116 "zh_Hans": "zh_chs",
117 'zh-CN': "zh_chs",
118 "zh_Hant": "zh_cht", # Chinese (Traditional)
119 "zh-HK": "zh_cht",
120 'zh-TW': "zh_cht",
121}
122
123
124def build_sb_cookie(cookie_params):
125 """Build sB cookie parameter from provided parameters.
126
127 :param cookie_params: Dictionary of cookie parameters
128 :type cookie_params: dict
129 :returns: Formatted cookie string
130 :rtype: str
131
132 Example:
133 >>> cookie_params = {'v': '1', 'vm': 'p', 'fl': '1', 'vl': 'lang_fr'}
134 >>> build_sb_cookie(cookie_params)
135 'v=1&vm=p&fl=1&vl=lang_fr'
136 """
137
138 cookie_parts = []
139 for key, value in cookie_params.items():
140 cookie_parts.append(f"{key}={value}")
141
142 return "&".join(cookie_parts)
143
144
145def request(query, params):
146 """Build Yahoo search request."""
147
148 lang, region = (params["language"].split("-") + [None])[:2]
149 lang = yahoo_languages.get(lang, "any")
150
151 # Build URL parameters
152 # - p (str): Search query string
153 # - btf (str): Time filter, maps to values like 'd' (day), 'w' (week), 'm' (month)
154 # - iscqry (str): Empty string, necessary for results to appear properly on first page
155 # - b (int): Search offset for pagination
156 # - pz (str): Amount of results expected for the page
157 url_params = {'p': query}
158
159 btf = time_range_dict.get(params['time_range'])
160 if btf:
161 url_params['btf'] = btf
162
163 if params['pageno'] == 1:
164 url_params['iscqry'] = ''
165 elif params['pageno'] >= 2:
166 url_params['b'] = params['pageno'] * 7 + 1 # 8, 15, 21, etc.
167 url_params['pz'] = 7
168 url_params['bct'] = 0
169 url_params['xargs'] = 0
170
171 # Build sB cookie (for filters)
172 # - vm (str): SafeSearch filter, maps to values like 'p' (None), 'i' (Moderate), 'r' (Strict)
173 # - fl (bool): Indicates if a search language is used or not
174 # - vl (str): The search language to use (e.g. lang_fr)
175 sbcookie_params = {
176 'v': 1,
177 'vm': safesearch_dict[params['safesearch']],
178 'fl': 1,
179 'vl': f'lang_{lang}',
180 'pn': 10,
181 'rw': 'new',
182 'userset': 1,
183 }
184 params['cookies']['sB'] = build_sb_cookie(sbcookie_params)
185
186 # Search region/language
187 domain = region2domain.get(region)
188 if not domain:
189 domain = lang2domain.get(lang, f'{lang}.search.yahoo.com')
190 logger.debug(f'domain selected: {domain}')
191 logger.debug(f'cookies: {params["cookies"]}')
192
193 params['url'] = f'https://{domain}/search?{urlencode(url_params)}'
194 params['domain'] = domain
195
196
197def parse_url(url_string):
198 """remove yahoo-specific tracking-url"""
199
200 endings = ['/RS', '/RK']
201 endpositions = []
202 start = url_string.find('http', url_string.find('/RU=') + 1)
203
204 for ending in endings:
205 endpos = url_string.rfind(ending)
206 if endpos > -1:
207 endpositions.append(endpos)
208
209 if start == 0 or len(endpositions) == 0:
210 return url_string
211
212 end = min(endpositions)
213 return unquote(url_string[start:end])
214
215
216def response(resp):
217 """parse response"""
218
219 results = []
220 dom = html.fromstring(resp.text)
221
222 url_xpath = './/div[contains(@class,"compTitle")]/h3/a/@href'
223 title_xpath = './/h3//a/@aria-label'
224
225 domain = resp.search_params['domain']
226 if domain == "search.yahoo.com":
227 url_xpath = './/div[contains(@class,"compTitle")]/a/@href'
228 title_xpath = './/div[contains(@class,"compTitle")]/a/h3/span'
229
230 # parse results
231 for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'):
232 url = eval_xpath_getindex(result, url_xpath, 0, default=None)
233 if url is None:
234 continue
235 url = parse_url(url)
236
237 title = eval_xpath_getindex(result, title_xpath, 0, default='')
238 title: str = extract_text(title)
239 content = eval_xpath_getindex(result, './/div[contains(@class, "compText")]', 0, default='')
240 content: str = extract_text(content, allow_none=True)
241
242 # append result
243 results.append(
244 {
245 'url': url,
246 # title sometimes contains HTML tags / see
247 # https://github.com/searxng/searxng/issues/3790
248 'title': " ".join(html_to_text(title).strip().split()),
249 'content': " ".join(html_to_text(content).strip().split()),
250 }
251 )
252
253 for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]//table//a'):
254 # append suggestion
255 results.append({'suggestion': extract_text(suggestion)})
256
257 return results
parse_url(url_string)
Definition yahoo.py:197
request(query, params)
Definition yahoo.py:145
build_sb_cookie(cookie_params)
Definition yahoo.py:124