44 if result.url
and not result.parsed_url:
45 if not isinstance(result.url, str):
46 log.debug(
'result: invalid URL: %s', str(result))
48 result.parsed_url =
None
50 result.parsed_url = urllib.parse.urlparse(result.url)
53 result.parsed_url = result.parsed_url._replace(
55 scheme=result.parsed_url.scheme
or "http",
56 path=result.parsed_url.path,
58 result.url = result.parsed_url.geturl()
60 if isinstance(result, LegacyResult)
and getattr(result,
"infobox",
None):
64 infobox_urls: list[dict[str, str]] = getattr(result,
"urls", [])
65 for item
in infobox_urls:
66 _url = item.get(
"url")
69 _url = urllib.parse.urlparse(_url)
70 item[
"url"] = _url._replace(
71 scheme=_url.scheme
or "http",
76 infobox_id: str |
None = getattr(result,
"id",
None)
78 _url = urllib.parse.urlparse(infobox_id)
79 result.id = _url._replace(
80 scheme=_url.scheme
or "http",
95 if result.title
and not isinstance(result.title, str):
96 log.debug(
"result: invalid type of field 'title': %s", str(result))
97 result.title = str(result)
98 if result.content
and not isinstance(result.content, str):
99 log.debug(
"result: invalid type of field 'content': %s", str(result))
100 result.content = str(result)
104 result.title = WHITESPACE_REGEX.sub(
" ", result.title).strip()
106 result.content = WHITESPACE_REGEX.sub(
" ", result.content).strip()
107 if result.content == result.title:
113 result:
"Result | LegacyResult", filter_func:
"Callable[[Result | LegacyResult, str, str], str | bool]"
120 url_fields = [
"url",
"iframe_src",
"audio_src",
"img_src",
"thumbnail_src",
"thumbnail"]
124 for field_name
in url_fields:
125 url_src = getattr(result, field_name,
"")
129 new_url = filter_func(result, field_name, url_src)
131 if isinstance(new_url, bool):
135 log.debug(
"filter_urls: drop field %s URL %s", field_name, url_src)
138 log.debug(
"filter_urls: modify field %s URL %s -> %s", field_name, url_src, new_url)
140 setattr(result, field_name, new_url)
141 if field_name ==
"url":
144 result.parsed_url =
None
145 elif isinstance(new_url, str):
146 result.parsed_url = urllib.parse.urlparse(new_url)
153 infobox_urls: list[dict[str, str]] = getattr(result,
"urls", [])
157 new_infobox_urls: list[dict[str, str]] = []
159 for item
in infobox_urls:
160 url_src = item.get(
"url",
"")
162 new_infobox_urls.append(item)
165 new_url = filter_func(result,
"infobox_urls", url_src)
166 if isinstance(new_url, bool):
168 new_infobox_urls.append(item)
171 log.debug(
"filter_urls: remove URL from field 'urls' ('infobox_urls') URL %s", url_src)
174 log.debug(
"filter_urls: modify URL from field 'urls' ('infobox_urls') URL %s -> %s", url_src, new_url)
175 item[
"url"] = new_url
176 new_infobox_urls.append(item)
178 setattr(result,
"urls", new_infobox_urls)
184 infobox_attributes: list[dict[str, t.Any]] = getattr(result,
"attributes", [])
186 if infobox_attributes:
188 new_infobox_attributes: list[dict[str, str | list[dict[str, str]]]] = []
190 for item
in infobox_attributes:
191 image: dict[str, str] = item.get(
"image", {})
192 url_src = image.get(
"src",
"")
194 new_infobox_attributes.append(item)
197 new_url = filter_func(result,
"infobox_attributes", url_src)
198 if isinstance(new_url, bool):
200 new_infobox_attributes.append(item)
203 log.debug(
"filter_urls: drop field 'image.src' ('infobox_attributes') URL %s", url_src)
208 "filter_urls: modify 'image.src' ('infobox_attributes') URL %s -> %s",
212 item[
"image"][
"src"] = new_url
213 new_infobox_attributes.append(item)
215 setattr(result,
"attributes", new_infobox_attributes)
217 result.normalize_result_fields()
normalize_result_fields(self)