.oO SearXNG Developer Documentation Oo.
Loading...
Searching...
No Matches
public_domain_image_archive.py
Go to the documentation of this file.
1# SPDX-License-Identifier: AGPL-3.0-or-later
2"""Public domain image archive"""
3
4from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
5from json import dumps
6
7from searx.network import get
8from searx.utils import extr
9from searx.exceptions import SearxEngineAccessDeniedException, SearxEngineException
10
11THUMBNAIL_SUFFIX = "?fit=max&h=360&w=360"
12"""
13Example thumbnail urls (from requests & html):
14- https://the-public-domain-review.imgix.net
15 /shop/nov-2023-prints-00043.jpg
16 ?fit=max&h=360&w=360
17- https://the-public-domain-review.imgix.net
18 /collections/the-history-of-four-footed-beasts-and-serpents-1658/
19 8616383182_5740fa7851_o.jpg
20 ?fit=max&h=360&w=360
21
22Example full image urls (from html)
23- https://the-public-domain-review.imgix.net/shop/
24 nov-2023-prints-00043.jpg
25 ?fit=clip&w=970&h=800&auto=format,compress
26- https://the-public-domain-review.imgix.net/collections/
27 the-history-of-four-footed-beasts-and-serpents-1658/8616383182_5740fa7851_o.jpg
28 ?fit=clip&w=310&h=800&auto=format,compress
29
30The thumbnail url from the request will be cleaned for the full image link
31The cleaned thumbnail url will have THUMBNAIL_SUFFIX added to them, based on the original thumbnail parameters
32"""
33
34# about
35about = {
36 "website": 'https://pdimagearchive.org',
37 "use_official_api": False,
38 "require_api_key": False,
39 "results": 'JSON',
40}
41
42pdia_base_url = 'https://pdimagearchive.org'
43pdia_config_start = "/_astro/InfiniteSearch."
44pdia_config_end = ".js"
45categories = ['images']
46page_size = 20
47paging = True
48
49
50__CACHED_API_URL = None
51
52
53def _clean_url(url):
54 parsed = urlparse(url)
55 query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']]
56
57 return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment))
58
59
61 global __CACHED_API_URL # pylint:disable=global-statement
62
63 if __CACHED_API_URL:
64 return __CACHED_API_URL
65
66 # fake request to extract api url
67 resp = get(f"{pdia_base_url}/search/?q=")
68 if resp.status_code != 200:
69 raise LookupError("Failed to fetch config location (and as such the API url) for PDImageArchive")
70 pdia_config_filepart = extr(resp.text, pdia_config_start, pdia_config_end)
71 pdia_config_url = pdia_base_url + pdia_config_start + pdia_config_filepart + pdia_config_end
72
73 resp = get(pdia_config_url)
74 if resp.status_code != 200:
75 raise LookupError("Failed to obtain AWS api url for PDImageArchive")
76
77 api_url = extr(resp.text, 'const r="', '"', default=None)
78
79 if api_url is None:
80 raise LookupError("Couldn't obtain AWS api url for PDImageArchive")
81
82 __CACHED_API_URL = api_url
83 return api_url
84
85
87 global __CACHED_API_URL # pylint:disable=global-statement
88
89 __CACHED_API_URL = None
90
91
92def request(query, params):
93 params['url'] = _get_algolia_api_url()
94 params['method'] = 'POST'
95
96 request_data = {
97 'page': params['pageno'] - 1,
98 'query': query,
99 'hitsPerPage': page_size,
100 'indexName': 'prod_all-images',
101 }
102 params['headers'] = {'Content-Type': 'application/json'}
103 params['data'] = dumps(request_data)
104
105 # http errors are handled manually to be able to reset the api url
106 params['raise_for_httperror'] = False
107 return params
108
109
110def response(resp):
111 results = []
112 json_data = resp.json()
113
114 if resp.status_code == 403:
117
118 if resp.status_code != 200:
120
121 if 'results' not in json_data:
122 return []
123
124 for result in json_data['results'][0]['hits']:
125 content = []
126
127 if result.get("themes"):
128 content.append("Themes: " + result['themes'])
129
130 if result.get("encompassingWork"):
131 content.append("Encompassing work: " + result['encompassingWork'])
132
133 base_image_url = result['thumbnail'].split("?")[0]
134
135 results.append(
136 {
137 'template': 'images.html',
138 'url': _clean_url(f"{about['website']}/images/{result['objectID']}"),
139 'img_src': _clean_url(base_image_url),
140 'thumbnail_src': _clean_url(base_image_url + THUMBNAIL_SUFFIX),
141 'title': f"{result['title'].strip()} by {result['artist']} {result.get('displayYear', '')}",
142 'content': "\n".join(content),
143 }
144 )
145
146 return results