2"""This module implements the Wikidata engine. Some implementations are shared
3from :ref:`wikipedia engine`.
9from urllib.parse
import urlencode, unquote
12from dateutil.parser
import isoparse
13from babel.dates
import format_datetime, format_date, format_time, get_datetime_format
17from searx.utils import searxng_useragent, get_string_replaces_function
18from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
20 fetch_wikimedia_traits,
27 "website":
'https://wikidata.org/',
28 "wikidata_id":
'Q2013',
29 "official_api_documentation":
'https://query.wikidata.org/',
30 "use_official_api":
True,
31 "require_api_key":
False,
35display_type = [
"infobox"]
36"""A list of display types composed from ``infobox`` and ``list``. The latter
37one will add a hit to the result list. The first one will show a hit in the
38info box. Both values can be set, or one of the two can be set."""
42SPARQL_ENDPOINT_URL =
'https://query.wikidata.org/sparql'
43SPARQL_EXPLAIN_URL =
'https://query.wikidata.org/bigdata/namespace/wdq/sparql?explain'
44WIKIDATA_PROPERTIES = {
45 'P434':
'MusicBrainz',
46 'P435':
'MusicBrainz',
47 'P436':
'MusicBrainz',
48 'P966':
'MusicBrainz',
68SELECT ?item ?itemLabel ?itemDescription ?lat ?long %SELECT%
71 SERVICE wikibase:mwapi {
72 bd:serviceParam wikibase:endpoint "www.wikidata.org";
73 wikibase:api "EntitySearch";
75 mwapi:search "%QUERY%";
76 mwapi:language "%LANGUAGE%".
77 ?item wikibase:apiOutputItem mwapi:item.
79 hint:Prior hint:runFirst "true".
83 SERVICE wikibase:label {
84 bd:serviceParam wikibase:language "%LANGUAGE%,en".
85 ?item rdfs:label ?itemLabel .
86 ?item schema:description ?itemDescription .
91GROUP BY ?item ?itemLabel ?itemDescription ?lat ?long %GROUP_BY%
95QUERY_PROPERTY_NAMES =
"""
100 WHERE { ?item wdt:P279* wd:Q12132 }
102 VALUES ?item { %ATTRIBUTES% }
104 OPTIONAL { ?item rdfs:label ?name. }
110DUMMY_ENTITY_URLS = set(
111 "http://www.wikidata.org/entity/" + wid
for wid
in (
"Q4115189",
"Q13406268",
"Q15397819",
"Q17339402")
117sparql_string_escape = get_string_replaces_function(
132replace_http_by_https = get_string_replaces_function({
'http:':
'https:'})
137 return {
'Accept':
'application/sparql-results+json',
'User-Agent': searxng_useragent()}
141 name = WIKIDATA_PROPERTIES.get(entity_id)
143 name = WIKIDATA_PROPERTIES.get((entity_id, language))
145 name = WIKIDATA_PROPERTIES.get((entity_id, language.split(
'-')[0]))
147 name = WIKIDATA_PROPERTIES.get((entity_id,
'en'))
156 http_response = get(SPARQL_ENDPOINT_URL +
'?' + urlencode({
'query': query}), headers=
get_headers(), **kwargs)
159 http_response = post(SPARQL_ENDPOINT_URL, data={
'query': query}, headers=
get_headers(), **kwargs)
160 if http_response.status_code != 200:
161 logger.debug(
'SPARQL endpoint error %s', http_response.content.decode())
162 logger.debug(
'request time %s', str(http_response.elapsed))
163 http_response.raise_for_status()
164 return loads(http_response.content.decode())
169 eng_tag, _wiki_netloc = get_wiki_params(params[
'searxng_locale'], traits)
170 query, attributes =
get_query(query, eng_tag)
171 logger.debug(
"request --> language %s // len(attributes): %s", eng_tag, len(attributes))
173 params[
'method'] =
'POST'
174 params[
'url'] = SPARQL_ENDPOINT_URL
175 params[
'data'] = {
'query': query}
177 params[
'language'] = eng_tag
178 params[
'attributes'] = attributes
186 jsonresponse = loads(resp.content.decode())
188 language = resp.search_params[
'language']
189 attributes = resp.search_params[
'attributes']
190 logger.debug(
"request --> language %s // len(attributes): %s", language, len(attributes))
192 seen_entities = set()
193 for result
in jsonresponse.get(
'results', {}).get(
'bindings', []):
194 attribute_result = {key: value[
'value']
for key, value
in result.items()}
195 entity_url = attribute_result[
'item']
196 if entity_url
not in seen_entities
and entity_url
not in DUMMY_ENTITY_URLS:
197 seen_entities.add(entity_url)
198 results +=
get_results(attribute_result, attributes, language)
200 logger.debug(
'The SPARQL request returns duplicate entities: %s', str(attribute_result))
205_IMG_SRC_DEFAULT_URL_PREFIX =
"https://commons.wikimedia.org/wiki/Special:FilePath/"
206_IMG_SRC_NEW_URL_PREFIX =
"https://upload.wikimedia.org/wikipedia/commons/thumb/"
210 """Get Thumbnail image from wikimedia commons
212 Images from commons.wikimedia.org are (HTTP) redirected to
213 upload.wikimedia.org. The redirected URL can be calculated by this
216 - https://stackoverflow.com/a/33691240
219 logger.debug(
'get_thumbnail(): %s', img_src)
220 if not img_src
is None and _IMG_SRC_DEFAULT_URL_PREFIX
in img_src.split()[0]:
221 img_src_name = unquote(img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX,
"").split(
"?", 1)[0].replace(
"%20",
"_"))
222 img_src_name_first = img_src_name
223 img_src_name_second = img_src_name
225 if ".svg" in img_src_name.split()[0]:
226 img_src_name_second = img_src_name +
".png"
228 img_src_size = img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX,
"").split(
"?", 1)[1]
229 img_src_size = img_src_size[img_src_size.index(
"=") + 1 : img_src_size.index(
"&")]
230 img_src_name_md5 = md5(img_src_name.encode(
"utf-8")).hexdigest()
232 _IMG_SRC_NEW_URL_PREFIX
233 + img_src_name_md5[0]
235 + img_src_name_md5[0:2]
241 + img_src_name_second
243 logger.debug(
'get_thumbnail() redirected: %s', img_src)
251 infobox_title = attribute_result.get(
'itemLabel')
252 infobox_id = attribute_result[
'item']
253 infobox_id_lang =
None
255 infobox_attributes = []
256 infobox_content = attribute_result.get(
'itemDescription', [])
260 for attribute
in attributes:
261 value = attribute.get_str(attribute_result, language)
262 if value
is not None and value !=
'':
263 attribute_type = type(attribute)
265 if attribute_type
in (WDURLAttribute, WDArticle):
268 for url
in value.split(
', '):
269 infobox_urls.append({
'title': attribute.get_label(language),
'url': url, **attribute.kwargs})
271 if "list" in display_type
and (attribute.kwargs.get(
'official')
or attribute_type == WDArticle):
272 results.append({
'title': infobox_title,
'url': url,
"content": infobox_content})
276 if attribute_type == WDArticle
and (
277 (attribute.language ==
'en' and infobox_id_lang
is None)
or attribute.language !=
'en'
279 infobox_id_lang = attribute.language
281 elif attribute_type == WDImageAttribute:
285 if attribute.priority > img_src_priority:
287 img_src_priority = attribute.priority
288 elif attribute_type == WDGeoAttribute:
293 area = attribute_result.get(
'P2046')
294 osm_zoom = area_to_osm_zoom(area)
if area
else 19
295 url = attribute.get_geo_url(attribute_result, osm_zoom=osm_zoom)
297 infobox_urls.append({
'title': attribute.get_label(language),
'url': url,
'entity': attribute.name})
299 infobox_attributes.append(
300 {
'label': attribute.get_label(language),
'value': value,
'entity': attribute.name}
307 infobox_urls.append({
'title':
'Wikidata',
'url': attribute_result[
'item']})
310 "list" in display_type
312 and len(infobox_attributes) == 0
313 and len(infobox_urls) == 1
314 and len(infobox_content) == 0
316 results.append({
'url': infobox_urls[0][
'url'],
'title': infobox_title,
'content': infobox_content})
317 elif "infobox" in display_type:
320 'infobox': infobox_title,
322 'content': infobox_content,
324 'urls': infobox_urls,
325 'attributes': infobox_attributes,
333 select = [a.get_select()
for a
in attributes]
334 where = list(filter(
lambda s: len(s) > 0, [a.get_where()
for a
in attributes]))
335 wikibase_label = list(filter(
lambda s: len(s) > 0, [a.get_wikibase_label()
for a
in attributes]))
336 group_by = list(filter(
lambda s: len(s) > 0, [a.get_group_by()
for a
in attributes]))
339 .replace(
'%SELECT%',
' '.join(select))
340 .replace(
'%WHERE%',
'\n '.join(where))
341 .replace(
'%WIKIBASE_LABELS%',
'\n '.join(wikibase_label))
342 .replace(
'%GROUP_BY%',
' '.join(group_by))
343 .replace(
'%LANGUAGE%', language)
345 return query, attributes
355 def add_amount(name):
361 def add_url(name, url_id=None, url_path_prefix=None, **kwargs):
362 attributes.append(
WDURLAttribute(name, url_id, url_path_prefix, kwargs))
364 def add_image(name, url_id=None, priority=1):
456 add_url(
'P856', official=
True)
458 if not language.startswith(
'en'):
463 add_url(
'P434', url_id=
'musicbrainz_artist')
464 add_url(
'P435', url_id=
'musicbrainz_work')
465 add_url(
'P436', url_id=
'musicbrainz_release_group')
466 add_url(
'P966', url_id=
'musicbrainz_label')
467 add_url(
'P345', url_id=
'imdb_id')
468 add_url(
'P2397', url_id=
'youtube_channel')
469 add_url(
'P1651', url_id=
'youtube_video')
470 add_url(
'P2002', url_id=
'twitter_profile')
471 add_url(
'P2013', url_id=
'facebook_profile')
472 add_url(
'P2003', url_id=
'instagram_profile')
475 add_url(
'P4033', url_path_prefix=
'/@')
476 add_url(
'P11947', url_path_prefix=
'/c/')
477 add_url(
'P12622', url_path_prefix=
'/c/')
483 add_image(
'P15', priority=1, url_id=
'wikimedia_image')
484 add_image(
'P242', priority=2, url_id=
'wikimedia_image')
485 add_image(
'P154', priority=3, url_id=
'wikimedia_image')
486 add_image(
'P18', priority=4, url_id=
'wikimedia_image')
487 add_image(
'P41', priority=5, url_id=
'wikimedia_image')
488 add_image(
'P2716', priority=6, url_id=
'wikimedia_image')
489 add_image(
'P2910', priority=7, url_id=
'wikimedia_image')
495 __slots__ = (
'name',)
501 return '(group_concat(distinct ?{name};separator=", ") as ?{name}s)'.replace(
'{name}', self.
name)
507 return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace(
'{name}', self.
name)
516 return result.get(self.
name +
's')
519 return '<' + str(type(self).__name__) +
':' + self.
name +
'>'
524 return '?{name} ?{name}Unit'.replace(
'{name}', self.
name)
527 return """ OPTIONAL { ?item p:{name} ?{name}Node .
528 ?{name}Node rdf:type wikibase:BestRank ; ps:{name} ?{name} .
529 OPTIONAL { ?{name}Node psv:{name}/wikibase:quantityUnit ?{name}Unit. } }""".replace(
537 value = result.get(self.
name)
538 unit = result.get(self.
name +
"Unit")
540 unit = unit.replace(
'http://www.wikidata.org/entity/',
'')
547 __slots__ =
'language',
'kwargs'
556 return "Wikipedia ({language})".replace(
'{language}', self.
language)
559 return "?article{language} ?articleName{language}".replace(
'{language}', self.
language)
562 return """OPTIONAL { ?article{language} schema:about ?item ;
563 schema:inLanguage "{language}" ;
564 schema:isPartOf <https://{language}.wikipedia.org/> ;
565 schema:name ?articleName{language} . }""".replace(
573 key =
'article{language}'.replace(
'{language}', self.
language)
574 return result.get(key)
579 return '(group_concat(distinct ?{name}Label;separator=", ") as ?{name}Labels)'.replace(
'{name}', self.
name)
582 return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace(
'{name}', self.
name)
585 return "?{name} rdfs:label ?{name}Label .".replace(
'{name}', self.
name)
588 return result.get(self.
name +
'Labels')
593 HTTP_WIKIMEDIA_IMAGE =
'http://commons.wikimedia.org/wiki/Special:FilePath/'
595 __slots__ =
'url_id',
'url_path_prefix',
'kwargs'
597 def __init__(self, name, url_id=None, url_path_prefix=None, kwargs=None):
599 :param url_id: ID matching one key in ``external_urls.json`` for
600 converting IDs to full URLs.
602 :param url_path_prefix: Path prefix if the values are of format
603 ``account@domain``. If provided, value are rewritten to
604 ``https://<domain><url_path_prefix><account>``. For example::
606 WDURLAttribute('P4033', url_path_prefix='/@')
608 Adds Property `P4033 <https://www.wikidata.org/wiki/Property:P4033>`_
609 to the wikidata query. This field might return for example
610 ``libreoffice@fosstodon.org`` and the URL built from this is then:
612 - account: ``libreoffice``
613 - domain: ``fosstodon.org``
614 - result url: https://fosstodon.org/@libreoffice
623 value = result.get(self.
name +
's')
627 value = value.split(
',')[0]
630 if value.startswith(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE):
631 value = value[len(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE) :]
632 url_id =
'wikimedia_image'
633 return get_external_url(url_id, value)
636 [account, domain] = [x.strip(
"@ ")
for x
in value.rsplit(
'@', 1)]
637 return f
"https://{domain}{self.url_path_prefix}{account}"
644 return "OpenStreetMap"
647 return "?{name}Lat ?{name}Long".replace(
'{name}', self.
name)
650 return """OPTIONAL { ?item p:{name}/psv:{name} [
651 wikibase:geoLatitude ?{name}Lat ;
652 wikibase:geoLongitude ?{name}Long ] }""".replace(
660 latitude = result.get(self.
name +
'Lat')
661 longitude = result.get(self.
name +
'Long')
662 if latitude
and longitude:
663 return latitude +
' ' + longitude
667 latitude = result.get(self.
name +
'Lat')
668 longitude = result.get(self.
name +
'Long')
669 if latitude
and longitude:
670 return get_earth_coordinates_url(latitude, longitude, osm_zoom)
676 __slots__ = (
'priority',)
678 def __init__(self, name, url_id=None, priority=100):
685 return '?{name} ?{name}timePrecision ?{name}timeZone ?{name}timeCalendar'.replace(
'{name}', self.
name)
692 return """OPTIONAL { ?item p:{name}/psv:{name} [
693 wikibase:timeValue ?{name} ;
694 wikibase:timePrecision ?{name}timePrecision ;
695 wikibase:timeTimezone ?{name}timeZone ;
696 wikibase:timeCalendarModel ?{name}timeCalendar ] . }
697 hint:Prior hint:rangeSafe true;""".replace(
715 timestamp = isoparse(value)
716 return format_date(timestamp, format=
'yyyy', locale=locale)
720 timestamp = isoparse(value)
721 return format_date(timestamp, format=
'MMMM y', locale=locale)
725 timestamp = isoparse(value)
726 return format_date(timestamp, format=
'full', locale=locale)
729 timestamp = isoparse(value)
732 get_datetime_format(format, locale=locale)
734 .replace(
'{0}', format_time(timestamp,
'full', tzinfo=
None, locale=locale))
735 .replace(
'{1}', format_date(timestamp,
'short', locale=locale))
740 return format_datetime(isoparse(value), format=
'full', locale=locale)
743 '0': (
'format_8', 1000000000),
744 '1': (
'format_8', 100000000),
745 '2': (
'format_8', 10000000),
746 '3': (
'format_8', 1000000),
747 '4': (
'format_8', 100000),
748 '5': (
'format_8', 10000),
749 '6': (
'format_8', 1000),
750 '7': (
'format_8', 100),
751 '8': (
'format_8', 10),
752 '9': (
'format_9', 1),
753 '10': (
'format_10', 1),
754 '11': (
'format_11', 0),
755 '12': (
'format_13', 0),
756 '13': (
'format_13', 0),
757 '14': (
'format_14', 0),
761 value = result.get(self.
name)
762 if value ==
'' or value
is None:
764 precision = result.get(self.
name +
'timePrecision')
765 date_format = WDDateAttribute.DATE_FORMAT.get(precision)
766 if date_format
is not None:
767 format_method = getattr(self, date_format[0])
768 precision = date_format[1]
772 if value.startswith(
'-'):
776 return format_method(value, language)
784 http_response = get(SPARQL_EXPLAIN_URL +
'&' + urlencode({
'query': query}), headers=
get_headers())
786 http_response = post(SPARQL_EXPLAIN_URL, data={
'query': query}, headers=
get_headers())
787 http_response.raise_for_status()
788 return http_response.content
793 for k, v
in WIKIDATA_UNITS.items():
794 WIKIDATA_PROPERTIES[k] = v[
'symbol']
797 wikidata_property_names = []
799 if type(attribute)
in (WDAttribute, WDAmountAttribute, WDURLAttribute, WDDateAttribute, WDLabelAttribute):
800 if attribute.name
not in WIKIDATA_PROPERTIES:
801 wikidata_property_names.append(
"wd:" + attribute.name)
802 query = QUERY_PROPERTY_NAMES.replace(
'%ATTRIBUTES%',
" ".join(wikidata_property_names))
804 for result
in jsonresponse.get(
'results', {}).get(
'bindings', {}):
805 name = result[
'name'][
'value']
806 lang = result[
'name'][
'xml:lang']
807 entity_id = result[
'item'][
'value'].replace(
'http://www.wikidata.org/entity/',
'')
808 WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize()
812 """Uses languages evaluated from :py:obj:`wikipedia.fetch_wikimedia_traits
813 <searx.engines.wikipedia.fetch_wikimedia_traits>` and removes
815 - ``traits.custom['wiki_netloc']``: wikidata does not have net-locations for
816 the languages and the list of all
818 - ``traits.custom['WIKIPEDIA_LANGUAGES']``: not used in the wikipedia engine
822 fetch_wikimedia_traits(engine_traits)
823 engine_traits.custom[
'wiki_netloc'] = {}
824 engine_traits.custom[
'WIKIPEDIA_LANGUAGES'] = []
get_str(self, result, language)
__init__(self, language, kwargs=None)
get_label(self, language)
get_str(self, result, language)
get_str(self, result, language)
get_label(self, language)
format_10(self, value, locale)
get_str(self, result, language)
format_8(self, value, locale)
format_9(self, value, locale)
format_13(self, value, locale)
format_14(self, value, locale)
format_11(self, value, locale)
get_geo_url(self, result, osm_zoom=19)
get_str(self, result, language)
get_label(self, language)
__init__(self, name, url_id=None, priority=100)
get_str(self, result, language)
__init__(self, name, url_id=None, url_path_prefix=None, kwargs=None)
get_str(self, result, language)
get_results(attribute_result, attributes, language)
get_query(query, language)
send_wikidata_query(query, method='GET', **kwargs)
debug_explain_wikidata_query(query, method='GET')
fetch_traits(EngineTraits engine_traits)
init(engine_settings=None)
get_label_for_entity(entity_id, language)