Coverage for oc_ds_converter / oc_idmanager / wikidata.py: 77%
100 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
1# SPDX-FileCopyrightText: 2023-2024 Arcangelo Massari <arcangelo.massari@unibo.it>
2# SPDX-FileCopyrightText: 2026 Marta Soricetti <marta.soricetti@unibo.it>
3#
4# SPDX-License-Identifier: ISC
7from json import loads
8from re import match, sub
9from time import sleep
10from urllib.parse import quote, unquote
12from oc_ds_converter.oc_idmanager.base import IdentifierManager
13from requests import ReadTimeout, get
14from requests.exceptions import ConnectionError
15from typing import Type, Optional
16from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager
17from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager
20class WikidataManager(IdentifierManager):
21 """This class implements an identifier manager for wikidata identifier"""
23 def __init__(self, use_api_service = True, storage_manager:Optional[StorageManager] = None):
24 """Wikidata manager constructor."""
25 super(WikidataManager, self).__init__()
26 self._api = "https://www.wikidata.org/wiki/Special:EntityData/"
27 self._use_api_service = use_api_service
28 if storage_manager is None:
29 self.storage_manager = InMemoryStorageManager()
30 else:
31 self.storage_manager = storage_manager
32 self._p = "wikidata:"
34 def validated_as_id(self, id_string):
35 wikidata_validation_value = self.storage_manager.get_value(id_string)
36 if isinstance(wikidata_validation_value, bool):
37 return wikidata_validation_value
38 else:
39 return None
41 def is_valid(self, wikidata_id, get_extra_info=False):
42 wikidata_id = self.normalise(wikidata_id, include_prefix=True)
44 if wikidata_id is None:
45 if get_extra_info:
46 return False, {"id":wikidata_id, "valid": False}
47 return False
48 else:
49 wikidata_validation_value = self.storage_manager.get_value(wikidata_id)
50 if isinstance(wikidata_validation_value, bool):
51 if get_extra_info:
52 return wikidata_validation_value, {"id": wikidata_id, "valid": wikidata_validation_value}
53 return wikidata_validation_value
55 else:
56 if get_extra_info:
57 info = self.exists(wikidata_id, get_extra_info=True)
58 self.storage_manager.set_full_value(wikidata_id, info[1])
59 return (info[0] and self.syntax_ok(wikidata_id)), info[1]
60 validity_check = self.syntax_ok(wikidata_id) and self.exists(wikidata_id)
61 self.storage_manager.set_value(wikidata_id, validity_check)
62 return validity_check
64 def normalise(self, id_string, include_prefix=False):
65 try:
66 if id_string.startswith(self._p):
67 wikidata_string = id_string[len(self._p):]
68 else:
69 wikidata_string = id_string
71 wikidata_string = sub("\0+", "", sub("[^Q0-9]", "", unquote(wikidata_string).upper()))
72 return "%s%s" % (
73 self._p if include_prefix else "",
74 wikidata_string.strip(),
75 )
76 except:
77 # Any error in processing the Q-ID will return None
78 return None
80 def syntax_ok(self, id_string):
82 if not id_string.startswith("wikidata:"):
83 id_string = self._p + id_string
84 return True if match("^wikidata:Q[1-9]\\d*$", id_string) else False
86 def exists(self, wikidata_id_full, get_extra_info=False, allow_extra_api=None):
87 valid_bool = True
88 if self._use_api_service:
89 wikidata_id = self.normalise(wikidata_id_full)
90 if wikidata_id is not None:
91 tentative = 3
92 while tentative:
93 tentative -= 1
94 try:
95 r = get(self._api + quote(wikidata_id), headers=self._headers, timeout=30)
96 if r.status_code == 200:
97 r.encoding = "utf-8"
98 json_res = loads(r.text)
99 if get_extra_info:
100 extra_info_result = {}
101 try:
102 result = True if json_res['entities'][f"{wikidata_id}"]['id'] == str(wikidata_id) else False
103 extra_info_result['valid'] = result
104 return result, extra_info_result
105 except KeyError:
106 extra_info_result["valid"] = False
107 return False, extra_info_result
108 # return True if json_res['entities'][f"{wikidata_id}"]['id'] == str(
109 # wikidata_id) else False, self.extra_info(json_res)
110 try:
111 return True if json_res['entities'][f"{wikidata_id}"]['id'] == str(wikidata_id) else False
112 except KeyError:
113 return False
115 elif 400 <= r.status_code < 500:
116 if get_extra_info:
117 return False, {"valid": False}
118 return False
119 except ReadTimeout:
120 # Do nothing, just try again
121 pass
122 except ConnectionError:
123 # Sleep 5 seconds, then try again
124 sleep(5)
125 valid_bool = False
126 else:
127 if get_extra_info:
128 return False, {"valid": False}
129 return False
131 if get_extra_info:
132 return valid_bool, {"valid": valid_bool}
133 return valid_bool
135 def extra_info(self, api_response, choose_api=None, info_dict={}):
136 result = {}
137 result["valid"] = True
138 # to be implemented
139 return result