Coverage for oc_ds_converter / oc_idmanager / wikidata.py: 77%

100 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2023-2024 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# SPDX-FileCopyrightText: 2026 Marta Soricetti <marta.soricetti@unibo.it> 

3# 

4# SPDX-License-Identifier: ISC 

5 

6 

7from json import loads 

8from re import match, sub 

9from time import sleep 

10from urllib.parse import quote, unquote 

11 

12from oc_ds_converter.oc_idmanager.base import IdentifierManager 

13from requests import ReadTimeout, get 

14from requests.exceptions import ConnectionError 

15from typing import Type, Optional 

16from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager 

17from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager 

18 

19 

20class WikidataManager(IdentifierManager): 

21 """This class implements an identifier manager for wikidata identifier""" 

22 

23 def __init__(self, use_api_service = True, storage_manager:Optional[StorageManager] = None): 

24 """Wikidata manager constructor.""" 

25 super(WikidataManager, self).__init__() 

26 self._api = "https://www.wikidata.org/wiki/Special:EntityData/" 

27 self._use_api_service = use_api_service 

28 if storage_manager is None: 

29 self.storage_manager = InMemoryStorageManager() 

30 else: 

31 self.storage_manager = storage_manager 

32 self._p = "wikidata:" 

33 

34 def validated_as_id(self, id_string): 

35 wikidata_validation_value = self.storage_manager.get_value(id_string) 

36 if isinstance(wikidata_validation_value, bool): 

37 return wikidata_validation_value 

38 else: 

39 return None 

40 

41 def is_valid(self, wikidata_id, get_extra_info=False): 

42 wikidata_id = self.normalise(wikidata_id, include_prefix=True) 

43 

44 if wikidata_id is None: 

45 if get_extra_info: 

46 return False, {"id":wikidata_id, "valid": False} 

47 return False 

48 else: 

49 wikidata_validation_value = self.storage_manager.get_value(wikidata_id) 

50 if isinstance(wikidata_validation_value, bool): 

51 if get_extra_info: 

52 return wikidata_validation_value, {"id": wikidata_id, "valid": wikidata_validation_value} 

53 return wikidata_validation_value 

54 

55 else: 

56 if get_extra_info: 

57 info = self.exists(wikidata_id, get_extra_info=True) 

58 self.storage_manager.set_full_value(wikidata_id, info[1]) 

59 return (info[0] and self.syntax_ok(wikidata_id)), info[1] 

60 validity_check = self.syntax_ok(wikidata_id) and self.exists(wikidata_id) 

61 self.storage_manager.set_value(wikidata_id, validity_check) 

62 return validity_check 

63 

64 def normalise(self, id_string, include_prefix=False): 

65 try: 

66 if id_string.startswith(self._p): 

67 wikidata_string = id_string[len(self._p):] 

68 else: 

69 wikidata_string = id_string 

70 

71 wikidata_string = sub("\0+", "", sub("[^Q0-9]", "", unquote(wikidata_string).upper())) 

72 return "%s%s" % ( 

73 self._p if include_prefix else "", 

74 wikidata_string.strip(), 

75 ) 

76 except: 

77 # Any error in processing the Q-ID will return None 

78 return None 

79 

80 def syntax_ok(self, id_string): 

81 

82 if not id_string.startswith("wikidata:"): 

83 id_string = self._p + id_string 

84 return True if match("^wikidata:Q[1-9]\\d*$", id_string) else False 

85 

86 def exists(self, wikidata_id_full, get_extra_info=False, allow_extra_api=None): 

87 valid_bool = True 

88 if self._use_api_service: 

89 wikidata_id = self.normalise(wikidata_id_full) 

90 if wikidata_id is not None: 

91 tentative = 3 

92 while tentative: 

93 tentative -= 1 

94 try: 

95 r = get(self._api + quote(wikidata_id), headers=self._headers, timeout=30) 

96 if r.status_code == 200: 

97 r.encoding = "utf-8" 

98 json_res = loads(r.text) 

99 if get_extra_info: 

100 extra_info_result = {} 

101 try: 

102 result = True if json_res['entities'][f"{wikidata_id}"]['id'] == str(wikidata_id) else False 

103 extra_info_result['valid'] = result 

104 return result, extra_info_result 

105 except KeyError: 

106 extra_info_result["valid"] = False 

107 return False, extra_info_result 

108 # return True if json_res['entities'][f"{wikidata_id}"]['id'] == str( 

109 # wikidata_id) else False, self.extra_info(json_res) 

110 try: 

111 return True if json_res['entities'][f"{wikidata_id}"]['id'] == str(wikidata_id) else False 

112 except KeyError: 

113 return False 

114 

115 elif 400 <= r.status_code < 500: 

116 if get_extra_info: 

117 return False, {"valid": False} 

118 return False 

119 except ReadTimeout: 

120 # Do nothing, just try again 

121 pass 

122 except ConnectionError: 

123 # Sleep 5 seconds, then try again 

124 sleep(5) 

125 valid_bool = False 

126 else: 

127 if get_extra_info: 

128 return False, {"valid": False} 

129 return False 

130 

131 if get_extra_info: 

132 return valid_bool, {"valid": valid_bool} 

133 return valid_bool 

134 

135 def extra_info(self, api_response, choose_api=None, info_dict={}): 

136 result = {} 

137 result["valid"] = True 

138 # to be implemented 

139 return result