Coverage for oc_ds_converter / oc_idmanager / viaf.py: 75%

110 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2023 Arianna Moretti <arianna.moretti4@unibo.it> 

2# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

3# SPDX-FileCopyrightText: 2024 Ivan Heibi <ivan.heibi2@unibo.it> 

4# SPDX-FileCopyrightText: 2026 Marta Soricetti <marta.soricetti@unibo.it> 

5# 

6# SPDX-License-Identifier: ISC 

7 

8 

9from json import loads 

10from re import match, sub 

11from time import sleep 

12from urllib.parse import quote, unquote 

13 

14from oc_ds_converter.oc_idmanager.base import IdentifierManager 

15from requests import ReadTimeout, get 

16from requests.exceptions import ConnectionError 

17 

18from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import RedisStorageManager 

19from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager 

20from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager 

21 

22from typing import Type, Optional 

23 

24 

25class ViafManager(IdentifierManager): 

26 """This class implements an identifier manager for VIAF identifier""" 

27 

28 def __init__(self, use_api_service: bool = True, storage_manager: StorageManager | None = None, testing: bool = True) -> None: 

29 """VIAF manager constructor.""" 

30 super(ViafManager, self).__init__() 

31 self._use_api_service = use_api_service 

32 if storage_manager is None: 

33 self.storage_manager = RedisStorageManager(testing=testing) 

34 else: 

35 self.storage_manager = storage_manager 

36 

37 self._api = f"http://viaf.org/viaf/" 

38 self._headers = { 

39 "Accept": "application/json" 

40 } 

41 self._use_api_service = use_api_service 

42 self._p = "viaf:" 

43 

44 

45 def validated_as_id(self, id_string): 

46 arxiv_vaidation_value = self.storage_manager.get_value(id_string) 

47 if isinstance(arxiv_vaidation_value, bool): 

48 return arxiv_vaidation_value 

49 else: 

50 return None 

51 

52 def is_valid(self, viaf_id, get_extra_info=False): 

53 viaf = self.normalise(viaf_id, include_prefix=True) 

54 if not viaf: 

55 return False 

56 else: 

57 arxiv_vaidation_value = self.storage_manager.get_value(viaf) 

58 if isinstance(arxiv_vaidation_value, bool): 

59 return arxiv_vaidation_value 

60 else: 

61 if get_extra_info: 

62 info = self.exists(viaf, get_extra_info=True) 

63 self.storage_manager.set_full_value(viaf, info[1]) 

64 return (info[0] and self.syntax_ok(viaf)), info[1] 

65 validity_check = self.syntax_ok(viaf) and self.exists(viaf) 

66 self.storage_manager.set_value(viaf, validity_check) 

67 

68 return validity_check 

69 

70 def normalise(self, id_string, include_prefix=False): 

71 try: 

72 if id_string.startswith(self._p): 

73 viaf_string = id_string[len(self._p):] 

74 else: 

75 viaf_string = id_string 

76 

77 viaf_string = sub("\0+", "", sub("[^0-9]", "", unquote(viaf_string))) 

78 return "%s%s" % ( 

79 self._p if include_prefix else "", 

80 viaf_string.strip(), 

81 ) 

82 except: 

83 # Any error in processing the VIAF will return None 

84 return None 

85 

86 def syntax_ok(self, id_string): 

87 

88 if not id_string.startswith("viaf:"): 

89 id_string = self._p + id_string 

90 return True if match(r"^viaf:[1-9]\d{1,21}$", id_string) else False 

91 

92 def exists(self, viaf_id_full, get_extra_info=False, allow_extra_api=None): 

93 valid_bool = True 

94 viaf_id = viaf_id_full 

95 extra_info_result = {"id": viaf_id} 

96 if self._use_api_service: 

97 viaf_id = self.normalise(viaf_id_full) 

98 extra_info_result = {"id": viaf_id} 

99 if viaf_id is not None: 

100 tentative = 3 

101 while tentative: 

102 tentative -= 1 

103 try: 

104 r = get(self._api + quote(viaf_id), headers=self._headers, timeout=30) 

105 if r.status_code == 200: 

106 r.encoding = "utf-8" 

107 json_res = loads(r.text) 

108 if get_extra_info: 

109 VIAFCluster = json_res.get('ns1:VIAFCluster') 

110 if VIAFCluster: 

111 try: 

112 result = True if str(VIAFCluster['ns1:viafID']) == str(viaf_id) else False 

113 extra_info_result["valid"] = result 

114 return result, extra_info_result 

115 except KeyError: 

116 extra_info_result["valid"] = False 

117 return False, extra_info_result 

118 else: 

119 extra_info_result["valid"] = False 

120 return False, extra_info_result 

121 VIAFCluster = json_res.get('ns1:VIAFCluster') 

122 if VIAFCluster: 

123 try: 

124 result = True if str(VIAFCluster['ns1:viafID']) == str(viaf_id) else False 

125 return result 

126 except KeyError: 

127 return False 

128 elif 400 <= r.status_code < 500: 

129 if get_extra_info: 

130 extra_info_result["valid"] = False 

131 return False, extra_info_result 

132 return False 

133 except ReadTimeout: 

134 # Do nothing, just try again 

135 pass 

136 except ConnectionError: 

137 # Sleep 5 seconds, then try again 

138 sleep(5) 

139 valid_bool = False 

140 else: 

141 if get_extra_info: 

142 extra_info_result["valid"] = False 

143 return False, extra_info_result 

144 return False 

145 

146 if get_extra_info: 

147 return valid_bool, extra_info_result 

148 return valid_bool 

149 

150 def extra_info(self, api_response, choose_api=None, info_dict={}): 

151 result = {} 

152 result["valid"] = True 

153 # to be implemented 

154 return result