Coverage for oc_ds_converter / oc_idmanager / viaf.py: 75%
110 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
1# SPDX-FileCopyrightText: 2023 Arianna Moretti <arianna.moretti4@unibo.it>
2# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
3# SPDX-FileCopyrightText: 2024 Ivan Heibi <ivan.heibi2@unibo.it>
4# SPDX-FileCopyrightText: 2026 Marta Soricetti <marta.soricetti@unibo.it>
5#
6# SPDX-License-Identifier: ISC
9from json import loads
10from re import match, sub
11from time import sleep
12from urllib.parse import quote, unquote
14from oc_ds_converter.oc_idmanager.base import IdentifierManager
15from requests import ReadTimeout, get
16from requests.exceptions import ConnectionError
18from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import RedisStorageManager
19from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager
20from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager
22from typing import Type, Optional
25class ViafManager(IdentifierManager):
26 """This class implements an identifier manager for VIAF identifier"""
28 def __init__(self, use_api_service: bool = True, storage_manager: StorageManager | None = None, testing: bool = True) -> None:
29 """VIAF manager constructor."""
30 super(ViafManager, self).__init__()
31 self._use_api_service = use_api_service
32 if storage_manager is None:
33 self.storage_manager = RedisStorageManager(testing=testing)
34 else:
35 self.storage_manager = storage_manager
37 self._api = f"http://viaf.org/viaf/"
38 self._headers = {
39 "Accept": "application/json"
40 }
41 self._use_api_service = use_api_service
42 self._p = "viaf:"
45 def validated_as_id(self, id_string):
46 arxiv_vaidation_value = self.storage_manager.get_value(id_string)
47 if isinstance(arxiv_vaidation_value, bool):
48 return arxiv_vaidation_value
49 else:
50 return None
52 def is_valid(self, viaf_id, get_extra_info=False):
53 viaf = self.normalise(viaf_id, include_prefix=True)
54 if not viaf:
55 return False
56 else:
57 arxiv_vaidation_value = self.storage_manager.get_value(viaf)
58 if isinstance(arxiv_vaidation_value, bool):
59 return arxiv_vaidation_value
60 else:
61 if get_extra_info:
62 info = self.exists(viaf, get_extra_info=True)
63 self.storage_manager.set_full_value(viaf, info[1])
64 return (info[0] and self.syntax_ok(viaf)), info[1]
65 validity_check = self.syntax_ok(viaf) and self.exists(viaf)
66 self.storage_manager.set_value(viaf, validity_check)
68 return validity_check
70 def normalise(self, id_string, include_prefix=False):
71 try:
72 if id_string.startswith(self._p):
73 viaf_string = id_string[len(self._p):]
74 else:
75 viaf_string = id_string
77 viaf_string = sub("\0+", "", sub("[^0-9]", "", unquote(viaf_string)))
78 return "%s%s" % (
79 self._p if include_prefix else "",
80 viaf_string.strip(),
81 )
82 except:
83 # Any error in processing the VIAF will return None
84 return None
86 def syntax_ok(self, id_string):
88 if not id_string.startswith("viaf:"):
89 id_string = self._p + id_string
90 return True if match(r"^viaf:[1-9]\d{1,21}$", id_string) else False
92 def exists(self, viaf_id_full, get_extra_info=False, allow_extra_api=None):
93 valid_bool = True
94 viaf_id = viaf_id_full
95 extra_info_result = {"id": viaf_id}
96 if self._use_api_service:
97 viaf_id = self.normalise(viaf_id_full)
98 extra_info_result = {"id": viaf_id}
99 if viaf_id is not None:
100 tentative = 3
101 while tentative:
102 tentative -= 1
103 try:
104 r = get(self._api + quote(viaf_id), headers=self._headers, timeout=30)
105 if r.status_code == 200:
106 r.encoding = "utf-8"
107 json_res = loads(r.text)
108 if get_extra_info:
109 VIAFCluster = json_res.get('ns1:VIAFCluster')
110 if VIAFCluster:
111 try:
112 result = True if str(VIAFCluster['ns1:viafID']) == str(viaf_id) else False
113 extra_info_result["valid"] = result
114 return result, extra_info_result
115 except KeyError:
116 extra_info_result["valid"] = False
117 return False, extra_info_result
118 else:
119 extra_info_result["valid"] = False
120 return False, extra_info_result
121 VIAFCluster = json_res.get('ns1:VIAFCluster')
122 if VIAFCluster:
123 try:
124 result = True if str(VIAFCluster['ns1:viafID']) == str(viaf_id) else False
125 return result
126 except KeyError:
127 return False
128 elif 400 <= r.status_code < 500:
129 if get_extra_info:
130 extra_info_result["valid"] = False
131 return False, extra_info_result
132 return False
133 except ReadTimeout:
134 # Do nothing, just try again
135 pass
136 except ConnectionError:
137 # Sleep 5 seconds, then try again
138 sleep(5)
139 valid_bool = False
140 else:
141 if get_extra_info:
142 extra_info_result["valid"] = False
143 return False, extra_info_result
144 return False
146 if get_extra_info:
147 return valid_bool, extra_info_result
148 return valid_bool
150 def extra_info(self, api_response, choose_api=None, info_dict={}):
151 result = {}
152 result["valid"] = True
153 # to be implemented
154 return result