Coverage for oc_ds_converter / oc_idmanager / doi.py: 90%
147 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-12 21:23 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-12 21:23 +0000
1# SPDX-FileCopyrightText: 2023-2024 Arianna Moretti <arianna.moretti4@unibo.it>
2# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
3# SPDX-FileCopyrightText: 2024 Ivan Heibi <ivan.heibi2@unibo.it>
4#
5# SPDX-License-Identifier: ISC
8from __future__ import annotations
10import re
11from re import match, sub
12from urllib.parse import quote, unquote
14from oc_ds_converter.metadata_manager import MetadataManager
15from oc_ds_converter.oc_idmanager.base import IdentifierManager
16from oc_ds_converter.oc_idmanager.isbn import ISBNManager
17from oc_ds_converter.oc_idmanager.issn import ISSNManager
18from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import RedisStorageManager
19from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager
20from oc_ds_converter.oc_idmanager.orcid import ORCIDManager
21from oc_ds_converter.oc_idmanager.support import call_api
24class DOIManager(IdentifierManager):
25 """This class implements an identifier manager for doi identifier"""
27 def __init__(
28 self,
29 use_api_service: bool = True,
30 storage_manager: StorageManager | None = None,
31 testing: bool = True,
32 orcid_doi_filepath: str = "",
33 ) -> None:
34 """DOI manager constructor."""
35 super().__init__()
36 self._orcid_doi_filepath = orcid_doi_filepath
37 if storage_manager is None:
38 self.storage_manager = RedisStorageManager(testing=testing)
39 else:
40 self.storage_manager = storage_manager
42 self._api = "https://doi.org/api/handles/"
43 self._api_airiti = ""
44 self._api_cnki = ""
45 self._api_crossref = "https://api.crossref.org/works/"
46 self._api_datacite = "https://api.datacite.org/dois/"
47 self._api_istic = ""
48 self._api_jalc = "https://api.japanlinkcenter.org/dois/"
49 self._api_kisti = ""
50 self._api_medra = "https://api.medra.org/metadata/"
51 self._api_op = ""
52 self._api_public = ""
53 self._api_unknown = "https://doi.org/ra/"
54 self._use_api_service = use_api_service
55 self._p = "doi:"
56 self._issnm = ISSNManager()
57 self._isbnm = ISBNManager()
58 self._om = ORCIDManager()
60 suffix_dcsupplemental = r"\/-\/DCSUPPLEMENTAL"
61 suffix_suppinfo = r"SUPPINF[0|O](\.)?"
62 suffix_pmid1 = r"[\.|\(|,|;]?PMID:\d+.*?"
63 suffix_pmid2 = r"[\.|\(|,|;]?PMCID:PMC\d+.*?"
64 suffix_epub = r"[\(|\[]EPUBAHEADOFPRINT[\)\]]"
65 suffix_published_online = r"[\.|\(|,|;]?ARTICLEPUBLISHEDONLINE.*?\d{4}"
66 suffix_http = r"[\.|\(|,|;]*HTTP:\/\/.*?"
67 suffix_subcontent = r"\/(META|ABSTRACT|FULL|EPDF|PDF|SUMMARY)([>|\)](LAST)?ACCESSED\d+)?"
68 suffix_accessed = r"[>|\)](LAST)?ACCESSED\d+"
69 suffix_sagepub = r"[\.|\(|,|;]?[A-Z]*\.?SAGEPUB.*?"
70 suffix_dotted_line = r"\.{5}.*?"
71 suffix_delimiters = r"[\.|,|<|&|\(|;]+"
72 suffix_doi_mark = r"\[DOI\].*?"
73 suffix_year = r"\(\d{4}\)?"
74 suffix_query = r"\?.*?=.*?"
75 suffix_hash = r"#.*?"
77 self.suffix_regex_lst = [suffix_dcsupplemental, suffix_suppinfo, suffix_pmid1, suffix_pmid2, suffix_epub,
78 suffix_published_online, suffix_http, suffix_subcontent, suffix_accessed, suffix_sagepub,
79 suffix_dotted_line, suffix_delimiters, suffix_doi_mark, suffix_year,
80 suffix_query, suffix_hash]
81 self.suffix_regex = r"(.*?)(?:" + "|".join(self.suffix_regex_lst) + r")$"
83 def validated_as_id(self, id_string: str) -> bool | None:
84 doi_vaidation_value = self.storage_manager.get_value(id_string)
85 if isinstance(doi_vaidation_value, bool):
86 return doi_vaidation_value
87 return None
89 def is_valid(
90 self, id_string: str, get_extra_info: bool = False
91 ) -> bool | tuple[bool, dict[str, str | bool]]:
92 doi = self.normalise(id_string, include_prefix=True)
93 if doi is None:
94 return False
95 doi_vaidation_value = self.storage_manager.get_value(doi)
96 if isinstance(doi_vaidation_value, bool):
97 return doi_vaidation_value
98 if get_extra_info:
99 result = self.exists(doi, get_extra_info=True)
100 valid_bool, extra_info = result # type: ignore[misc]
101 self.storage_manager.set_full_value(doi, extra_info)
102 valid = valid_bool and self.syntax_ok(doi)
103 if not valid and self._use_api_service:
104 repaired = self.attempt_repair(doi.replace(self._p, ""))
105 if repaired:
106 repaired_doi = self._p + repaired
107 repaired_result = self.exists(repaired_doi, get_extra_info=True)
108 repaired_valid, repaired_info = repaired_result # type: ignore[misc]
109 self.storage_manager.set_full_value(repaired_doi, repaired_info)
110 return repaired_valid, repaired_info
111 return valid, extra_info
112 validity_check = self.syntax_ok(doi) and bool(self.exists(doi))
113 if not validity_check and self._use_api_service:
114 repaired = self.attempt_repair(doi.replace(self._p, ""))
115 if repaired:
116 repaired_doi = self._p + repaired
117 self.storage_manager.set_value(repaired_doi, True)
118 return True
119 self.storage_manager.set_value(doi, validity_check)
120 return validity_check
122 def normalise(self, id_string: str, include_prefix: bool = False) -> str | None:
123 if "10." not in id_string:
124 return None
125 doi = sub(
126 r"\0+", "", sub(r"\s+", "", unquote(id_string[id_string.index("10.") :]))
127 )
128 if not doi:
129 return None
130 doi = doi.lower().strip()
131 return f"{self._p}{doi}" if include_prefix else doi
133 def attempt_repair(self, doi: str) -> str | None:
134 if not self._use_api_service:
135 return None
136 tmp_doi = doi
137 suffix_match = re.search(self.suffix_regex, tmp_doi, re.IGNORECASE)
138 if suffix_match:
139 tmp_doi = suffix_match.group(1)
140 tmp_doi = re.sub("\\\\", "", tmp_doi)
141 tmp_doi = re.sub("__", "_", tmp_doi)
142 tmp_doi = re.sub("\\.\\.", ".", tmp_doi)
143 tmp_doi = re.sub("<.*?>.*?</.*?>", "", tmp_doi)
144 tmp_doi = re.sub("<.*?/>", "", tmp_doi)
145 if tmp_doi != doi and self.exists(tmp_doi):
146 return tmp_doi
147 return None
149 def syntax_ok(self, id_string: str) -> bool:
150 if not id_string.startswith(self._p):
151 id_string = self._p + id_string
152 return bool(
153 match(
154 r"^doi:10\.(\d{4,9}|[^\s/]+(\.[^\s/]+)*)/[^\s]+$",
155 id_string,
156 re.IGNORECASE,
157 )
158 )
160 def exists(
161 self,
162 id_string: str,
163 get_extra_info: bool = False,
164 allow_extra_api: str | None = None,
165 ) -> bool | tuple[bool, dict[str, str | bool]]:
166 valid_bool = True
167 doi: str | None = id_string
168 if self._use_api_service:
169 doi = self.normalise(id_string)
170 if doi:
171 json_res = call_api(
172 url=self._api + quote(doi), headers=self._headers
173 )
174 if json_res and isinstance(json_res, dict):
175 valid_bool = json_res["responseCode"] == 1
176 if get_extra_info:
177 extra_info: dict[str, str | bool] = {
178 "id": doi,
179 "valid": valid_bool,
180 "ra": "unknown",
181 }
182 if allow_extra_api is None:
183 return valid_bool, extra_info
184 if valid_bool is True and allow_extra_api:
185 r_format = "xml" if allow_extra_api == "medra" else "json"
186 extra_api_result = call_api(
187 url=getattr(self, f"_api_{allow_extra_api}")
188 + quote(doi),
189 headers=self._headers,
190 r_format=r_format,
191 )
192 if extra_api_result and isinstance(extra_api_result, dict):
193 metadata_manager = MetadataManager(
194 allow_extra_api, json_res, self._orcid_doi_filepath
195 )
196 metadata = metadata_manager.extract_metadata()
197 if metadata:
198 extra_info.update(metadata)
199 return valid_bool, extra_info
200 return valid_bool, {
201 "id": doi,
202 "valid": valid_bool,
203 "ra": "unknown",
204 }
205 return valid_bool
206 valid_bool = False
207 else:
208 if get_extra_info:
209 return False, {"id": "", "valid": False, "ra": "unknown"}
210 return False
211 if get_extra_info:
212 return valid_bool, {
213 "id": doi if doi else "",
214 "valid": valid_bool,
215 "ra": "unknown",
216 }
217 return valid_bool