Coverage for oc_ds_converter / oc_idmanager / doi.py: 92%
157 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
1# SPDX-FileCopyrightText: 2023-2024 Arianna Moretti <arianna.moretti4@unibo.it>
2# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
3# SPDX-FileCopyrightText: 2024 Ivan Heibi <ivan.heibi2@unibo.it>
4#
5# SPDX-License-Identifier: ISC
8from __future__ import annotations
10import re
11from re import match, sub
12from urllib.parse import quote, unquote
14from oc_ds_converter.metadata_manager import MetadataManager
15from oc_ds_converter.oc_idmanager.base import IdentifierManager
16from oc_ds_converter.oc_idmanager.isbn import ISBNManager
17from oc_ds_converter.oc_idmanager.issn import ISSNManager
18from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import RedisStorageManager
19from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager
20from oc_ds_converter.oc_idmanager.orcid import ORCIDManager
21from oc_ds_converter.oc_idmanager.support import call_api
24class DOIManager(IdentifierManager):
25 """This class implements an identifier manager for doi identifier"""
27 def __init__(
28 self,
29 use_api_service: bool = True,
30 storage_manager: StorageManager | None = None,
31 testing: bool = True,
32 orcid_doi_filepath: str = "",
33 ) -> None:
34 """DOI manager constructor."""
35 super().__init__()
36 self._orcid_doi_filepath = orcid_doi_filepath
37 if storage_manager is None:
38 self.storage_manager = RedisStorageManager(testing=testing)
39 else:
40 self.storage_manager = storage_manager
42 self._api = "https://doi.org/api/handles/"
43 self._api_airiti = ""
44 self._api_cnki = ""
45 self._api_crossref = "https://api.crossref.org/works/"
46 self._api_datacite = "https://api.datacite.org/dois/"
47 self._api_istic = ""
48 self._api_jalc = "https://api.japanlinkcenter.org/dois/"
49 self._api_kisti = ""
50 self._api_medra = "https://api.medra.org/metadata/"
51 self._api_op = ""
52 self._api_public = ""
53 self._api_unknown = "https://doi.org/ra/"
54 self._use_api_service = use_api_service
55 self._p = "doi:"
56 self._issnm = ISSNManager()
57 self._isbnm = ISBNManager()
58 self._om = ORCIDManager()
60 # ISC License (ISC)
61 # ==================================
64 prefix_dx = r"HTTP:\/\/DX\.D[0|O]I\.[0|O]RG\/"
65 prefix_doi = r"HTTPS:\/\/D[0|O]I\.[0|O]RG\/"
66 suffix_dcsupplemental = r"\/-\/DCSUPPLEMENTAL"
67 suffix_suppinfo = r"SUPPINF[0|O](\.)?"
68 suffix_pmid1 = r"[\.|\(|,|;]?PMID:\d+.*?"
69 suffix_pmid2 = r"[\.|\(|,|;]?PMCID:PMC\d+.*?"
70 suffix_epub = r"[\(|\[]EPUBAHEADOFPRINT[\)\]]"
71 suffix_published_online = r"[\.|\(|,|;]?ARTICLEPUBLISHEDONLINE.*?\d{4}"
72 suffix_http = r"[\.|\(|,|;]*HTTP:\/\/.*?"
73 suffix_subcontent = r"\/(META|ABSTRACT|FULL|EPDF|PDF|SUMMARY)([>|\)](LAST)?ACCESSED\d+)?"
74 suffix_accessed = r"[>|\)](LAST)?ACCESSED\d+"
75 suffix_sagepub = r"[\.|\(|,|;]?[A-Z]*\.?SAGEPUB.*?"
76 suffix_dotted_line = r"\.{5}.*?"
77 suffix_delimiters = r"[\.|,|<|&|\(|;]+"
78 suffix_doi_mark = r"\[DOI\].*?"
79 suffix_year = r"\(\d{4}\)?"
80 suffix_query = r"\?.*?=.*?"
81 suffix_hash = r"#.*?"
83 self.suffix_regex_lst = [suffix_dcsupplemental, suffix_suppinfo, suffix_pmid1, suffix_pmid2, suffix_epub,
84 suffix_published_online, suffix_http, suffix_subcontent, suffix_accessed, suffix_sagepub,
85 suffix_dotted_line, suffix_delimiters, suffix_doi_mark, suffix_year,
86 suffix_query, suffix_hash]
87 self.prefix_regex_lst = [prefix_dx, prefix_doi]
88 self.prefix_regex = r"(.*?)(?:\.)?(?:" + "|".join(self.prefix_regex_lst) + r")(.*)"
89 self.suffix_regex = r"(.*?)(?:" + "|".join(self.suffix_regex_lst) + r")$"
91 def validated_as_id(self, id_string: str) -> bool | None:
92 doi_vaidation_value = self.storage_manager.get_value(id_string)
93 if isinstance(doi_vaidation_value, bool):
94 return doi_vaidation_value
95 return None
97 def is_valid(
98 self, id_string: str, get_extra_info: bool = False
99 ) -> bool | tuple[bool, dict[str, str | bool]]:
100 doi = self.normalise(id_string, include_prefix=True)
101 if doi is None:
102 return False
103 doi_vaidation_value = self.storage_manager.get_value(doi)
104 if isinstance(doi_vaidation_value, bool):
105 return doi_vaidation_value
106 if get_extra_info:
107 result = self.exists(doi, get_extra_info=True)
108 valid_bool, extra_info = result # type: ignore[misc]
109 self.storage_manager.set_full_value(doi, extra_info)
110 valid = valid_bool and self.syntax_ok(doi)
111 if not valid and self._use_api_service:
112 repaired = self.attempt_repair(doi.replace(self._p, ""))
113 if repaired:
114 repaired_doi = self._p + repaired
115 repaired_result = self.exists(repaired_doi, get_extra_info=True)
116 repaired_valid, repaired_info = repaired_result # type: ignore[misc]
117 self.storage_manager.set_full_value(repaired_doi, repaired_info)
118 return repaired_valid, repaired_info
119 return valid, extra_info
120 validity_check = self.syntax_ok(doi) and bool(self.exists(doi))
121 if not validity_check and self._use_api_service:
122 repaired = self.attempt_repair(doi.replace(self._p, ""))
123 if repaired:
124 repaired_doi = self._p + repaired
125 self.storage_manager.set_value(repaired_doi, True)
126 return True
127 self.storage_manager.set_value(doi, validity_check)
128 return validity_check
130 def base_normalise(self, id_string: str) -> str | None:
131 if "10." not in id_string:
132 return None
133 id_string = sub(
134 r"\0+", "", sub(r"\s+", "", unquote(id_string[id_string.index("10.") :]))
135 )
136 return id_string.lower().strip() if id_string else None
138 def normalise(self, id_string: str, include_prefix: bool = False) -> str | None:
139 normalized = self.base_normalise(id_string)
140 if not normalized:
141 return None
142 tmp_doi = normalized.replace(" ", "")
143 prefix_match = re.search(self.prefix_regex, tmp_doi, re.IGNORECASE)
144 if prefix_match:
145 tmp_doi = prefix_match.group(1)
146 suffix_match = re.search(self.suffix_regex, tmp_doi, re.IGNORECASE)
147 if suffix_match:
148 tmp_doi = suffix_match.group(1)
149 return "%s%s" % (
150 self._p if include_prefix else "",
151 tmp_doi.lower().strip(),
152 )
154 def attempt_repair(self, doi: str) -> str | None:
155 if not self._use_api_service:
156 return None
157 tmp_doi = doi
158 tmp_doi = re.sub("\\\\", "", tmp_doi)
159 tmp_doi = re.sub("__", "_", tmp_doi)
160 tmp_doi = re.sub("\\.\\.", ".", tmp_doi)
161 tmp_doi = re.sub("<.*?>.*?</.*?>", "", tmp_doi)
162 tmp_doi = re.sub("<.*?/>", "", tmp_doi)
163 if tmp_doi != doi and self.exists(tmp_doi):
164 return tmp_doi
165 return None
167 def syntax_ok(self, id_string: str) -> bool:
168 if not id_string.startswith(self._p):
169 id_string = self._p + id_string
170 return bool(
171 match(
172 r"^doi:10\.(\d{4,9}|[^\s/]+(\.[^\s/]+)*)/[^\s]+$",
173 id_string,
174 re.IGNORECASE,
175 )
176 )
178 def exists(
179 self,
180 id_string: str,
181 get_extra_info: bool = False,
182 allow_extra_api: str | None = None,
183 ) -> bool | tuple[bool, dict[str, str | bool]]:
184 valid_bool = True
185 doi: str | None = id_string
186 if self._use_api_service:
187 doi = self.normalise(id_string)
188 if doi:
189 json_res = call_api(
190 url=self._api + quote(doi), headers=self._headers
191 )
192 if json_res and isinstance(json_res, dict):
193 valid_bool = json_res["responseCode"] == 1
194 if get_extra_info:
195 extra_info: dict[str, str | bool] = {
196 "id": doi,
197 "valid": valid_bool,
198 "ra": "unknown",
199 }
200 if allow_extra_api is None:
201 return valid_bool, extra_info
202 if valid_bool is True and allow_extra_api:
203 r_format = "xml" if allow_extra_api == "medra" else "json"
204 extra_api_result = call_api(
205 url=getattr(self, f"_api_{allow_extra_api}")
206 + quote(doi),
207 headers=self._headers,
208 r_format=r_format,
209 )
210 if extra_api_result and isinstance(extra_api_result, dict):
211 metadata_manager = MetadataManager(
212 allow_extra_api, json_res, self._orcid_doi_filepath
213 )
214 metadata = metadata_manager.extract_metadata()
215 if metadata:
216 extra_info.update(metadata)
217 return valid_bool, extra_info
218 return valid_bool, {
219 "id": doi,
220 "valid": valid_bool,
221 "ra": "unknown",
222 }
223 return valid_bool
224 valid_bool = False
225 else:
226 if get_extra_info:
227 return False, {"id": "", "valid": False, "ra": "unknown"}
228 return False
229 if get_extra_info:
230 return valid_bool, {
231 "id": doi if doi else "",
232 "valid": valid_bool,
233 "ra": "unknown",
234 }
235 return valid_bool