Coverage for oc_ds_converter / oc_idmanager / doi.py: 92%

157 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2023-2024 Arianna Moretti <arianna.moretti4@unibo.it> 

2# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

3# SPDX-FileCopyrightText: 2024 Ivan Heibi <ivan.heibi2@unibo.it> 

4# 

5# SPDX-License-Identifier: ISC 

6 

7 

8from __future__ import annotations 

9 

10import re 

11from re import match, sub 

12from urllib.parse import quote, unquote 

13 

14from oc_ds_converter.metadata_manager import MetadataManager 

15from oc_ds_converter.oc_idmanager.base import IdentifierManager 

16from oc_ds_converter.oc_idmanager.isbn import ISBNManager 

17from oc_ds_converter.oc_idmanager.issn import ISSNManager 

18from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import RedisStorageManager 

19from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager 

20from oc_ds_converter.oc_idmanager.orcid import ORCIDManager 

21from oc_ds_converter.oc_idmanager.support import call_api 

22 

23 

24class DOIManager(IdentifierManager): 

25 """This class implements an identifier manager for doi identifier""" 

26 

27 def __init__( 

28 self, 

29 use_api_service: bool = True, 

30 storage_manager: StorageManager | None = None, 

31 testing: bool = True, 

32 orcid_doi_filepath: str = "", 

33 ) -> None: 

34 """DOI manager constructor.""" 

35 super().__init__() 

36 self._orcid_doi_filepath = orcid_doi_filepath 

37 if storage_manager is None: 

38 self.storage_manager = RedisStorageManager(testing=testing) 

39 else: 

40 self.storage_manager = storage_manager 

41 

42 self._api = "https://doi.org/api/handles/" 

43 self._api_airiti = "" 

44 self._api_cnki = "" 

45 self._api_crossref = "https://api.crossref.org/works/" 

46 self._api_datacite = "https://api.datacite.org/dois/" 

47 self._api_istic = "" 

48 self._api_jalc = "https://api.japanlinkcenter.org/dois/" 

49 self._api_kisti = "" 

50 self._api_medra = "https://api.medra.org/metadata/" 

51 self._api_op = "" 

52 self._api_public = "" 

53 self._api_unknown = "https://doi.org/ra/" 

54 self._use_api_service = use_api_service 

55 self._p = "doi:" 

56 self._issnm = ISSNManager() 

57 self._isbnm = ISBNManager() 

58 self._om = ORCIDManager() 

59 

60 # ISC License (ISC) 

61 # ================================== 

62 

63 

64 prefix_dx = r"HTTP:\/\/DX\.D[0|O]I\.[0|O]RG\/" 

65 prefix_doi = r"HTTPS:\/\/D[0|O]I\.[0|O]RG\/" 

66 suffix_dcsupplemental = r"\/-\/DCSUPPLEMENTAL" 

67 suffix_suppinfo = r"SUPPINF[0|O](\.)?" 

68 suffix_pmid1 = r"[\.|\(|,|;]?PMID:\d+.*?" 

69 suffix_pmid2 = r"[\.|\(|,|;]?PMCID:PMC\d+.*?" 

70 suffix_epub = r"[\(|\[]EPUBAHEADOFPRINT[\)\]]" 

71 suffix_published_online = r"[\.|\(|,|;]?ARTICLEPUBLISHEDONLINE.*?\d{4}" 

72 suffix_http = r"[\.|\(|,|;]*HTTP:\/\/.*?" 

73 suffix_subcontent = r"\/(META|ABSTRACT|FULL|EPDF|PDF|SUMMARY)([>|\)](LAST)?ACCESSED\d+)?" 

74 suffix_accessed = r"[>|\)](LAST)?ACCESSED\d+" 

75 suffix_sagepub = r"[\.|\(|,|;]?[A-Z]*\.?SAGEPUB.*?" 

76 suffix_dotted_line = r"\.{5}.*?" 

77 suffix_delimiters = r"[\.|,|<|&|\(|;]+" 

78 suffix_doi_mark = r"\[DOI\].*?" 

79 suffix_year = r"\(\d{4}\)?" 

80 suffix_query = r"\?.*?=.*?" 

81 suffix_hash = r"#.*?" 

82 

83 self.suffix_regex_lst = [suffix_dcsupplemental, suffix_suppinfo, suffix_pmid1, suffix_pmid2, suffix_epub, 

84 suffix_published_online, suffix_http, suffix_subcontent, suffix_accessed, suffix_sagepub, 

85 suffix_dotted_line, suffix_delimiters, suffix_doi_mark, suffix_year, 

86 suffix_query, suffix_hash] 

87 self.prefix_regex_lst = [prefix_dx, prefix_doi] 

88 self.prefix_regex = r"(.*?)(?:\.)?(?:" + "|".join(self.prefix_regex_lst) + r")(.*)" 

89 self.suffix_regex = r"(.*?)(?:" + "|".join(self.suffix_regex_lst) + r")$" 

90 

91 def validated_as_id(self, id_string: str) -> bool | None: 

92 doi_vaidation_value = self.storage_manager.get_value(id_string) 

93 if isinstance(doi_vaidation_value, bool): 

94 return doi_vaidation_value 

95 return None 

96 

97 def is_valid( 

98 self, id_string: str, get_extra_info: bool = False 

99 ) -> bool | tuple[bool, dict[str, str | bool]]: 

100 doi = self.normalise(id_string, include_prefix=True) 

101 if doi is None: 

102 return False 

103 doi_vaidation_value = self.storage_manager.get_value(doi) 

104 if isinstance(doi_vaidation_value, bool): 

105 return doi_vaidation_value 

106 if get_extra_info: 

107 result = self.exists(doi, get_extra_info=True) 

108 valid_bool, extra_info = result # type: ignore[misc] 

109 self.storage_manager.set_full_value(doi, extra_info) 

110 valid = valid_bool and self.syntax_ok(doi) 

111 if not valid and self._use_api_service: 

112 repaired = self.attempt_repair(doi.replace(self._p, "")) 

113 if repaired: 

114 repaired_doi = self._p + repaired 

115 repaired_result = self.exists(repaired_doi, get_extra_info=True) 

116 repaired_valid, repaired_info = repaired_result # type: ignore[misc] 

117 self.storage_manager.set_full_value(repaired_doi, repaired_info) 

118 return repaired_valid, repaired_info 

119 return valid, extra_info 

120 validity_check = self.syntax_ok(doi) and bool(self.exists(doi)) 

121 if not validity_check and self._use_api_service: 

122 repaired = self.attempt_repair(doi.replace(self._p, "")) 

123 if repaired: 

124 repaired_doi = self._p + repaired 

125 self.storage_manager.set_value(repaired_doi, True) 

126 return True 

127 self.storage_manager.set_value(doi, validity_check) 

128 return validity_check 

129 

130 def base_normalise(self, id_string: str) -> str | None: 

131 if "10." not in id_string: 

132 return None 

133 id_string = sub( 

134 r"\0+", "", sub(r"\s+", "", unquote(id_string[id_string.index("10.") :])) 

135 ) 

136 return id_string.lower().strip() if id_string else None 

137 

138 def normalise(self, id_string: str, include_prefix: bool = False) -> str | None: 

139 normalized = self.base_normalise(id_string) 

140 if not normalized: 

141 return None 

142 tmp_doi = normalized.replace(" ", "") 

143 prefix_match = re.search(self.prefix_regex, tmp_doi, re.IGNORECASE) 

144 if prefix_match: 

145 tmp_doi = prefix_match.group(1) 

146 suffix_match = re.search(self.suffix_regex, tmp_doi, re.IGNORECASE) 

147 if suffix_match: 

148 tmp_doi = suffix_match.group(1) 

149 return "%s%s" % ( 

150 self._p if include_prefix else "", 

151 tmp_doi.lower().strip(), 

152 ) 

153 

154 def attempt_repair(self, doi: str) -> str | None: 

155 if not self._use_api_service: 

156 return None 

157 tmp_doi = doi 

158 tmp_doi = re.sub("\\\\", "", tmp_doi) 

159 tmp_doi = re.sub("__", "_", tmp_doi) 

160 tmp_doi = re.sub("\\.\\.", ".", tmp_doi) 

161 tmp_doi = re.sub("<.*?>.*?</.*?>", "", tmp_doi) 

162 tmp_doi = re.sub("<.*?/>", "", tmp_doi) 

163 if tmp_doi != doi and self.exists(tmp_doi): 

164 return tmp_doi 

165 return None 

166 

167 def syntax_ok(self, id_string: str) -> bool: 

168 if not id_string.startswith(self._p): 

169 id_string = self._p + id_string 

170 return bool( 

171 match( 

172 r"^doi:10\.(\d{4,9}|[^\s/]+(\.[^\s/]+)*)/[^\s]+$", 

173 id_string, 

174 re.IGNORECASE, 

175 ) 

176 ) 

177 

178 def exists( 

179 self, 

180 id_string: str, 

181 get_extra_info: bool = False, 

182 allow_extra_api: str | None = None, 

183 ) -> bool | tuple[bool, dict[str, str | bool]]: 

184 valid_bool = True 

185 doi: str | None = id_string 

186 if self._use_api_service: 

187 doi = self.normalise(id_string) 

188 if doi: 

189 json_res = call_api( 

190 url=self._api + quote(doi), headers=self._headers 

191 ) 

192 if json_res and isinstance(json_res, dict): 

193 valid_bool = json_res["responseCode"] == 1 

194 if get_extra_info: 

195 extra_info: dict[str, str | bool] = { 

196 "id": doi, 

197 "valid": valid_bool, 

198 "ra": "unknown", 

199 } 

200 if allow_extra_api is None: 

201 return valid_bool, extra_info 

202 if valid_bool is True and allow_extra_api: 

203 r_format = "xml" if allow_extra_api == "medra" else "json" 

204 extra_api_result = call_api( 

205 url=getattr(self, f"_api_{allow_extra_api}") 

206 + quote(doi), 

207 headers=self._headers, 

208 r_format=r_format, 

209 ) 

210 if extra_api_result and isinstance(extra_api_result, dict): 

211 metadata_manager = MetadataManager( 

212 allow_extra_api, json_res, self._orcid_doi_filepath 

213 ) 

214 metadata = metadata_manager.extract_metadata() 

215 if metadata: 

216 extra_info.update(metadata) 

217 return valid_bool, extra_info 

218 return valid_bool, { 

219 "id": doi, 

220 "valid": valid_bool, 

221 "ra": "unknown", 

222 } 

223 return valid_bool 

224 valid_bool = False 

225 else: 

226 if get_extra_info: 

227 return False, {"id": "", "valid": False, "ra": "unknown"} 

228 return False 

229 if get_extra_info: 

230 return valid_bool, { 

231 "id": doi if doi else "", 

232 "valid": valid_bool, 

233 "ra": "unknown", 

234 } 

235 return valid_bool