Coverage for oc_ds_converter / oc_idmanager / doi.py: 90%

147 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-12 21:23 +0000

1# SPDX-FileCopyrightText: 2023-2024 Arianna Moretti <arianna.moretti4@unibo.it> 

2# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

3# SPDX-FileCopyrightText: 2024 Ivan Heibi <ivan.heibi2@unibo.it> 

4# 

5# SPDX-License-Identifier: ISC 

6 

7 

8from __future__ import annotations 

9 

10import re 

11from re import match, sub 

12from urllib.parse import quote, unquote 

13 

14from oc_ds_converter.metadata_manager import MetadataManager 

15from oc_ds_converter.oc_idmanager.base import IdentifierManager 

16from oc_ds_converter.oc_idmanager.isbn import ISBNManager 

17from oc_ds_converter.oc_idmanager.issn import ISSNManager 

18from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import RedisStorageManager 

19from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager 

20from oc_ds_converter.oc_idmanager.orcid import ORCIDManager 

21from oc_ds_converter.oc_idmanager.support import call_api 

22 

23 

24class DOIManager(IdentifierManager): 

25 """This class implements an identifier manager for doi identifier""" 

26 

27 def __init__( 

28 self, 

29 use_api_service: bool = True, 

30 storage_manager: StorageManager | None = None, 

31 testing: bool = True, 

32 orcid_doi_filepath: str = "", 

33 ) -> None: 

34 """DOI manager constructor.""" 

35 super().__init__() 

36 self._orcid_doi_filepath = orcid_doi_filepath 

37 if storage_manager is None: 

38 self.storage_manager = RedisStorageManager(testing=testing) 

39 else: 

40 self.storage_manager = storage_manager 

41 

42 self._api = "https://doi.org/api/handles/" 

43 self._api_airiti = "" 

44 self._api_cnki = "" 

45 self._api_crossref = "https://api.crossref.org/works/" 

46 self._api_datacite = "https://api.datacite.org/dois/" 

47 self._api_istic = "" 

48 self._api_jalc = "https://api.japanlinkcenter.org/dois/" 

49 self._api_kisti = "" 

50 self._api_medra = "https://api.medra.org/metadata/" 

51 self._api_op = "" 

52 self._api_public = "" 

53 self._api_unknown = "https://doi.org/ra/" 

54 self._use_api_service = use_api_service 

55 self._p = "doi:" 

56 self._issnm = ISSNManager() 

57 self._isbnm = ISBNManager() 

58 self._om = ORCIDManager() 

59 

60 suffix_dcsupplemental = r"\/-\/DCSUPPLEMENTAL" 

61 suffix_suppinfo = r"SUPPINF[0|O](\.)?" 

62 suffix_pmid1 = r"[\.|\(|,|;]?PMID:\d+.*?" 

63 suffix_pmid2 = r"[\.|\(|,|;]?PMCID:PMC\d+.*?" 

64 suffix_epub = r"[\(|\[]EPUBAHEADOFPRINT[\)\]]" 

65 suffix_published_online = r"[\.|\(|,|;]?ARTICLEPUBLISHEDONLINE.*?\d{4}" 

66 suffix_http = r"[\.|\(|,|;]*HTTP:\/\/.*?" 

67 suffix_subcontent = r"\/(META|ABSTRACT|FULL|EPDF|PDF|SUMMARY)([>|\)](LAST)?ACCESSED\d+)?" 

68 suffix_accessed = r"[>|\)](LAST)?ACCESSED\d+" 

69 suffix_sagepub = r"[\.|\(|,|;]?[A-Z]*\.?SAGEPUB.*?" 

70 suffix_dotted_line = r"\.{5}.*?" 

71 suffix_delimiters = r"[\.|,|<|&|\(|;]+" 

72 suffix_doi_mark = r"\[DOI\].*?" 

73 suffix_year = r"\(\d{4}\)?" 

74 suffix_query = r"\?.*?=.*?" 

75 suffix_hash = r"#.*?" 

76 

77 self.suffix_regex_lst = [suffix_dcsupplemental, suffix_suppinfo, suffix_pmid1, suffix_pmid2, suffix_epub, 

78 suffix_published_online, suffix_http, suffix_subcontent, suffix_accessed, suffix_sagepub, 

79 suffix_dotted_line, suffix_delimiters, suffix_doi_mark, suffix_year, 

80 suffix_query, suffix_hash] 

81 self.suffix_regex = r"(.*?)(?:" + "|".join(self.suffix_regex_lst) + r")$" 

82 

83 def validated_as_id(self, id_string: str) -> bool | None: 

84 doi_vaidation_value = self.storage_manager.get_value(id_string) 

85 if isinstance(doi_vaidation_value, bool): 

86 return doi_vaidation_value 

87 return None 

88 

89 def is_valid( 

90 self, id_string: str, get_extra_info: bool = False 

91 ) -> bool | tuple[bool, dict[str, str | bool]]: 

92 doi = self.normalise(id_string, include_prefix=True) 

93 if doi is None: 

94 return False 

95 doi_vaidation_value = self.storage_manager.get_value(doi) 

96 if isinstance(doi_vaidation_value, bool): 

97 return doi_vaidation_value 

98 if get_extra_info: 

99 result = self.exists(doi, get_extra_info=True) 

100 valid_bool, extra_info = result # type: ignore[misc] 

101 self.storage_manager.set_full_value(doi, extra_info) 

102 valid = valid_bool and self.syntax_ok(doi) 

103 if not valid and self._use_api_service: 

104 repaired = self.attempt_repair(doi.replace(self._p, "")) 

105 if repaired: 

106 repaired_doi = self._p + repaired 

107 repaired_result = self.exists(repaired_doi, get_extra_info=True) 

108 repaired_valid, repaired_info = repaired_result # type: ignore[misc] 

109 self.storage_manager.set_full_value(repaired_doi, repaired_info) 

110 return repaired_valid, repaired_info 

111 return valid, extra_info 

112 validity_check = self.syntax_ok(doi) and bool(self.exists(doi)) 

113 if not validity_check and self._use_api_service: 

114 repaired = self.attempt_repair(doi.replace(self._p, "")) 

115 if repaired: 

116 repaired_doi = self._p + repaired 

117 self.storage_manager.set_value(repaired_doi, True) 

118 return True 

119 self.storage_manager.set_value(doi, validity_check) 

120 return validity_check 

121 

122 def normalise(self, id_string: str, include_prefix: bool = False) -> str | None: 

123 if "10." not in id_string: 

124 return None 

125 doi = sub( 

126 r"\0+", "", sub(r"\s+", "", unquote(id_string[id_string.index("10.") :])) 

127 ) 

128 if not doi: 

129 return None 

130 doi = doi.lower().strip() 

131 return f"{self._p}{doi}" if include_prefix else doi 

132 

133 def attempt_repair(self, doi: str) -> str | None: 

134 if not self._use_api_service: 

135 return None 

136 tmp_doi = doi 

137 suffix_match = re.search(self.suffix_regex, tmp_doi, re.IGNORECASE) 

138 if suffix_match: 

139 tmp_doi = suffix_match.group(1) 

140 tmp_doi = re.sub("\\\\", "", tmp_doi) 

141 tmp_doi = re.sub("__", "_", tmp_doi) 

142 tmp_doi = re.sub("\\.\\.", ".", tmp_doi) 

143 tmp_doi = re.sub("<.*?>.*?</.*?>", "", tmp_doi) 

144 tmp_doi = re.sub("<.*?/>", "", tmp_doi) 

145 if tmp_doi != doi and self.exists(tmp_doi): 

146 return tmp_doi 

147 return None 

148 

149 def syntax_ok(self, id_string: str) -> bool: 

150 if not id_string.startswith(self._p): 

151 id_string = self._p + id_string 

152 return bool( 

153 match( 

154 r"^doi:10\.(\d{4,9}|[^\s/]+(\.[^\s/]+)*)/[^\s]+$", 

155 id_string, 

156 re.IGNORECASE, 

157 ) 

158 ) 

159 

160 def exists( 

161 self, 

162 id_string: str, 

163 get_extra_info: bool = False, 

164 allow_extra_api: str | None = None, 

165 ) -> bool | tuple[bool, dict[str, str | bool]]: 

166 valid_bool = True 

167 doi: str | None = id_string 

168 if self._use_api_service: 

169 doi = self.normalise(id_string) 

170 if doi: 

171 json_res = call_api( 

172 url=self._api + quote(doi), headers=self._headers 

173 ) 

174 if json_res and isinstance(json_res, dict): 

175 valid_bool = json_res["responseCode"] == 1 

176 if get_extra_info: 

177 extra_info: dict[str, str | bool] = { 

178 "id": doi, 

179 "valid": valid_bool, 

180 "ra": "unknown", 

181 } 

182 if allow_extra_api is None: 

183 return valid_bool, extra_info 

184 if valid_bool is True and allow_extra_api: 

185 r_format = "xml" if allow_extra_api == "medra" else "json" 

186 extra_api_result = call_api( 

187 url=getattr(self, f"_api_{allow_extra_api}") 

188 + quote(doi), 

189 headers=self._headers, 

190 r_format=r_format, 

191 ) 

192 if extra_api_result and isinstance(extra_api_result, dict): 

193 metadata_manager = MetadataManager( 

194 allow_extra_api, json_res, self._orcid_doi_filepath 

195 ) 

196 metadata = metadata_manager.extract_metadata() 

197 if metadata: 

198 extra_info.update(metadata) 

199 return valid_bool, extra_info 

200 return valid_bool, { 

201 "id": doi, 

202 "valid": valid_bool, 

203 "ra": "unknown", 

204 } 

205 return valid_bool 

206 valid_bool = False 

207 else: 

208 if get_extra_info: 

209 return False, {"id": "", "valid": False, "ra": "unknown"} 

210 return False 

211 if get_extra_info: 

212 return valid_bool, { 

213 "id": doi if doi else "", 

214 "valid": valid_bool, 

215 "ra": "unknown", 

216 } 

217 return valid_bool