Coverage for test / crossref_processing_test.py: 100%

595 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it> 

2# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

3# SPDX-FileCopyrightText: 2025 Arianna Moretti <arianna.moretti4@unibo.it> 

4# 

5# SPDX-License-Identifier: ISC 

6 

7import json 

8import os 

9import unittest 

10 

11from oc_ds_converter.crossref.crossref_processing import CrossrefProcessing 

12from oc_ds_converter.datasource.orcid_index import PublishersRedis 

13from oc_ds_converter.lib.csvmanager import CSVManager 

14from oc_ds_converter.lib.jsonmanager import load_json 

15 

16TEST_DIR = os.path.join("test", "crossref_processing") 

17JSON_FILE = os.path.join(TEST_DIR, "0.json") 

18TMP_SUPPORT_MATERIAL = os.path.join(TEST_DIR, "tmp_support") 

19IOD = os.path.join(TEST_DIR, 'iod') 

20DATA = os.path.join(TEST_DIR, '40228.json') 

21PUBLISHERS_MAPPING = os.path.join(TEST_DIR, 'publishers.csv') 

22 

23 

24 

25class TestCrossrefProcessing(unittest.TestCase): 

26 def test_extract_all_ids_cited(self): 

27 c_processing = CrossrefProcessing() 

28 with open(JSON_FILE, encoding="utf8") as f: 

29 result = json.load(f) 

30 for entity_dict in result['items']: 

31 results_ids = c_processing.extract_all_ids(entity_dict, False) 

32 br = results_ids[0] 

33 expected_br = ['doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020', 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218', 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807', 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2', 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693', 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72', 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677'] 

34 self.assertEqual(set(expected_br), set(br)) 

35 c_processing.storage_manager.delete_storage() 

36 

37 def test_extract_all_ids_cited_redis(self): 

38 c_processing = CrossrefProcessing(testing=True) 

39 with open(JSON_FILE, encoding="utf8") as f: 

40 result = json.load(f) 

41 for entity_dict in result['items']: 

42 results_ids = c_processing.extract_all_ids(entity_dict, False) 

43 br = results_ids[0] 

44 expected_br = ['doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 

45 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020', 

46 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218', 

47 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807', 

48 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2', 

49 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693', 

50 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72', 

51 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677'] 

52 self.assertEqual(set(expected_br), set(br)) 

53 c_processing.storage_manager.delete_storage() 

54 

55 def test_get_redis_validity_list(self): 

56 c_processing = CrossrefProcessing() 

57 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 

58 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020', 

59 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218', 

60 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807', 

61 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2', 

62 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693', 

63 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72', 

64 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677'} 

65 br_valid_list = c_processing.get_redis_validity_list(br, "br") 

66 exp_br_valid_list = [] 

67 self.assertEqual(br_valid_list, exp_br_valid_list) 

68 c_processing.storage_manager.delete_storage() 

69 

70 def test_get_redis_validity_list_redis(self): 

71 c_processing = CrossrefProcessing(testing=True) 

72 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 

73 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020', 

74 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218', 

75 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807', 

76 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2', 

77 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693', 

78 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72', 

79 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677'} 

80 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

81 br_valid_list = c_processing.get_redis_validity_list(br, "br") 

82 exp_br_valid_list = [] 

83 ra_valid_list = c_processing.get_redis_validity_list(ra, "ra") 

84 self.assertEqual(br_valid_list, exp_br_valid_list) 

85 exp_ra_valid_list = [] 

86 self.assertEqual(ra_valid_list, exp_ra_valid_list) 

87 c_processing.storage_manager.delete_storage() 

88 

89 def test_get_redis_validity_dict_w_fakeredis_db_values_sqlite(self): 

90 c_processing = CrossrefProcessing() 

91 c_processing.BR_redis.sadd('doi:10.2105/ajph.2006.101626', "omid:1") 

92 c_processing.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2") 

93 

94 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 

95 'doi:10.1177/003335490812300219'} 

96 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

97 

98 br_validity_dict = c_processing.get_redis_validity_list(br, "br") 

99 exp_br_valid_list = ['doi:10.2105/ajph.2006.101626'] 

100 ra_validity_dict = c_processing.get_redis_validity_list(ra, "ra") 

101 exp_ra_valid_list = ['orcid:0000-0002-8090-6886'] 

102 self.assertEqual(br_validity_dict, exp_br_valid_list) 

103 self.assertEqual(ra_validity_dict, exp_ra_valid_list) 

104 

105 c_processing.storage_manager.delete_storage() 

106 

107 c_processing.BR_redis.delete('doi:10.2105/ajph.2006.101626') 

108 c_processing.RA_redis.delete('orcid:0000-0002-8090-6886') 

109 

110 def test_get_redis_validity_dict_w_fakeredis_db_values_redis(self): 

111 c_processing = CrossrefProcessing(testing=True) 

112 c_processing.BR_redis.sadd('doi:10.2105/ajph.2006.101626', "omid:1") 

113 c_processing.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2") 

114 

115 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 

116 'doi:10.1177/003335490812300219'} 

117 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

118 

119 br_validity_dict = c_processing.get_redis_validity_list(br, "br") 

120 exp_br_valid_list = ['doi:10.2105/ajph.2006.101626'] 

121 ra_validity_dict = c_processing.get_redis_validity_list(ra, "ra") 

122 exp_ra_valid_list = ['orcid:0000-0002-8090-6886'] 

123 self.assertEqual(br_validity_dict, exp_br_valid_list) 

124 self.assertEqual(ra_validity_dict, exp_ra_valid_list) 

125 

126 c_processing.storage_manager.delete_storage() 

127 

128 c_processing.BR_redis.delete('doi:10.2105/ajph.2006.101626') 

129 c_processing.RA_redis.delete('orcid:0000-0002-8090-6886') 

130 

131 def test_validated_as_default(self): 

132 """ 

133 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

134 string of the identifier, the method "validated_as" returns: 

135 - True if the id was already validated as valid 

136 - False if the id was already validated as invalid 

137 - None if the id was not validated before 

138 The procedure is tested 

139 - With default storage manager (sqlite) without a pre-existent db associated 

140 """ 

141 

142 c_processing = CrossrefProcessing() 

143 validate_as_none = c_processing.validated_as({"schema":"doi", "identifier": "doi:10.1001/10-v4n2-hsf10003"}) 

144 self.assertEqual(validate_as_none, None) 

145 c_processing.storage_manager.delete_storage() 

146 

147 def test_validated_as_default_redis(self): 

148 """ 

149 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

150 string of the identifier, the method "validated_as" returns: 

151 - True if the id was already validated as valid 

152 - False if the id was already validated as invalid 

153 - None if the id was not validated before 

154 The procedure is tested 

155 - With redis storage manager without a pre-existent db associated 

156 """ 

157 c_processing = CrossrefProcessing(testing=True) 

158 validate_as_none = c_processing.validated_as({"schema": "doi", "identifier": "doi:10.1001/10-v4n2-hsf10003"}) 

159 self.assertEqual(validate_as_none, None) 

160 c_processing.storage_manager.delete_storage() 

161 

162 def test_validated_as_redis_with_preexistent_data(self): 

163 """ 

164 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

165 string of the identifier, the method "validated_as" returns: 

166 - True if the id was already validated as valid 

167 - False if the id was already validated as invalid 

168 - None if the id was not validated before 

169 The procedure is tested 

170 - With redis storage manager and pre-existent data associated 

171 """ 

172 valid_doi_not_in_db = {"identifier":"doi:10.1001/2012.jama.10158", "schema":"doi"} 

173 valid_doi_in_db = {"identifier":"doi:10.1001/2012.jama.10368", "schema":"doi"} 

174 invalid_doi_in_db = {"identifier":"doi:10.1001/2012.jama.1036", "schema":"doi"} 

175 

176 # New class instance and set values directly on the DOIManager's storage_manager 

177 c_processing_redis = CrossrefProcessing(testing=True) 

178 c_processing_redis.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True) 

179 c_processing_redis.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False) 

180 validated_as_True = c_processing_redis.validated_as(valid_doi_in_db) 

181 validated_as_False = c_processing_redis.validated_as(invalid_doi_in_db) 

182 not_validated = c_processing_redis.validated_as(valid_doi_not_in_db) 

183 

184 self.assertEqual(validated_as_True, True) 

185 self.assertEqual(validated_as_False, False) 

186 self.assertEqual(not_validated, None) 

187 

188 c_processing_redis.doi_m.storage_manager.delete_storage() 

189 

190 def test_validated_as_inmemory(self): 

191 """ 

192 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

193 string of the identifier, the method "validated_as" returns: 

194 - True if the id was already validated as valid 

195 - False if the id was already validated as invalid 

196 - None if the id was not validated before 

197 The procedure is tested 

198 - With in Memory + Json storage manager and a pre-existent db associated 

199 - With in Memory + Json storage manager without a pre-existent db associated 

200 """ 

201 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"} 

202 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"} 

203 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"} 

204 

205 c_processing = CrossrefProcessing(testing=True) 

206 c_processing.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True) 

207 c_processing.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False) 

208 validated_as_True = c_processing.validated_as(valid_doi_in_db) 

209 validated_as_False = c_processing.validated_as(invalid_doi_in_db) 

210 not_validated = c_processing.validated_as(valid_doi_not_in_db) 

211 

212 self.assertEqual(validated_as_True, True) 

213 self.assertEqual(validated_as_False, False) 

214 self.assertEqual(not_validated, None) 

215 

216 c_processing.doi_m.storage_manager.delete_storage() 

217 

218 def test_validated_as_redis(self): 

219 """ 

220 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

221 string of the identifier, the method "validated_as" returns: 

222 - True if the id was already validated as valid 

223 - False if the id was already validated as invalid 

224 - None if the id was not validated before 

225 The procedure is tested 

226 - With REDIS storage manager and a pre-existent db associated 

227 - With REDIS storage manager without a pre-existent db associated 

228 """ 

229 

230 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"} 

231 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"} 

232 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"} 

233 

234 # New class instance and set values directly on the DOIManager's storage_manager 

235 c_processing_redis = CrossrefProcessing(testing=True) 

236 c_processing_redis.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True) 

237 c_processing_redis.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False) 

238 validated_as_True = c_processing_redis.validated_as(valid_doi_in_db) 

239 validated_as_False = c_processing_redis.validated_as(invalid_doi_in_db) 

240 not_validated = c_processing_redis.validated_as(valid_doi_not_in_db) 

241 self.assertEqual(validated_as_True, True) 

242 self.assertEqual(validated_as_False, False) 

243 self.assertEqual(not_validated, None) 

244 c_processing_redis.storage_manager.delete_storage() 

245 

246 def test_get_id_manager(self): 

247 """Check that, given in input the string of a schema (e.g.:'pmid') or an id with a prefix (e.g.: 'pmid:12334') 

248 and a dictionary mapping the strings of the schemas to their id managers, the method returns the correct 

249 id manager. Note that each instance of the Preprocessing class needs its own instances of the id managers, 

250 in order to avoid conflicts while validating data""" 

251 

252 c_processing = CrossrefProcessing() 

253 id_man_dict = c_processing.venue_id_man_dict 

254 

255 issn_id = "issn:0003-987X" 

256 issn_string = "issn" 

257 issn_man_exp = c_processing.get_id_manager(issn_id, id_man_dict) 

258 issn_man_exp_2 = c_processing.get_id_manager(issn_string, id_man_dict) 

259 

260 #check that the idmanager for the issn was returned and that it works as expected 

261 assert issn_man_exp is not None 

262 assert issn_man_exp_2 is not None 

263 self.assertTrue(issn_man_exp.is_valid(issn_id)) 

264 self.assertTrue(issn_man_exp_2.is_valid(issn_id)) 

265 

266 def test_csv_creator(self): 

267 c_processing = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None) 

268 data = load_json(DATA, None) # type: ignore[arg-type] 

269 assert data is not None 

270 dois_to_prefetch = [item.get("DOI") for item in data['items'] if item.get("DOI")] 

271 c_processing.prefetch_doi_orcid_index(dois_to_prefetch) 

272 output = list() 

273 for item in data['items']: 

274 tabular_data = c_processing.csv_creator(item) 

275 if tabular_data: 

276 output.append(tabular_data) 

277 self.assertEqual(len(output), 11) 

278 output_ids = [row['id'] for row in output] 

279 self.assertIn('doi:10.47886/9789251092637.ch7', output_ids) 

280 self.assertIn('doi:10.9799/ksfan.2012.25.1.069', output_ids) 

281 self.assertIn('doi:10.9799/ksfan.2012.25.1.105', output_ids) 

282 first_item = next(row for row in output if row['id'] == 'doi:10.47886/9789251092637.ch7') 

283 self.assertEqual(first_item['type'], 'book chapter') 

284 self.assertEqual(first_item['publisher'], 'American Fisheries Society [crossref:460]') 

285 

286 def test_csv_creator_cited(self): 

287 c_processing_cited = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None, citing=False) 

288 with open(JSON_FILE, encoding="utf8") as f: 

289 result = json.load(f) 

290 output = list() 

291 for item in result['items']: 

292 if item.get("reference"): 

293 # filtering out entities without citations 

294 has_doi_references = [x for x in item["reference"] if x.get("DOI")] 

295 if has_doi_references: 

296 for reference_dict in has_doi_references: 

297 tabular_data = c_processing_cited.csv_creator(reference_dict) 

298 if tabular_data: 

299 output.append(tabular_data) 

300 expected_output =[ 

301 {'id': 'doi:10.2105/ajph.2006.101626', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

302 {'id': 'doi:10.1001/jama.299.12.1471', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

303 {'id': 'doi:10.1177/003335490812300219', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

304 {'id': 'doi:10.1089/bsp.2008.0020', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

305 {'id': 'doi:10.1097/01.ccm.0000151067.76074.21', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

306 {'id': 'doi:10.1177/003335490912400218', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

307 {'id': 'doi:10.1097/dmp.0b013e31817196bf', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

308 {'id': 'doi:10.1056/nejmsa021807', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

309 {'id': 'doi:10.1097/dmp.0b013e31819d977c', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

310 {'id': 'doi:10.1097/dmp.0b013e31819f1ae2', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

311 {'id': 'doi:10.1097/dmp.0b013e318194898d', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

312 {'id': 'doi:10.1378/chest.07-2693', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

313 {'id': 'doi:10.1016/s0196-0644(99)70224-6', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

314 {'id': 'doi:10.1097/01.ccm.0000151072.17826.72', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

315 {'id': 'doi:10.1097/01.bcr.0000155527.76205.a2', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

316 {'id': 'doi:10.2105/ajph.2009.162677', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}] 

317 self.assertEqual(output, expected_output) 

318 

319 def test_get_pages(self): 

320 item = { 

321 'page': '469-476' 

322 } 

323 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

324 pages = crossref_processor.get_crossref_pages(item) 

325 self.assertEqual(pages, '469-476') 

326 

327 def test_get_pages_right_letter(self): 

328 item = { 

329 'page': 'G22' 

330 } 

331 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

332 pages = crossref_processor.get_crossref_pages(item) 

333 self.assertEqual(pages, 'G22-G22') 

334 

335 def test_get_pages_wrong_letter(self): 

336 item = { 

337 'page': '583b-584' 

338 } 

339 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

340 pages = crossref_processor.get_crossref_pages(item) 

341 self.assertEqual(pages, '583-584') 

342 

343 def test_get_pages_roman_letters(self): 

344 item = { 

345 'page': 'iv-l' 

346 } 

347 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

348 pages = crossref_processor.get_crossref_pages(item) 

349 self.assertEqual(pages, 'iv-l') 

350 

351 def test_get_pages_non_roman_letters(self): 

352 item = { 

353 'page': 'kj-hh' 

354 } 

355 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

356 pages = crossref_processor.get_crossref_pages(item) 

357 self.assertEqual(pages, '') 

358 

359 def test_load_publishers_mapping(self): 

360 output = CrossrefProcessing.load_publishers_mapping(publishers_filepath=PUBLISHERS_MAPPING) 

361 expected_output = { 

362 '1': {'name': 'Annals of Family Medicine', 'prefixes': {'10.1370'}}, 

363 '2': {'name': 'American Association of Petroleum Geologists AAPG/Datapages', 'prefixes': {'10.15530', '10.1306'}}, 

364 '3': {'name': 'American Association of Physics Teachers (AAPT)','prefixes': {'10.1119'}}, 

365 '6': {'name': 'American College of Medical Physics (ACMP)','prefixes': {'10.1120'}}, 

366 '9': {'name': 'Allen Press', 'prefixes': {'10.1043'}}, 

367 '10': {'name': 'American Medical Association (AMA)', 'prefixes': {'10.1001'}}, 

368 '11': {'name': 'American Economic Association', 'prefixes': {'10.1257'}}, 

369 '460': {'name': 'American Fisheries Society', 'prefixes': {'10.1577', '10.47886'}} 

370 } 

371 self.assertEqual(output, expected_output) 

372 

373 def test_get_publisher_name(self): 

374 # The item's member is in the publishers' mapping 

375 item = { 

376 'publisher': 'American Fisheries Society', 

377 'DOI': '10.47886/9789251092637.ch7', 

378 'prefix': '10.47886', 

379 'member': '460' 

380 } 

381 doi = '10.47886/9789251092637.ch7' 

382 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

383 publisher_name = crossref_processor.get_publisher_name(doi, item) 

384 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]') 

385 

386 def test_get_publisher_name_no_member(self): 

387 # The item has no member, but the DOI prefix is the publishers' mapping 

388 item = { 

389 'publisher': 'American Fisheries Society', 

390 'DOI': '10.47886/9789251092637.ch7', 

391 'prefix': '10.47886' 

392 } 

393 doi = '10.47886/9789251092637.ch7' 

394 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

395 publisher_name = crossref_processor.get_publisher_name(doi, item) 

396 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]') 

397 

398 def test_get_publisher_name_redis_by_member(self): 

399 publishers_redis = PublishersRedis(testing=True) 

400 publishers_redis.set_publisher("460", "American Fisheries Society", {"10.47886"}) 

401 

402 item = { 

403 'publisher': 'American Fisheries Society', 

404 'DOI': '10.47886/9789251092637.ch7', 

405 'prefix': '10.47886', 

406 'member': '460' 

407 } 

408 doi = '10.47886/9789251092637.ch7' 

409 crossref_processor = CrossrefProcessing( 

410 orcid_index=None, publishers_filepath=None, 

411 use_redis_publishers=True, testing=True 

412 ) 

413 crossref_processor._publishers_redis = publishers_redis 

414 publisher_name = crossref_processor.get_publisher_name(doi, item) 

415 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]') 

416 

417 def test_get_publisher_name_redis_by_prefix(self): 

418 publishers_redis = PublishersRedis(testing=True) 

419 publishers_redis.set_publisher("460", "American Fisheries Society", {"10.47886"}) 

420 

421 item = { 

422 'publisher': 'American Fisheries Society', 

423 'DOI': '10.47886/9789251092637.ch7', 

424 'prefix': '10.47886' 

425 } 

426 doi = '10.47886/9789251092637.ch7' 

427 crossref_processor = CrossrefProcessing( 

428 orcid_index=None, publishers_filepath=None, 

429 use_redis_publishers=True, testing=True 

430 ) 

431 crossref_processor._publishers_redis = publishers_redis 

432 publisher_name = crossref_processor.get_publisher_name(doi, item) 

433 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]') 

434 

435 def test_get_publisher_name_redis_not_found(self): 

436 publishers_redis = PublishersRedis(testing=True) 

437 

438 item = { 

439 'publisher': 'Unknown Publisher', 

440 'DOI': '10.9999/unknown', 

441 'prefix': '10.9999' 

442 } 

443 doi = '10.9999/unknown' 

444 crossref_processor = CrossrefProcessing( 

445 orcid_index=None, publishers_filepath=None, 

446 use_redis_publishers=True, testing=True 

447 ) 

448 crossref_processor._publishers_redis = publishers_redis 

449 publisher_name = crossref_processor.get_publisher_name(doi, item) 

450 self.assertEqual(publisher_name, 'Unknown Publisher') 

451 

452 def test_get_venue_name(self): 

453 item = { 

454 'container-title': ['Cerebrospinal Fluid [Working Title]'], 

455 } 

456 row = {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'journal article', 'publisher': '', 'editor': ''} 

457 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

458 venue_name = crossref_processor.get_venue_name(item, row) 

459 self.assertEqual(venue_name, 'Cerebrospinal Fluid [Working Title]') 

460 

461 def test_get_venue_name_with_ISSN(self): 

462 item = { 

463 "container-title": ["Disaster Medicine and Public Health Preparedness"], 

464 "ISSN": ["1935-7893", "1938-744X"] 

465 } 

466 row = {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 

467 'type': 'journal article', 'publisher': '', 'editor': ''} 

468 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

469 venue_name = crossref_processor.get_venue_name(item, row) 

470 self.assertEqual(venue_name, 'Disaster Medicine and Public Health Preparedness [issn:1935-7893 issn:1938-744X]') 

471 

472 def test_find_crossref_orcid(self): 

473 """Test that, given in input a string representing an ORCID, the method returns: 

474 - the ORCID itself if it is valid 

475 - an empty string if it is not valid 

476 The procedure is tested with: 

477 - a valid ORCID 

478 - an invalid ORCID 

479 - a non-string input 

480 """ 

481 c_processing = CrossrefProcessing() 

482 test_doi = "10.1234/test123" # Added test DOI 

483 

484 # Valid ORCID 

485 inp = '0000-0003-4082-1500' 

486 out = c_processing.find_crossref_orcid(inp, test_doi) 

487 exp = "orcid:0000-0003-4082-1500" 

488 self.assertEqual(out, exp) 

489 

490 # Invalid ORCID 

491 inp_invalid_id = '5500-0001-9759-3938' 

492 out_invalid_id = c_processing.find_crossref_orcid(inp_invalid_id, test_doi) 

493 exp_invalid_id = "" 

494 self.assertEqual(out_invalid_id, exp_invalid_id) 

495 

496 # Non-string input 

497 inp_non_string = None 

498 out_non_string = c_processing.find_crossref_orcid(inp_non_string, test_doi) 

499 exp_non_string = "" 

500 self.assertEqual(out_non_string, exp_non_string) 

501 

502 c_processing.orcid_m.storage_manager.delete_storage() 

503 

504 # Set a valid id as invalid in storage to check that the api check is 

505 # avoided if the info is already in storage 

506 c_processing = CrossrefProcessing(testing=True) 

507 c_processing.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", False) 

508 

509 inp = '0000-0001-9759-3938' 

510 out = c_processing.find_crossref_orcid(inp, test_doi) 

511 exp = "" 

512 self.assertEqual(out, exp) 

513 c_processing.orcid_m.storage_manager.delete_storage() 

514 

515 c_processing = CrossrefProcessing(testing=True) 

516 c_processing.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", True) 

517 inp = '0000-0001-9759-3938' 

518 out = c_processing.find_crossref_orcid(inp, test_doi) 

519 exp = "orcid:0000-0001-9759-3938" 

520 self.assertEqual(out, exp) 

521 c_processing.orcid_m.storage_manager.delete_storage() 

522 

523 def test_report_series_venue_id(self): 

524 crossref_processor = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None) 

525 items = {'items': [{ 

526 'DOI': '10.1007/978-3-030-00668-6_8', 

527 'container-title': ["troitel'stvo: nauka i obrazovanie [Construction: Science and Education]"], 

528 'ISSN': '2305-5502', 

529 'type': 'report-series' 

530 }]} 

531 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

532 output = list() 

533 for item in items['items']: 

534 output.append(crossref_processor.csv_creator(item)) 

535 expected_output = [{'id': 'doi:10.1007/978-3-030-00668-6_8', 'title': '', 'author': '', 'pub_date': '', 'venue': "troitel'stvo: nauka i obrazovanie [Construction: Science and Education] [issn:2305-5502]", 'volume': '', 'issue': '', 'page': '', 'type': 'report series', 'publisher': '', 'editor': ''}] 

536 self.assertEqual(output, expected_output) 

537 

538 def test_report_series_br_id(self): 

539 crossref_processor = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None) 

540 items = {'items': [{ 

541 'DOI': '10.1007/978-3-030-00668-6_8', 

542 'container-title': [], 

543 'ISSN': '2305-5502', 

544 'type': 'report-series' 

545 }]} 

546 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

547 output = list() 

548 for item in items['items']: 

549 output.append(crossref_processor.csv_creator(item)) 

550 expected_output = [{'id': 'doi:10.1007/978-3-030-00668-6_8 issn:2305-5502', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'report series', 'publisher': '', 'editor': ''}] 

551 self.assertEqual(output, expected_output) 

552 

553 def test_get_agents_strings_list(self): 

554 authors_list = [ 

555 { 

556 'given': 'Myung-Hee', 

557 'family': 'Kim', 

558 'affiliation': [], 

559 "role": "author" 

560 }, 

561 { 

562 'given': 'Jin-Seon', 

563 'family': 'Seo', 

564 'affiliation': [], 

565 "role": "author" 

566 }, 

567 { 

568 'given': 'Mi-Kyeong', 

569 'family': 'Choi', 

570 'affiliation': [], 

571 "role": "author" 

572 }, 

573 { 

574 'given': 'Eun-Young', 

575 'family': 'Kim', 

576 'affiliation': [], 

577 "role": "author" 

578 } 

579 ] 

580 crossref_processor = CrossrefProcessing(IOD) 

581 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105']) 

582 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', 

583 authors_list) 

584 expected_authors_list = ['Kim, Myung-Hee', 'Seo, Jin-Seon', 'Choi, Mi-Kyeong [orcid:0000-0002-6227-4053]', 

585 'Kim, Eun-Young'] 

586 self.assertEqual(authors_strings_list, expected_authors_list) 

587 

588 def test_get_agents_strings_list_same_family(self): 

589 # Two authors have the same family name and the same given name initials 

590 authors_list = [ 

591 { 

592 'given': 'Mi-Kyeong', 

593 'family': 'Choi', 

594 'affiliation': [], 

595 "role": "author" 

596 }, 

597 { 

598 'given': 'Mi-Hong', 

599 'family': 'Choi', 

600 'affiliation': [], 

601 "role": "author" 

602 } 

603 ] 

604 crossref_processor = CrossrefProcessing(IOD) 

605 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105']) 

606 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', 

607 authors_list) 

608 expected_authors_list = ['Choi, Mi-Kyeong [orcid:0000-0002-6227-4053]', 'Choi, Mi-Hong'] 

609 self.assertEqual(authors_strings_list, expected_authors_list) 

610 

611 def test_get_agents_strings_list_homonyms(self): 

612 # Two authors have the same family name and the same given name 

613 authors_list = [ 

614 { 

615 'given': 'Mi-Kyeong', 

616 'family': 'Choi', 

617 'affiliation': [], 

618 "role": "author" 

619 }, 

620 { 

621 'given': 'Mi-Kyeong', 

622 'family': 'Choi', 

623 'affiliation': [], 

624 "role": "author" 

625 } 

626 ] 

627 crossref_processor = CrossrefProcessing(IOD) 

628 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', 

629 authors_list) 

630 expected_authors_list = ['Choi, Mi-Kyeong', 'Choi, Mi-Kyeong'] 

631 self.assertEqual(authors_strings_list, expected_authors_list) 

632 

633 def test_get_agents_strings_list_inverted_names(self): 

634 # One author with an ORCID has as a name the surname of another 

635 authors_list = [ 

636 { 

637 'given': 'Choi', 

638 'family': 'Mi-Kyeong', 

639 'affiliation': [], 

640 "role": "author" 

641 }, 

642 { 

643 'given': 'Mi-Hong', 

644 'family': 'Choi', 

645 'affiliation': [], 

646 "role": "author" 

647 } 

648 ] 

649 crossref_processor = CrossrefProcessing(IOD) 

650 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', 

651 authors_list) 

652 expected_authors_list = ['Mi-Kyeong, Choi', 'Choi, Mi-Hong'] 

653 self.assertEqual(authors_strings_list, expected_authors_list) 

654 

655 def test_get_agents_strings_list_overlapping_surnames(self): 

656 # The surname of one author is included in the surname of another. 

657 authors_list = [ 

658 { 

659 "given": "Puvaneswari", 

660 "family": "Paravamsivam", 

661 "sequence": "first", 

662 "affiliation": [], 

663 "role": "author" 

664 }, 

665 { 

666 "given": "Chua Kek", 

667 "family": "Heng", 

668 "sequence": "additional", 

669 "affiliation": [], 

670 "role": "author" 

671 }, 

672 { 

673 "given": "Sri Nurestri Abdul", 

674 "family": "Malek", 

675 "sequence": "additional", 

676 "affiliation": [], 

677 "role": "author" 

678 }, 

679 { 

680 "given": "Vikineswary", 

681 "family": "Sabaratnam", 

682 "sequence": "additional", 

683 "affiliation": [], 

684 "role": "author" 

685 }, 

686 { 

687 "given": "Ravishankar Ram", 

688 "family": "M", 

689 "sequence": "additional", 

690 "affiliation": [], 

691 "role": "author" 

692 }, 

693 { 

694 "given": "Sri Nurestri Abdul", 

695 "family": "Malek", 

696 "sequence": "additional", 

697 "affiliation": [], 

698 "role": "editor" 

699 }, 

700 { 

701 "given": "Umah Rani", 

702 "family": "Kuppusamy", 

703 "sequence": "additional", 

704 "affiliation": [], 

705 "role": "author" 

706 } 

707 ] 

708 crossref_processor = CrossrefProcessing(None) 

709 csv_manager = CSVManager() 

710 csv_manager.data = {'doi:10.9799/ksfan.2012.25.1.105': {'Malek, Sri Nurestri Abdul [0000-0001-6278-8559]'}} 

711 crossref_processor.orcid_index = csv_manager 

712 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105']) 

713 authors_strings_list, editors_strings_list = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list) 

714 expected_authors_list = ['Paravamsivam, Puvaneswari', 'Heng, Chua Kek', 'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]', 'Sabaratnam, Vikineswary', 'M, Ravishankar Ram', 'Kuppusamy, Umah Rani'] 

715 expected_editors_list = ['Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]'] 

716 self.assertEqual((authors_strings_list, editors_strings_list), (expected_authors_list, expected_editors_list)) 

717 

718 def test_id_worker(self): 

719 field_issn = 'ISSN 1050-124X' 

720 field_isbn = ['978-1-56619-909-4'] 

721 issn_list = list() 

722 isbn_list = list() 

723 CrossrefProcessing.id_worker(field_issn, issn_list, CrossrefProcessing.issn_worker) 

724 CrossrefProcessing.id_worker(field_isbn, isbn_list, CrossrefProcessing.isbn_worker) 

725 expected_issn_list = ['issn:1050-124X'] 

726 expected_isbn_list = ['isbn:9781566199094'] 

727 self.assertEqual((issn_list, isbn_list), (expected_issn_list, expected_isbn_list)) 

728 

729 def test_to_validated_id_list(self): 

730 cp = CrossrefProcessing(testing=True) 

731 # CASE_1: is valid 

732 inp_1 = {'id':'doi:10.13039/100005522', 'schema':'doi'} 

733 out_1 = cp.to_validated_id_list(inp_1) 

734 exp_1 = ['doi:10.13039/100005522'] 

735 self.assertEqual(out_1, exp_1) 

736 cp.doi_m.storage_manager.delete_storage() 

737 

738 cp = CrossrefProcessing(testing=True) 

739 # CASE_2: is invalid 

740 inp_2 = {'id':'doi:10.1089/bsp.2008.002', 'schema':'doi'} 

741 out_2 = cp.to_validated_id_list(inp_2) 

742 exp_2 = [] 

743 self.assertEqual(out_2, exp_2) 

744 

745 cp = CrossrefProcessing(testing=True) 

746 # CASE_3: valid orcid 

747 inp_3 = {'id': 'orcid:0000-0003-4082-1500', 'schema':'orcid'} 

748 out_3 = cp.to_validated_id_list(inp_3) 

749 exp_3 = ['orcid:0000-0003-4082-1500'] 

750 self.assertEqual(out_3, exp_3) 

751 cp.orcid_m.storage_manager.delete_storage() 

752 

753 cp = CrossrefProcessing(testing=True) 

754 #CASE_4: invalid doi in self._redis_values_br 

755 inp_4 = {'id': 'doi:10.1089/bsp.2008.002', 'schema': 'doi'} 

756 cp._redis_values_br.append(inp_4['id']) 

757 out_4 = cp.to_validated_id_list(inp_4) 

758 exp_4 = ['doi:10.1089/bsp.2008.002'] 

759 self.assertEqual(out_4, exp_4) 

760 value=cp.tmp_doi_m.storage_manager.get_value('doi:10.1089/bsp.2008.002') 

761 self.assertEqual(value, True) 

762 cp.doi_m.storage_manager.delete_storage() 

763 

764 

765 def test_to_validated_id_list_redis(self): 

766 cp = CrossrefProcessing(testing=True) 

767 # CASE_1: is valid 

768 inp_1 = {'id': 'doi:10.13039/100005522', 'schema': 'doi'} 

769 out_1 = cp.to_validated_id_list(inp_1) 

770 exp_1 = ['doi:10.13039/100005522'] 

771 self.assertEqual(out_1, exp_1) 

772 cp.doi_m.storage_manager.delete_storage() 

773 

774 cp = CrossrefProcessing(testing=True) 

775 # CASE_2: is invalid 

776 inp_2 = {'id': 'doi:10.1089/bsp.2008.002', 'schema': 'doi'} 

777 out_2 = cp.to_validated_id_list(inp_2) 

778 exp_2 = [] 

779 self.assertEqual(out_2, exp_2) 

780 

781 cp = CrossrefProcessing(testing=True) 

782 # CASE_3: valid orcid 

783 inp_3 = {'id': 'orcid:0000-0003-4082-1500', 'schema': 'orcid'} 

784 out_3 = cp.to_validated_id_list(inp_3) 

785 exp_3 = ['orcid:0000-0003-4082-1500'] 

786 self.assertEqual(out_3, exp_3) 

787 cp.orcid_m.storage_manager.delete_storage() 

788 

789 cp = CrossrefProcessing(testing=True) 

790 # CASE_4: invalid doi in self._redis_values_br 

791 inp_4 = {'id': 'doi:10.1089/bsp.2008.002', 'schema': 'doi'} 

792 cp._redis_values_br.append(inp_4['id']) 

793 out_4 = cp.to_validated_id_list(inp_4) 

794 exp_4 = ['doi:10.1089/bsp.2008.002'] 

795 self.assertEqual(out_4, exp_4) 

796 value = cp.tmp_doi_m.storage_manager.get_value('doi:10.1089/bsp.2008.002') 

797 self.assertEqual(value, True) 

798 cp.doi_m.storage_manager.delete_storage() 

799 

800 def test_find_crossref_orcid_with_index(self): 

801 """Test ORCID validation using ORCID index before API validation""" 

802 # Setup 

803 test_doi = "10.1234/test123" 

804 test_doi_prefixed = "doi:10.1234/test123" 

805 test_orcid = "0000-0002-1234-5678" 

806 test_name = "Smith, John" 

807 

808 # Create CrossrefProcessing instance with ORCID index 

809 cp = CrossrefProcessing(testing=True) 

810 cp.orcid_index.add_value(test_doi_prefixed, f"{test_name} [orcid:{test_orcid}]") # type: ignore[attr-defined] 

811 cp.prefetch_doi_orcid_index([test_doi]) 

812 

813 # Test Case 1: ORCID found in index 

814 out_1 = cp.find_crossref_orcid(test_orcid, test_doi) 

815 exp_1 = f"orcid:{test_orcid}" 

816 self.assertEqual(out_1, exp_1) 

817 # Verify it was added to temporary storage 

818 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}")) 

819 

820 # Test Case 2: ORCID not in index but valid via API 

821 out_2 = cp.find_crossref_orcid("0000-0003-4082-1500", test_doi) 

822 exp_2 = "orcid:0000-0003-4082-1500" 

823 self.assertEqual(out_2, exp_2) 

824 

825 # Test Case 3: ORCID not in index and invalid 

826 out_3 = cp.find_crossref_orcid("0000-0000-0000-0000", test_doi) 

827 exp_3 = "" 

828 self.assertEqual(out_3, exp_3) 

829 

830 # Cleanup 

831 cp.orcid_m.storage_manager.delete_storage() 

832 

833 def test_find_crossref_orcid_api_disabled_not_in_index(self): 

834 """API OFF + empty index: a syntactically valid ORCID must NOT be resolved.""" 

835 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

836 test_doi = "10.9999/noindex" 

837 candidate = "0000-0003-4082-1500" # syntactically valid 

838 

839 out = cp.find_crossref_orcid(candidate, test_doi) 

840 self.assertEqual(out, "") 

841 # Must NOT be written to tmp storage 

842 self.assertIsNone(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{candidate}")) 

843 

844 cp.orcid_m.storage_manager.delete_storage() 

845 

846 def test_find_crossref_orcid_api_disabled_from_index(self): 

847 """API OFF + present in DOI→ORCID index: must resolve and be saved in tmp storage.""" 

848 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

849 test_doi = "10.1234/test" 

850 test_doi_prefixed = "doi:10.1234/test" 

851 test_orcid = "0000-0002-1234-5678" 

852 test_name = "Smith, John" 

853 

854 cp.orcid_index.add_value(test_doi_prefixed, f"{test_name} [orcid:{test_orcid}]") # type: ignore[attr-defined] 

855 cp.prefetch_doi_orcid_index([test_doi]) 

856 

857 out = cp.find_crossref_orcid(test_orcid, test_doi) 

858 self.assertEqual(out, f"orcid:{test_orcid}") 

859 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}")) 

860 

861 cp.orcid_m.storage_manager.delete_storage() 

862 

863 def test_find_crossref_orcid_api_disabled_in_storage(self): 

864 """API OFF + ORCID already valid in persistent storage: must be accepted.""" 

865 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

866 oid = "orcid:0000-0003-4082-1500" 

867 cp.orcid_m.storage_manager.set_value(oid, True) # mark valid 

868 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/any") 

869 self.assertEqual(out, oid) 

870 cp.orcid_m.storage_manager.delete_storage() 

871 

872 def test_find_crossref_orcid_api_disabled_from_redis_snapshot(self): 

873 """API OFF + empty index/storage, but ORCID present in Redis snapshot: accept and seed tmp storage.""" 

874 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

875 oid = "orcid:0000-0003-4082-1500" 

876 cp.update_redis_values(br=[], ra=[oid]) # emulate per-chunk snapshot 

877 

878 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/noindex") 

879 self.assertEqual(out, oid) 

880 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(oid)) 

881 cp.orcid_m.storage_manager.delete_storage() 

882 

883 def test_find_crossref_orcid_api_enabled_invalid_in_storage(self): 

884 """API ON + ORCID explicitly invalid in storage: reject immediately (no API/index).""" 

885 cp = CrossrefProcessing(use_orcid_api=True, testing=True) 

886 oid = "orcid:0000-0002-9286-2630" 

887 cp.orcid_m.storage_manager.set_value(oid, False) 

888 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/anything") 

889 self.assertEqual(out, "") 

890 cp.orcid_m.storage_manager.delete_storage() 

891 

892 def test_find_crossref_orcid_api_enabled_from_redis_snapshot(self): 

893 """API ON + empty storage/index, but ORCID present in Redis snapshot: accept without API call.""" 

894 cp = CrossrefProcessing(use_orcid_api=True, testing=True) 

895 oid = "orcid:0000-0003-4082-1500" 

896 cp.update_redis_values(br=[], ra=[oid]) 

897 

898 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/noindex") 

899 self.assertEqual(out, oid) 

900 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(oid)) 

901 cp.orcid_m.storage_manager.delete_storage() 

902 

903 def test_get_agents_strings_list_api_disabled_no_index(self): 

904 """API OFF + empty index: ORCIDs provided in agent dict MUST NOT be appended to the author string.""" 

905 agents_list = [ 

906 { 

907 "given": "Jane", 

908 "family": "Doe", 

909 "role": "author", 

910 "ORCID": "https://orcid.org/0000-0003-4082-1500", # present in metadata 

911 } 

912 ] 

913 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

914 authors_strings, editors_strings = cp.get_agents_strings_list("10.9999/noindex", agents_list) 

915 self.assertEqual(authors_strings, ["Doe, Jane"]) # no [orcid:...] tag 

916 self.assertEqual(editors_strings, []) 

917 cp.orcid_m.storage_manager.delete_storage() 

918 

919 def test_get_agents_strings_list_api_disabled_index_requires_prefixed_doi(self): 

920 """ 

921 API OFF + indice DOI→ORCID popolato con chiave DOI prefissata (doi:...). 

922 Il DOI passato a get_agents_strings_list è senza prefisso: la funzione deve 

923 normalizzarlo prima di interrogare l'indice, altrimenti l'ORCID non viene trovato. 

924 """ 

925 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

926 

927 # Indice popolato con DOI **prefissato** 

928 doi_pref = "doi:10.1234/test-idx" 

929 test_orcid = "0000-0002-9999-8888" 

930 cp.orcid_index.add_value(doi_pref, f"Smith, John [orcid:{test_orcid}]") # type: ignore[attr-defined] 

931 cp.prefetch_doi_orcid_index(["10.1234/test-idx"]) 

932 

933 # Autore senza ORCID in metadati; DOI passato **senza prefisso** 

934 agents = [{ 

935 "given": "John", 

936 "family": "Smith", 

937 "role": "author" 

938 }] 

939 

940 authors, editors = cp.get_agents_strings_list("10.1234/test-idx", agents) 

941 # Deve risolvere via indice e apporre il tag [orcid:...] 

942 self.assertEqual(authors, ["Smith, John [orcid:0000-0002-9999-8888]"]) 

943 self.assertEqual(editors, []) 

944 cp.orcid_m.storage_manager.delete_storage() 

945 

946 def test_find_crossref_orcid_api_disabled_redis_snapshot_unprefixed_orcid(self): 

947 """ 

948 API OFF + indice vuoto + storage vuoto, ma Redis snapshot contiene ORCID **senza prefisso**. 

949 La funzione deve riconoscerlo (normalizzando) e validarlo. 

950 """ 

951 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

952 

953 # Redis snapshot con ORCID **senza prefisso** 

954 raw_orcid = "0000-0003-4082-1500" 

955 cp.update_redis_values(br=[], ra=[raw_orcid]) 

956 

957 out = cp.find_crossref_orcid(raw_orcid, "10.9999/noindex") 

958 self.assertEqual(out, f"orcid:{raw_orcid}") 

959 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{raw_orcid}")) 

960 cp.orcid_m.storage_manager.delete_storage() 

961 

962 def test_update_redis_values_normalizes_inputs(self): 

963 """ 

964 update_redis_values deve normalizzare sempre: 

965 - DOI → con prefisso 'doi:' 

966 - ORCID → con prefisso 'orcid:' 

967 ed eliminare voci non normalizzabili. 

968 """ 

969 cp = CrossrefProcessing(testing=True) 

970 

971 cp.update_redis_values( 

972 br=["10.1001/jama.299.12.1471", "doi:10.2105/ajph.2006.101626", "xxx-bad"], 

973 ra=["0000-0002-1234-5678", "orcid:0000-0003-4082-1500", "bad-orcid"] 

974 ) 

975 

976 # Tutti normalizzati (e 'bad' scartati) 

977 self.assertIn("doi:10.1001/jama.299.12.1471", cp._redis_values_br) 

978 self.assertIn("doi:10.2105/ajph.2006.101626", cp._redis_values_br) 

979 self.assertNotIn("xxx-bad", cp._redis_values_br) 

980 

981 self.assertIn("orcid:0000-0002-1234-5678", cp._redis_values_ra) 

982 self.assertIn("orcid:0000-0003-4082-1500", cp._redis_values_ra) 

983 self.assertNotIn("bad-orcid", cp._redis_values_ra) 

984 cp.storage_manager.delete_storage() 

985 

986 

987def test_validated_as_with_storage_manager(storage_manager): 

988 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"} 

989 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"} 

990 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"} 

991 

992 c_processing = CrossrefProcessing(storage_manager=storage_manager, testing=True) 

993 c_processing.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True) 

994 c_processing.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False) 

995 

996 assert c_processing.validated_as(valid_doi_in_db) is True 

997 assert c_processing.validated_as(invalid_doi_in_db) is False 

998 assert c_processing.validated_as(valid_doi_not_in_db) is None 

999 

1000 

1001class TestCrossrefProcessingWithMockedAPI(unittest.TestCase): 

1002 """Integration tests using mocked Crossref API responses from conftest.py.""" 

1003 

1004 def test_csv_creator_nature_article(self): 

1005 """Test with Nature article from mocked API (doi:10.1038/nature12373).""" 

1006 item = { 

1007 "DOI": "10.1038/nature12373", 

1008 "type": "journal-article", 

1009 "title": ["Nanometre-scale thermometry in a living cell"], 

1010 "author": [ 

1011 {"given": "G.", "family": "Kucsko", "sequence": "first"}, 

1012 {"given": "P. C.", "family": "Maurer", "sequence": "additional"}, 

1013 {"given": "M. D.", "family": "Lukin", "sequence": "additional"} 

1014 ], 

1015 "container-title": ["Nature"], 

1016 "volume": "500", 

1017 "issue": "7460", 

1018 "page": "54-58", 

1019 "issued": {"date-parts": [[2013, 7, 31]]}, 

1020 "ISSN": ["0028-0836", "1476-4687"], 

1021 "publisher": "Springer Science and Business Media LLC", 

1022 "member": "297", 

1023 "prefix": "10.1038" 

1024 } 

1025 processor = CrossrefProcessing(testing=True) 

1026 row = processor.csv_creator(item) 

1027 

1028 expected = { 

1029 'id': 'doi:10.1038/nature12373', 

1030 'title': 'Nanometre-scale thermometry in a living cell', 

1031 'author': 'Kucsko, G.; Maurer, P. C.; Lukin, M. D.', 

1032 'pub_date': '2013-7-31', 

1033 'venue': 'Nature [issn:0028-0836 issn:1476-4687]', 

1034 'volume': '500', 

1035 'issue': '7460', 

1036 'page': '54-58', 

1037 'type': 'journal article', 

1038 'publisher': 'Springer Science and Business Media LLC [crossref:297]', 

1039 'editor': '' 

1040 } 

1041 self.assertEqual(row, expected) 

1042 processor.storage_manager.delete_storage() 

1043 

1044 def test_csv_creator_plos_with_orcid_url(self): 

1045 """Test PLOS article with ORCID in URL format from mocked API.""" 

1046 item = { 

1047 "DOI": "10.1371/journal.pone.0284601", 

1048 "type": "journal-article", 

1049 "title": ["Biochemical evaluation of vaccination in rats"], 

1050 "author": [ 

1051 {"given": "Mahsa", "family": "Teymoorzadeh", "sequence": "first"}, 

1052 {"given": "Razieh", "family": "Yazdanparast", "sequence": "additional", 

1053 "ORCID": "https://orcid.org/0000-0003-0530-4305", "authenticated-orcid": True} 

1054 ], 

1055 "container-title": ["PLOS ONE"], 

1056 "volume": "18", 

1057 "issue": "5", 

1058 "page": "e0284601", 

1059 "issued": {"date-parts": [[2023, 5, 4]]}, 

1060 "ISSN": ["1932-6203"], 

1061 "publisher": "Public Library of Science (PLoS)" 

1062 } 

1063 processor = CrossrefProcessing(testing=True) 

1064 row = processor.csv_creator(item) 

1065 

1066 expected = { 

1067 'id': 'doi:10.1371/journal.pone.0284601', 

1068 'title': 'Biochemical evaluation of vaccination in rats', 

1069 'author': 'Teymoorzadeh, Mahsa; Yazdanparast, Razieh [orcid:0000-0003-0530-4305]', 

1070 'pub_date': '2023-5-4', 

1071 'venue': 'PLOS ONE [issn:1932-6203]', 

1072 'volume': '18', 

1073 'issue': '5', 

1074 'page': 'e0284601-e0284601', 

1075 'type': 'journal article', 

1076 'publisher': 'Public Library of Science (PLoS)', 

1077 'editor': '' 

1078 } 

1079 self.assertEqual(row, expected) 

1080 processor.storage_manager.delete_storage() 

1081 

1082 def test_csv_creator_book_chapter_multiple_containers(self): 

1083 """Test book chapter with multiple container-titles from mocked API.""" 

1084 item = { 

1085 "DOI": "10.1007/978-3-030-00668-6_8", 

1086 "type": "book-chapter", 

1087 "title": ["The SPAR Ontologies"], 

1088 "author": [ 

1089 {"given": "Silvio", "family": "Peroni", "sequence": "first"}, 

1090 {"given": "David", "family": "Shotton", "sequence": "additional"} 

1091 ], 

1092 "container-title": ["Lecture Notes in Computer Science", "The Semantic Web – ISWC 2018"], 

1093 "page": "119-136", 

1094 "issued": {"date-parts": [[2018]]}, 

1095 "ISBN": ["9783030006679", "9783030006686"], 

1096 "publisher": "Springer International Publishing" 

1097 } 

1098 processor = CrossrefProcessing(testing=True) 

1099 row = processor.csv_creator(item) 

1100 

1101 expected = { 

1102 'id': 'doi:10.1007/978-3-030-00668-6_8', 

1103 'title': 'The SPAR Ontologies', 

1104 'author': 'Peroni, Silvio; Shotton, David', 

1105 'pub_date': '2018', 

1106 'venue': 'Lecture Notes in Computer Science [isbn:9783030006679 isbn:9783030006686]', 

1107 'volume': '', 

1108 'issue': '', 

1109 'page': '119-136', 

1110 'type': 'book chapter', 

1111 'publisher': 'Springer International Publishing', 

1112 'editor': '' 

1113 } 

1114 self.assertEqual(row, expected) 

1115 processor.storage_manager.delete_storage() 

1116 

1117 def test_csv_creator_date_parts_null(self): 

1118 """Test handling of date-parts with null value: [[null]] from mocked API.""" 

1119 item = { 

1120 "DOI": "10.1234/null-date", 

1121 "type": "journal-article", 

1122 "title": ["Article with null date"], 

1123 "issued": {"date-parts": [[None]]} 

1124 } 

1125 processor = CrossrefProcessing(testing=True) 

1126 row = processor.csv_creator(item) 

1127 

1128 expected = { 

1129 'id': 'doi:10.1234/null-date', 

1130 'title': 'Article with null date', 

1131 'author': '', 

1132 'pub_date': '', 

1133 'venue': '', 

1134 'volume': '', 

1135 'issue': '', 

1136 'page': '', 

1137 'type': 'journal article', 

1138 'publisher': '', 

1139 'editor': '' 

1140 } 

1141 self.assertEqual(row, expected) 

1142 processor.storage_manager.delete_storage() 

1143 

1144 def test_csv_creator_date_parts_empty(self): 

1145 """Test handling of date-parts as empty list: [[]] from mocked API.""" 

1146 item = { 

1147 "DOI": "10.1234/empty-date", 

1148 "type": "journal-article", 

1149 "title": ["Article with empty date-parts"], 

1150 "issued": {"date-parts": [[]]} 

1151 } 

1152 processor = CrossrefProcessing(testing=True) 

1153 row = processor.csv_creator(item) 

1154 

1155 expected = { 

1156 'id': 'doi:10.1234/empty-date', 

1157 'title': 'Article with empty date-parts', 

1158 'author': '', 

1159 'pub_date': '', 

1160 'venue': '', 

1161 'volume': '', 

1162 'issue': '', 

1163 'page': '', 

1164 'type': 'journal article', 

1165 'publisher': '', 

1166 'editor': '' 

1167 } 

1168 self.assertEqual(row, expected) 

1169 processor.storage_manager.delete_storage() 

1170 

1171 def test_csv_creator_date_parts_missing(self): 

1172 """Test handling of issued without date-parts key from mocked API.""" 

1173 item = { 

1174 "DOI": "10.1234/no-dateparts", 

1175 "type": "journal-article", 

1176 "title": ["Article without date-parts key"], 

1177 "issued": {} 

1178 } 

1179 processor = CrossrefProcessing(testing=True) 

1180 row = processor.csv_creator(item) 

1181 

1182 expected = { 

1183 'id': 'doi:10.1234/no-dateparts', 

1184 'title': 'Article without date-parts key', 

1185 'author': '', 

1186 'pub_date': '', 

1187 'venue': '', 

1188 'volume': '', 

1189 'issue': '', 

1190 'page': '', 

1191 'type': 'journal article', 

1192 'publisher': '', 

1193 'editor': '' 

1194 } 

1195 self.assertEqual(row, expected) 

1196 processor.storage_manager.delete_storage() 

1197 

1198 def test_csv_creator_html_in_title(self): 

1199 """Test HTML markup in title is cleaned (from mocked API structure).""" 

1200 item = { 

1201 "DOI": "10.1234/html-title", 

1202 "type": "journal-article", 

1203 "title": ["A study of <i>Escherichia coli</i> in <b>biofilms</b>"], 

1204 "issued": {"date-parts": [[2024, 1, 15]]} 

1205 } 

1206 processor = CrossrefProcessing(testing=True) 

1207 row = processor.csv_creator(item) 

1208 

1209 expected = { 

1210 'id': 'doi:10.1234/html-title', 

1211 'title': 'A study of Escherichia coli in biofilms', 

1212 'author': '', 

1213 'pub_date': '2024-1-15', 

1214 'venue': '', 

1215 'volume': '', 

1216 'issue': '', 

1217 'page': '', 

1218 'type': 'journal article', 

1219 'publisher': '', 

1220 'editor': '' 

1221 } 

1222 self.assertEqual(row, expected) 

1223 processor.storage_manager.delete_storage() 

1224 

1225 def test_csv_creator_with_editor(self): 

1226 """Test article with both author and editor from mocked API structure.""" 

1227 item = { 

1228 "DOI": "10.1234/with-editor", 

1229 "type": "edited-book", 

1230 "title": ["Edited volume test"], 

1231 "author": [{"given": "John", "family": "Doe", "sequence": "first"}], 

1232 "editor": [{"given": "Jane", "family": "Smith", "sequence": "first"}], 

1233 "issued": {"date-parts": [[2024, 6, 20]]} 

1234 } 

1235 processor = CrossrefProcessing(testing=True) 

1236 row = processor.csv_creator(item) 

1237 

1238 expected = { 

1239 'id': 'doi:10.1234/with-editor', 

1240 'title': 'Edited volume test', 

1241 'author': 'Doe, John', 

1242 'pub_date': '2024-6-20', 

1243 'venue': '', 

1244 'volume': '', 

1245 'issue': '', 

1246 'page': '', 

1247 'type': 'edited book', 

1248 'publisher': '', 

1249 'editor': 'Smith, Jane' 

1250 } 

1251 self.assertEqual(row, expected) 

1252 processor.storage_manager.delete_storage() 

1253 

1254 def test_csv_creator_no_inplace_modification(self): 

1255 """Test that csv_creator does not modify the input item dict.""" 

1256 item = { 

1257 "DOI": "10.1234/with-editor", 

1258 "type": "edited-book", 

1259 "title": ["Edited volume test"], 

1260 "author": [{"given": "John", "family": "Doe", "sequence": "first"}], 

1261 "editor": [{"given": "Jane", "family": "Smith", "sequence": "first"}], 

1262 "issued": {"date-parts": [[2024, 6, 20]]} 

1263 } 

1264 original_author = {"given": "John", "family": "Doe", "sequence": "first"} 

1265 original_editor = {"given": "Jane", "family": "Smith", "sequence": "first"} 

1266 

1267 processor = CrossrefProcessing(testing=True) 

1268 processor.csv_creator(item) 

1269 

1270 self.assertEqual(item['author'][0], original_author) 

1271 self.assertEqual(item['editor'][0], original_editor) 

1272 processor.storage_manager.delete_storage() 

1273 

1274 def test_csv_creator_member_as_string(self): 

1275 """Test that member field as string (API format) is handled.""" 

1276 item = { 

1277 "DOI": "10.1001/test.12345", 

1278 "type": "journal-article", 

1279 "title": ["Test"], 

1280 "publisher": "American Medical Association (AMA)", 

1281 "member": "10", 

1282 "prefix": "10.1001", 

1283 "issued": {"date-parts": [[2024]]} 

1284 } 

1285 processor = CrossrefProcessing( 

1286 publishers_filepath=PUBLISHERS_MAPPING, 

1287 testing=True 

1288 ) 

1289 row = processor.csv_creator(item) 

1290 

1291 expected = { 

1292 'id': 'doi:10.1001/test.12345', 

1293 'title': 'Test', 

1294 'author': '', 

1295 'pub_date': '2024', 

1296 'venue': '', 

1297 'volume': '', 

1298 'issue': '', 

1299 'page': '', 

1300 'type': 'journal article', 

1301 'publisher': 'American Medical Association (AMA) [crossref:10]', 

1302 'editor': '' 

1303 } 

1304 self.assertEqual(row, expected) 

1305 processor.storage_manager.delete_storage() 

1306 

1307 

1308 

1309 

1310 

1311 

1312 

1313 

1314 

1315 

1316 

1317 

1318 

1319