Coverage for test / crossref_processing_test.py: 100%

605 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-12 21:23 +0000

1# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it> 

2# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

3# SPDX-FileCopyrightText: 2025 Arianna Moretti <arianna.moretti4@unibo.it> 

4# 

5# SPDX-License-Identifier: ISC 

6 

7import json 

8import os 

9import unittest 

10 

11from oc_ds_converter.crossref.crossref_processing import CrossrefProcessing 

12from oc_ds_converter.datasource.orcid_index import PublishersRedis 

13from oc_ds_converter.lib.csvmanager import CSVManager 

14from oc_ds_converter.lib.jsonmanager import load_json 

15 

16TEST_DIR = os.path.join("test", "crossref_processing") 

17JSON_FILE = os.path.join(TEST_DIR, "0.json") 

18TMP_SUPPORT_MATERIAL = os.path.join(TEST_DIR, "tmp_support") 

19IOD = os.path.join(TEST_DIR, 'iod') 

20DATA = os.path.join(TEST_DIR, '40228.json') 

21PUBLISHERS_MAPPING = os.path.join(TEST_DIR, 'publishers.csv') 

22 

23 

24 

25class TestCrossrefProcessing(unittest.TestCase): 

26 def test_extract_all_ids_cited(self): 

27 c_processing = CrossrefProcessing() 

28 with open(JSON_FILE, encoding="utf8") as f: 

29 result = json.load(f) 

30 for entity_dict in result['items']: 

31 results_ids = c_processing.extract_all_ids(entity_dict, False) 

32 br = results_ids[0] 

33 expected_br = ['doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020', 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218', 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807', 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2', 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693', 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72', 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677'] 

34 self.assertEqual(set(expected_br), set(br)) 

35 c_processing.storage_manager.delete_storage() 

36 

37 def test_extract_all_ids_cited_redis(self): 

38 c_processing = CrossrefProcessing(testing=True) 

39 with open(JSON_FILE, encoding="utf8") as f: 

40 result = json.load(f) 

41 for entity_dict in result['items']: 

42 results_ids = c_processing.extract_all_ids(entity_dict, False) 

43 br = results_ids[0] 

44 expected_br = ['doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 

45 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020', 

46 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218', 

47 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807', 

48 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2', 

49 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693', 

50 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72', 

51 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677'] 

52 self.assertEqual(set(expected_br), set(br)) 

53 c_processing.storage_manager.delete_storage() 

54 

55 def test_get_redis_validity_list(self): 

56 c_processing = CrossrefProcessing() 

57 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 

58 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020', 

59 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218', 

60 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807', 

61 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2', 

62 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693', 

63 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72', 

64 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677'} 

65 br_valid_list = c_processing.get_redis_validity_list(br, "br") 

66 exp_br_valid_list = [] 

67 self.assertEqual(br_valid_list, exp_br_valid_list) 

68 c_processing.storage_manager.delete_storage() 

69 

70 def test_get_redis_validity_list_redis(self): 

71 c_processing = CrossrefProcessing(testing=True) 

72 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 

73 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020', 

74 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218', 

75 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807', 

76 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2', 

77 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693', 

78 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72', 

79 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677'} 

80 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

81 br_valid_list = c_processing.get_redis_validity_list(br, "br") 

82 exp_br_valid_list = [] 

83 ra_valid_list = c_processing.get_redis_validity_list(ra, "ra") 

84 self.assertEqual(br_valid_list, exp_br_valid_list) 

85 exp_ra_valid_list = [] 

86 self.assertEqual(ra_valid_list, exp_ra_valid_list) 

87 c_processing.storage_manager.delete_storage() 

88 

89 def test_get_redis_validity_dict_w_fakeredis_db_values_sqlite(self): 

90 c_processing = CrossrefProcessing() 

91 c_processing.BR_redis.sadd('doi:10.2105/ajph.2006.101626', "omid:1") 

92 c_processing.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2") 

93 

94 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 

95 'doi:10.1177/003335490812300219'} 

96 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

97 

98 br_validity_dict = c_processing.get_redis_validity_list(br, "br") 

99 exp_br_valid_list = ['doi:10.2105/ajph.2006.101626'] 

100 ra_validity_dict = c_processing.get_redis_validity_list(ra, "ra") 

101 exp_ra_valid_list = ['orcid:0000-0002-8090-6886'] 

102 self.assertEqual(br_validity_dict, exp_br_valid_list) 

103 self.assertEqual(ra_validity_dict, exp_ra_valid_list) 

104 

105 c_processing.storage_manager.delete_storage() 

106 

107 c_processing.BR_redis.delete('doi:10.2105/ajph.2006.101626') 

108 c_processing.RA_redis.delete('orcid:0000-0002-8090-6886') 

109 

110 def test_get_redis_validity_dict_w_fakeredis_db_values_redis(self): 

111 c_processing = CrossrefProcessing(testing=True) 

112 c_processing.BR_redis.sadd('doi:10.2105/ajph.2006.101626', "omid:1") 

113 c_processing.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2") 

114 

115 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 

116 'doi:10.1177/003335490812300219'} 

117 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

118 

119 br_validity_dict = c_processing.get_redis_validity_list(br, "br") 

120 exp_br_valid_list = ['doi:10.2105/ajph.2006.101626'] 

121 ra_validity_dict = c_processing.get_redis_validity_list(ra, "ra") 

122 exp_ra_valid_list = ['orcid:0000-0002-8090-6886'] 

123 self.assertEqual(br_validity_dict, exp_br_valid_list) 

124 self.assertEqual(ra_validity_dict, exp_ra_valid_list) 

125 

126 c_processing.storage_manager.delete_storage() 

127 

128 c_processing.BR_redis.delete('doi:10.2105/ajph.2006.101626') 

129 c_processing.RA_redis.delete('orcid:0000-0002-8090-6886') 

130 

131 def test_validated_as_default(self): 

132 """ 

133 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

134 string of the identifier, the method "validated_as" returns: 

135 - True if the id was already validated as valid 

136 - False if the id was already validated as invalid 

137 - None if the id was not validated before 

138 The procedure is tested 

139 - With default storage manager (sqlite) without a pre-existent db associated 

140 """ 

141 

142 c_processing = CrossrefProcessing() 

143 validate_as_none = c_processing.validated_as({"schema":"doi", "identifier": "doi:10.1001/10-v4n2-hsf10003"}) 

144 self.assertEqual(validate_as_none, None) 

145 c_processing.storage_manager.delete_storage() 

146 

147 def test_validated_as_default_redis(self): 

148 """ 

149 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

150 string of the identifier, the method "validated_as" returns: 

151 - True if the id was already validated as valid 

152 - False if the id was already validated as invalid 

153 - None if the id was not validated before 

154 The procedure is tested 

155 - With redis storage manager without a pre-existent db associated 

156 """ 

157 c_processing = CrossrefProcessing(testing=True) 

158 validate_as_none = c_processing.validated_as({"schema": "doi", "identifier": "doi:10.1001/10-v4n2-hsf10003"}) 

159 self.assertEqual(validate_as_none, None) 

160 c_processing.storage_manager.delete_storage() 

161 

162 def test_validated_as_redis_with_preexistent_data(self): 

163 """ 

164 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

165 string of the identifier, the method "validated_as" returns: 

166 - True if the id was already validated as valid 

167 - False if the id was already validated as invalid 

168 - None if the id was not validated before 

169 The procedure is tested 

170 - With redis storage manager and pre-existent data associated 

171 """ 

172 valid_doi_not_in_db = {"identifier":"doi:10.1001/2012.jama.10158", "schema":"doi"} 

173 valid_doi_in_db = {"identifier":"doi:10.1001/2012.jama.10368", "schema":"doi"} 

174 invalid_doi_in_db = {"identifier":"doi:10.1001/2012.jama.1036", "schema":"doi"} 

175 

176 # New class instance and set values directly on the DOIManager's storage_manager 

177 c_processing_redis = CrossrefProcessing(testing=True) 

178 c_processing_redis.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True) 

179 c_processing_redis.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False) 

180 validated_as_True = c_processing_redis.validated_as(valid_doi_in_db) 

181 validated_as_False = c_processing_redis.validated_as(invalid_doi_in_db) 

182 not_validated = c_processing_redis.validated_as(valid_doi_not_in_db) 

183 

184 self.assertEqual(validated_as_True, True) 

185 self.assertEqual(validated_as_False, False) 

186 self.assertEqual(not_validated, None) 

187 

188 c_processing_redis.doi_m.storage_manager.delete_storage() 

189 

190 def test_validated_as_inmemory(self): 

191 """ 

192 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

193 string of the identifier, the method "validated_as" returns: 

194 - True if the id was already validated as valid 

195 - False if the id was already validated as invalid 

196 - None if the id was not validated before 

197 The procedure is tested 

198 - With in Memory + Json storage manager and a pre-existent db associated 

199 - With in Memory + Json storage manager without a pre-existent db associated 

200 """ 

201 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"} 

202 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"} 

203 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"} 

204 

205 c_processing = CrossrefProcessing(testing=True) 

206 c_processing.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True) 

207 c_processing.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False) 

208 validated_as_True = c_processing.validated_as(valid_doi_in_db) 

209 validated_as_False = c_processing.validated_as(invalid_doi_in_db) 

210 not_validated = c_processing.validated_as(valid_doi_not_in_db) 

211 

212 self.assertEqual(validated_as_True, True) 

213 self.assertEqual(validated_as_False, False) 

214 self.assertEqual(not_validated, None) 

215 

216 c_processing.doi_m.storage_manager.delete_storage() 

217 

218 def test_validated_as_redis(self): 

219 """ 

220 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

221 string of the identifier, the method "validated_as" returns: 

222 - True if the id was already validated as valid 

223 - False if the id was already validated as invalid 

224 - None if the id was not validated before 

225 The procedure is tested 

226 - With REDIS storage manager and a pre-existent db associated 

227 - With REDIS storage manager without a pre-existent db associated 

228 """ 

229 

230 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"} 

231 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"} 

232 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"} 

233 

234 # New class instance and set values directly on the DOIManager's storage_manager 

235 c_processing_redis = CrossrefProcessing(testing=True) 

236 c_processing_redis.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True) 

237 c_processing_redis.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False) 

238 validated_as_True = c_processing_redis.validated_as(valid_doi_in_db) 

239 validated_as_False = c_processing_redis.validated_as(invalid_doi_in_db) 

240 not_validated = c_processing_redis.validated_as(valid_doi_not_in_db) 

241 self.assertEqual(validated_as_True, True) 

242 self.assertEqual(validated_as_False, False) 

243 self.assertEqual(not_validated, None) 

244 c_processing_redis.storage_manager.delete_storage() 

245 

246 def test_get_id_manager(self): 

247 """Check that, given in input the string of a schema (e.g.:'pmid') or an id with a prefix (e.g.: 'pmid:12334') 

248 and a dictionary mapping the strings of the schemas to their id managers, the method returns the correct 

249 id manager. Note that each instance of the Preprocessing class needs its own instances of the id managers, 

250 in order to avoid conflicts while validating data""" 

251 

252 c_processing = CrossrefProcessing() 

253 id_man_dict = c_processing.venue_id_man_dict 

254 

255 issn_id = "issn:0003-987X" 

256 issn_string = "issn" 

257 issn_man_exp = c_processing.get_id_manager(issn_id, id_man_dict) 

258 issn_man_exp_2 = c_processing.get_id_manager(issn_string, id_man_dict) 

259 

260 #check that the idmanager for the issn was returned and that it works as expected 

261 assert issn_man_exp is not None 

262 assert issn_man_exp_2 is not None 

263 self.assertTrue(issn_man_exp.is_valid(issn_id)) 

264 self.assertTrue(issn_man_exp_2.is_valid(issn_id)) 

265 

266 def test_csv_creator(self): 

267 c_processing = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None) 

268 data = load_json(DATA, None) # type: ignore[arg-type] 

269 assert data is not None 

270 dois_to_prefetch = [item.get("DOI") for item in data['items'] if item.get("DOI")] 

271 c_processing.prefetch_doi_orcid_index(dois_to_prefetch) 

272 output = list() 

273 for item in data['items']: 

274 tabular_data = c_processing.csv_creator(item) 

275 if tabular_data: 

276 output.append(tabular_data) 

277 self.assertEqual(len(output), 11) 

278 output_ids = [row['id'] for row in output] 

279 self.assertIn('doi:10.47886/9789251092637.ch7', output_ids) 

280 self.assertIn('doi:10.9799/ksfan.2012.25.1.069', output_ids) 

281 self.assertIn('doi:10.9799/ksfan.2012.25.1.105', output_ids) 

282 first_item = next(row for row in output if row['id'] == 'doi:10.47886/9789251092637.ch7') 

283 self.assertEqual(first_item['type'], 'book chapter') 

284 self.assertEqual(first_item['publisher'], 'American Fisheries Society [crossref:460]') 

285 

286 def test_csv_creator_cited(self): 

287 c_processing_cited = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None, citing=False) 

288 with open(JSON_FILE, encoding="utf8") as f: 

289 result = json.load(f) 

290 output = list() 

291 for item in result['items']: 

292 if item.get("reference"): 

293 # filtering out entities without citations 

294 has_doi_references = [x for x in item["reference"] if x.get("DOI")] 

295 if has_doi_references: 

296 for reference_dict in has_doi_references: 

297 tabular_data = c_processing_cited.csv_creator(reference_dict) 

298 if tabular_data: 

299 output.append(tabular_data) 

300 expected_output =[ 

301 {'id': 'doi:10.2105/ajph.2006.101626', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

302 {'id': 'doi:10.1001/jama.299.12.1471', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

303 {'id': 'doi:10.1177/003335490812300219', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

304 {'id': 'doi:10.1089/bsp.2008.0020', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

305 {'id': 'doi:10.1097/01.ccm.0000151067.76074.21', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

306 {'id': 'doi:10.1177/003335490912400218', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

307 {'id': 'doi:10.1097/dmp.0b013e31817196bf', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

308 {'id': 'doi:10.1056/nejmsa021807', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

309 {'id': 'doi:10.1097/dmp.0b013e31819d977c', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

310 {'id': 'doi:10.1097/dmp.0b013e31819f1ae2', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

311 {'id': 'doi:10.1097/dmp.0b013e318194898d', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

312 {'id': 'doi:10.1378/chest.07-2693', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

313 {'id': 'doi:10.1016/s0196-0644(99)70224-6', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

314 {'id': 'doi:10.1097/01.ccm.0000151072.17826.72', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

315 {'id': 'doi:10.1097/01.bcr.0000155527.76205.a2', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

316 {'id': 'doi:10.2105/ajph.2009.162677', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}] 

317 self.assertEqual(output, expected_output) 

318 

319 def test_get_pages(self): 

320 item = { 

321 'page': '469-476' 

322 } 

323 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

324 pages = crossref_processor.get_crossref_pages(item) 

325 self.assertEqual(pages, '469-476') 

326 

327 def test_get_pages_right_letter(self): 

328 item = { 

329 'page': 'G22' 

330 } 

331 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

332 pages = crossref_processor.get_crossref_pages(item) 

333 self.assertEqual(pages, 'G22-G22') 

334 

335 def test_get_pages_wrong_letter(self): 

336 item = { 

337 'page': '583b-584' 

338 } 

339 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

340 pages = crossref_processor.get_crossref_pages(item) 

341 self.assertEqual(pages, '583-584') 

342 

343 def test_get_pages_roman_letters(self): 

344 item = { 

345 'page': 'iv-l' 

346 } 

347 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

348 pages = crossref_processor.get_crossref_pages(item) 

349 self.assertEqual(pages, 'iv-l') 

350 

351 def test_get_pages_non_roman_letters(self): 

352 item = { 

353 'page': 'kj-hh' 

354 } 

355 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

356 pages = crossref_processor.get_crossref_pages(item) 

357 self.assertEqual(pages, '') 

358 

359 def test_load_publishers_mapping(self): 

360 output = CrossrefProcessing.load_publishers_mapping(publishers_filepath=PUBLISHERS_MAPPING) 

361 expected_output = { 

362 '1': {'name': 'Annals of Family Medicine', 'prefixes': {'10.1370'}}, 

363 '2': {'name': 'American Association of Petroleum Geologists AAPG/Datapages', 'prefixes': {'10.15530', '10.1306'}}, 

364 '3': {'name': 'American Association of Physics Teachers (AAPT)','prefixes': {'10.1119'}}, 

365 '6': {'name': 'American College of Medical Physics (ACMP)','prefixes': {'10.1120'}}, 

366 '9': {'name': 'Allen Press', 'prefixes': {'10.1043'}}, 

367 '10': {'name': 'American Medical Association (AMA)', 'prefixes': {'10.1001'}}, 

368 '11': {'name': 'American Economic Association', 'prefixes': {'10.1257'}}, 

369 '460': {'name': 'American Fisheries Society', 'prefixes': {'10.1577', '10.47886'}} 

370 } 

371 self.assertEqual(output, expected_output) 

372 

373 def test_get_publisher_name(self): 

374 # The item's member is in the publishers' mapping 

375 item = { 

376 'publisher': 'American Fisheries Society', 

377 'DOI': '10.47886/9789251092637.ch7', 

378 'prefix': '10.47886', 

379 'member': '460' 

380 } 

381 doi = '10.47886/9789251092637.ch7' 

382 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

383 publisher_name = crossref_processor.get_publisher_name(doi, item) 

384 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]') 

385 

386 def test_get_publisher_name_no_member(self): 

387 # The item has no member, but the DOI prefix is the publishers' mapping 

388 item = { 

389 'publisher': 'American Fisheries Society', 

390 'DOI': '10.47886/9789251092637.ch7', 

391 'prefix': '10.47886' 

392 } 

393 doi = '10.47886/9789251092637.ch7' 

394 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

395 publisher_name = crossref_processor.get_publisher_name(doi, item) 

396 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]') 

397 

398 def test_get_publisher_name_redis_by_member(self): 

399 publishers_redis = PublishersRedis(testing=True) 

400 publishers_redis.set_publisher("460", "American Fisheries Society", {"10.47886"}) 

401 

402 item = { 

403 'publisher': 'American Fisheries Society', 

404 'DOI': '10.47886/9789251092637.ch7', 

405 'prefix': '10.47886', 

406 'member': '460' 

407 } 

408 doi = '10.47886/9789251092637.ch7' 

409 crossref_processor = CrossrefProcessing( 

410 orcid_index=None, publishers_filepath=None, 

411 use_redis_publishers=True, testing=True 

412 ) 

413 crossref_processor._publishers_redis = publishers_redis 

414 publisher_name = crossref_processor.get_publisher_name(doi, item) 

415 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]') 

416 

417 def test_get_publisher_name_redis_by_prefix(self): 

418 publishers_redis = PublishersRedis(testing=True) 

419 publishers_redis.set_publisher("460", "American Fisheries Society", {"10.47886"}) 

420 

421 item = { 

422 'publisher': 'American Fisheries Society', 

423 'DOI': '10.47886/9789251092637.ch7', 

424 'prefix': '10.47886' 

425 } 

426 doi = '10.47886/9789251092637.ch7' 

427 crossref_processor = CrossrefProcessing( 

428 orcid_index=None, publishers_filepath=None, 

429 use_redis_publishers=True, testing=True 

430 ) 

431 crossref_processor._publishers_redis = publishers_redis 

432 publisher_name = crossref_processor.get_publisher_name(doi, item) 

433 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]') 

434 

435 def test_get_publisher_name_redis_not_found(self): 

436 publishers_redis = PublishersRedis(testing=True) 

437 

438 item = { 

439 'publisher': 'Unknown Publisher', 

440 'DOI': '10.9999/unknown', 

441 'prefix': '10.9999' 

442 } 

443 doi = '10.9999/unknown' 

444 crossref_processor = CrossrefProcessing( 

445 orcid_index=None, publishers_filepath=None, 

446 use_redis_publishers=True, testing=True 

447 ) 

448 crossref_processor._publishers_redis = publishers_redis 

449 publisher_name = crossref_processor.get_publisher_name(doi, item) 

450 self.assertEqual(publisher_name, 'Unknown Publisher') 

451 

452 def test_get_venue_name(self): 

453 item = { 

454 'container-title': ['Cerebrospinal Fluid [Working Title]'], 

455 } 

456 row = {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'journal article', 'publisher': '', 'editor': ''} 

457 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

458 venue_name = crossref_processor.get_venue_name(item, row) 

459 self.assertEqual(venue_name, 'Cerebrospinal Fluid [Working Title]') 

460 

461 def test_get_venue_name_with_ISSN(self): 

462 item = { 

463 "container-title": ["Disaster Medicine and Public Health Preparedness"], 

464 "ISSN": ["1935-7893", "1938-744X"] 

465 } 

466 row = {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 

467 'type': 'journal article', 'publisher': '', 'editor': ''} 

468 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

469 venue_name = crossref_processor.get_venue_name(item, row) 

470 self.assertEqual(venue_name, 'Disaster Medicine and Public Health Preparedness [issn:1935-7893 issn:1938-744X]') 

471 

472 def test_find_crossref_orcid(self): 

473 """Test that, given in input a string representing an ORCID, the method returns: 

474 - the ORCID itself if it is valid 

475 - an empty string if it is not valid 

476 The procedure is tested with: 

477 - a valid ORCID 

478 - an invalid ORCID 

479 - a non-string input 

480 """ 

481 c_processing = CrossrefProcessing() 

482 test_doi = "10.1234/test123" # Added test DOI 

483 

484 # Valid ORCID 

485 inp = '0000-0003-4082-1500' 

486 out = c_processing.find_crossref_orcid(inp, test_doi) 

487 exp = "orcid:0000-0003-4082-1500" 

488 self.assertEqual(out, exp) 

489 

490 # Invalid ORCID 

491 inp_invalid_id = '5500-0001-9759-3938' 

492 out_invalid_id = c_processing.find_crossref_orcid(inp_invalid_id, test_doi) 

493 exp_invalid_id = "" 

494 self.assertEqual(out_invalid_id, exp_invalid_id) 

495 

496 # Non-string input 

497 inp_non_string = None 

498 out_non_string = c_processing.find_crossref_orcid(inp_non_string, test_doi) 

499 exp_non_string = "" 

500 self.assertEqual(out_non_string, exp_non_string) 

501 

502 c_processing.orcid_m.storage_manager.delete_storage() 

503 

504 # Set a valid id as invalid in storage to check that the api check is 

505 # avoided if the info is already in storage 

506 c_processing = CrossrefProcessing(testing=True) 

507 c_processing.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", False) 

508 

509 inp = '0000-0001-9759-3938' 

510 out = c_processing.find_crossref_orcid(inp, test_doi) 

511 exp = "" 

512 self.assertEqual(out, exp) 

513 c_processing.orcid_m.storage_manager.delete_storage() 

514 

515 c_processing = CrossrefProcessing(testing=True) 

516 c_processing.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", True) 

517 inp = '0000-0001-9759-3938' 

518 out = c_processing.find_crossref_orcid(inp, test_doi) 

519 exp = "orcid:0000-0001-9759-3938" 

520 self.assertEqual(out, exp) 

521 c_processing.orcid_m.storage_manager.delete_storage() 

522 

523 def test_report_series_venue_id(self): 

524 crossref_processor = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None) 

525 items = {'items': [{ 

526 'DOI': '10.1007/978-3-030-00668-6_8', 

527 'container-title': ["troitel'stvo: nauka i obrazovanie [Construction: Science and Education]"], 

528 'ISSN': '2305-5502', 

529 'type': 'report-series' 

530 }]} 

531 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

532 output = list() 

533 for item in items['items']: 

534 output.append(crossref_processor.csv_creator(item)) 

535 expected_output = [{'id': 'doi:10.1007/978-3-030-00668-6_8', 'title': '', 'author': '', 'pub_date': '', 'venue': "troitel'stvo: nauka i obrazovanie [Construction: Science and Education] [issn:2305-5502]", 'volume': '', 'issue': '', 'page': '', 'type': 'report series', 'publisher': '', 'editor': ''}] 

536 self.assertEqual(output, expected_output) 

537 

538 def test_report_series_br_id(self): 

539 crossref_processor = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None) 

540 items = {'items': [{ 

541 'DOI': '10.1007/978-3-030-00668-6_8', 

542 'container-title': [], 

543 'ISSN': '2305-5502', 

544 'type': 'report-series' 

545 }]} 

546 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING) 

547 output = list() 

548 for item in items['items']: 

549 output.append(crossref_processor.csv_creator(item)) 

550 expected_output = [{'id': 'doi:10.1007/978-3-030-00668-6_8 issn:2305-5502', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'report series', 'publisher': '', 'editor': ''}] 

551 self.assertEqual(output, expected_output) 

552 

553 def test_get_agents_strings_list(self): 

554 authors_list = [ 

555 { 

556 'given': 'Myung-Hee', 

557 'family': 'Kim', 

558 'affiliation': [], 

559 "role": "author" 

560 }, 

561 { 

562 'given': 'Jin-Seon', 

563 'family': 'Seo', 

564 'affiliation': [], 

565 "role": "author" 

566 }, 

567 { 

568 'given': 'Mi-Kyeong', 

569 'family': 'Choi', 

570 'affiliation': [], 

571 "role": "author" 

572 }, 

573 { 

574 'given': 'Eun-Young', 

575 'family': 'Kim', 

576 'affiliation': [], 

577 "role": "author" 

578 } 

579 ] 

580 crossref_processor = CrossrefProcessing(IOD) 

581 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105']) 

582 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', 

583 authors_list) 

584 expected_authors_list = ['Kim, Myung-Hee', 'Seo, Jin-Seon', 'Choi, Mi-Kyeong [orcid:0000-0002-6227-4053]', 

585 'Kim, Eun-Young'] 

586 self.assertEqual(authors_strings_list, expected_authors_list) 

587 

588 def test_get_agents_strings_list_same_family(self): 

589 # Two authors have the same family name and the same given name initials 

590 authors_list = [ 

591 { 

592 'given': 'Mi-Kyeong', 

593 'family': 'Choi', 

594 'affiliation': [], 

595 "role": "author" 

596 }, 

597 { 

598 'given': 'Mi-Hong', 

599 'family': 'Choi', 

600 'affiliation': [], 

601 "role": "author" 

602 } 

603 ] 

604 crossref_processor = CrossrefProcessing(IOD) 

605 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105']) 

606 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', 

607 authors_list) 

608 expected_authors_list = ['Choi, Mi-Kyeong [orcid:0000-0002-6227-4053]', 'Choi, Mi-Hong'] 

609 self.assertEqual(authors_strings_list, expected_authors_list) 

610 

611 def test_get_agents_strings_list_homonyms(self): 

612 # Two authors have the same family name and the same given name 

613 authors_list = [ 

614 { 

615 'given': 'Mi-Kyeong', 

616 'family': 'Choi', 

617 'affiliation': [], 

618 "role": "author" 

619 }, 

620 { 

621 'given': 'Mi-Kyeong', 

622 'family': 'Choi', 

623 'affiliation': [], 

624 "role": "author" 

625 } 

626 ] 

627 crossref_processor = CrossrefProcessing(IOD) 

628 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', 

629 authors_list) 

630 expected_authors_list = ['Choi, Mi-Kyeong', 'Choi, Mi-Kyeong'] 

631 self.assertEqual(authors_strings_list, expected_authors_list) 

632 

633 def test_get_agents_strings_list_inverted_names(self): 

634 # One author with an ORCID has as a name the surname of another 

635 authors_list = [ 

636 { 

637 'given': 'Choi', 

638 'family': 'Mi-Kyeong', 

639 'affiliation': [], 

640 "role": "author" 

641 }, 

642 { 

643 'given': 'Mi-Hong', 

644 'family': 'Choi', 

645 'affiliation': [], 

646 "role": "author" 

647 } 

648 ] 

649 crossref_processor = CrossrefProcessing(IOD) 

650 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', 

651 authors_list) 

652 expected_authors_list = ['Mi-Kyeong, Choi', 'Choi, Mi-Hong'] 

653 self.assertEqual(authors_strings_list, expected_authors_list) 

654 

655 def test_get_agents_strings_list_overlapping_surnames(self): 

656 # The surname of one author is included in the surname of another. 

657 authors_list = [ 

658 { 

659 "given": "Puvaneswari", 

660 "family": "Paravamsivam", 

661 "sequence": "first", 

662 "affiliation": [], 

663 "role": "author" 

664 }, 

665 { 

666 "given": "Chua Kek", 

667 "family": "Heng", 

668 "sequence": "additional", 

669 "affiliation": [], 

670 "role": "author" 

671 }, 

672 { 

673 "given": "Sri Nurestri Abdul", 

674 "family": "Malek", 

675 "sequence": "additional", 

676 "affiliation": [], 

677 "role": "author" 

678 }, 

679 { 

680 "given": "Vikineswary", 

681 "family": "Sabaratnam", 

682 "sequence": "additional", 

683 "affiliation": [], 

684 "role": "author" 

685 }, 

686 { 

687 "given": "Ravishankar Ram", 

688 "family": "M", 

689 "sequence": "additional", 

690 "affiliation": [], 

691 "role": "author" 

692 }, 

693 { 

694 "given": "Sri Nurestri Abdul", 

695 "family": "Malek", 

696 "sequence": "additional", 

697 "affiliation": [], 

698 "role": "editor" 

699 }, 

700 { 

701 "given": "Umah Rani", 

702 "family": "Kuppusamy", 

703 "sequence": "additional", 

704 "affiliation": [], 

705 "role": "author" 

706 } 

707 ] 

708 crossref_processor = CrossrefProcessing(None) 

709 csv_manager = CSVManager() 

710 csv_manager.data = {'doi:10.9799/ksfan.2012.25.1.105': {'Malek, Sri Nurestri Abdul [0000-0001-6278-8559]'}} 

711 crossref_processor.orcid_index = csv_manager 

712 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105']) 

713 authors_strings_list, editors_strings_list = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list) 

714 expected_authors_list = ['Paravamsivam, Puvaneswari', 'Heng, Chua Kek', 'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]', 'Sabaratnam, Vikineswary', 'M, Ravishankar Ram', 'Kuppusamy, Umah Rani'] 

715 expected_editors_list = ['Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]'] 

716 self.assertEqual((authors_strings_list, editors_strings_list), (expected_authors_list, expected_editors_list)) 

717 

718 def test_get_agents_strings_list_short_surname_substring(self): 

719 # A short index surname ("Li") must not contaminate longer surnames that 

720 # merely contain it as a substring ("Gladilin", "Poggioli"). Only the 

721 # genuine "Li" author may receive the ORCID. 

722 authors_list = [ 

723 {"given": "L. K.", "family": "Gladilin", "affiliation": [], "role": "author"}, 

724 {"given": "L.", "family": "Poggioli", "affiliation": [], "role": "author"}, 

725 {"given": "L.", "family": "Li", "affiliation": [], "role": "author"}, 

726 ] 

727 crossref_processor = CrossrefProcessing(None) 

728 csv_manager = CSVManager() 

729 csv_manager.data = {'doi:10.9799/ksfan.2012.25.1.105': {'Li, Liang [0000-0001-6411-6107]'}} 

730 crossref_processor.orcid_index = csv_manager 

731 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105']) 

732 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', 

733 authors_list) 

734 expected_authors_list = ['Gladilin, L. K.', 'Poggioli, L.', 'Li, L. [orcid:0000-0001-6411-6107]'] 

735 self.assertEqual(authors_strings_list, expected_authors_list) 

736 

737 def test_id_worker(self): 

738 field_issn = 'ISSN 1050-124X' 

739 field_isbn = ['978-1-56619-909-4'] 

740 issn_list = list() 

741 isbn_list = list() 

742 CrossrefProcessing.id_worker(field_issn, issn_list, CrossrefProcessing.issn_worker) 

743 CrossrefProcessing.id_worker(field_isbn, isbn_list, CrossrefProcessing.isbn_worker) 

744 expected_issn_list = ['issn:1050-124X'] 

745 expected_isbn_list = ['isbn:9781566199094'] 

746 self.assertEqual((issn_list, isbn_list), (expected_issn_list, expected_isbn_list)) 

747 

748 def test_to_validated_id_list(self): 

749 cp = CrossrefProcessing(testing=True) 

750 # CASE_1: is valid 

751 inp_1 = {'id':'doi:10.13039/100005522', 'schema':'doi'} 

752 out_1 = cp.to_validated_id_list(inp_1) 

753 exp_1 = ['doi:10.13039/100005522'] 

754 self.assertEqual(out_1, exp_1) 

755 cp.doi_m.storage_manager.delete_storage() 

756 

757 cp = CrossrefProcessing(testing=True) 

758 # CASE_2: is invalid 

759 inp_2 = {'id':'doi:10.1089/bsp.2008.002', 'schema':'doi'} 

760 out_2 = cp.to_validated_id_list(inp_2) 

761 exp_2 = [] 

762 self.assertEqual(out_2, exp_2) 

763 

764 cp = CrossrefProcessing(testing=True) 

765 # CASE_3: valid orcid 

766 inp_3 = {'id': 'orcid:0000-0003-4082-1500', 'schema':'orcid'} 

767 out_3 = cp.to_validated_id_list(inp_3) 

768 exp_3 = ['orcid:0000-0003-4082-1500'] 

769 self.assertEqual(out_3, exp_3) 

770 cp.orcid_m.storage_manager.delete_storage() 

771 

772 cp = CrossrefProcessing(testing=True) 

773 #CASE_4: invalid doi in self._redis_values_br 

774 inp_4 = {'id': 'doi:10.1089/bsp.2008.002', 'schema': 'doi'} 

775 cp._redis_values_br.append(inp_4['id']) 

776 out_4 = cp.to_validated_id_list(inp_4) 

777 exp_4 = ['doi:10.1089/bsp.2008.002'] 

778 self.assertEqual(out_4, exp_4) 

779 value=cp.tmp_doi_m.storage_manager.get_value('doi:10.1089/bsp.2008.002') 

780 self.assertEqual(value, True) 

781 cp.doi_m.storage_manager.delete_storage() 

782 

783 

784 def test_to_validated_id_list_redis(self): 

785 cp = CrossrefProcessing(testing=True) 

786 # CASE_1: is valid 

787 inp_1 = {'id': 'doi:10.13039/100005522', 'schema': 'doi'} 

788 out_1 = cp.to_validated_id_list(inp_1) 

789 exp_1 = ['doi:10.13039/100005522'] 

790 self.assertEqual(out_1, exp_1) 

791 cp.doi_m.storage_manager.delete_storage() 

792 

793 cp = CrossrefProcessing(testing=True) 

794 # CASE_2: is invalid 

795 inp_2 = {'id': 'doi:10.1089/bsp.2008.002', 'schema': 'doi'} 

796 out_2 = cp.to_validated_id_list(inp_2) 

797 exp_2 = [] 

798 self.assertEqual(out_2, exp_2) 

799 

800 cp = CrossrefProcessing(testing=True) 

801 # CASE_3: valid orcid 

802 inp_3 = {'id': 'orcid:0000-0003-4082-1500', 'schema': 'orcid'} 

803 out_3 = cp.to_validated_id_list(inp_3) 

804 exp_3 = ['orcid:0000-0003-4082-1500'] 

805 self.assertEqual(out_3, exp_3) 

806 cp.orcid_m.storage_manager.delete_storage() 

807 

808 cp = CrossrefProcessing(testing=True) 

809 # CASE_4: invalid doi in self._redis_values_br 

810 inp_4 = {'id': 'doi:10.1089/bsp.2008.002', 'schema': 'doi'} 

811 cp._redis_values_br.append(inp_4['id']) 

812 out_4 = cp.to_validated_id_list(inp_4) 

813 exp_4 = ['doi:10.1089/bsp.2008.002'] 

814 self.assertEqual(out_4, exp_4) 

815 value = cp.tmp_doi_m.storage_manager.get_value('doi:10.1089/bsp.2008.002') 

816 self.assertEqual(value, True) 

817 cp.doi_m.storage_manager.delete_storage() 

818 

819 def test_find_crossref_orcid_with_index(self): 

820 """Test ORCID validation using ORCID index before API validation""" 

821 # Setup 

822 test_doi = "10.1234/test123" 

823 test_doi_prefixed = "doi:10.1234/test123" 

824 test_orcid = "0000-0002-1234-5678" 

825 test_name = "Smith, John" 

826 

827 # Create CrossrefProcessing instance with ORCID index 

828 cp = CrossrefProcessing(testing=True) 

829 cp.orcid_index.add_value(test_doi_prefixed, f"{test_name} [orcid:{test_orcid}]") # type: ignore[attr-defined] 

830 cp.prefetch_doi_orcid_index([test_doi]) 

831 

832 # Test Case 1: ORCID found in index 

833 out_1 = cp.find_crossref_orcid(test_orcid, test_doi) 

834 exp_1 = f"orcid:{test_orcid}" 

835 self.assertEqual(out_1, exp_1) 

836 # Verify it was added to temporary storage 

837 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}")) 

838 

839 # Test Case 2: ORCID not in index but valid via API 

840 out_2 = cp.find_crossref_orcid("0000-0003-4082-1500", test_doi) 

841 exp_2 = "orcid:0000-0003-4082-1500" 

842 self.assertEqual(out_2, exp_2) 

843 

844 # Test Case 3: ORCID not in index and invalid 

845 out_3 = cp.find_crossref_orcid("0000-0000-0000-0000", test_doi) 

846 exp_3 = "" 

847 self.assertEqual(out_3, exp_3) 

848 

849 # Cleanup 

850 cp.orcid_m.storage_manager.delete_storage() 

851 

852 def test_find_crossref_orcid_api_disabled_not_in_index(self): 

853 """API OFF + empty index: a syntactically valid ORCID must NOT be resolved.""" 

854 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

855 test_doi = "10.9999/noindex" 

856 candidate = "0000-0003-4082-1500" # syntactically valid 

857 

858 out = cp.find_crossref_orcid(candidate, test_doi) 

859 self.assertEqual(out, "") 

860 # Must NOT be written to tmp storage 

861 self.assertIsNone(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{candidate}")) 

862 

863 cp.orcid_m.storage_manager.delete_storage() 

864 

865 def test_find_crossref_orcid_api_disabled_from_index(self): 

866 """API OFF + present in DOI→ORCID index: must resolve and be saved in tmp storage.""" 

867 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

868 test_doi = "10.1234/test" 

869 test_doi_prefixed = "doi:10.1234/test" 

870 test_orcid = "0000-0002-1234-5678" 

871 test_name = "Smith, John" 

872 

873 cp.orcid_index.add_value(test_doi_prefixed, f"{test_name} [orcid:{test_orcid}]") # type: ignore[attr-defined] 

874 cp.prefetch_doi_orcid_index([test_doi]) 

875 

876 out = cp.find_crossref_orcid(test_orcid, test_doi) 

877 self.assertEqual(out, f"orcid:{test_orcid}") 

878 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}")) 

879 

880 cp.orcid_m.storage_manager.delete_storage() 

881 

882 def test_find_crossref_orcid_api_disabled_in_storage(self): 

883 """API OFF + ORCID already valid in persistent storage: must be accepted.""" 

884 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

885 oid = "orcid:0000-0003-4082-1500" 

886 cp.orcid_m.storage_manager.set_value(oid, True) # mark valid 

887 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/any") 

888 self.assertEqual(out, oid) 

889 cp.orcid_m.storage_manager.delete_storage() 

890 

891 def test_find_crossref_orcid_api_disabled_from_redis_snapshot(self): 

892 """API OFF + empty index/storage, but ORCID present in Redis snapshot: accept and seed tmp storage.""" 

893 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

894 oid = "orcid:0000-0003-4082-1500" 

895 cp.update_redis_values(br=[], ra=[oid]) # emulate per-chunk snapshot 

896 

897 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/noindex") 

898 self.assertEqual(out, oid) 

899 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(oid)) 

900 cp.orcid_m.storage_manager.delete_storage() 

901 

902 def test_find_crossref_orcid_api_enabled_invalid_in_storage(self): 

903 """API ON + ORCID explicitly invalid in storage: reject immediately (no API/index).""" 

904 cp = CrossrefProcessing(use_orcid_api=True, testing=True) 

905 oid = "orcid:0000-0002-9286-2630" 

906 cp.orcid_m.storage_manager.set_value(oid, False) 

907 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/anything") 

908 self.assertEqual(out, "") 

909 cp.orcid_m.storage_manager.delete_storage() 

910 

911 def test_find_crossref_orcid_api_enabled_from_redis_snapshot(self): 

912 """API ON + empty storage/index, but ORCID present in Redis snapshot: accept without API call.""" 

913 cp = CrossrefProcessing(use_orcid_api=True, testing=True) 

914 oid = "orcid:0000-0003-4082-1500" 

915 cp.update_redis_values(br=[], ra=[oid]) 

916 

917 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/noindex") 

918 self.assertEqual(out, oid) 

919 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(oid)) 

920 cp.orcid_m.storage_manager.delete_storage() 

921 

922 def test_get_agents_strings_list_api_disabled_no_index(self): 

923 """API OFF + empty index: ORCIDs provided in agent dict MUST NOT be appended to the author string.""" 

924 agents_list = [ 

925 { 

926 "given": "Jane", 

927 "family": "Doe", 

928 "role": "author", 

929 "ORCID": "https://orcid.org/0000-0003-4082-1500", # present in metadata 

930 } 

931 ] 

932 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

933 authors_strings, editors_strings = cp.get_agents_strings_list("10.9999/noindex", agents_list) 

934 self.assertEqual(authors_strings, ["Doe, Jane"]) # no [orcid:...] tag 

935 self.assertEqual(editors_strings, []) 

936 cp.orcid_m.storage_manager.delete_storage() 

937 

938 def test_get_agents_strings_list_api_disabled_index_requires_prefixed_doi(self): 

939 """ 

940 API OFF + indice DOI→ORCID popolato con chiave DOI prefissata (doi:...). 

941 Il DOI passato a get_agents_strings_list è senza prefisso: la funzione deve 

942 normalizzarlo prima di interrogare l'indice, altrimenti l'ORCID non viene trovato. 

943 """ 

944 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

945 

946 # Indice popolato con DOI **prefissato** 

947 doi_pref = "doi:10.1234/test-idx" 

948 test_orcid = "0000-0002-9999-8888" 

949 cp.orcid_index.add_value(doi_pref, f"Smith, John [orcid:{test_orcid}]") # type: ignore[attr-defined] 

950 cp.prefetch_doi_orcid_index(["10.1234/test-idx"]) 

951 

952 # Autore senza ORCID in metadati; DOI passato **senza prefisso** 

953 agents = [{ 

954 "given": "John", 

955 "family": "Smith", 

956 "role": "author" 

957 }] 

958 

959 authors, editors = cp.get_agents_strings_list("10.1234/test-idx", agents) 

960 # Deve risolvere via indice e apporre il tag [orcid:...] 

961 self.assertEqual(authors, ["Smith, John [orcid:0000-0002-9999-8888]"]) 

962 self.assertEqual(editors, []) 

963 cp.orcid_m.storage_manager.delete_storage() 

964 

965 def test_find_crossref_orcid_api_disabled_redis_snapshot_unprefixed_orcid(self): 

966 """ 

967 API OFF + indice vuoto + storage vuoto, ma Redis snapshot contiene ORCID **senza prefisso**. 

968 La funzione deve riconoscerlo (normalizzando) e validarlo. 

969 """ 

970 cp = CrossrefProcessing(use_orcid_api=False, testing=True) 

971 

972 # Redis snapshot con ORCID **senza prefisso** 

973 raw_orcid = "0000-0003-4082-1500" 

974 cp.update_redis_values(br=[], ra=[raw_orcid]) 

975 

976 out = cp.find_crossref_orcid(raw_orcid, "10.9999/noindex") 

977 self.assertEqual(out, f"orcid:{raw_orcid}") 

978 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{raw_orcid}")) 

979 cp.orcid_m.storage_manager.delete_storage() 

980 

981 def test_update_redis_values_normalizes_inputs(self): 

982 """ 

983 update_redis_values deve normalizzare sempre: 

984 - DOI → con prefisso 'doi:' 

985 - ORCID → con prefisso 'orcid:' 

986 ed eliminare voci non normalizzabili. 

987 """ 

988 cp = CrossrefProcessing(testing=True) 

989 

990 cp.update_redis_values( 

991 br=["10.1001/jama.299.12.1471", "doi:10.2105/ajph.2006.101626", "xxx-bad"], 

992 ra=["0000-0002-1234-5678", "orcid:0000-0003-4082-1500", "bad-orcid"] 

993 ) 

994 

995 # Tutti normalizzati (e 'bad' scartati) 

996 self.assertIn("doi:10.1001/jama.299.12.1471", cp._redis_values_br) 

997 self.assertIn("doi:10.2105/ajph.2006.101626", cp._redis_values_br) 

998 self.assertNotIn("xxx-bad", cp._redis_values_br) 

999 

1000 self.assertIn("orcid:0000-0002-1234-5678", cp._redis_values_ra) 

1001 self.assertIn("orcid:0000-0003-4082-1500", cp._redis_values_ra) 

1002 self.assertNotIn("bad-orcid", cp._redis_values_ra) 

1003 cp.storage_manager.delete_storage() 

1004 

1005 

1006def test_validated_as_with_storage_manager(storage_manager): 

1007 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"} 

1008 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"} 

1009 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"} 

1010 

1011 c_processing = CrossrefProcessing(storage_manager=storage_manager, testing=True) 

1012 c_processing.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True) 

1013 c_processing.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False) 

1014 

1015 assert c_processing.validated_as(valid_doi_in_db) is True 

1016 assert c_processing.validated_as(invalid_doi_in_db) is False 

1017 assert c_processing.validated_as(valid_doi_not_in_db) is None 

1018 

1019 

1020class TestCrossrefProcessingWithMockedAPI(unittest.TestCase): 

1021 """Integration tests using mocked Crossref API responses from conftest.py.""" 

1022 

1023 def test_csv_creator_nature_article(self): 

1024 """Test with Nature article from mocked API (doi:10.1038/nature12373).""" 

1025 item = { 

1026 "DOI": "10.1038/nature12373", 

1027 "type": "journal-article", 

1028 "title": ["Nanometre-scale thermometry in a living cell"], 

1029 "author": [ 

1030 {"given": "G.", "family": "Kucsko", "sequence": "first"}, 

1031 {"given": "P. C.", "family": "Maurer", "sequence": "additional"}, 

1032 {"given": "M. D.", "family": "Lukin", "sequence": "additional"} 

1033 ], 

1034 "container-title": ["Nature"], 

1035 "volume": "500", 

1036 "issue": "7460", 

1037 "page": "54-58", 

1038 "issued": {"date-parts": [[2013, 7, 31]]}, 

1039 "ISSN": ["0028-0836", "1476-4687"], 

1040 "publisher": "Springer Science and Business Media LLC", 

1041 "member": "297", 

1042 "prefix": "10.1038" 

1043 } 

1044 processor = CrossrefProcessing(testing=True) 

1045 row = processor.csv_creator(item) 

1046 

1047 expected = { 

1048 'id': 'doi:10.1038/nature12373', 

1049 'title': 'Nanometre-scale thermometry in a living cell', 

1050 'author': 'Kucsko, G.; Maurer, P. C.; Lukin, M. D.', 

1051 'pub_date': '2013-7-31', 

1052 'venue': 'Nature [issn:0028-0836 issn:1476-4687]', 

1053 'volume': '500', 

1054 'issue': '7460', 

1055 'page': '54-58', 

1056 'type': 'journal article', 

1057 'publisher': 'Springer Science and Business Media LLC [crossref:297]', 

1058 'editor': '' 

1059 } 

1060 self.assertEqual(row, expected) 

1061 processor.storage_manager.delete_storage() 

1062 

1063 def test_csv_creator_plos_with_orcid_url(self): 

1064 """Test PLOS article with ORCID in URL format from mocked API.""" 

1065 item = { 

1066 "DOI": "10.1371/journal.pone.0284601", 

1067 "type": "journal-article", 

1068 "title": ["Biochemical evaluation of vaccination in rats"], 

1069 "author": [ 

1070 {"given": "Mahsa", "family": "Teymoorzadeh", "sequence": "first"}, 

1071 {"given": "Razieh", "family": "Yazdanparast", "sequence": "additional", 

1072 "ORCID": "https://orcid.org/0000-0003-0530-4305", "authenticated-orcid": True} 

1073 ], 

1074 "container-title": ["PLOS ONE"], 

1075 "volume": "18", 

1076 "issue": "5", 

1077 "page": "e0284601", 

1078 "issued": {"date-parts": [[2023, 5, 4]]}, 

1079 "ISSN": ["1932-6203"], 

1080 "publisher": "Public Library of Science (PLoS)" 

1081 } 

1082 processor = CrossrefProcessing(testing=True) 

1083 row = processor.csv_creator(item) 

1084 

1085 expected = { 

1086 'id': 'doi:10.1371/journal.pone.0284601', 

1087 'title': 'Biochemical evaluation of vaccination in rats', 

1088 'author': 'Teymoorzadeh, Mahsa; Yazdanparast, Razieh [orcid:0000-0003-0530-4305]', 

1089 'pub_date': '2023-5-4', 

1090 'venue': 'PLOS ONE [issn:1932-6203]', 

1091 'volume': '18', 

1092 'issue': '5', 

1093 'page': 'e0284601-e0284601', 

1094 'type': 'journal article', 

1095 'publisher': 'Public Library of Science (PLoS)', 

1096 'editor': '' 

1097 } 

1098 self.assertEqual(row, expected) 

1099 processor.storage_manager.delete_storage() 

1100 

1101 def test_csv_creator_book_chapter_multiple_containers(self): 

1102 """Test book chapter with multiple container-titles from mocked API.""" 

1103 item = { 

1104 "DOI": "10.1007/978-3-030-00668-6_8", 

1105 "type": "book-chapter", 

1106 "title": ["The SPAR Ontologies"], 

1107 "author": [ 

1108 {"given": "Silvio", "family": "Peroni", "sequence": "first"}, 

1109 {"given": "David", "family": "Shotton", "sequence": "additional"} 

1110 ], 

1111 "container-title": ["Lecture Notes in Computer Science", "The Semantic Web – ISWC 2018"], 

1112 "page": "119-136", 

1113 "issued": {"date-parts": [[2018]]}, 

1114 "ISBN": ["9783030006679", "9783030006686"], 

1115 "publisher": "Springer International Publishing" 

1116 } 

1117 processor = CrossrefProcessing(testing=True) 

1118 row = processor.csv_creator(item) 

1119 

1120 expected = { 

1121 'id': 'doi:10.1007/978-3-030-00668-6_8', 

1122 'title': 'The SPAR Ontologies', 

1123 'author': 'Peroni, Silvio; Shotton, David', 

1124 'pub_date': '2018', 

1125 'venue': 'Lecture Notes in Computer Science [isbn:9783030006679 isbn:9783030006686]', 

1126 'volume': '', 

1127 'issue': '', 

1128 'page': '119-136', 

1129 'type': 'book chapter', 

1130 'publisher': 'Springer International Publishing', 

1131 'editor': '' 

1132 } 

1133 self.assertEqual(row, expected) 

1134 processor.storage_manager.delete_storage() 

1135 

1136 def test_csv_creator_date_parts_null(self): 

1137 """Test handling of date-parts with null value: [[null]] from mocked API.""" 

1138 item = { 

1139 "DOI": "10.1234/null-date", 

1140 "type": "journal-article", 

1141 "title": ["Article with null date"], 

1142 "issued": {"date-parts": [[None]]} 

1143 } 

1144 processor = CrossrefProcessing(testing=True) 

1145 row = processor.csv_creator(item) 

1146 

1147 expected = { 

1148 'id': 'doi:10.1234/null-date', 

1149 'title': 'Article with null date', 

1150 'author': '', 

1151 'pub_date': '', 

1152 'venue': '', 

1153 'volume': '', 

1154 'issue': '', 

1155 'page': '', 

1156 'type': 'journal article', 

1157 'publisher': '', 

1158 'editor': '' 

1159 } 

1160 self.assertEqual(row, expected) 

1161 processor.storage_manager.delete_storage() 

1162 

1163 def test_csv_creator_date_parts_empty(self): 

1164 """Test handling of date-parts as empty list: [[]] from mocked API.""" 

1165 item = { 

1166 "DOI": "10.1234/empty-date", 

1167 "type": "journal-article", 

1168 "title": ["Article with empty date-parts"], 

1169 "issued": {"date-parts": [[]]} 

1170 } 

1171 processor = CrossrefProcessing(testing=True) 

1172 row = processor.csv_creator(item) 

1173 

1174 expected = { 

1175 'id': 'doi:10.1234/empty-date', 

1176 'title': 'Article with empty date-parts', 

1177 'author': '', 

1178 'pub_date': '', 

1179 'venue': '', 

1180 'volume': '', 

1181 'issue': '', 

1182 'page': '', 

1183 'type': 'journal article', 

1184 'publisher': '', 

1185 'editor': '' 

1186 } 

1187 self.assertEqual(row, expected) 

1188 processor.storage_manager.delete_storage() 

1189 

1190 def test_csv_creator_date_parts_missing(self): 

1191 """Test handling of issued without date-parts key from mocked API.""" 

1192 item = { 

1193 "DOI": "10.1234/no-dateparts", 

1194 "type": "journal-article", 

1195 "title": ["Article without date-parts key"], 

1196 "issued": {} 

1197 } 

1198 processor = CrossrefProcessing(testing=True) 

1199 row = processor.csv_creator(item) 

1200 

1201 expected = { 

1202 'id': 'doi:10.1234/no-dateparts', 

1203 'title': 'Article without date-parts key', 

1204 'author': '', 

1205 'pub_date': '', 

1206 'venue': '', 

1207 'volume': '', 

1208 'issue': '', 

1209 'page': '', 

1210 'type': 'journal article', 

1211 'publisher': '', 

1212 'editor': '' 

1213 } 

1214 self.assertEqual(row, expected) 

1215 processor.storage_manager.delete_storage() 

1216 

1217 def test_csv_creator_html_in_title(self): 

1218 """Test HTML markup in title is cleaned (from mocked API structure).""" 

1219 item = { 

1220 "DOI": "10.1234/html-title", 

1221 "type": "journal-article", 

1222 "title": ["A study of <i>Escherichia coli</i> in <b>biofilms</b>"], 

1223 "issued": {"date-parts": [[2024, 1, 15]]} 

1224 } 

1225 processor = CrossrefProcessing(testing=True) 

1226 row = processor.csv_creator(item) 

1227 

1228 expected = { 

1229 'id': 'doi:10.1234/html-title', 

1230 'title': 'A study of Escherichia coli in biofilms', 

1231 'author': '', 

1232 'pub_date': '2024-1-15', 

1233 'venue': '', 

1234 'volume': '', 

1235 'issue': '', 

1236 'page': '', 

1237 'type': 'journal article', 

1238 'publisher': '', 

1239 'editor': '' 

1240 } 

1241 self.assertEqual(row, expected) 

1242 processor.storage_manager.delete_storage() 

1243 

1244 def test_csv_creator_with_editor(self): 

1245 """Test article with both author and editor from mocked API structure.""" 

1246 item = { 

1247 "DOI": "10.1234/with-editor", 

1248 "type": "edited-book", 

1249 "title": ["Edited volume test"], 

1250 "author": [{"given": "John", "family": "Doe", "sequence": "first"}], 

1251 "editor": [{"given": "Jane", "family": "Smith", "sequence": "first"}], 

1252 "issued": {"date-parts": [[2024, 6, 20]]} 

1253 } 

1254 processor = CrossrefProcessing(testing=True) 

1255 row = processor.csv_creator(item) 

1256 

1257 expected = { 

1258 'id': 'doi:10.1234/with-editor', 

1259 'title': 'Edited volume test', 

1260 'author': 'Doe, John', 

1261 'pub_date': '2024-6-20', 

1262 'venue': '', 

1263 'volume': '', 

1264 'issue': '', 

1265 'page': '', 

1266 'type': 'edited book', 

1267 'publisher': '', 

1268 'editor': 'Smith, Jane' 

1269 } 

1270 self.assertEqual(row, expected) 

1271 processor.storage_manager.delete_storage() 

1272 

1273 def test_csv_creator_no_inplace_modification(self): 

1274 """Test that csv_creator does not modify the input item dict.""" 

1275 item = { 

1276 "DOI": "10.1234/with-editor", 

1277 "type": "edited-book", 

1278 "title": ["Edited volume test"], 

1279 "author": [{"given": "John", "family": "Doe", "sequence": "first"}], 

1280 "editor": [{"given": "Jane", "family": "Smith", "sequence": "first"}], 

1281 "issued": {"date-parts": [[2024, 6, 20]]} 

1282 } 

1283 original_author = {"given": "John", "family": "Doe", "sequence": "first"} 

1284 original_editor = {"given": "Jane", "family": "Smith", "sequence": "first"} 

1285 

1286 processor = CrossrefProcessing(testing=True) 

1287 processor.csv_creator(item) 

1288 

1289 self.assertEqual(item['author'][0], original_author) 

1290 self.assertEqual(item['editor'][0], original_editor) 

1291 processor.storage_manager.delete_storage() 

1292 

1293 def test_csv_creator_member_as_string(self): 

1294 """Test that member field as string (API format) is handled.""" 

1295 item = { 

1296 "DOI": "10.1001/test.12345", 

1297 "type": "journal-article", 

1298 "title": ["Test"], 

1299 "publisher": "American Medical Association (AMA)", 

1300 "member": "10", 

1301 "prefix": "10.1001", 

1302 "issued": {"date-parts": [[2024]]} 

1303 } 

1304 processor = CrossrefProcessing( 

1305 publishers_filepath=PUBLISHERS_MAPPING, 

1306 testing=True 

1307 ) 

1308 row = processor.csv_creator(item) 

1309 

1310 expected = { 

1311 'id': 'doi:10.1001/test.12345', 

1312 'title': 'Test', 

1313 'author': '', 

1314 'pub_date': '2024', 

1315 'venue': '', 

1316 'volume': '', 

1317 'issue': '', 

1318 'page': '', 

1319 'type': 'journal article', 

1320 'publisher': 'American Medical Association (AMA) [crossref:10]', 

1321 'editor': '' 

1322 } 

1323 self.assertEqual(row, expected) 

1324 processor.storage_manager.delete_storage() 

1325 

1326 

1327 

1328 

1329 

1330 

1331 

1332 

1333 

1334 

1335 

1336 

1337 

1338