Coverage for test/crossref_processing

1# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it>

2# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it>

3# SPDX-FileCopyrightText: 2025 Arianna Moretti <arianna.moretti4@unibo.it>

5# SPDX-License-Identifier: ISC

7import json

8import os

9import unittest

11from oc_ds_converter.crossref.crossref_processing import CrossrefProcessing

12from oc_ds_converter.datasource.orcid_index import PublishersRedis

13from oc_ds_converter.lib.csvmanager import CSVManager

14from oc_ds_converter.lib.jsonmanager import load_json

16TEST_DIR = os.path.join("test", "crossref_processing")

17JSON_FILE = os.path.join(TEST_DIR, "0.json")

18TMP_SUPPORT_MATERIAL = os.path.join(TEST_DIR, "tmp_support")

19IOD = os.path.join(TEST_DIR, 'iod')

20DATA = os.path.join(TEST_DIR, '40228.json')

21PUBLISHERS_MAPPING = os.path.join(TEST_DIR, 'publishers.csv')

25class TestCrossrefProcessing(unittest.TestCase):

26 def test_extract_all_ids_cited(self):

27 c_processing = CrossrefProcessing()

28 with open(JSON_FILE, encoding="utf8") as f:

29 result = json.load(f)

30 for entity_dict in result['items']:

31 results_ids = c_processing.extract_all_ids(entity_dict, False)

32 br = results_ids[0]

33 expected_br = ['doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020', 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218', 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807', 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2', 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693', 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72', 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677']

34 self.assertEqual(set(expected_br), set(br))

35 c_processing.storage_manager.delete_storage()

37 def test_extract_all_ids_cited_redis(self):

38 c_processing = CrossrefProcessing(testing=True)

39 with open(JSON_FILE, encoding="utf8") as f:

40 result = json.load(f)

41 for entity_dict in result['items']:

42 results_ids = c_processing.extract_all_ids(entity_dict, False)

43 br = results_ids[0]

44 expected_br = ['doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471',

45 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020',

46 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218',

47 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807',

48 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2',

49 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693',

50 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72',

51 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677']

52 self.assertEqual(set(expected_br), set(br))

53 c_processing.storage_manager.delete_storage()

55 def test_get_redis_validity_list(self):

56 c_processing = CrossrefProcessing()

57 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471',

58 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020',

59 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218',

60 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807',

61 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2',

62 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693',

63 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72',

64 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677'}

65 br_valid_list = c_processing.get_redis_validity_list(br, "br")

66 exp_br_valid_list = []

67 self.assertEqual(br_valid_list, exp_br_valid_list)

68 c_processing.storage_manager.delete_storage()

70 def test_get_redis_validity_list_redis(self):

71 c_processing = CrossrefProcessing(testing=True)

72 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471',

73 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020',

74 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218',

75 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807',

76 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2',

77 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693',

78 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72',

79 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677'}

80 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}

81 br_valid_list = c_processing.get_redis_validity_list(br, "br")

82 exp_br_valid_list = []

83 ra_valid_list = c_processing.get_redis_validity_list(ra, "ra")

84 self.assertEqual(br_valid_list, exp_br_valid_list)

85 exp_ra_valid_list = []

86 self.assertEqual(ra_valid_list, exp_ra_valid_list)

87 c_processing.storage_manager.delete_storage()

89 def test_get_redis_validity_dict_w_fakeredis_db_values_sqlite(self):

90 c_processing = CrossrefProcessing()

91 c_processing.BR_redis.sadd('doi:10.2105/ajph.2006.101626', "omid:1")

92 c_processing.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2")

94 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471',

95 'doi:10.1177/003335490812300219'}

96 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}

98 br_validity_dict = c_processing.get_redis_validity_list(br, "br")

99 exp_br_valid_list = ['doi:10.2105/ajph.2006.101626']

100 ra_validity_dict = c_processing.get_redis_validity_list(ra, "ra")

101 exp_ra_valid_list = ['orcid:0000-0002-8090-6886']

102 self.assertEqual(br_validity_dict, exp_br_valid_list)

103 self.assertEqual(ra_validity_dict, exp_ra_valid_list)

104

105 c_processing.storage_manager.delete_storage()

106

107 c_processing.BR_redis.delete('doi:10.2105/ajph.2006.101626')

108 c_processing.RA_redis.delete('orcid:0000-0002-8090-6886')

109

110 def test_get_redis_validity_dict_w_fakeredis_db_values_redis(self):

111 c_processing = CrossrefProcessing(testing=True)

112 c_processing.BR_redis.sadd('doi:10.2105/ajph.2006.101626', "omid:1")

113 c_processing.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2")

114

115 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471',

116 'doi:10.1177/003335490812300219'}

117 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}

118

119 br_validity_dict = c_processing.get_redis_validity_list(br, "br")

120 exp_br_valid_list = ['doi:10.2105/ajph.2006.101626']

121 ra_validity_dict = c_processing.get_redis_validity_list(ra, "ra")

122 exp_ra_valid_list = ['orcid:0000-0002-8090-6886']

123 self.assertEqual(br_validity_dict, exp_br_valid_list)

124 self.assertEqual(ra_validity_dict, exp_ra_valid_list)

125

126 c_processing.storage_manager.delete_storage()

127

128 c_processing.BR_redis.delete('doi:10.2105/ajph.2006.101626')

129 c_processing.RA_redis.delete('orcid:0000-0002-8090-6886')

130

131 def test_validated_as_default(self):

132 """

133 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:

134 string of the identifier, the method "validated_as" returns:

135 - True if the id was already validated as valid

136 - False if the id was already validated as invalid

137 - None if the id was not validated before

138 The procedure is tested

139 - With default storage manager (sqlite) without a pre-existent db associated

140 """

141

142 c_processing = CrossrefProcessing()

143 validate_as_none = c_processing.validated_as({"schema":"doi", "identifier": "doi:10.1001/10-v4n2-hsf10003"})

144 self.assertEqual(validate_as_none, None)

145 c_processing.storage_manager.delete_storage()

146

147 def test_validated_as_default_redis(self):

148 """

149 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:

150 string of the identifier, the method "validated_as" returns:

151 - True if the id was already validated as valid

152 - False if the id was already validated as invalid

153 - None if the id was not validated before

154 The procedure is tested

155 - With redis storage manager without a pre-existent db associated

156 """

157 c_processing = CrossrefProcessing(testing=True)

158 validate_as_none = c_processing.validated_as({"schema": "doi", "identifier": "doi:10.1001/10-v4n2-hsf10003"})

159 self.assertEqual(validate_as_none, None)

160 c_processing.storage_manager.delete_storage()

161

162 def test_validated_as_redis_with_preexistent_data(self):

163 """

164 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:

165 string of the identifier, the method "validated_as" returns:

166 - True if the id was already validated as valid

167 - False if the id was already validated as invalid

168 - None if the id was not validated before

169 The procedure is tested

170 - With redis storage manager and pre-existent data associated

171 """

172 valid_doi_not_in_db = {"identifier":"doi:10.1001/2012.jama.10158", "schema":"doi"}

173 valid_doi_in_db = {"identifier":"doi:10.1001/2012.jama.10368", "schema":"doi"}

174 invalid_doi_in_db = {"identifier":"doi:10.1001/2012.jama.1036", "schema":"doi"}

175

176 # New class instance and set values directly on the DOIManager's storage_manager

177 c_processing_redis = CrossrefProcessing(testing=True)

178 c_processing_redis.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True)

179 c_processing_redis.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False)

180 validated_as_True = c_processing_redis.validated_as(valid_doi_in_db)

181 validated_as_False = c_processing_redis.validated_as(invalid_doi_in_db)

182 not_validated = c_processing_redis.validated_as(valid_doi_not_in_db)

183

184 self.assertEqual(validated_as_True, True)

185 self.assertEqual(validated_as_False, False)

186 self.assertEqual(not_validated, None)

187

188 c_processing_redis.doi_m.storage_manager.delete_storage()

189

190 def test_validated_as_inmemory(self):

191 """

192 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:

193 string of the identifier, the method "validated_as" returns:

194 - True if the id was already validated as valid

195 - False if the id was already validated as invalid

196 - None if the id was not validated before

197 The procedure is tested

198 - With in Memory + Json storage manager and a pre-existent db associated

199 - With in Memory + Json storage manager without a pre-existent db associated

200 """

201 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"}

202 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"}

203 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"}

204

205 c_processing = CrossrefProcessing(testing=True)

206 c_processing.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True)

207 c_processing.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False)

208 validated_as_True = c_processing.validated_as(valid_doi_in_db)

209 validated_as_False = c_processing.validated_as(invalid_doi_in_db)

210 not_validated = c_processing.validated_as(valid_doi_not_in_db)

211

212 self.assertEqual(validated_as_True, True)

213 self.assertEqual(validated_as_False, False)

214 self.assertEqual(not_validated, None)

215

216 c_processing.doi_m.storage_manager.delete_storage()

217

218 def test_validated_as_redis(self):

219 """

220 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:

221 string of the identifier, the method "validated_as" returns:

222 - True if the id was already validated as valid

223 - False if the id was already validated as invalid

224 - None if the id was not validated before

225 The procedure is tested

226 - With REDIS storage manager and a pre-existent db associated

227 - With REDIS storage manager without a pre-existent db associated

228 """

229

230 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"}

231 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"}

232 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"}

233

234 # New class instance and set values directly on the DOIManager's storage_manager

235 c_processing_redis = CrossrefProcessing(testing=True)

236 c_processing_redis.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True)

237 c_processing_redis.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False)

238 validated_as_True = c_processing_redis.validated_as(valid_doi_in_db)

239 validated_as_False = c_processing_redis.validated_as(invalid_doi_in_db)

240 not_validated = c_processing_redis.validated_as(valid_doi_not_in_db)

241 self.assertEqual(validated_as_True, True)

242 self.assertEqual(validated_as_False, False)

243 self.assertEqual(not_validated, None)

244 c_processing_redis.storage_manager.delete_storage()

245

246 def test_get_id_manager(self):

247 """Check that, given in input the string of a schema (e.g.:'pmid') or an id with a prefix (e.g.: 'pmid:12334')

248 and a dictionary mapping the strings of the schemas to their id managers, the method returns the correct

249 id manager. Note that each instance of the Preprocessing class needs its own instances of the id managers,

250 in order to avoid conflicts while validating data"""

251

252 c_processing = CrossrefProcessing()

253 id_man_dict = c_processing.venue_id_man_dict

254

255 issn_id = "issn:0003-987X"

256 issn_string = "issn"

257 issn_man_exp = c_processing.get_id_manager(issn_id, id_man_dict)

258 issn_man_exp_2 = c_processing.get_id_manager(issn_string, id_man_dict)

259

260 #check that the idmanager for the issn was returned and that it works as expected

261 assert issn_man_exp is not None

262 assert issn_man_exp_2 is not None

263 self.assertTrue(issn_man_exp.is_valid(issn_id))

264 self.assertTrue(issn_man_exp_2.is_valid(issn_id))

265

266 def test_csv_creator(self):

267 c_processing = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None)

268 data = load_json(DATA, None) # type: ignore[arg-type]

269 assert data is not None

270 dois_to_prefetch = [item.get("DOI") for item in data['items'] if item.get("DOI")]

271 c_processing.prefetch_doi_orcid_index(dois_to_prefetch)

272 output = list()

273 for item in data['items']:

274 tabular_data = c_processing.csv_creator(item)

275 if tabular_data:

276 output.append(tabular_data)

277 self.assertEqual(len(output), 11)

278 output_ids = [row['id'] for row in output]

279 self.assertIn('doi:10.47886/9789251092637.ch7', output_ids)

280 self.assertIn('doi:10.9799/ksfan.2012.25.1.069', output_ids)

281 self.assertIn('doi:10.9799/ksfan.2012.25.1.105', output_ids)

282 first_item = next(row for row in output if row['id'] == 'doi:10.47886/9789251092637.ch7')

283 self.assertEqual(first_item['type'], 'book chapter')

284 self.assertEqual(first_item['publisher'], 'American Fisheries Society [crossref:460]')

285

286 def test_csv_creator_cited(self):

287 c_processing_cited = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None, citing=False)

288 with open(JSON_FILE, encoding="utf8") as f:

289 result = json.load(f)

290 output = list()

291 for item in result['items']:

292 if item.get("reference"):

293 # filtering out entities without citations

294 has_doi_references = [x for x in item["reference"] if x.get("DOI")]

295 if has_doi_references:

296 for reference_dict in has_doi_references:

297 tabular_data = c_processing_cited.csv_creator(reference_dict)

298 if tabular_data:

299 output.append(tabular_data)

300 expected_output =[

301 {'id': 'doi:10.2105/ajph.2006.101626', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

302 {'id': 'doi:10.1001/jama.299.12.1471', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

303 {'id': 'doi:10.1177/003335490812300219', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

304 {'id': 'doi:10.1089/bsp.2008.0020', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

305 {'id': 'doi:10.1097/01.ccm.0000151067.76074.21', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

306 {'id': 'doi:10.1177/003335490912400218', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

307 {'id': 'doi:10.1097/dmp.0b013e31817196bf', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

308 {'id': 'doi:10.1056/nejmsa021807', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

309 {'id': 'doi:10.1097/dmp.0b013e31819d977c', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

310 {'id': 'doi:10.1097/dmp.0b013e31819f1ae2', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

311 {'id': 'doi:10.1097/dmp.0b013e318194898d', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

312 {'id': 'doi:10.1378/chest.07-2693', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

313 {'id': 'doi:10.1016/s0196-0644(99)70224-6', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

314 {'id': 'doi:10.1097/01.ccm.0000151072.17826.72', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

315 {'id': 'doi:10.1097/01.bcr.0000155527.76205.a2', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

316 {'id': 'doi:10.2105/ajph.2009.162677', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}]

317 self.assertEqual(output, expected_output)

318

319 def test_get_pages(self):

320 item = {

321 'page': '469-476'

322 }

323 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)

324 pages = crossref_processor.get_crossref_pages(item)

325 self.assertEqual(pages, '469-476')

326

327 def test_get_pages_right_letter(self):

328 item = {

329 'page': 'G22'

330 }

331 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)

332 pages = crossref_processor.get_crossref_pages(item)

333 self.assertEqual(pages, 'G22-G22')

334

335 def test_get_pages_wrong_letter(self):

336 item = {

337 'page': '583b-584'

338 }

339 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)

340 pages = crossref_processor.get_crossref_pages(item)

341 self.assertEqual(pages, '583-584')

342

343 def test_get_pages_roman_letters(self):

344 item = {

345 'page': 'iv-l'

346 }

347 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)

348 pages = crossref_processor.get_crossref_pages(item)

349 self.assertEqual(pages, 'iv-l')

350

351 def test_get_pages_non_roman_letters(self):

352 item = {

353 'page': 'kj-hh'

354 }

355 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)

356 pages = crossref_processor.get_crossref_pages(item)

357 self.assertEqual(pages, '')

358

359 def test_load_publishers_mapping(self):

360 output = CrossrefProcessing.load_publishers_mapping(publishers_filepath=PUBLISHERS_MAPPING)

361 expected_output = {

362 '1': {'name': 'Annals of Family Medicine', 'prefixes': {'10.1370'}},

363 '2': {'name': 'American Association of Petroleum Geologists AAPG/Datapages', 'prefixes': {'10.15530', '10.1306'}},

364 '3': {'name': 'American Association of Physics Teachers (AAPT)','prefixes': {'10.1119'}},

365 '6': {'name': 'American College of Medical Physics (ACMP)','prefixes': {'10.1120'}},

366 '9': {'name': 'Allen Press', 'prefixes': {'10.1043'}},

367 '10': {'name': 'American Medical Association (AMA)', 'prefixes': {'10.1001'}},

368 '11': {'name': 'American Economic Association', 'prefixes': {'10.1257'}},

369 '460': {'name': 'American Fisheries Society', 'prefixes': {'10.1577', '10.47886'}}

370 }

371 self.assertEqual(output, expected_output)

372

373 def test_get_publisher_name(self):

374 # The item's member is in the publishers' mapping

375 item = {

376 'publisher': 'American Fisheries Society',

377 'DOI': '10.47886/9789251092637.ch7',

378 'prefix': '10.47886',

379 'member': '460'

380 }

381 doi = '10.47886/9789251092637.ch7'

382 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)

383 publisher_name = crossref_processor.get_publisher_name(doi, item)

384 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]')

385

386 def test_get_publisher_name_no_member(self):

387 # The item has no member, but the DOI prefix is the publishers' mapping

388 item = {

389 'publisher': 'American Fisheries Society',

390 'DOI': '10.47886/9789251092637.ch7',

391 'prefix': '10.47886'

392 }

393 doi = '10.47886/9789251092637.ch7'

394 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)

395 publisher_name = crossref_processor.get_publisher_name(doi, item)

396 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]')

397

398 def test_get_publisher_name_redis_by_member(self):

399 publishers_redis = PublishersRedis(testing=True)

400 publishers_redis.set_publisher("460", "American Fisheries Society", {"10.47886"})

401

402 item = {

403 'publisher': 'American Fisheries Society',

404 'DOI': '10.47886/9789251092637.ch7',

405 'prefix': '10.47886',

406 'member': '460'

407 }

408 doi = '10.47886/9789251092637.ch7'

409 crossref_processor = CrossrefProcessing(

410 orcid_index=None, publishers_filepath=None,

411 use_redis_publishers=True, testing=True

412 )

413 crossref_processor._publishers_redis = publishers_redis

414 publisher_name = crossref_processor.get_publisher_name(doi, item)

415 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]')

416

417 def test_get_publisher_name_redis_by_prefix(self):

418 publishers_redis = PublishersRedis(testing=True)

419 publishers_redis.set_publisher("460", "American Fisheries Society", {"10.47886"})

420

421 item = {

422 'publisher': 'American Fisheries Society',

423 'DOI': '10.47886/9789251092637.ch7',

424 'prefix': '10.47886'

425 }

426 doi = '10.47886/9789251092637.ch7'

427 crossref_processor = CrossrefProcessing(

428 orcid_index=None, publishers_filepath=None,

429 use_redis_publishers=True, testing=True

430 )

431 crossref_processor._publishers_redis = publishers_redis

432 publisher_name = crossref_processor.get_publisher_name(doi, item)

433 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]')

434

435 def test_get_publisher_name_redis_not_found(self):

436 publishers_redis = PublishersRedis(testing=True)

437

438 item = {

439 'publisher': 'Unknown Publisher',

440 'DOI': '10.9999/unknown',

441 'prefix': '10.9999'

442 }

443 doi = '10.9999/unknown'

444 crossref_processor = CrossrefProcessing(

445 orcid_index=None, publishers_filepath=None,

446 use_redis_publishers=True, testing=True

447 )

448 crossref_processor._publishers_redis = publishers_redis

449 publisher_name = crossref_processor.get_publisher_name(doi, item)

450 self.assertEqual(publisher_name, 'Unknown Publisher')

451

452 def test_get_venue_name(self):

453 item = {

454 'container-title': ['Cerebrospinal Fluid [Working Title]'],

455 }

456 row = {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'journal article', 'publisher': '', 'editor': ''}

457 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)

458 venue_name = crossref_processor.get_venue_name(item, row)

459 self.assertEqual(venue_name, 'Cerebrospinal Fluid [Working Title]')

460

461 def test_get_venue_name_with_ISSN(self):

462 item = {

463 "container-title": ["Disaster Medicine and Public Health Preparedness"],

464 "ISSN": ["1935-7893", "1938-744X"]

465 }

466 row = {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '',

467 'type': 'journal article', 'publisher': '', 'editor': ''}

468 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)

469 venue_name = crossref_processor.get_venue_name(item, row)

470 self.assertEqual(venue_name, 'Disaster Medicine and Public Health Preparedness [issn:1935-7893 issn:1938-744X]')

471

472 def test_find_crossref_orcid(self):

473 """Test that, given in input a string representing an ORCID, the method returns:

474 - the ORCID itself if it is valid

475 - an empty string if it is not valid

476 The procedure is tested with:

477 - a valid ORCID

478 - an invalid ORCID

479 - a non-string input

480 """

481 c_processing = CrossrefProcessing()

482 test_doi = "10.1234/test123" # Added test DOI

483

484 # Valid ORCID

485 inp = '0000-0003-4082-1500'

486 out = c_processing.find_crossref_orcid(inp, test_doi)

487 exp = "orcid:0000-0003-4082-1500"

488 self.assertEqual(out, exp)

489

490 # Invalid ORCID

491 inp_invalid_id = '5500-0001-9759-3938'

492 out_invalid_id = c_processing.find_crossref_orcid(inp_invalid_id, test_doi)

493 exp_invalid_id = ""

494 self.assertEqual(out_invalid_id, exp_invalid_id)

495

496 # Non-string input

497 inp_non_string = None

498 out_non_string = c_processing.find_crossref_orcid(inp_non_string, test_doi)

499 exp_non_string = ""

500 self.assertEqual(out_non_string, exp_non_string)

501

502 c_processing.orcid_m.storage_manager.delete_storage()

503

504 # Set a valid id as invalid in storage to check that the api check is

505 # avoided if the info is already in storage

506 c_processing = CrossrefProcessing(testing=True)

507 c_processing.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", False)

508

509 inp = '0000-0001-9759-3938'

510 out = c_processing.find_crossref_orcid(inp, test_doi)

511 exp = ""

512 self.assertEqual(out, exp)

513 c_processing.orcid_m.storage_manager.delete_storage()

514

515 c_processing = CrossrefProcessing(testing=True)

516 c_processing.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", True)

517 inp = '0000-0001-9759-3938'

518 out = c_processing.find_crossref_orcid(inp, test_doi)

519 exp = "orcid:0000-0001-9759-3938"

520 self.assertEqual(out, exp)

521 c_processing.orcid_m.storage_manager.delete_storage()

522

523 def test_report_series_venue_id(self):

524 crossref_processor = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None)

525 items = {'items': [{

526 'DOI': '10.1007/978-3-030-00668-6_8',

527 'container-title': ["troitel'stvo: nauka i obrazovanie [Construction: Science and Education]"],

528 'ISSN': '2305-5502',

529 'type': 'report-series'

530 }]}

531 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)

532 output = list()

533 for item in items['items']:

534 output.append(crossref_processor.csv_creator(item))

535 expected_output = [{'id': 'doi:10.1007/978-3-030-00668-6_8', 'title': '', 'author': '', 'pub_date': '', 'venue': "troitel'stvo: nauka i obrazovanie [Construction: Science and Education] [issn:2305-5502]", 'volume': '', 'issue': '', 'page': '', 'type': 'report series', 'publisher': '', 'editor': ''}]

536 self.assertEqual(output, expected_output)

537

538 def test_report_series_br_id(self):

539 crossref_processor = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None)

540 items = {'items': [{

541 'DOI': '10.1007/978-3-030-00668-6_8',

542 'container-title': [],

543 'ISSN': '2305-5502',

544 'type': 'report-series'

545 }]}

546 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)

547 output = list()

548 for item in items['items']:

549 output.append(crossref_processor.csv_creator(item))

550 expected_output = [{'id': 'doi:10.1007/978-3-030-00668-6_8 issn:2305-5502', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'report series', 'publisher': '', 'editor': ''}]

551 self.assertEqual(output, expected_output)

552

553 def test_get_agents_strings_list(self):

554 authors_list = [

555 {

556 'given': 'Myung-Hee',

557 'family': 'Kim',

558 'affiliation': [],

559 "role": "author"

560 },

561 {

562 'given': 'Jin-Seon',

563 'family': 'Seo',

564 'affiliation': [],

565 "role": "author"

566 },

567 {

568 'given': 'Mi-Kyeong',

569 'family': 'Choi',

570 'affiliation': [],

571 "role": "author"

572 },

573 {

574 'given': 'Eun-Young',

575 'family': 'Kim',

576 'affiliation': [],

577 "role": "author"

578 }

579 ]

580 crossref_processor = CrossrefProcessing(IOD)

581 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105'])

582 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105',

583 authors_list)

584 expected_authors_list = ['Kim, Myung-Hee', 'Seo, Jin-Seon', 'Choi, Mi-Kyeong [orcid:0000-0002-6227-4053]',

585 'Kim, Eun-Young']

586 self.assertEqual(authors_strings_list, expected_authors_list)

587

588 def test_get_agents_strings_list_same_family(self):

589 # Two authors have the same family name and the same given name initials

590 authors_list = [

591 {

592 'given': 'Mi-Kyeong',

593 'family': 'Choi',

594 'affiliation': [],

595 "role": "author"

596 },

597 {

598 'given': 'Mi-Hong',

599 'family': 'Choi',

600 'affiliation': [],

601 "role": "author"

602 }

603 ]

604 crossref_processor = CrossrefProcessing(IOD)

605 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105'])

606 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105',

607 authors_list)

608 expected_authors_list = ['Choi, Mi-Kyeong [orcid:0000-0002-6227-4053]', 'Choi, Mi-Hong']

609 self.assertEqual(authors_strings_list, expected_authors_list)

610

611 def test_get_agents_strings_list_homonyms(self):

612 # Two authors have the same family name and the same given name

613 authors_list = [

614 {

615 'given': 'Mi-Kyeong',

616 'family': 'Choi',

617 'affiliation': [],

618 "role": "author"

619 },

620 {

621 'given': 'Mi-Kyeong',

622 'family': 'Choi',

623 'affiliation': [],

624 "role": "author"

625 }

626 ]

627 crossref_processor = CrossrefProcessing(IOD)

628 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105',

629 authors_list)

630 expected_authors_list = ['Choi, Mi-Kyeong', 'Choi, Mi-Kyeong']

631 self.assertEqual(authors_strings_list, expected_authors_list)

632

633 def test_get_agents_strings_list_inverted_names(self):

634 # One author with an ORCID has as a name the surname of another

635 authors_list = [

636 {

637 'given': 'Choi',

638 'family': 'Mi-Kyeong',

639 'affiliation': [],

640 "role": "author"

641 },

642 {

643 'given': 'Mi-Hong',

644 'family': 'Choi',

645 'affiliation': [],

646 "role": "author"

647 }

648 ]

649 crossref_processor = CrossrefProcessing(IOD)

650 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105',

651 authors_list)

652 expected_authors_list = ['Mi-Kyeong, Choi', 'Choi, Mi-Hong']

653 self.assertEqual(authors_strings_list, expected_authors_list)

654

655 def test_get_agents_strings_list_overlapping_surnames(self):

656 # The surname of one author is included in the surname of another.

657 authors_list = [

658 {

659 "given": "Puvaneswari",

660 "family": "Paravamsivam",

661 "sequence": "first",

662 "affiliation": [],

663 "role": "author"

664 },

665 {

666 "given": "Chua Kek",

667 "family": "Heng",

668 "sequence": "additional",

669 "affiliation": [],

670 "role": "author"

671 },

672 {

673 "given": "Sri Nurestri Abdul",

674 "family": "Malek",

675 "sequence": "additional",

676 "affiliation": [],

677 "role": "author"

678 },

679 {

680 "given": "Vikineswary",

681 "family": "Sabaratnam",

682 "sequence": "additional",

683 "affiliation": [],

684 "role": "author"

685 },

686 {

687 "given": "Ravishankar Ram",

688 "family": "M",

689 "sequence": "additional",

690 "affiliation": [],

691 "role": "author"

692 },

693 {

694 "given": "Sri Nurestri Abdul",

695 "family": "Malek",

696 "sequence": "additional",

697 "affiliation": [],

698 "role": "editor"

699 },

700 {

701 "given": "Umah Rani",

702 "family": "Kuppusamy",

703 "sequence": "additional",

704 "affiliation": [],

705 "role": "author"

706 }

707 ]

708 crossref_processor = CrossrefProcessing(None)

709 csv_manager = CSVManager()

710 csv_manager.data = {'doi:10.9799/ksfan.2012.25.1.105': {'Malek, Sri Nurestri Abdul [0000-0001-6278-8559]'}}

711 crossref_processor.orcid_index = csv_manager

712 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105'])

713 authors_strings_list, editors_strings_list = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list)

714 expected_authors_list = ['Paravamsivam, Puvaneswari', 'Heng, Chua Kek', 'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]', 'Sabaratnam, Vikineswary', 'M, Ravishankar Ram', 'Kuppusamy, Umah Rani']

715 expected_editors_list = ['Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]']

716 self.assertEqual((authors_strings_list, editors_strings_list), (expected_authors_list, expected_editors_list))

717

718 def test_id_worker(self):

719 field_issn = 'ISSN 1050-124X'

720 field_isbn = ['978-1-56619-909-4']

721 issn_list = list()

722 isbn_list = list()

723 CrossrefProcessing.id_worker(field_issn, issn_list, CrossrefProcessing.issn_worker)

724 CrossrefProcessing.id_worker(field_isbn, isbn_list, CrossrefProcessing.isbn_worker)

725 expected_issn_list = ['issn:1050-124X']

726 expected_isbn_list = ['isbn:9781566199094']

727 self.assertEqual((issn_list, isbn_list), (expected_issn_list, expected_isbn_list))

728

729 def test_to_validated_id_list(self):

730 cp = CrossrefProcessing(testing=True)

731 # CASE_1: is valid

732 inp_1 = {'id':'doi:10.13039/100005522', 'schema':'doi'}

733 out_1 = cp.to_validated_id_list(inp_1)

734 exp_1 = ['doi:10.13039/100005522']

735 self.assertEqual(out_1, exp_1)

736 cp.doi_m.storage_manager.delete_storage()

737

738 cp = CrossrefProcessing(testing=True)

739 # CASE_2: is invalid

740 inp_2 = {'id':'doi:10.1089/bsp.2008.002', 'schema':'doi'}

741 out_2 = cp.to_validated_id_list(inp_2)

742 exp_2 = []

743 self.assertEqual(out_2, exp_2)

744

745 cp = CrossrefProcessing(testing=True)

746 # CASE_3: valid orcid

747 inp_3 = {'id': 'orcid:0000-0003-4082-1500', 'schema':'orcid'}

748 out_3 = cp.to_validated_id_list(inp_3)

749 exp_3 = ['orcid:0000-0003-4082-1500']

750 self.assertEqual(out_3, exp_3)

751 cp.orcid_m.storage_manager.delete_storage()

752

753 cp = CrossrefProcessing(testing=True)

754 #CASE_4: invalid doi in self._redis_values_br

755 inp_4 = {'id': 'doi:10.1089/bsp.2008.002', 'schema': 'doi'}

756 cp._redis_values_br.append(inp_4['id'])

757 out_4 = cp.to_validated_id_list(inp_4)

758 exp_4 = ['doi:10.1089/bsp.2008.002']

759 self.assertEqual(out_4, exp_4)

760 value=cp.tmp_doi_m.storage_manager.get_value('doi:10.1089/bsp.2008.002')

761 self.assertEqual(value, True)

762 cp.doi_m.storage_manager.delete_storage()

763

764

765 def test_to_validated_id_list_redis(self):

766 cp = CrossrefProcessing(testing=True)

767 # CASE_1: is valid

768 inp_1 = {'id': 'doi:10.13039/100005522', 'schema': 'doi'}

769 out_1 = cp.to_validated_id_list(inp_1)

770 exp_1 = ['doi:10.13039/100005522']

771 self.assertEqual(out_1, exp_1)

772 cp.doi_m.storage_manager.delete_storage()

773

774 cp = CrossrefProcessing(testing=True)

775 # CASE_2: is invalid

776 inp_2 = {'id': 'doi:10.1089/bsp.2008.002', 'schema': 'doi'}

777 out_2 = cp.to_validated_id_list(inp_2)

778 exp_2 = []

779 self.assertEqual(out_2, exp_2)

780

781 cp = CrossrefProcessing(testing=True)

782 # CASE_3: valid orcid

783 inp_3 = {'id': 'orcid:0000-0003-4082-1500', 'schema': 'orcid'}

784 out_3 = cp.to_validated_id_list(inp_3)

785 exp_3 = ['orcid:0000-0003-4082-1500']

786 self.assertEqual(out_3, exp_3)

787 cp.orcid_m.storage_manager.delete_storage()

788

789 cp = CrossrefProcessing(testing=True)

790 # CASE_4: invalid doi in self._redis_values_br

791 inp_4 = {'id': 'doi:10.1089/bsp.2008.002', 'schema': 'doi'}

792 cp._redis_values_br.append(inp_4['id'])

793 out_4 = cp.to_validated_id_list(inp_4)

794 exp_4 = ['doi:10.1089/bsp.2008.002']

795 self.assertEqual(out_4, exp_4)

796 value = cp.tmp_doi_m.storage_manager.get_value('doi:10.1089/bsp.2008.002')

797 self.assertEqual(value, True)

798 cp.doi_m.storage_manager.delete_storage()

799

800 def test_find_crossref_orcid_with_index(self):

801 """Test ORCID validation using ORCID index before API validation"""

802 # Setup

803 test_doi = "10.1234/test123"

804 test_doi_prefixed = "doi:10.1234/test123"

805 test_orcid = "0000-0002-1234-5678"

806 test_name = "Smith, John"

807

808 # Create CrossrefProcessing instance with ORCID index

809 cp = CrossrefProcessing(testing=True)

810 cp.orcid_index.add_value(test_doi_prefixed, f"{test_name} [orcid:{test_orcid}]") # type: ignore[attr-defined]

811 cp.prefetch_doi_orcid_index([test_doi])

812

813 # Test Case 1: ORCID found in index

814 out_1 = cp.find_crossref_orcid(test_orcid, test_doi)

815 exp_1 = f"orcid:{test_orcid}"

816 self.assertEqual(out_1, exp_1)

817 # Verify it was added to temporary storage

818 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}"))

819

820 # Test Case 2: ORCID not in index but valid via API

821 out_2 = cp.find_crossref_orcid("0000-0003-4082-1500", test_doi)

822 exp_2 = "orcid:0000-0003-4082-1500"

823 self.assertEqual(out_2, exp_2)

824

825 # Test Case 3: ORCID not in index and invalid

826 out_3 = cp.find_crossref_orcid("0000-0000-0000-0000", test_doi)

827 exp_3 = ""

828 self.assertEqual(out_3, exp_3)

829

830 # Cleanup

831 cp.orcid_m.storage_manager.delete_storage()

832

833 def test_find_crossref_orcid_api_disabled_not_in_index(self):

834 """API OFF + empty index: a syntactically valid ORCID must NOT be resolved."""

835 cp = CrossrefProcessing(use_orcid_api=False, testing=True)

836 test_doi = "10.9999/noindex"

837 candidate = "0000-0003-4082-1500" # syntactically valid

838

839 out = cp.find_crossref_orcid(candidate, test_doi)

840 self.assertEqual(out, "")

841 # Must NOT be written to tmp storage

842 self.assertIsNone(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{candidate}"))

843

844 cp.orcid_m.storage_manager.delete_storage()

845

846 def test_find_crossref_orcid_api_disabled_from_index(self):

847 """API OFF + present in DOI→ORCID index: must resolve and be saved in tmp storage."""

848 cp = CrossrefProcessing(use_orcid_api=False, testing=True)

849 test_doi = "10.1234/test"

850 test_doi_prefixed = "doi:10.1234/test"

851 test_orcid = "0000-0002-1234-5678"

852 test_name = "Smith, John"

853

854 cp.orcid_index.add_value(test_doi_prefixed, f"{test_name} [orcid:{test_orcid}]") # type: ignore[attr-defined]

855 cp.prefetch_doi_orcid_index([test_doi])

856

857 out = cp.find_crossref_orcid(test_orcid, test_doi)

858 self.assertEqual(out, f"orcid:{test_orcid}")

859 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}"))

860

861 cp.orcid_m.storage_manager.delete_storage()

862

863 def test_find_crossref_orcid_api_disabled_in_storage(self):

864 """API OFF + ORCID already valid in persistent storage: must be accepted."""

865 cp = CrossrefProcessing(use_orcid_api=False, testing=True)

866 oid = "orcid:0000-0003-4082-1500"

867 cp.orcid_m.storage_manager.set_value(oid, True) # mark valid

868 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/any")

869 self.assertEqual(out, oid)

870 cp.orcid_m.storage_manager.delete_storage()

871

872 def test_find_crossref_orcid_api_disabled_from_redis_snapshot(self):

873 """API OFF + empty index/storage, but ORCID present in Redis snapshot: accept and seed tmp storage."""

874 cp = CrossrefProcessing(use_orcid_api=False, testing=True)

875 oid = "orcid:0000-0003-4082-1500"

876 cp.update_redis_values(br=[], ra=[oid]) # emulate per-chunk snapshot

877

878 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/noindex")

879 self.assertEqual(out, oid)

880 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(oid))

881 cp.orcid_m.storage_manager.delete_storage()

882

883 def test_find_crossref_orcid_api_enabled_invalid_in_storage(self):

884 """API ON + ORCID explicitly invalid in storage: reject immediately (no API/index)."""

885 cp = CrossrefProcessing(use_orcid_api=True, testing=True)

886 oid = "orcid:0000-0002-9286-2630"

887 cp.orcid_m.storage_manager.set_value(oid, False)

888 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/anything")

889 self.assertEqual(out, "")

890 cp.orcid_m.storage_manager.delete_storage()

891

892 def test_find_crossref_orcid_api_enabled_from_redis_snapshot(self):

893 """API ON + empty storage/index, but ORCID present in Redis snapshot: accept without API call."""

894 cp = CrossrefProcessing(use_orcid_api=True, testing=True)

895 oid = "orcid:0000-0003-4082-1500"

896 cp.update_redis_values(br=[], ra=[oid])

897

898 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/noindex")

899 self.assertEqual(out, oid)

900 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(oid))

901 cp.orcid_m.storage_manager.delete_storage()

902

903 def test_get_agents_strings_list_api_disabled_no_index(self):

904 """API OFF + empty index: ORCIDs provided in agent dict MUST NOT be appended to the author string."""

905 agents_list = [

906 {

907 "given": "Jane",

908 "family": "Doe",

909 "role": "author",

910 "ORCID": "https://orcid.org/0000-0003-4082-1500", # present in metadata

911 }

912 ]

913 cp = CrossrefProcessing(use_orcid_api=False, testing=True)

914 authors_strings, editors_strings = cp.get_agents_strings_list("10.9999/noindex", agents_list)

915 self.assertEqual(authors_strings, ["Doe, Jane"]) # no [orcid:...] tag

916 self.assertEqual(editors_strings, [])

917 cp.orcid_m.storage_manager.delete_storage()

918

919 def test_get_agents_strings_list_api_disabled_index_requires_prefixed_doi(self):

920 """

921 API OFF + indice DOI→ORCID popolato con chiave DOI prefissata (doi:...).

922 Il DOI passato a get_agents_strings_list è senza prefisso: la funzione deve

923 normalizzarlo prima di interrogare l'indice, altrimenti l'ORCID non viene trovato.

924 """

925 cp = CrossrefProcessing(use_orcid_api=False, testing=True)

926

927 # Indice popolato con DOI **prefissato**

928 doi_pref = "doi:10.1234/test-idx"

929 test_orcid = "0000-0002-9999-8888"

930 cp.orcid_index.add_value(doi_pref, f"Smith, John [orcid:{test_orcid}]") # type: ignore[attr-defined]

931 cp.prefetch_doi_orcid_index(["10.1234/test-idx"])

932

933 # Autore senza ORCID in metadati; DOI passato **senza prefisso**

934 agents = [{

935 "given": "John",

936 "family": "Smith",

937 "role": "author"

938 }]

939

940 authors, editors = cp.get_agents_strings_list("10.1234/test-idx", agents)

941 # Deve risolvere via indice e apporre il tag [orcid:...]

942 self.assertEqual(authors, ["Smith, John [orcid:0000-0002-9999-8888]"])

943 self.assertEqual(editors, [])

944 cp.orcid_m.storage_manager.delete_storage()

945

946 def test_find_crossref_orcid_api_disabled_redis_snapshot_unprefixed_orcid(self):

947 """

948 API OFF + indice vuoto + storage vuoto, ma Redis snapshot contiene ORCID **senza prefisso**.

949 La funzione deve riconoscerlo (normalizzando) e validarlo.

950 """

951 cp = CrossrefProcessing(use_orcid_api=False, testing=True)

952

953 # Redis snapshot con ORCID **senza prefisso**

954 raw_orcid = "0000-0003-4082-1500"

955 cp.update_redis_values(br=[], ra=[raw_orcid])

956

957 out = cp.find_crossref_orcid(raw_orcid, "10.9999/noindex")

958 self.assertEqual(out, f"orcid:{raw_orcid}")

959 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{raw_orcid}"))

960 cp.orcid_m.storage_manager.delete_storage()

961

962 def test_update_redis_values_normalizes_inputs(self):

963 """

964 update_redis_values deve normalizzare sempre:

965 - DOI → con prefisso 'doi:'

966 - ORCID → con prefisso 'orcid:'

967 ed eliminare voci non normalizzabili.

968 """

969 cp = CrossrefProcessing(testing=True)

970

971 cp.update_redis_values(

972 br=["10.1001/jama.299.12.1471", "doi:10.2105/ajph.2006.101626", "xxx-bad"],

973 ra=["0000-0002-1234-5678", "orcid:0000-0003-4082-1500", "bad-orcid"]

974 )

975

976 # Tutti normalizzati (e 'bad' scartati)

977 self.assertIn("doi:10.1001/jama.299.12.1471", cp._redis_values_br)

978 self.assertIn("doi:10.2105/ajph.2006.101626", cp._redis_values_br)

979 self.assertNotIn("xxx-bad", cp._redis_values_br)

980

981 self.assertIn("orcid:0000-0002-1234-5678", cp._redis_values_ra)

982 self.assertIn("orcid:0000-0003-4082-1500", cp._redis_values_ra)

983 self.assertNotIn("bad-orcid", cp._redis_values_ra)

984 cp.storage_manager.delete_storage()

985

986

987def test_validated_as_with_storage_manager(storage_manager):

988 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"}

989 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"}

990 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"}

991

992 c_processing = CrossrefProcessing(storage_manager=storage_manager, testing=True)

993 c_processing.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True)

994 c_processing.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False)

995

996 assert c_processing.validated_as(valid_doi_in_db) is True

997 assert c_processing.validated_as(invalid_doi_in_db) is False

998 assert c_processing.validated_as(valid_doi_not_in_db) is None

999

1000

1001class TestCrossrefProcessingWithMockedAPI(unittest.TestCase):

1002 """Integration tests using mocked Crossref API responses from conftest.py."""

1003

1004 def test_csv_creator_nature_article(self):

1005 """Test with Nature article from mocked API (doi:10.1038/nature12373)."""

1006 item = {

1007 "DOI": "10.1038/nature12373",

1008 "type": "journal-article",

1009 "title": ["Nanometre-scale thermometry in a living cell"],

1010 "author": [

1011 {"given": "G.", "family": "Kucsko", "sequence": "first"},

1012 {"given": "P. C.", "family": "Maurer", "sequence": "additional"},

1013 {"given": "M. D.", "family": "Lukin", "sequence": "additional"}

1014 ],

1015 "container-title": ["Nature"],

1016 "volume": "500",

1017 "issue": "7460",

1018 "page": "54-58",

1019 "issued": {"date-parts": [[2013, 7, 31]]},

1020 "ISSN": ["0028-0836", "1476-4687"],

1021 "publisher": "Springer Science and Business Media LLC",

1022 "member": "297",

1023 "prefix": "10.1038"

1024 }

1025 processor = CrossrefProcessing(testing=True)

1026 row = processor.csv_creator(item)

1027

1028 expected = {

1029 'id': 'doi:10.1038/nature12373',

1030 'title': 'Nanometre-scale thermometry in a living cell',

1031 'author': 'Kucsko, G.; Maurer, P. C.; Lukin, M. D.',

1032 'pub_date': '2013-7-31',

1033 'venue': 'Nature [issn:0028-0836 issn:1476-4687]',

1034 'volume': '500',

1035 'issue': '7460',

1036 'page': '54-58',

1037 'type': 'journal article',

1038 'publisher': 'Springer Science and Business Media LLC [crossref:297]',

1039 'editor': ''

1040 }

1041 self.assertEqual(row, expected)

1042 processor.storage_manager.delete_storage()

1043

1044 def test_csv_creator_plos_with_orcid_url(self):

1045 """Test PLOS article with ORCID in URL format from mocked API."""

1046 item = {

1047 "DOI": "10.1371/journal.pone.0284601",

1048 "type": "journal-article",

1049 "title": ["Biochemical evaluation of vaccination in rats"],

1050 "author": [

1051 {"given": "Mahsa", "family": "Teymoorzadeh", "sequence": "first"},

1052 {"given": "Razieh", "family": "Yazdanparast", "sequence": "additional",

1053 "ORCID": "https://orcid.org/0000-0003-0530-4305", "authenticated-orcid": True}

1054 ],

1055 "container-title": ["PLOS ONE"],

1056 "volume": "18",

1057 "issue": "5",

1058 "page": "e0284601",

1059 "issued": {"date-parts": [[2023, 5, 4]]},

1060 "ISSN": ["1932-6203"],

1061 "publisher": "Public Library of Science (PLoS)"

1062 }

1063 processor = CrossrefProcessing(testing=True)

1064 row = processor.csv_creator(item)

1065

1066 expected = {

1067 'id': 'doi:10.1371/journal.pone.0284601',

1068 'title': 'Biochemical evaluation of vaccination in rats',

1069 'author': 'Teymoorzadeh, Mahsa; Yazdanparast, Razieh [orcid:0000-0003-0530-4305]',

1070 'pub_date': '2023-5-4',

1071 'venue': 'PLOS ONE [issn:1932-6203]',

1072 'volume': '18',

1073 'issue': '5',

1074 'page': 'e0284601-e0284601',

1075 'type': 'journal article',

1076 'publisher': 'Public Library of Science (PLoS)',

1077 'editor': ''

1078 }

1079 self.assertEqual(row, expected)

1080 processor.storage_manager.delete_storage()

1081

1082 def test_csv_creator_book_chapter_multiple_containers(self):

1083 """Test book chapter with multiple container-titles from mocked API."""

1084 item = {

1085 "DOI": "10.1007/978-3-030-00668-6_8",

1086 "type": "book-chapter",

1087 "title": ["The SPAR Ontologies"],

1088 "author": [

1089 {"given": "Silvio", "family": "Peroni", "sequence": "first"},

1090 {"given": "David", "family": "Shotton", "sequence": "additional"}

1091 ],

1092 "container-title": ["Lecture Notes in Computer Science", "The Semantic Web – ISWC 2018"],

1093 "page": "119-136",

1094 "issued": {"date-parts": [[2018]]},

1095 "ISBN": ["9783030006679", "9783030006686"],

1096 "publisher": "Springer International Publishing"

1097 }

1098 processor = CrossrefProcessing(testing=True)

1099 row = processor.csv_creator(item)

1100

1101 expected = {

1102 'id': 'doi:10.1007/978-3-030-00668-6_8',

1103 'title': 'The SPAR Ontologies',

1104 'author': 'Peroni, Silvio; Shotton, David',

1105 'pub_date': '2018',

1106 'venue': 'Lecture Notes in Computer Science [isbn:9783030006679 isbn:9783030006686]',

1107 'volume': '',

1108 'issue': '',

1109 'page': '119-136',

1110 'type': 'book chapter',

1111 'publisher': 'Springer International Publishing',

1112 'editor': ''

1113 }

1114 self.assertEqual(row, expected)

1115 processor.storage_manager.delete_storage()

1116

1117 def test_csv_creator_date_parts_null(self):

1118 """Test handling of date-parts with null value: [[null]] from mocked API."""

1119 item = {

1120 "DOI": "10.1234/null-date",

1121 "type": "journal-article",

1122 "title": ["Article with null date"],

1123 "issued": {"date-parts": [[None]]}

1124 }

1125 processor = CrossrefProcessing(testing=True)

1126 row = processor.csv_creator(item)

1127

1128 expected = {

1129 'id': 'doi:10.1234/null-date',

1130 'title': 'Article with null date',

1131 'author': '',

1132 'pub_date': '',

1133 'venue': '',

1134 'volume': '',

1135 'issue': '',

1136 'page': '',

1137 'type': 'journal article',

1138 'publisher': '',

1139 'editor': ''

1140 }

1141 self.assertEqual(row, expected)

1142 processor.storage_manager.delete_storage()

1143

1144 def test_csv_creator_date_parts_empty(self):

1145 """Test handling of date-parts as empty list: [[]] from mocked API."""

1146 item = {

1147 "DOI": "10.1234/empty-date",

1148 "type": "journal-article",

1149 "title": ["Article with empty date-parts"],

1150 "issued": {"date-parts": [[]]}

1151 }

1152 processor = CrossrefProcessing(testing=True)

1153 row = processor.csv_creator(item)

1154

1155 expected = {

1156 'id': 'doi:10.1234/empty-date',

1157 'title': 'Article with empty date-parts',

1158 'author': '',

1159 'pub_date': '',

1160 'venue': '',

1161 'volume': '',

1162 'issue': '',

1163 'page': '',

1164 'type': 'journal article',

1165 'publisher': '',

1166 'editor': ''

1167 }

1168 self.assertEqual(row, expected)

1169 processor.storage_manager.delete_storage()

1170

1171 def test_csv_creator_date_parts_missing(self):

1172 """Test handling of issued without date-parts key from mocked API."""

1173 item = {

1174 "DOI": "10.1234/no-dateparts",

1175 "type": "journal-article",

1176 "title": ["Article without date-parts key"],

1177 "issued": {}

1178 }

1179 processor = CrossrefProcessing(testing=True)

1180 row = processor.csv_creator(item)

1181

1182 expected = {

1183 'id': 'doi:10.1234/no-dateparts',

1184 'title': 'Article without date-parts key',

1185 'author': '',

1186 'pub_date': '',

1187 'venue': '',

1188 'volume': '',

1189 'issue': '',

1190 'page': '',

1191 'type': 'journal article',

1192 'publisher': '',

1193 'editor': ''

1194 }

1195 self.assertEqual(row, expected)

1196 processor.storage_manager.delete_storage()

1197

1198 def test_csv_creator_html_in_title(self):

1199 """Test HTML markup in title is cleaned (from mocked API structure)."""

1200 item = {

1201 "DOI": "10.1234/html-title",

1202 "type": "journal-article",

1203 "title": ["A study of <i>Escherichia coli</i> in <b>biofilms</b>"],

1204 "issued": {"date-parts": [[2024, 1, 15]]}

1205 }

1206 processor = CrossrefProcessing(testing=True)

1207 row = processor.csv_creator(item)

1208

1209 expected = {

1210 'id': 'doi:10.1234/html-title',

1211 'title': 'A study of Escherichia coli in biofilms',

1212 'author': '',

1213 'pub_date': '2024-1-15',

1214 'venue': '',

1215 'volume': '',

1216 'issue': '',

1217 'page': '',

1218 'type': 'journal article',

1219 'publisher': '',

1220 'editor': ''

1221 }

1222 self.assertEqual(row, expected)

1223 processor.storage_manager.delete_storage()

1224

1225 def test_csv_creator_with_editor(self):

1226 """Test article with both author and editor from mocked API structure."""

1227 item = {

1228 "DOI": "10.1234/with-editor",

1229 "type": "edited-book",

1230 "title": ["Edited volume test"],

1231 "author": [{"given": "John", "family": "Doe", "sequence": "first"}],

1232 "editor": [{"given": "Jane", "family": "Smith", "sequence": "first"}],

1233 "issued": {"date-parts": [[2024, 6, 20]]}

1234 }

1235 processor = CrossrefProcessing(testing=True)

1236 row = processor.csv_creator(item)

1237

1238 expected = {

1239 'id': 'doi:10.1234/with-editor',

1240 'title': 'Edited volume test',

1241 'author': 'Doe, John',

1242 'pub_date': '2024-6-20',

1243 'venue': '',

1244 'volume': '',

1245 'issue': '',

1246 'page': '',

1247 'type': 'edited book',

1248 'publisher': '',

1249 'editor': 'Smith, Jane'

1250 }

1251 self.assertEqual(row, expected)

1252 processor.storage_manager.delete_storage()

1253

1254 def test_csv_creator_no_inplace_modification(self):

1255 """Test that csv_creator does not modify the input item dict."""

1256 item = {

1257 "DOI": "10.1234/with-editor",

1258 "type": "edited-book",

1259 "title": ["Edited volume test"],

1260 "author": [{"given": "John", "family": "Doe", "sequence": "first"}],

1261 "editor": [{"given": "Jane", "family": "Smith", "sequence": "first"}],

1262 "issued": {"date-parts": [[2024, 6, 20]]}

1263 }

1264 original_author = {"given": "John", "family": "Doe", "sequence": "first"}

1265 original_editor = {"given": "Jane", "family": "Smith", "sequence": "first"}

1266

1267 processor = CrossrefProcessing(testing=True)

1268 processor.csv_creator(item)

1269

1270 self.assertEqual(item['author'][0], original_author)

1271 self.assertEqual(item['editor'][0], original_editor)

1272 processor.storage_manager.delete_storage()

1273

1274 def test_csv_creator_member_as_string(self):

1275 """Test that member field as string (API format) is handled."""

1276 item = {

1277 "DOI": "10.1001/test.12345",

1278 "type": "journal-article",

1279 "title": ["Test"],

1280 "publisher": "American Medical Association (AMA)",

1281 "member": "10",

1282 "prefix": "10.1001",

1283 "issued": {"date-parts": [[2024]]}

1284 }

1285 processor = CrossrefProcessing(

1286 publishers_filepath=PUBLISHERS_MAPPING,

1287 testing=True

1288 )

1289 row = processor.csv_creator(item)

1290

1291 expected = {

1292 'id': 'doi:10.1001/test.12345',

1293 'title': 'Test',

1294 'author': '',

1295 'pub_date': '2024',

1296 'venue': '',

1297 'volume': '',

1298 'issue': '',

1299 'page': '',

1300 'type': 'journal article',

1301 'publisher': 'American Medical Association (AMA) [crossref:10]',

1302 'editor': ''

1303 }

1304 self.assertEqual(row, expected)

1305 processor.storage_manager.delete_storage()

Coverage for test / crossref_processing_test.py: 100%

595 statements