Coverage for test / crossref_processing_test.py: 100%
595 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
1# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it>
2# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
3# SPDX-FileCopyrightText: 2025 Arianna Moretti <arianna.moretti4@unibo.it>
4#
5# SPDX-License-Identifier: ISC
7import json
8import os
9import unittest
11from oc_ds_converter.crossref.crossref_processing import CrossrefProcessing
12from oc_ds_converter.datasource.orcid_index import PublishersRedis
13from oc_ds_converter.lib.csvmanager import CSVManager
14from oc_ds_converter.lib.jsonmanager import load_json
16TEST_DIR = os.path.join("test", "crossref_processing")
17JSON_FILE = os.path.join(TEST_DIR, "0.json")
18TMP_SUPPORT_MATERIAL = os.path.join(TEST_DIR, "tmp_support")
19IOD = os.path.join(TEST_DIR, 'iod')
20DATA = os.path.join(TEST_DIR, '40228.json')
21PUBLISHERS_MAPPING = os.path.join(TEST_DIR, 'publishers.csv')
25class TestCrossrefProcessing(unittest.TestCase):
26 def test_extract_all_ids_cited(self):
27 c_processing = CrossrefProcessing()
28 with open(JSON_FILE, encoding="utf8") as f:
29 result = json.load(f)
30 for entity_dict in result['items']:
31 results_ids = c_processing.extract_all_ids(entity_dict, False)
32 br = results_ids[0]
33 expected_br = ['doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471', 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020', 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218', 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807', 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2', 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693', 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72', 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677']
34 self.assertEqual(set(expected_br), set(br))
35 c_processing.storage_manager.delete_storage()
37 def test_extract_all_ids_cited_redis(self):
38 c_processing = CrossrefProcessing(testing=True)
39 with open(JSON_FILE, encoding="utf8") as f:
40 result = json.load(f)
41 for entity_dict in result['items']:
42 results_ids = c_processing.extract_all_ids(entity_dict, False)
43 br = results_ids[0]
44 expected_br = ['doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471',
45 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020',
46 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218',
47 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807',
48 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2',
49 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693',
50 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72',
51 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677']
52 self.assertEqual(set(expected_br), set(br))
53 c_processing.storage_manager.delete_storage()
55 def test_get_redis_validity_list(self):
56 c_processing = CrossrefProcessing()
57 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471',
58 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020',
59 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218',
60 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807',
61 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2',
62 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693',
63 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72',
64 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677'}
65 br_valid_list = c_processing.get_redis_validity_list(br, "br")
66 exp_br_valid_list = []
67 self.assertEqual(br_valid_list, exp_br_valid_list)
68 c_processing.storage_manager.delete_storage()
70 def test_get_redis_validity_list_redis(self):
71 c_processing = CrossrefProcessing(testing=True)
72 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471',
73 'doi:10.1177/003335490812300219', 'doi:10.1089/bsp.2008.0020',
74 'doi:10.1097/01.ccm.0000151067.76074.21', 'doi:10.1177/003335490912400218',
75 'doi:10.1097/dmp.0b013e31817196bf', 'doi:10.1056/nejmsa021807',
76 'doi:10.1097/dmp.0b013e31819d977c', 'doi:10.1097/dmp.0b013e31819f1ae2',
77 'doi:10.1097/dmp.0b013e318194898d', 'doi:10.1378/chest.07-2693',
78 'doi:10.1016/s0196-0644(99)70224-6', 'doi:10.1097/01.ccm.0000151072.17826.72',
79 'doi:10.1097/01.bcr.0000155527.76205.a2', 'doi:10.2105/ajph.2009.162677'}
80 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}
81 br_valid_list = c_processing.get_redis_validity_list(br, "br")
82 exp_br_valid_list = []
83 ra_valid_list = c_processing.get_redis_validity_list(ra, "ra")
84 self.assertEqual(br_valid_list, exp_br_valid_list)
85 exp_ra_valid_list = []
86 self.assertEqual(ra_valid_list, exp_ra_valid_list)
87 c_processing.storage_manager.delete_storage()
89 def test_get_redis_validity_dict_w_fakeredis_db_values_sqlite(self):
90 c_processing = CrossrefProcessing()
91 c_processing.BR_redis.sadd('doi:10.2105/ajph.2006.101626', "omid:1")
92 c_processing.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2")
94 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471',
95 'doi:10.1177/003335490812300219'}
96 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}
98 br_validity_dict = c_processing.get_redis_validity_list(br, "br")
99 exp_br_valid_list = ['doi:10.2105/ajph.2006.101626']
100 ra_validity_dict = c_processing.get_redis_validity_list(ra, "ra")
101 exp_ra_valid_list = ['orcid:0000-0002-8090-6886']
102 self.assertEqual(br_validity_dict, exp_br_valid_list)
103 self.assertEqual(ra_validity_dict, exp_ra_valid_list)
105 c_processing.storage_manager.delete_storage()
107 c_processing.BR_redis.delete('doi:10.2105/ajph.2006.101626')
108 c_processing.RA_redis.delete('orcid:0000-0002-8090-6886')
110 def test_get_redis_validity_dict_w_fakeredis_db_values_redis(self):
111 c_processing = CrossrefProcessing(testing=True)
112 c_processing.BR_redis.sadd('doi:10.2105/ajph.2006.101626', "omid:1")
113 c_processing.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2")
115 br = {'doi:10.2105/ajph.2006.101626', 'doi:10.1001/jama.299.12.1471',
116 'doi:10.1177/003335490812300219'}
117 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}
119 br_validity_dict = c_processing.get_redis_validity_list(br, "br")
120 exp_br_valid_list = ['doi:10.2105/ajph.2006.101626']
121 ra_validity_dict = c_processing.get_redis_validity_list(ra, "ra")
122 exp_ra_valid_list = ['orcid:0000-0002-8090-6886']
123 self.assertEqual(br_validity_dict, exp_br_valid_list)
124 self.assertEqual(ra_validity_dict, exp_ra_valid_list)
126 c_processing.storage_manager.delete_storage()
128 c_processing.BR_redis.delete('doi:10.2105/ajph.2006.101626')
129 c_processing.RA_redis.delete('orcid:0000-0002-8090-6886')
131 def test_validated_as_default(self):
132 """
133 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:
134 string of the identifier, the method "validated_as" returns:
135 - True if the id was already validated as valid
136 - False if the id was already validated as invalid
137 - None if the id was not validated before
138 The procedure is tested
139 - With default storage manager (sqlite) without a pre-existent db associated
140 """
142 c_processing = CrossrefProcessing()
143 validate_as_none = c_processing.validated_as({"schema":"doi", "identifier": "doi:10.1001/10-v4n2-hsf10003"})
144 self.assertEqual(validate_as_none, None)
145 c_processing.storage_manager.delete_storage()
147 def test_validated_as_default_redis(self):
148 """
149 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:
150 string of the identifier, the method "validated_as" returns:
151 - True if the id was already validated as valid
152 - False if the id was already validated as invalid
153 - None if the id was not validated before
154 The procedure is tested
155 - With redis storage manager without a pre-existent db associated
156 """
157 c_processing = CrossrefProcessing(testing=True)
158 validate_as_none = c_processing.validated_as({"schema": "doi", "identifier": "doi:10.1001/10-v4n2-hsf10003"})
159 self.assertEqual(validate_as_none, None)
160 c_processing.storage_manager.delete_storage()
162 def test_validated_as_redis_with_preexistent_data(self):
163 """
164 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:
165 string of the identifier, the method "validated_as" returns:
166 - True if the id was already validated as valid
167 - False if the id was already validated as invalid
168 - None if the id was not validated before
169 The procedure is tested
170 - With redis storage manager and pre-existent data associated
171 """
172 valid_doi_not_in_db = {"identifier":"doi:10.1001/2012.jama.10158", "schema":"doi"}
173 valid_doi_in_db = {"identifier":"doi:10.1001/2012.jama.10368", "schema":"doi"}
174 invalid_doi_in_db = {"identifier":"doi:10.1001/2012.jama.1036", "schema":"doi"}
176 # New class instance and set values directly on the DOIManager's storage_manager
177 c_processing_redis = CrossrefProcessing(testing=True)
178 c_processing_redis.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True)
179 c_processing_redis.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False)
180 validated_as_True = c_processing_redis.validated_as(valid_doi_in_db)
181 validated_as_False = c_processing_redis.validated_as(invalid_doi_in_db)
182 not_validated = c_processing_redis.validated_as(valid_doi_not_in_db)
184 self.assertEqual(validated_as_True, True)
185 self.assertEqual(validated_as_False, False)
186 self.assertEqual(not_validated, None)
188 c_processing_redis.doi_m.storage_manager.delete_storage()
190 def test_validated_as_inmemory(self):
191 """
192 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:
193 string of the identifier, the method "validated_as" returns:
194 - True if the id was already validated as valid
195 - False if the id was already validated as invalid
196 - None if the id was not validated before
197 The procedure is tested
198 - With in Memory + Json storage manager and a pre-existent db associated
199 - With in Memory + Json storage manager without a pre-existent db associated
200 """
201 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"}
202 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"}
203 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"}
205 c_processing = CrossrefProcessing(testing=True)
206 c_processing.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True)
207 c_processing.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False)
208 validated_as_True = c_processing.validated_as(valid_doi_in_db)
209 validated_as_False = c_processing.validated_as(invalid_doi_in_db)
210 not_validated = c_processing.validated_as(valid_doi_not_in_db)
212 self.assertEqual(validated_as_True, True)
213 self.assertEqual(validated_as_False, False)
214 self.assertEqual(not_validated, None)
216 c_processing.doi_m.storage_manager.delete_storage()
218 def test_validated_as_redis(self):
219 """
220 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:
221 string of the identifier, the method "validated_as" returns:
222 - True if the id was already validated as valid
223 - False if the id was already validated as invalid
224 - None if the id was not validated before
225 The procedure is tested
226 - With REDIS storage manager and a pre-existent db associated
227 - With REDIS storage manager without a pre-existent db associated
228 """
230 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"}
231 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"}
232 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"}
234 # New class instance and set values directly on the DOIManager's storage_manager
235 c_processing_redis = CrossrefProcessing(testing=True)
236 c_processing_redis.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True)
237 c_processing_redis.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False)
238 validated_as_True = c_processing_redis.validated_as(valid_doi_in_db)
239 validated_as_False = c_processing_redis.validated_as(invalid_doi_in_db)
240 not_validated = c_processing_redis.validated_as(valid_doi_not_in_db)
241 self.assertEqual(validated_as_True, True)
242 self.assertEqual(validated_as_False, False)
243 self.assertEqual(not_validated, None)
244 c_processing_redis.storage_manager.delete_storage()
246 def test_get_id_manager(self):
247 """Check that, given in input the string of a schema (e.g.:'pmid') or an id with a prefix (e.g.: 'pmid:12334')
248 and a dictionary mapping the strings of the schemas to their id managers, the method returns the correct
249 id manager. Note that each instance of the Preprocessing class needs its own instances of the id managers,
250 in order to avoid conflicts while validating data"""
252 c_processing = CrossrefProcessing()
253 id_man_dict = c_processing.venue_id_man_dict
255 issn_id = "issn:0003-987X"
256 issn_string = "issn"
257 issn_man_exp = c_processing.get_id_manager(issn_id, id_man_dict)
258 issn_man_exp_2 = c_processing.get_id_manager(issn_string, id_man_dict)
260 #check that the idmanager for the issn was returned and that it works as expected
261 assert issn_man_exp is not None
262 assert issn_man_exp_2 is not None
263 self.assertTrue(issn_man_exp.is_valid(issn_id))
264 self.assertTrue(issn_man_exp_2.is_valid(issn_id))
266 def test_csv_creator(self):
267 c_processing = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None)
268 data = load_json(DATA, None) # type: ignore[arg-type]
269 assert data is not None
270 dois_to_prefetch = [item.get("DOI") for item in data['items'] if item.get("DOI")]
271 c_processing.prefetch_doi_orcid_index(dois_to_prefetch)
272 output = list()
273 for item in data['items']:
274 tabular_data = c_processing.csv_creator(item)
275 if tabular_data:
276 output.append(tabular_data)
277 self.assertEqual(len(output), 11)
278 output_ids = [row['id'] for row in output]
279 self.assertIn('doi:10.47886/9789251092637.ch7', output_ids)
280 self.assertIn('doi:10.9799/ksfan.2012.25.1.069', output_ids)
281 self.assertIn('doi:10.9799/ksfan.2012.25.1.105', output_ids)
282 first_item = next(row for row in output if row['id'] == 'doi:10.47886/9789251092637.ch7')
283 self.assertEqual(first_item['type'], 'book chapter')
284 self.assertEqual(first_item['publisher'], 'American Fisheries Society [crossref:460]')
286 def test_csv_creator_cited(self):
287 c_processing_cited = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None, citing=False)
288 with open(JSON_FILE, encoding="utf8") as f:
289 result = json.load(f)
290 output = list()
291 for item in result['items']:
292 if item.get("reference"):
293 # filtering out entities without citations
294 has_doi_references = [x for x in item["reference"] if x.get("DOI")]
295 if has_doi_references:
296 for reference_dict in has_doi_references:
297 tabular_data = c_processing_cited.csv_creator(reference_dict)
298 if tabular_data:
299 output.append(tabular_data)
300 expected_output =[
301 {'id': 'doi:10.2105/ajph.2006.101626', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
302 {'id': 'doi:10.1001/jama.299.12.1471', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
303 {'id': 'doi:10.1177/003335490812300219', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
304 {'id': 'doi:10.1089/bsp.2008.0020', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
305 {'id': 'doi:10.1097/01.ccm.0000151067.76074.21', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
306 {'id': 'doi:10.1177/003335490912400218', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
307 {'id': 'doi:10.1097/dmp.0b013e31817196bf', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
308 {'id': 'doi:10.1056/nejmsa021807', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
309 {'id': 'doi:10.1097/dmp.0b013e31819d977c', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
310 {'id': 'doi:10.1097/dmp.0b013e31819f1ae2', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
311 {'id': 'doi:10.1097/dmp.0b013e318194898d', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
312 {'id': 'doi:10.1378/chest.07-2693', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
313 {'id': 'doi:10.1016/s0196-0644(99)70224-6', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
314 {'id': 'doi:10.1097/01.ccm.0000151072.17826.72', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
315 {'id': 'doi:10.1097/01.bcr.0000155527.76205.a2', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},
316 {'id': 'doi:10.2105/ajph.2009.162677', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}]
317 self.assertEqual(output, expected_output)
319 def test_get_pages(self):
320 item = {
321 'page': '469-476'
322 }
323 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)
324 pages = crossref_processor.get_crossref_pages(item)
325 self.assertEqual(pages, '469-476')
327 def test_get_pages_right_letter(self):
328 item = {
329 'page': 'G22'
330 }
331 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)
332 pages = crossref_processor.get_crossref_pages(item)
333 self.assertEqual(pages, 'G22-G22')
335 def test_get_pages_wrong_letter(self):
336 item = {
337 'page': '583b-584'
338 }
339 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)
340 pages = crossref_processor.get_crossref_pages(item)
341 self.assertEqual(pages, '583-584')
343 def test_get_pages_roman_letters(self):
344 item = {
345 'page': 'iv-l'
346 }
347 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)
348 pages = crossref_processor.get_crossref_pages(item)
349 self.assertEqual(pages, 'iv-l')
351 def test_get_pages_non_roman_letters(self):
352 item = {
353 'page': 'kj-hh'
354 }
355 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)
356 pages = crossref_processor.get_crossref_pages(item)
357 self.assertEqual(pages, '')
359 def test_load_publishers_mapping(self):
360 output = CrossrefProcessing.load_publishers_mapping(publishers_filepath=PUBLISHERS_MAPPING)
361 expected_output = {
362 '1': {'name': 'Annals of Family Medicine', 'prefixes': {'10.1370'}},
363 '2': {'name': 'American Association of Petroleum Geologists AAPG/Datapages', 'prefixes': {'10.15530', '10.1306'}},
364 '3': {'name': 'American Association of Physics Teachers (AAPT)','prefixes': {'10.1119'}},
365 '6': {'name': 'American College of Medical Physics (ACMP)','prefixes': {'10.1120'}},
366 '9': {'name': 'Allen Press', 'prefixes': {'10.1043'}},
367 '10': {'name': 'American Medical Association (AMA)', 'prefixes': {'10.1001'}},
368 '11': {'name': 'American Economic Association', 'prefixes': {'10.1257'}},
369 '460': {'name': 'American Fisheries Society', 'prefixes': {'10.1577', '10.47886'}}
370 }
371 self.assertEqual(output, expected_output)
373 def test_get_publisher_name(self):
374 # The item's member is in the publishers' mapping
375 item = {
376 'publisher': 'American Fisheries Society',
377 'DOI': '10.47886/9789251092637.ch7',
378 'prefix': '10.47886',
379 'member': '460'
380 }
381 doi = '10.47886/9789251092637.ch7'
382 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)
383 publisher_name = crossref_processor.get_publisher_name(doi, item)
384 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]')
386 def test_get_publisher_name_no_member(self):
387 # The item has no member, but the DOI prefix is the publishers' mapping
388 item = {
389 'publisher': 'American Fisheries Society',
390 'DOI': '10.47886/9789251092637.ch7',
391 'prefix': '10.47886'
392 }
393 doi = '10.47886/9789251092637.ch7'
394 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)
395 publisher_name = crossref_processor.get_publisher_name(doi, item)
396 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]')
398 def test_get_publisher_name_redis_by_member(self):
399 publishers_redis = PublishersRedis(testing=True)
400 publishers_redis.set_publisher("460", "American Fisheries Society", {"10.47886"})
402 item = {
403 'publisher': 'American Fisheries Society',
404 'DOI': '10.47886/9789251092637.ch7',
405 'prefix': '10.47886',
406 'member': '460'
407 }
408 doi = '10.47886/9789251092637.ch7'
409 crossref_processor = CrossrefProcessing(
410 orcid_index=None, publishers_filepath=None,
411 use_redis_publishers=True, testing=True
412 )
413 crossref_processor._publishers_redis = publishers_redis
414 publisher_name = crossref_processor.get_publisher_name(doi, item)
415 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]')
417 def test_get_publisher_name_redis_by_prefix(self):
418 publishers_redis = PublishersRedis(testing=True)
419 publishers_redis.set_publisher("460", "American Fisheries Society", {"10.47886"})
421 item = {
422 'publisher': 'American Fisheries Society',
423 'DOI': '10.47886/9789251092637.ch7',
424 'prefix': '10.47886'
425 }
426 doi = '10.47886/9789251092637.ch7'
427 crossref_processor = CrossrefProcessing(
428 orcid_index=None, publishers_filepath=None,
429 use_redis_publishers=True, testing=True
430 )
431 crossref_processor._publishers_redis = publishers_redis
432 publisher_name = crossref_processor.get_publisher_name(doi, item)
433 self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]')
435 def test_get_publisher_name_redis_not_found(self):
436 publishers_redis = PublishersRedis(testing=True)
438 item = {
439 'publisher': 'Unknown Publisher',
440 'DOI': '10.9999/unknown',
441 'prefix': '10.9999'
442 }
443 doi = '10.9999/unknown'
444 crossref_processor = CrossrefProcessing(
445 orcid_index=None, publishers_filepath=None,
446 use_redis_publishers=True, testing=True
447 )
448 crossref_processor._publishers_redis = publishers_redis
449 publisher_name = crossref_processor.get_publisher_name(doi, item)
450 self.assertEqual(publisher_name, 'Unknown Publisher')
452 def test_get_venue_name(self):
453 item = {
454 'container-title': ['Cerebrospinal Fluid [Working Title]'],
455 }
456 row = {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'journal article', 'publisher': '', 'editor': ''}
457 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)
458 venue_name = crossref_processor.get_venue_name(item, row)
459 self.assertEqual(venue_name, 'Cerebrospinal Fluid [Working Title]')
461 def test_get_venue_name_with_ISSN(self):
462 item = {
463 "container-title": ["Disaster Medicine and Public Health Preparedness"],
464 "ISSN": ["1935-7893", "1938-744X"]
465 }
466 row = {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '',
467 'type': 'journal article', 'publisher': '', 'editor': ''}
468 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)
469 venue_name = crossref_processor.get_venue_name(item, row)
470 self.assertEqual(venue_name, 'Disaster Medicine and Public Health Preparedness [issn:1935-7893 issn:1938-744X]')
472 def test_find_crossref_orcid(self):
473 """Test that, given in input a string representing an ORCID, the method returns:
474 - the ORCID itself if it is valid
475 - an empty string if it is not valid
476 The procedure is tested with:
477 - a valid ORCID
478 - an invalid ORCID
479 - a non-string input
480 """
481 c_processing = CrossrefProcessing()
482 test_doi = "10.1234/test123" # Added test DOI
484 # Valid ORCID
485 inp = '0000-0003-4082-1500'
486 out = c_processing.find_crossref_orcid(inp, test_doi)
487 exp = "orcid:0000-0003-4082-1500"
488 self.assertEqual(out, exp)
490 # Invalid ORCID
491 inp_invalid_id = '5500-0001-9759-3938'
492 out_invalid_id = c_processing.find_crossref_orcid(inp_invalid_id, test_doi)
493 exp_invalid_id = ""
494 self.assertEqual(out_invalid_id, exp_invalid_id)
496 # Non-string input
497 inp_non_string = None
498 out_non_string = c_processing.find_crossref_orcid(inp_non_string, test_doi)
499 exp_non_string = ""
500 self.assertEqual(out_non_string, exp_non_string)
502 c_processing.orcid_m.storage_manager.delete_storage()
504 # Set a valid id as invalid in storage to check that the api check is
505 # avoided if the info is already in storage
506 c_processing = CrossrefProcessing(testing=True)
507 c_processing.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", False)
509 inp = '0000-0001-9759-3938'
510 out = c_processing.find_crossref_orcid(inp, test_doi)
511 exp = ""
512 self.assertEqual(out, exp)
513 c_processing.orcid_m.storage_manager.delete_storage()
515 c_processing = CrossrefProcessing(testing=True)
516 c_processing.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", True)
517 inp = '0000-0001-9759-3938'
518 out = c_processing.find_crossref_orcid(inp, test_doi)
519 exp = "orcid:0000-0001-9759-3938"
520 self.assertEqual(out, exp)
521 c_processing.orcid_m.storage_manager.delete_storage()
523 def test_report_series_venue_id(self):
524 crossref_processor = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None)
525 items = {'items': [{
526 'DOI': '10.1007/978-3-030-00668-6_8',
527 'container-title': ["troitel'stvo: nauka i obrazovanie [Construction: Science and Education]"],
528 'ISSN': '2305-5502',
529 'type': 'report-series'
530 }]}
531 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)
532 output = list()
533 for item in items['items']:
534 output.append(crossref_processor.csv_creator(item))
535 expected_output = [{'id': 'doi:10.1007/978-3-030-00668-6_8', 'title': '', 'author': '', 'pub_date': '', 'venue': "troitel'stvo: nauka i obrazovanie [Construction: Science and Education] [issn:2305-5502]", 'volume': '', 'issue': '', 'page': '', 'type': 'report series', 'publisher': '', 'editor': ''}]
536 self.assertEqual(output, expected_output)
538 def test_report_series_br_id(self):
539 crossref_processor = CrossrefProcessing(orcid_index=IOD, publishers_filepath=None)
540 items = {'items': [{
541 'DOI': '10.1007/978-3-030-00668-6_8',
542 'container-title': [],
543 'ISSN': '2305-5502',
544 'type': 'report-series'
545 }]}
546 crossref_processor = CrossrefProcessing(orcid_index=None, publishers_filepath=PUBLISHERS_MAPPING)
547 output = list()
548 for item in items['items']:
549 output.append(crossref_processor.csv_creator(item))
550 expected_output = [{'id': 'doi:10.1007/978-3-030-00668-6_8 issn:2305-5502', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'report series', 'publisher': '', 'editor': ''}]
551 self.assertEqual(output, expected_output)
553 def test_get_agents_strings_list(self):
554 authors_list = [
555 {
556 'given': 'Myung-Hee',
557 'family': 'Kim',
558 'affiliation': [],
559 "role": "author"
560 },
561 {
562 'given': 'Jin-Seon',
563 'family': 'Seo',
564 'affiliation': [],
565 "role": "author"
566 },
567 {
568 'given': 'Mi-Kyeong',
569 'family': 'Choi',
570 'affiliation': [],
571 "role": "author"
572 },
573 {
574 'given': 'Eun-Young',
575 'family': 'Kim',
576 'affiliation': [],
577 "role": "author"
578 }
579 ]
580 crossref_processor = CrossrefProcessing(IOD)
581 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105'])
582 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105',
583 authors_list)
584 expected_authors_list = ['Kim, Myung-Hee', 'Seo, Jin-Seon', 'Choi, Mi-Kyeong [orcid:0000-0002-6227-4053]',
585 'Kim, Eun-Young']
586 self.assertEqual(authors_strings_list, expected_authors_list)
588 def test_get_agents_strings_list_same_family(self):
589 # Two authors have the same family name and the same given name initials
590 authors_list = [
591 {
592 'given': 'Mi-Kyeong',
593 'family': 'Choi',
594 'affiliation': [],
595 "role": "author"
596 },
597 {
598 'given': 'Mi-Hong',
599 'family': 'Choi',
600 'affiliation': [],
601 "role": "author"
602 }
603 ]
604 crossref_processor = CrossrefProcessing(IOD)
605 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105'])
606 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105',
607 authors_list)
608 expected_authors_list = ['Choi, Mi-Kyeong [orcid:0000-0002-6227-4053]', 'Choi, Mi-Hong']
609 self.assertEqual(authors_strings_list, expected_authors_list)
611 def test_get_agents_strings_list_homonyms(self):
612 # Two authors have the same family name and the same given name
613 authors_list = [
614 {
615 'given': 'Mi-Kyeong',
616 'family': 'Choi',
617 'affiliation': [],
618 "role": "author"
619 },
620 {
621 'given': 'Mi-Kyeong',
622 'family': 'Choi',
623 'affiliation': [],
624 "role": "author"
625 }
626 ]
627 crossref_processor = CrossrefProcessing(IOD)
628 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105',
629 authors_list)
630 expected_authors_list = ['Choi, Mi-Kyeong', 'Choi, Mi-Kyeong']
631 self.assertEqual(authors_strings_list, expected_authors_list)
633 def test_get_agents_strings_list_inverted_names(self):
634 # One author with an ORCID has as a name the surname of another
635 authors_list = [
636 {
637 'given': 'Choi',
638 'family': 'Mi-Kyeong',
639 'affiliation': [],
640 "role": "author"
641 },
642 {
643 'given': 'Mi-Hong',
644 'family': 'Choi',
645 'affiliation': [],
646 "role": "author"
647 }
648 ]
649 crossref_processor = CrossrefProcessing(IOD)
650 authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105',
651 authors_list)
652 expected_authors_list = ['Mi-Kyeong, Choi', 'Choi, Mi-Hong']
653 self.assertEqual(authors_strings_list, expected_authors_list)
655 def test_get_agents_strings_list_overlapping_surnames(self):
656 # The surname of one author is included in the surname of another.
657 authors_list = [
658 {
659 "given": "Puvaneswari",
660 "family": "Paravamsivam",
661 "sequence": "first",
662 "affiliation": [],
663 "role": "author"
664 },
665 {
666 "given": "Chua Kek",
667 "family": "Heng",
668 "sequence": "additional",
669 "affiliation": [],
670 "role": "author"
671 },
672 {
673 "given": "Sri Nurestri Abdul",
674 "family": "Malek",
675 "sequence": "additional",
676 "affiliation": [],
677 "role": "author"
678 },
679 {
680 "given": "Vikineswary",
681 "family": "Sabaratnam",
682 "sequence": "additional",
683 "affiliation": [],
684 "role": "author"
685 },
686 {
687 "given": "Ravishankar Ram",
688 "family": "M",
689 "sequence": "additional",
690 "affiliation": [],
691 "role": "author"
692 },
693 {
694 "given": "Sri Nurestri Abdul",
695 "family": "Malek",
696 "sequence": "additional",
697 "affiliation": [],
698 "role": "editor"
699 },
700 {
701 "given": "Umah Rani",
702 "family": "Kuppusamy",
703 "sequence": "additional",
704 "affiliation": [],
705 "role": "author"
706 }
707 ]
708 crossref_processor = CrossrefProcessing(None)
709 csv_manager = CSVManager()
710 csv_manager.data = {'doi:10.9799/ksfan.2012.25.1.105': {'Malek, Sri Nurestri Abdul [0000-0001-6278-8559]'}}
711 crossref_processor.orcid_index = csv_manager
712 crossref_processor.prefetch_doi_orcid_index(['10.9799/ksfan.2012.25.1.105'])
713 authors_strings_list, editors_strings_list = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list)
714 expected_authors_list = ['Paravamsivam, Puvaneswari', 'Heng, Chua Kek', 'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]', 'Sabaratnam, Vikineswary', 'M, Ravishankar Ram', 'Kuppusamy, Umah Rani']
715 expected_editors_list = ['Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]']
716 self.assertEqual((authors_strings_list, editors_strings_list), (expected_authors_list, expected_editors_list))
718 def test_id_worker(self):
719 field_issn = 'ISSN 1050-124X'
720 field_isbn = ['978-1-56619-909-4']
721 issn_list = list()
722 isbn_list = list()
723 CrossrefProcessing.id_worker(field_issn, issn_list, CrossrefProcessing.issn_worker)
724 CrossrefProcessing.id_worker(field_isbn, isbn_list, CrossrefProcessing.isbn_worker)
725 expected_issn_list = ['issn:1050-124X']
726 expected_isbn_list = ['isbn:9781566199094']
727 self.assertEqual((issn_list, isbn_list), (expected_issn_list, expected_isbn_list))
729 def test_to_validated_id_list(self):
730 cp = CrossrefProcessing(testing=True)
731 # CASE_1: is valid
732 inp_1 = {'id':'doi:10.13039/100005522', 'schema':'doi'}
733 out_1 = cp.to_validated_id_list(inp_1)
734 exp_1 = ['doi:10.13039/100005522']
735 self.assertEqual(out_1, exp_1)
736 cp.doi_m.storage_manager.delete_storage()
738 cp = CrossrefProcessing(testing=True)
739 # CASE_2: is invalid
740 inp_2 = {'id':'doi:10.1089/bsp.2008.002', 'schema':'doi'}
741 out_2 = cp.to_validated_id_list(inp_2)
742 exp_2 = []
743 self.assertEqual(out_2, exp_2)
745 cp = CrossrefProcessing(testing=True)
746 # CASE_3: valid orcid
747 inp_3 = {'id': 'orcid:0000-0003-4082-1500', 'schema':'orcid'}
748 out_3 = cp.to_validated_id_list(inp_3)
749 exp_3 = ['orcid:0000-0003-4082-1500']
750 self.assertEqual(out_3, exp_3)
751 cp.orcid_m.storage_manager.delete_storage()
753 cp = CrossrefProcessing(testing=True)
754 #CASE_4: invalid doi in self._redis_values_br
755 inp_4 = {'id': 'doi:10.1089/bsp.2008.002', 'schema': 'doi'}
756 cp._redis_values_br.append(inp_4['id'])
757 out_4 = cp.to_validated_id_list(inp_4)
758 exp_4 = ['doi:10.1089/bsp.2008.002']
759 self.assertEqual(out_4, exp_4)
760 value=cp.tmp_doi_m.storage_manager.get_value('doi:10.1089/bsp.2008.002')
761 self.assertEqual(value, True)
762 cp.doi_m.storage_manager.delete_storage()
765 def test_to_validated_id_list_redis(self):
766 cp = CrossrefProcessing(testing=True)
767 # CASE_1: is valid
768 inp_1 = {'id': 'doi:10.13039/100005522', 'schema': 'doi'}
769 out_1 = cp.to_validated_id_list(inp_1)
770 exp_1 = ['doi:10.13039/100005522']
771 self.assertEqual(out_1, exp_1)
772 cp.doi_m.storage_manager.delete_storage()
774 cp = CrossrefProcessing(testing=True)
775 # CASE_2: is invalid
776 inp_2 = {'id': 'doi:10.1089/bsp.2008.002', 'schema': 'doi'}
777 out_2 = cp.to_validated_id_list(inp_2)
778 exp_2 = []
779 self.assertEqual(out_2, exp_2)
781 cp = CrossrefProcessing(testing=True)
782 # CASE_3: valid orcid
783 inp_3 = {'id': 'orcid:0000-0003-4082-1500', 'schema': 'orcid'}
784 out_3 = cp.to_validated_id_list(inp_3)
785 exp_3 = ['orcid:0000-0003-4082-1500']
786 self.assertEqual(out_3, exp_3)
787 cp.orcid_m.storage_manager.delete_storage()
789 cp = CrossrefProcessing(testing=True)
790 # CASE_4: invalid doi in self._redis_values_br
791 inp_4 = {'id': 'doi:10.1089/bsp.2008.002', 'schema': 'doi'}
792 cp._redis_values_br.append(inp_4['id'])
793 out_4 = cp.to_validated_id_list(inp_4)
794 exp_4 = ['doi:10.1089/bsp.2008.002']
795 self.assertEqual(out_4, exp_4)
796 value = cp.tmp_doi_m.storage_manager.get_value('doi:10.1089/bsp.2008.002')
797 self.assertEqual(value, True)
798 cp.doi_m.storage_manager.delete_storage()
800 def test_find_crossref_orcid_with_index(self):
801 """Test ORCID validation using ORCID index before API validation"""
802 # Setup
803 test_doi = "10.1234/test123"
804 test_doi_prefixed = "doi:10.1234/test123"
805 test_orcid = "0000-0002-1234-5678"
806 test_name = "Smith, John"
808 # Create CrossrefProcessing instance with ORCID index
809 cp = CrossrefProcessing(testing=True)
810 cp.orcid_index.add_value(test_doi_prefixed, f"{test_name} [orcid:{test_orcid}]") # type: ignore[attr-defined]
811 cp.prefetch_doi_orcid_index([test_doi])
813 # Test Case 1: ORCID found in index
814 out_1 = cp.find_crossref_orcid(test_orcid, test_doi)
815 exp_1 = f"orcid:{test_orcid}"
816 self.assertEqual(out_1, exp_1)
817 # Verify it was added to temporary storage
818 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}"))
820 # Test Case 2: ORCID not in index but valid via API
821 out_2 = cp.find_crossref_orcid("0000-0003-4082-1500", test_doi)
822 exp_2 = "orcid:0000-0003-4082-1500"
823 self.assertEqual(out_2, exp_2)
825 # Test Case 3: ORCID not in index and invalid
826 out_3 = cp.find_crossref_orcid("0000-0000-0000-0000", test_doi)
827 exp_3 = ""
828 self.assertEqual(out_3, exp_3)
830 # Cleanup
831 cp.orcid_m.storage_manager.delete_storage()
833 def test_find_crossref_orcid_api_disabled_not_in_index(self):
834 """API OFF + empty index: a syntactically valid ORCID must NOT be resolved."""
835 cp = CrossrefProcessing(use_orcid_api=False, testing=True)
836 test_doi = "10.9999/noindex"
837 candidate = "0000-0003-4082-1500" # syntactically valid
839 out = cp.find_crossref_orcid(candidate, test_doi)
840 self.assertEqual(out, "")
841 # Must NOT be written to tmp storage
842 self.assertIsNone(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{candidate}"))
844 cp.orcid_m.storage_manager.delete_storage()
846 def test_find_crossref_orcid_api_disabled_from_index(self):
847 """API OFF + present in DOI→ORCID index: must resolve and be saved in tmp storage."""
848 cp = CrossrefProcessing(use_orcid_api=False, testing=True)
849 test_doi = "10.1234/test"
850 test_doi_prefixed = "doi:10.1234/test"
851 test_orcid = "0000-0002-1234-5678"
852 test_name = "Smith, John"
854 cp.orcid_index.add_value(test_doi_prefixed, f"{test_name} [orcid:{test_orcid}]") # type: ignore[attr-defined]
855 cp.prefetch_doi_orcid_index([test_doi])
857 out = cp.find_crossref_orcid(test_orcid, test_doi)
858 self.assertEqual(out, f"orcid:{test_orcid}")
859 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}"))
861 cp.orcid_m.storage_manager.delete_storage()
863 def test_find_crossref_orcid_api_disabled_in_storage(self):
864 """API OFF + ORCID already valid in persistent storage: must be accepted."""
865 cp = CrossrefProcessing(use_orcid_api=False, testing=True)
866 oid = "orcid:0000-0003-4082-1500"
867 cp.orcid_m.storage_manager.set_value(oid, True) # mark valid
868 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/any")
869 self.assertEqual(out, oid)
870 cp.orcid_m.storage_manager.delete_storage()
872 def test_find_crossref_orcid_api_disabled_from_redis_snapshot(self):
873 """API OFF + empty index/storage, but ORCID present in Redis snapshot: accept and seed tmp storage."""
874 cp = CrossrefProcessing(use_orcid_api=False, testing=True)
875 oid = "orcid:0000-0003-4082-1500"
876 cp.update_redis_values(br=[], ra=[oid]) # emulate per-chunk snapshot
878 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/noindex")
879 self.assertEqual(out, oid)
880 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(oid))
881 cp.orcid_m.storage_manager.delete_storage()
883 def test_find_crossref_orcid_api_enabled_invalid_in_storage(self):
884 """API ON + ORCID explicitly invalid in storage: reject immediately (no API/index)."""
885 cp = CrossrefProcessing(use_orcid_api=True, testing=True)
886 oid = "orcid:0000-0002-9286-2630"
887 cp.orcid_m.storage_manager.set_value(oid, False)
888 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/anything")
889 self.assertEqual(out, "")
890 cp.orcid_m.storage_manager.delete_storage()
892 def test_find_crossref_orcid_api_enabled_from_redis_snapshot(self):
893 """API ON + empty storage/index, but ORCID present in Redis snapshot: accept without API call."""
894 cp = CrossrefProcessing(use_orcid_api=True, testing=True)
895 oid = "orcid:0000-0003-4082-1500"
896 cp.update_redis_values(br=[], ra=[oid])
898 out = cp.find_crossref_orcid(oid.split(":")[1], "10.9999/noindex")
899 self.assertEqual(out, oid)
900 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(oid))
901 cp.orcid_m.storage_manager.delete_storage()
903 def test_get_agents_strings_list_api_disabled_no_index(self):
904 """API OFF + empty index: ORCIDs provided in agent dict MUST NOT be appended to the author string."""
905 agents_list = [
906 {
907 "given": "Jane",
908 "family": "Doe",
909 "role": "author",
910 "ORCID": "https://orcid.org/0000-0003-4082-1500", # present in metadata
911 }
912 ]
913 cp = CrossrefProcessing(use_orcid_api=False, testing=True)
914 authors_strings, editors_strings = cp.get_agents_strings_list("10.9999/noindex", agents_list)
915 self.assertEqual(authors_strings, ["Doe, Jane"]) # no [orcid:...] tag
916 self.assertEqual(editors_strings, [])
917 cp.orcid_m.storage_manager.delete_storage()
919 def test_get_agents_strings_list_api_disabled_index_requires_prefixed_doi(self):
920 """
921 API OFF + indice DOI→ORCID popolato con chiave DOI prefissata (doi:...).
922 Il DOI passato a get_agents_strings_list è senza prefisso: la funzione deve
923 normalizzarlo prima di interrogare l'indice, altrimenti l'ORCID non viene trovato.
924 """
925 cp = CrossrefProcessing(use_orcid_api=False, testing=True)
927 # Indice popolato con DOI **prefissato**
928 doi_pref = "doi:10.1234/test-idx"
929 test_orcid = "0000-0002-9999-8888"
930 cp.orcid_index.add_value(doi_pref, f"Smith, John [orcid:{test_orcid}]") # type: ignore[attr-defined]
931 cp.prefetch_doi_orcid_index(["10.1234/test-idx"])
933 # Autore senza ORCID in metadati; DOI passato **senza prefisso**
934 agents = [{
935 "given": "John",
936 "family": "Smith",
937 "role": "author"
938 }]
940 authors, editors = cp.get_agents_strings_list("10.1234/test-idx", agents)
941 # Deve risolvere via indice e apporre il tag [orcid:...]
942 self.assertEqual(authors, ["Smith, John [orcid:0000-0002-9999-8888]"])
943 self.assertEqual(editors, [])
944 cp.orcid_m.storage_manager.delete_storage()
946 def test_find_crossref_orcid_api_disabled_redis_snapshot_unprefixed_orcid(self):
947 """
948 API OFF + indice vuoto + storage vuoto, ma Redis snapshot contiene ORCID **senza prefisso**.
949 La funzione deve riconoscerlo (normalizzando) e validarlo.
950 """
951 cp = CrossrefProcessing(use_orcid_api=False, testing=True)
953 # Redis snapshot con ORCID **senza prefisso**
954 raw_orcid = "0000-0003-4082-1500"
955 cp.update_redis_values(br=[], ra=[raw_orcid])
957 out = cp.find_crossref_orcid(raw_orcid, "10.9999/noindex")
958 self.assertEqual(out, f"orcid:{raw_orcid}")
959 self.assertTrue(cp.tmp_orcid_m.storage_manager.get_value(f"orcid:{raw_orcid}"))
960 cp.orcid_m.storage_manager.delete_storage()
962 def test_update_redis_values_normalizes_inputs(self):
963 """
964 update_redis_values deve normalizzare sempre:
965 - DOI → con prefisso 'doi:'
966 - ORCID → con prefisso 'orcid:'
967 ed eliminare voci non normalizzabili.
968 """
969 cp = CrossrefProcessing(testing=True)
971 cp.update_redis_values(
972 br=["10.1001/jama.299.12.1471", "doi:10.2105/ajph.2006.101626", "xxx-bad"],
973 ra=["0000-0002-1234-5678", "orcid:0000-0003-4082-1500", "bad-orcid"]
974 )
976 # Tutti normalizzati (e 'bad' scartati)
977 self.assertIn("doi:10.1001/jama.299.12.1471", cp._redis_values_br)
978 self.assertIn("doi:10.2105/ajph.2006.101626", cp._redis_values_br)
979 self.assertNotIn("xxx-bad", cp._redis_values_br)
981 self.assertIn("orcid:0000-0002-1234-5678", cp._redis_values_ra)
982 self.assertIn("orcid:0000-0003-4082-1500", cp._redis_values_ra)
983 self.assertNotIn("bad-orcid", cp._redis_values_ra)
984 cp.storage_manager.delete_storage()
987def test_validated_as_with_storage_manager(storage_manager):
988 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"}
989 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"}
990 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"}
992 c_processing = CrossrefProcessing(storage_manager=storage_manager, testing=True)
993 c_processing.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True)
994 c_processing.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False)
996 assert c_processing.validated_as(valid_doi_in_db) is True
997 assert c_processing.validated_as(invalid_doi_in_db) is False
998 assert c_processing.validated_as(valid_doi_not_in_db) is None
1001class TestCrossrefProcessingWithMockedAPI(unittest.TestCase):
1002 """Integration tests using mocked Crossref API responses from conftest.py."""
1004 def test_csv_creator_nature_article(self):
1005 """Test with Nature article from mocked API (doi:10.1038/nature12373)."""
1006 item = {
1007 "DOI": "10.1038/nature12373",
1008 "type": "journal-article",
1009 "title": ["Nanometre-scale thermometry in a living cell"],
1010 "author": [
1011 {"given": "G.", "family": "Kucsko", "sequence": "first"},
1012 {"given": "P. C.", "family": "Maurer", "sequence": "additional"},
1013 {"given": "M. D.", "family": "Lukin", "sequence": "additional"}
1014 ],
1015 "container-title": ["Nature"],
1016 "volume": "500",
1017 "issue": "7460",
1018 "page": "54-58",
1019 "issued": {"date-parts": [[2013, 7, 31]]},
1020 "ISSN": ["0028-0836", "1476-4687"],
1021 "publisher": "Springer Science and Business Media LLC",
1022 "member": "297",
1023 "prefix": "10.1038"
1024 }
1025 processor = CrossrefProcessing(testing=True)
1026 row = processor.csv_creator(item)
1028 expected = {
1029 'id': 'doi:10.1038/nature12373',
1030 'title': 'Nanometre-scale thermometry in a living cell',
1031 'author': 'Kucsko, G.; Maurer, P. C.; Lukin, M. D.',
1032 'pub_date': '2013-7-31',
1033 'venue': 'Nature [issn:0028-0836 issn:1476-4687]',
1034 'volume': '500',
1035 'issue': '7460',
1036 'page': '54-58',
1037 'type': 'journal article',
1038 'publisher': 'Springer Science and Business Media LLC [crossref:297]',
1039 'editor': ''
1040 }
1041 self.assertEqual(row, expected)
1042 processor.storage_manager.delete_storage()
1044 def test_csv_creator_plos_with_orcid_url(self):
1045 """Test PLOS article with ORCID in URL format from mocked API."""
1046 item = {
1047 "DOI": "10.1371/journal.pone.0284601",
1048 "type": "journal-article",
1049 "title": ["Biochemical evaluation of vaccination in rats"],
1050 "author": [
1051 {"given": "Mahsa", "family": "Teymoorzadeh", "sequence": "first"},
1052 {"given": "Razieh", "family": "Yazdanparast", "sequence": "additional",
1053 "ORCID": "https://orcid.org/0000-0003-0530-4305", "authenticated-orcid": True}
1054 ],
1055 "container-title": ["PLOS ONE"],
1056 "volume": "18",
1057 "issue": "5",
1058 "page": "e0284601",
1059 "issued": {"date-parts": [[2023, 5, 4]]},
1060 "ISSN": ["1932-6203"],
1061 "publisher": "Public Library of Science (PLoS)"
1062 }
1063 processor = CrossrefProcessing(testing=True)
1064 row = processor.csv_creator(item)
1066 expected = {
1067 'id': 'doi:10.1371/journal.pone.0284601',
1068 'title': 'Biochemical evaluation of vaccination in rats',
1069 'author': 'Teymoorzadeh, Mahsa; Yazdanparast, Razieh [orcid:0000-0003-0530-4305]',
1070 'pub_date': '2023-5-4',
1071 'venue': 'PLOS ONE [issn:1932-6203]',
1072 'volume': '18',
1073 'issue': '5',
1074 'page': 'e0284601-e0284601',
1075 'type': 'journal article',
1076 'publisher': 'Public Library of Science (PLoS)',
1077 'editor': ''
1078 }
1079 self.assertEqual(row, expected)
1080 processor.storage_manager.delete_storage()
1082 def test_csv_creator_book_chapter_multiple_containers(self):
1083 """Test book chapter with multiple container-titles from mocked API."""
1084 item = {
1085 "DOI": "10.1007/978-3-030-00668-6_8",
1086 "type": "book-chapter",
1087 "title": ["The SPAR Ontologies"],
1088 "author": [
1089 {"given": "Silvio", "family": "Peroni", "sequence": "first"},
1090 {"given": "David", "family": "Shotton", "sequence": "additional"}
1091 ],
1092 "container-title": ["Lecture Notes in Computer Science", "The Semantic Web – ISWC 2018"],
1093 "page": "119-136",
1094 "issued": {"date-parts": [[2018]]},
1095 "ISBN": ["9783030006679", "9783030006686"],
1096 "publisher": "Springer International Publishing"
1097 }
1098 processor = CrossrefProcessing(testing=True)
1099 row = processor.csv_creator(item)
1101 expected = {
1102 'id': 'doi:10.1007/978-3-030-00668-6_8',
1103 'title': 'The SPAR Ontologies',
1104 'author': 'Peroni, Silvio; Shotton, David',
1105 'pub_date': '2018',
1106 'venue': 'Lecture Notes in Computer Science [isbn:9783030006679 isbn:9783030006686]',
1107 'volume': '',
1108 'issue': '',
1109 'page': '119-136',
1110 'type': 'book chapter',
1111 'publisher': 'Springer International Publishing',
1112 'editor': ''
1113 }
1114 self.assertEqual(row, expected)
1115 processor.storage_manager.delete_storage()
1117 def test_csv_creator_date_parts_null(self):
1118 """Test handling of date-parts with null value: [[null]] from mocked API."""
1119 item = {
1120 "DOI": "10.1234/null-date",
1121 "type": "journal-article",
1122 "title": ["Article with null date"],
1123 "issued": {"date-parts": [[None]]}
1124 }
1125 processor = CrossrefProcessing(testing=True)
1126 row = processor.csv_creator(item)
1128 expected = {
1129 'id': 'doi:10.1234/null-date',
1130 'title': 'Article with null date',
1131 'author': '',
1132 'pub_date': '',
1133 'venue': '',
1134 'volume': '',
1135 'issue': '',
1136 'page': '',
1137 'type': 'journal article',
1138 'publisher': '',
1139 'editor': ''
1140 }
1141 self.assertEqual(row, expected)
1142 processor.storage_manager.delete_storage()
1144 def test_csv_creator_date_parts_empty(self):
1145 """Test handling of date-parts as empty list: [[]] from mocked API."""
1146 item = {
1147 "DOI": "10.1234/empty-date",
1148 "type": "journal-article",
1149 "title": ["Article with empty date-parts"],
1150 "issued": {"date-parts": [[]]}
1151 }
1152 processor = CrossrefProcessing(testing=True)
1153 row = processor.csv_creator(item)
1155 expected = {
1156 'id': 'doi:10.1234/empty-date',
1157 'title': 'Article with empty date-parts',
1158 'author': '',
1159 'pub_date': '',
1160 'venue': '',
1161 'volume': '',
1162 'issue': '',
1163 'page': '',
1164 'type': 'journal article',
1165 'publisher': '',
1166 'editor': ''
1167 }
1168 self.assertEqual(row, expected)
1169 processor.storage_manager.delete_storage()
1171 def test_csv_creator_date_parts_missing(self):
1172 """Test handling of issued without date-parts key from mocked API."""
1173 item = {
1174 "DOI": "10.1234/no-dateparts",
1175 "type": "journal-article",
1176 "title": ["Article without date-parts key"],
1177 "issued": {}
1178 }
1179 processor = CrossrefProcessing(testing=True)
1180 row = processor.csv_creator(item)
1182 expected = {
1183 'id': 'doi:10.1234/no-dateparts',
1184 'title': 'Article without date-parts key',
1185 'author': '',
1186 'pub_date': '',
1187 'venue': '',
1188 'volume': '',
1189 'issue': '',
1190 'page': '',
1191 'type': 'journal article',
1192 'publisher': '',
1193 'editor': ''
1194 }
1195 self.assertEqual(row, expected)
1196 processor.storage_manager.delete_storage()
1198 def test_csv_creator_html_in_title(self):
1199 """Test HTML markup in title is cleaned (from mocked API structure)."""
1200 item = {
1201 "DOI": "10.1234/html-title",
1202 "type": "journal-article",
1203 "title": ["A study of <i>Escherichia coli</i> in <b>biofilms</b>"],
1204 "issued": {"date-parts": [[2024, 1, 15]]}
1205 }
1206 processor = CrossrefProcessing(testing=True)
1207 row = processor.csv_creator(item)
1209 expected = {
1210 'id': 'doi:10.1234/html-title',
1211 'title': 'A study of Escherichia coli in biofilms',
1212 'author': '',
1213 'pub_date': '2024-1-15',
1214 'venue': '',
1215 'volume': '',
1216 'issue': '',
1217 'page': '',
1218 'type': 'journal article',
1219 'publisher': '',
1220 'editor': ''
1221 }
1222 self.assertEqual(row, expected)
1223 processor.storage_manager.delete_storage()
1225 def test_csv_creator_with_editor(self):
1226 """Test article with both author and editor from mocked API structure."""
1227 item = {
1228 "DOI": "10.1234/with-editor",
1229 "type": "edited-book",
1230 "title": ["Edited volume test"],
1231 "author": [{"given": "John", "family": "Doe", "sequence": "first"}],
1232 "editor": [{"given": "Jane", "family": "Smith", "sequence": "first"}],
1233 "issued": {"date-parts": [[2024, 6, 20]]}
1234 }
1235 processor = CrossrefProcessing(testing=True)
1236 row = processor.csv_creator(item)
1238 expected = {
1239 'id': 'doi:10.1234/with-editor',
1240 'title': 'Edited volume test',
1241 'author': 'Doe, John',
1242 'pub_date': '2024-6-20',
1243 'venue': '',
1244 'volume': '',
1245 'issue': '',
1246 'page': '',
1247 'type': 'edited book',
1248 'publisher': '',
1249 'editor': 'Smith, Jane'
1250 }
1251 self.assertEqual(row, expected)
1252 processor.storage_manager.delete_storage()
1254 def test_csv_creator_no_inplace_modification(self):
1255 """Test that csv_creator does not modify the input item dict."""
1256 item = {
1257 "DOI": "10.1234/with-editor",
1258 "type": "edited-book",
1259 "title": ["Edited volume test"],
1260 "author": [{"given": "John", "family": "Doe", "sequence": "first"}],
1261 "editor": [{"given": "Jane", "family": "Smith", "sequence": "first"}],
1262 "issued": {"date-parts": [[2024, 6, 20]]}
1263 }
1264 original_author = {"given": "John", "family": "Doe", "sequence": "first"}
1265 original_editor = {"given": "Jane", "family": "Smith", "sequence": "first"}
1267 processor = CrossrefProcessing(testing=True)
1268 processor.csv_creator(item)
1270 self.assertEqual(item['author'][0], original_author)
1271 self.assertEqual(item['editor'][0], original_editor)
1272 processor.storage_manager.delete_storage()
1274 def test_csv_creator_member_as_string(self):
1275 """Test that member field as string (API format) is handled."""
1276 item = {
1277 "DOI": "10.1001/test.12345",
1278 "type": "journal-article",
1279 "title": ["Test"],
1280 "publisher": "American Medical Association (AMA)",
1281 "member": "10",
1282 "prefix": "10.1001",
1283 "issued": {"date-parts": [[2024]]}
1284 }
1285 processor = CrossrefProcessing(
1286 publishers_filepath=PUBLISHERS_MAPPING,
1287 testing=True
1288 )
1289 row = processor.csv_creator(item)
1291 expected = {
1292 'id': 'doi:10.1001/test.12345',
1293 'title': 'Test',
1294 'author': '',
1295 'pub_date': '2024',
1296 'venue': '',
1297 'volume': '',
1298 'issue': '',
1299 'page': '',
1300 'type': 'journal article',
1301 'publisher': 'American Medical Association (AMA) [crossref:10]',
1302 'editor': ''
1303 }
1304 self.assertEqual(row, expected)
1305 processor.storage_manager.delete_storage()