Coverage for test / processing_oroci_test.py: 99%
879 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-12 21:23 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-12 21:23 +0000
1# SPDX-FileCopyrightText: 2023 Arianna Moretti <arianna.moretti4@unibo.it>
2# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it>
3# SPDX-FileCopyrightText: 2025-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
4#
5# SPDX-License-Identifier: ISC
7import os
8import unittest
10from oc_ds_converter.lib.jsonmanager import *
11from oc_ds_converter.openaire.openaire_processing import OpenaireProcessing
12#
14BASE = os.path.join('test', 'openaire_processing')
15DATA = os.path.join(BASE, 'jSonFile_1.json')
16DATA_DIR = BASE
17TMP_SUPPORT_MATERIAL = os.path.join(BASE, "tmp_support")
18OUTPUT = os.path.join(BASE, 'meta_input')
19MULTIPROCESS_OUTPUT = os.path.join(BASE, 'multi_process_test')
20MEMO_JSON_PATH = "test/openaire_processing/tmp_support/memo.json"
21SAMPLE_ENTITY = {'collectedFrom': [{'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::081b82f96300b6a6e3d282bad31cb6e2', 'schema': 'DNET Identifier'}], 'name': 'Crossref'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::8ac8380272269217cb09a928c8caa993', 'schema': 'DNET Identifier'}], 'name': 'UnpayWall'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::806360c771262b4d6770e7cdf04b5c5a', 'schema': 'DNET Identifier'}], 'name': 'ORCID'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a', 'schema': 'DNET Identifier'}], 'name': 'Microsoft Academic Graph'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::9e3be59865b2c1c335d32dae2fe7b254', 'schema': 'DNET Identifier'}], 'name': 'Datacite'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::6f4922f45568161a8cdf4ad2299f6d23', 'schema': 'DNET Identifier'}], 'name': 'arXiv.org e-Print Archive'}, 'provisionMode': 'collected'}], 'creator': [{'name': 'Matteo Serra'}, {'name': 'Salvatore Mignemi'}, {'identifiers': [{'identifier': '0000-0001-5595-7537', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-5595-7537'}], 'name': 'Mariano Cadoni'}], 'dnetIdentifier': '50|doi_dedup___::41074cd388749ccbdb6668caaf059f4a', 'identifier': [{'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi', 'url': 'https://doi.org/10.1103/physrevd.84.084046'}, {'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi'}, {'identifier': '10.48550/arxiv.1107.5979', 'schema': 'doi', 'url': 'https://dx.doi.org/10.48550/arxiv.1107.5979'}, {'identifier': '1107.5979', 'schema': 'arXiv', 'url': 'http://arxiv.org/abs/1107.5979'}], 'objectSubType': 'Article', 'objectType': 'publication', 'publicationDate': '2011-10-21', 'publisher': [{'name': 'American Physical Society (APS)'}], 'title': 'Exact solutions with AdS asymptotics of Einstein and Einstein-Maxwell gravity minimally coupled to a scalar field'}
22SAMPLE_ENT2 = {"identifier":"000017d2c913b28e09291b811ce3609a","linkprovider":[{"identifiers":[{"identifier":"10|openaire____::0a836ef43dcb67bb7cbd4dd509b11b73","schema":"DNET Identifier"}],"name":"CORE (RIOXX-UK Aggregator)"},{"identifiers":[{"identifier":"10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357","schema":"DNET Identifier"}],"name":"PubMed Central"},{"identifiers":[{"identifier":"10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c","schema":"DNET Identifier"}],"name":"Europe PubMed Central"},{"identifiers":[{"identifier":"10|opendoar____::229754d7799160502a143a72f6789927","schema":"DNET Identifier"}],"name":"Publications at Bielefeld University"}],"publicationDate":"2014-02-01","publisher":[{"name":"Springer Nature"}],"relationship":{"inverse":"IsCitedBy","name":"Cites","schema":"datacite"},"source":{"collectedFrom":[{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|openaire____::0a836ef43dcb67bb7cbd4dd509b11b73","schema":"DNET Identifier"}],"name":"CORE (RIOXX-UK Aggregator)"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357","schema":"DNET Identifier"}],"name":"PubMed Central"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c","schema":"DNET Identifier"}],"name":"Europe PubMed Central"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|opendoar____::229754d7799160502a143a72f6789927","schema":"DNET Identifier"}],"name":"Publications at Bielefeld University"},"provisionMode":"collected"}],"creator":[{"identifiers":[{"identifier":"0000-0002-6491-0754","schema":"ORCID","url":"https://orcid.org/0000-0002-6491-0754"}],"name":"Sattler, Sebastian"},{"name":"Mehlkop, Guido"},{"name":"Graeff, Peter"},{"identifiers":[{"identifier":"0000-0002-8090-6886","schema":"ORCID","url":"https://orcid.org/0000-0002-8090-6886"}],"name":"Sauer, Carsten"}],"dnetIdentifier":"50|pmid_dedup__::8936076da7a86820c24ede7ca3ff15b3","identifier":[{"identifier":"PMC3928621","schema":"pmc","url":"http://europepmc.org/articles/PMC3928621"},{"identifier":"24484640","schema":"pmid"},{"identifier":"24484640","schema":"pmid","url":"https://pubmed.ncbi.nlm.nih.gov/24484640"},{"identifier":"PMC3928621","schema":"pmc"}],"objectSubType":"Article","objectType":"publication","publicationDate":"2014-02-01","publisher":[{"name":"Springer Nature"}],"title":"Evaluating the drivers of and obstacles to the willingness to use cognitive enhancement drugs: the influence of drug characteristics, social environment, and personal characteristics"},"target":{"collectedFrom":[{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2","schema":"DNET Identifier"}],"name":"Crossref"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a","schema":"DNET Identifier"}],"name":"Microsoft Academic Graph"},"provisionMode":"collected"}],"creator":[{"name":"Harold G. Grasmick"},{"name":"Robert J. Bursik"}],"dnetIdentifier":"50|doi_________::816648c63de74835ec2b0a753a68f037","identifier":[{"identifier":"10.2307/3053861","schema":"doi","url":"https://doi.org/10.2307/3053861"}],"objectSubType":"Article","objectType":"publication","publicationDate":"1990-01-01","publisher":[{"name":"JSTOR"}],"title":"Conscience, significant others, and rational choice: Extending the deterrence model."}}
23SAMPLE_ENTITY_FOR_CSV_CREATOR = {'collectedFrom': [{'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::0a836ef43dcb67bb7cbd4dd509b11b73', 'schema': 'DNET Identifier'}], 'name': 'CORE (RIOXX-UK Aggregator)'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357', 'schema': 'DNET Identifier'}], 'name': 'PubMed Central'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c', 'schema': 'DNET Identifier'}], 'name': 'Europe PubMed Central'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|driver______::bee53aa31dc2cbb538c10c2b65fa5824', 'schema': 'DNET Identifier'}], 'name': 'DOAJ-Articles'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::566a9968b43628588e76be5a85a0f9e8', 'schema': 'DNET Identifier'}], 'name': "King's Research Portal"}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::c2cdfa5866e03cdd07d313cbc8fb8311', 'schema': 'DNET Identifier'}], 'name': 'Multidisciplinary Digital Publishing Institute'}, 'provisionMode': 'collected'}], 'creator': [{'name': 'Smith, Lee'}, {'name': 'Sawyer, Alexia'}, {'name': 'Gardner, Benjamin'}, {'name': 'Seppala, Katri'}, {'name': 'Ucci, Marcella'}, {'name': 'Marmot, Alexi'}, {'name': 'Lally, Pippa'}, {'name': 'Fisher, Abi'}], 'dnetIdentifier': '50|pmid_dedup__::a1a8687c2378a0d68314566dec29dafb', 'objectSubType': 'Article', 'objectType': 'publication', 'publicationDate': '2018-06-09', 'publisher': [{'name': 'MDPI'}], 'title': 'Occupational physical activity habits of UK office workers: cross-sectional data from the Active Buildings Study', 'identifier': {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:29890726', 'valid': None}]}, "redis_validity_lists":[[],[]]}
26class TestOpenaireProcessing(unittest.TestCase):
28 def delete_storege(self, storage_type=None, specific_path=None):
29 if not specific_path:
30 if storage_type == "sqlite":
31 auto_db_created_path = os.path.join(os.getcwd(), "storage", "id_valid_dict.db")
32 auto_db_created_path = auto_db_created_path if os.path.exists(auto_db_created_path) else auto_db_created_path+"?mode=rw"
33 if os.path.exists(auto_db_created_path):
34 os.remove(auto_db_created_path)
35 else:
36 auto_db_created_path = os.path.join(os.getcwd(), "storage", "id_value.json")
37 if os.path.exists(auto_db_created_path):
38 os.remove(auto_db_created_path)
39 elif specific_path:
40 if os.path.exists(specific_path):
41 os.remove(specific_path)
43 def test_get_all_ids(self):
44 opp = OpenaireProcessing()
45 allids = opp.extract_all_ids(SAMPLE_ENT2)
46 self.assertCountEqual(['pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'], allids[0])
47 self.assertCountEqual(['orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'], allids[1])
49 opp.storage_manager.delete_storage()
51 def test_get_all_ids_redis(self):
52 opp = OpenaireProcessing(testing=True)
53 allids = opp.extract_all_ids(SAMPLE_ENT2)
54 self.assertCountEqual(['pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'], allids[0])
55 self.assertCountEqual(['orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'], allids[1])
56 opp.storage_manager.delete_storage()
58 def test_get_redis_validity_list(self):
59 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'}
60 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}
62 opp = OpenaireProcessing()
63 br_valid_list = opp.get_redis_validity_list(br, "br")
64 exp_exp_br_valid_list = []
65 ra_valid_list = opp.get_redis_validity_list(ra, "ra")
66 exp_exp_ra_valid_list = []
67 self.assertEqual(ra_valid_list, exp_exp_ra_valid_list)
68 self.assertEqual(br_valid_list, exp_exp_br_valid_list)
70 opp.storage_manager.delete_storage()
72 def test_get_redis_validity_list_redis(self):
73 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'}
74 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}
76 opp = OpenaireProcessing(testing=True)
77 br_valid_list = opp.get_redis_validity_list(br, "br")
78 exp_exp_br_valid_list = []
79 ra_valid_list = opp.get_redis_validity_list(ra, "ra")
80 exp_exp_ra_valid_list = []
81 self.assertEqual(ra_valid_list, exp_exp_ra_valid_list)
82 self.assertEqual(br_valid_list, exp_exp_br_valid_list)
83 opp.storage_manager.delete_storage()
85 def test_get_reids_validity_dict_w_fakeredis_db_values_sqlite(self):
86 opp = OpenaireProcessing()
87 opp.BR_redis.sadd('pmid:24484640', "omid:1")
88 opp.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2")
90 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'}
91 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}
93 br_validity_dict = opp.get_redis_validity_list(br, "br")
94 exp_br_valid_list = ["pmid:24484640"]
95 ra_validity_dict = opp.get_redis_validity_list(ra, "ra")
96 exp_ra_valid_list = ['orcid:0000-0002-8090-6886']
97 self.assertEqual(br_validity_dict, exp_br_valid_list)
98 self.assertEqual(ra_validity_dict, exp_ra_valid_list)
100 opp.storage_manager.delete_storage()
102 opp.BR_redis.delete('pmid:24484640')
103 opp.BR_redis.delete('pmcid:PMC3928621')
104 opp.RA_redis.delete('orcid:0000-0002-8090-6886')
106 def test_get_reids_validity_dict_w_fakeredis_db_values_redis(self):
107 opp = OpenaireProcessing(testing=True)
108 opp.BR_redis.sadd('pmid:24484640', "omid:1")
109 opp.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2")
112 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'}
113 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}
115 br_validity_dict = opp.get_redis_validity_list(br, "br")
116 exp_br_valid_list = ["pmid:24484640"]
117 ra_validity_dict = opp.get_redis_validity_list(ra, "ra")
118 exp_ra_valid_list = ['orcid:0000-0002-8090-6886']
119 self.assertEqual(br_validity_dict, exp_br_valid_list)
120 self.assertEqual(ra_validity_dict, exp_ra_valid_list)
122 opp.storage_manager.delete_storage()
123 opp.BR_redis.delete('pmid:24484640')
124 opp.BR_redis.delete('pmcid:PMC3928621')
125 opp.RA_redis.delete('orcid:0000-0002-8090-6886')
127 def test_validated_as_default(self):
128 """
129 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:
130 string of the identifier, the method "validated_as" returns:
131 - True if the id was already validated as valid
132 - False if the id was already validated as invalid
133 - None if the id was not validated before
134 The procedure is tested
135 - With default storage manager (sqlite) without a pre-existent db associated
136 """
138 opp = OpenaireProcessing()
139 validate_as_none = opp.validated_as({"schema":"pmid", "identifier": "pmid:23483834"})
140 self.assertEqual(validate_as_none, None)
141 opp.storage_manager.delete_storage()
143 def test_validated_as_default_redis(self):
144 '''
145 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:
146 string of the identifier, the method "validated_as" returns:
147 - True if the id was already validated as valid
148 - False if the id was already validated as invalid
149 - None if the id was not validated before
150 The procedure is tested
151 - With redis storage manager without a pre-existent db associated
152 '''
154 opp = OpenaireProcessing(testing=True)
155 validate_as_none = opp.validated_as({"schema":"pmid", "identifier": "pmid:23483834"})
156 self.assertEqual(validate_as_none, None)
157 opp.storage_manager.delete_storage()
159 def test_validated_as_redis_with_preexistent_data(self):
160 '''
161 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:
162 string of the identifier, the method "validated_as" returns:
163 - True if the id was already validated as valid
164 - False if the id was already validated as invalid
165 - None if the id was not validated before
166 The procedure is tested
167 - With redis storage manager and pre-existent data associated
168 '''
169 valid_pmid_not_in_db = {"identifier":"pmid:2938", "schema":"pmid"}
170 valid_pmid_in_db = {"identifier":"pmid:23483834", "schema":"pmid"}
171 invalid_pmid_in_db = {"identifier":"pmid:18328387372097", "schema":"pmid"}
173 # New class instance and set values directly on the id managers' storage_manager
174 opp_redis = OpenaireProcessing(testing=True)
175 opp_redis.pmid_m.storage_manager.set_value(valid_pmid_in_db["identifier"], True)
176 opp_redis.pmid_m.storage_manager.set_value(invalid_pmid_in_db["identifier"], False)
177 validated_as_True = opp_redis.validated_as(valid_pmid_in_db)
178 validated_as_False = opp_redis.validated_as(invalid_pmid_in_db)
179 not_validated = opp_redis.validated_as(valid_pmid_not_in_db)
181 self.assertEqual(validated_as_True, True)
182 self.assertEqual(validated_as_False, False)
183 self.assertEqual(not_validated, None)
185 opp_redis.pmid_m.storage_manager.delete_storage()
188 def test_validated_as_inmemory(self):
189 '''
190 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:
191 string of the identifier, the method "validated_as" returns:
192 - True if the id was already validated as valid
193 - False if the id was already validated as invalid
194 - None if the id was not validated before
195 The procedure is tested
196 - With in Memory + Json storage manager and a pre-existent db associated
197 - With in Memory + Json storage manager without a pre-existent db associated
198 '''
200 valid_pmid_not_in_db = {"identifier":"pmid:2938", "schema":"pmid"}
201 valid_pmid_in_db = {"identifier":"pmid:23483834", "schema":"pmid"}
202 invalid_pmid_in_db = {"identifier":"pmid:18328387372097", "schema":"pmid"}
204 # New class instance and set values directly on the id managers' storage_manager
205 opp_sql = OpenaireProcessing(testing=True)
206 opp_sql.pmid_m.storage_manager.set_value(valid_pmid_in_db["identifier"], True)
207 opp_sql.pmid_m.storage_manager.set_value(invalid_pmid_in_db["identifier"], False)
208 validated_as_True = opp_sql.validated_as(valid_pmid_in_db)
209 validated_as_False = opp_sql.validated_as(invalid_pmid_in_db)
210 not_validated = opp_sql.validated_as(valid_pmid_not_in_db)
212 self.assertEqual(validated_as_True, True)
213 self.assertEqual(validated_as_False, False)
214 self.assertEqual(not_validated, None)
216 opp_sql.pmid_m.storage_manager.delete_storage()
219 def test_validated_as_redis(self):
220 '''
221 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:
222 string of the identifier, the method "validated_as" returns:
223 - True if the id was already validated as valid
224 - False if the id was already validated as invalid
225 - None if the id was not validated before
226 The procedure is tested
227 - With REDIS storage manager and a pre-existent db associated
228 - With REDIS storage manager without a pre-existent db associated
229 '''
231 valid_pmid_not_in_db = {"identifier":"pmid:2938", "schema":"pmid"}
232 valid_pmid_in_db = {"identifier":"pmid:23483834", "schema":"pmid"}
233 invalid_pmid_in_db = {"identifier":"pmid:18328387372097", "schema":"pmid"}
235 # New class instance and set values directly on the id managers' storage_manager
236 opp_redis = OpenaireProcessing(testing=True)
237 opp_redis.pmid_m.storage_manager.set_value(valid_pmid_in_db["identifier"], True)
238 opp_redis.pmid_m.storage_manager.set_value(invalid_pmid_in_db["identifier"], False)
239 validated_as_True = opp_redis.validated_as(valid_pmid_in_db)
240 validated_as_False = opp_redis.validated_as(invalid_pmid_in_db)
241 not_validated = opp_redis.validated_as(valid_pmid_not_in_db)
243 self.assertEqual(validated_as_True, True)
244 self.assertEqual(validated_as_False, False)
245 self.assertEqual(not_validated, None)
246 opp_redis.pmid_m.storage_manager.delete_storage()
248 def test_get_id_manager(self):
249 """Check that, given in input the string of a schema (e.g.:'pmid') or an id with a prefix (e.g.: 'pmid:12334')
250 and a dictionary mapping the strings of the schemas to their id managers, the method returns the correct
251 id manager. Note that each instance of the Preprocessing class needs its own instances of the id managers,
252 in order to avoid conflicts while validating data"""
254 op = OpenaireProcessing()
255 id_man_dict = op._id_man_dict
257 pmid_id = "pmid:12345"
258 pmid_string = "pmid"
259 pmid_man_exp = op.get_id_manager(pmid_id, id_man_dict)
260 pmid_man_exp_2 = op.get_id_manager(pmid_string, id_man_dict)
262 #check that the idmanager for the pmid was returned and that it works as expected
263 self.assertTrue(pmid_man_exp.is_valid(pmid_id))
264 self.assertTrue(pmid_man_exp_2.is_valid(pmid_id))
266 doi_id = "doi:10.1103/physrevd.84.084046"
267 doi_string = "doi"
268 doi_man_exp = op.get_id_manager(doi_id, id_man_dict)
269 doi_man_exp_2 = op.get_id_manager(doi_string, id_man_dict)
271 #check that the idmanager for the doi was returned and that it works as expected
272 self.assertTrue(doi_man_exp.is_valid(doi_id))
273 self.assertTrue(doi_man_exp_2.is_valid(doi_id))
275 pmc_id = "pmcid:PMC5555555"
276 pmc_string = "pmcid"
277 pmc_man_exp = op.get_id_manager(pmc_id, id_man_dict)
278 pmc_man_exp_2 = op.get_id_manager(pmc_string, id_man_dict)
280 #check that the idmanager for the pmc was returned and that it works as expected
281 self.assertTrue(pmc_man_exp.is_valid(pmc_id))
282 self.assertTrue(pmc_man_exp_2.is_valid(pmc_id))
284 arxiv_id = "arxiv:1509.08217"
285 arxiv_string = "arxiv"
286 arxiv_man_exp = op.get_id_manager(arxiv_id, id_man_dict)
287 arxiv_man_exp_2 = op.get_id_manager(arxiv_string, id_man_dict)
289 #check that the idmanager for the arxiv was returned and that it works as expected
290 self.assertTrue(arxiv_man_exp.is_valid(arxiv_id))
291 self.assertTrue(arxiv_man_exp_2.is_valid(arxiv_id))
293 op.storage_manager.delete_storage()
295 def test_get_id_manager_redis(self):
296 """Check that, given in input the string of a schema (e.g.:'pmid') or an id with a prefix (e.g.: 'pmid:12334')
297 and a dictionary mapping the strings of the schemas to their id managers, the method returns the correct
298 id manager. Note that each instance of the Preprocessing class needs its own instances of the id managers,
299 in order to avoid conflicts while validating data"""
301 op = OpenaireProcessing(testing=True)
302 id_man_dict = op._id_man_dict
304 pmid_id = "pmid:12345"
305 pmid_string = "pmid"
306 pmid_man_exp = op.get_id_manager(pmid_id, id_man_dict)
307 pmid_man_exp_2 = op.get_id_manager(pmid_string, id_man_dict)
309 #check that the idmanager for the pmid was returned and that it works as expected
310 self.assertTrue(pmid_man_exp.is_valid(pmid_id))
311 self.assertTrue(pmid_man_exp_2.is_valid(pmid_id))
313 doi_id = "doi:10.1103/physrevd.84.084046"
314 doi_string = "doi"
315 doi_man_exp = op.get_id_manager(doi_id, id_man_dict)
316 doi_man_exp_2 = op.get_id_manager(doi_string, id_man_dict)
318 #check that the idmanager for the doi was returned and that it works as expected
319 self.assertTrue(doi_man_exp.is_valid(doi_id))
320 self.assertTrue(doi_man_exp_2.is_valid(doi_id))
322 pmc_id = "pmcid:PMC5555555"
323 pmc_string = "pmcid"
324 pmc_man_exp = op.get_id_manager(pmc_id, id_man_dict)
325 pmc_man_exp_2 = op.get_id_manager(pmc_string, id_man_dict)
327 #check that the idmanager for the pmc was returned and that it works as expected
328 self.assertTrue(pmc_man_exp.is_valid(pmc_id))
329 self.assertTrue(pmc_man_exp_2.is_valid(pmc_id))
331 arxiv_id = "arxiv:1509.08217"
332 arxiv_string = "arxiv"
333 arxiv_man_exp = op.get_id_manager(arxiv_id, id_man_dict)
334 arxiv_man_exp_2 = op.get_id_manager(arxiv_string, id_man_dict)
336 #check that the idmanager for the arxiv was returned and that it works as expected
337 self.assertTrue(arxiv_man_exp.is_valid(arxiv_id))
338 self.assertTrue(arxiv_man_exp_2.is_valid(arxiv_id))
340 op.storage_manager.delete_storage()
343 def test_normalise_any_id(self):
344 '''
345 Check that, given an id with a prefix, any doi, pmid, pmcid and arxiv id is correctly normalised
346 '''
347 op = OpenaireProcessing()
349 pmid_id = "pmid:12345"
350 doi_id = "doi:10.1103/physrevd.84.084046"
351 arxiv_id = "arxiv:1509.08217"
352 pmc_id = "pmcid:PMC5555555"
354 self.assertEqual(pmid_id, op.normalise_any_id(pmid_id+"abc"))
355 self.assertEqual(doi_id, op.normalise_any_id("doi:" + doi_id.split(":")[1].upper()))
356 self.assertEqual(arxiv_id + "v1", op.normalise_any_id(arxiv_id.replace(".", "....")))
357 self.assertEqual(pmc_id, op.normalise_any_id(pmc_id+" "))
359 op.storage_manager.delete_storage()
361 def test_normalise_any_id_redis(self):
362 '''
363 Check that, given an id with a prefix, any doi, pmid, pmcid and arxiv id is correctly normalised
364 '''
365 op = OpenaireProcessing(testing=True)
367 pmid_id = "pmid:12345"
368 doi_id = "doi:10.1103/physrevd.84.084046"
369 arxiv_id = "arxiv:1509.08217"
370 pmc_id = "pmcid:PMC5555555"
372 self.assertEqual(pmid_id, op.normalise_any_id(pmid_id+"abc"))
373 self.assertEqual(doi_id, op.normalise_any_id("doi:" + doi_id.split(":")[1].upper()))
374 self.assertEqual(arxiv_id + "v1", op.normalise_any_id(arxiv_id.replace(".", "....")))
375 self.assertEqual(pmc_id, op.normalise_any_id(pmc_id+" "))
377 op.storage_manager.delete_storage()
379 def test_get_norm_ids(self):
380 '''
381 Check that, given a list of dictionaries representing the ids of an entity, the method returns a reduced version
382 of the same list, containing only the normalised version of the ids of the schemas managed by opencitations.
383 Each reduced dictionary only contains two key-value pairs, i.e.: "identifier" and "schema".
384 '''
385 op = OpenaireProcessing()
387 list_of_ids_to_norm_with_duplicates = [
388 {'identifier': '10.1103/PHYSREVD.84.084046', 'schema': 'doi',
389 'url': 'https://doi.org/10.1103/physrevd.84.084046'},
390 {'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi'},
391 {'identifier': '10.48550/arxiv.1107.5979', 'schema': 'doi',
392 'url': 'https://dx.doi.org/10.48550/arxiv.1107.5979'},
393 {'identifier': '1107.5979', 'schema': 'arXiv', 'url': 'http://arxiv.org/abs/1107.5979'}]
394 norm_ids = op.get_norm_ids(list_of_ids_to_norm_with_duplicates)
395 exp_norm_ids = [{'identifier': 'doi:10.1103/physrevd.84.084046', 'schema': 'doi'},
396 {'identifier': 'doi:10.48550/arxiv.1107.5979', 'schema': 'doi'},
397 {'identifier': 'arxiv:1107.5979v1', 'schema': 'arxiv'}]
399 list_of_ids_w_not_managed_schema = [
400 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'},
401 {'identifier': '21887584', 'schema': 'pmid', 'url': 'https://pubmed.ncbi.nlm.nih.gov/21887584'},
402 {'identifier': '10.1007/s12160-011-9282-0', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}]
403 norm_ids_2 = op.get_norm_ids(list_of_ids_w_not_managed_schema)
404 exp_norm_ids_2 = [{'identifier': 'pmid:21887584', 'schema': 'pmid'},
405 {'identifier': 'doi:10.1007/s12160-011-9282-0', 'schema': 'doi'}]
407 list_of_ids_not_managed_and_not_normalisable_only = [
408 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'},
409 {'identifier': '20.ABC/s12160-011-9282-FAKEID', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}]
410 norm_ids_3 = op.get_norm_ids(list_of_ids_not_managed_and_not_normalisable_only)
411 exp_norm_ids_3 = []
413 self.assertEqual(norm_ids, exp_norm_ids)
414 self.assertEqual(norm_ids_2, exp_norm_ids_2)
415 self.assertEqual(norm_ids_3, exp_norm_ids_3)
416 op.storage_manager.delete_storage()
419 def test_get_norm_ids_redis(self):
420 '''
421 Check that, given a list of dictionaries representing the ids of an entity, the method returns a reduced version
422 of the same list, containing only the normalised version of the ids of the schemas managed by opencitations.
423 Each reduced dictionary only contains two key-value pairs, i.e.: "identifier" and "schema".
424 '''
425 op = OpenaireProcessing(testing=True)
427 list_of_ids_to_norm_with_duplicates = [
428 {'identifier': '10.1103/PHYSREVD.84.084046', 'schema': 'doi',
429 'url': 'https://doi.org/10.1103/physrevd.84.084046'},
430 {'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi'},
431 {'identifier': '10.48550/arxiv.1107.5979', 'schema': 'doi',
432 'url': 'https://dx.doi.org/10.48550/arxiv.1107.5979'},
433 {'identifier': '1107.5979', 'schema': 'arXiv', 'url': 'http://arxiv.org/abs/1107.5979'}]
434 norm_ids = op.get_norm_ids(list_of_ids_to_norm_with_duplicates)
435 exp_norm_ids = [{'identifier': 'doi:10.1103/physrevd.84.084046', 'schema': 'doi'},
436 {'identifier': 'doi:10.48550/arxiv.1107.5979', 'schema': 'doi'},
437 {'identifier': 'arxiv:1107.5979v1', 'schema': 'arxiv'}]
439 list_of_ids_w_not_managed_schema = [
440 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'},
441 {'identifier': '21887584', 'schema': 'pmid', 'url': 'https://pubmed.ncbi.nlm.nih.gov/21887584'},
442 {'identifier': '10.1007/s12160-011-9282-0', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}]
443 norm_ids_2 = op.get_norm_ids(list_of_ids_w_not_managed_schema)
444 exp_norm_ids_2 = [{'identifier': 'pmid:21887584', 'schema': 'pmid'},
445 {'identifier': 'doi:10.1007/s12160-011-9282-0', 'schema': 'doi'}]
447 list_of_ids_not_managed_and_not_normalisable_only = [
448 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'},
449 {'identifier': '20.ABC/s12160-011-9282-FAKEID', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}]
450 norm_ids_3 = op.get_norm_ids(list_of_ids_not_managed_and_not_normalisable_only)
451 exp_norm_ids_3 = []
453 self.assertEqual(norm_ids, exp_norm_ids)
454 self.assertEqual(norm_ids_2, exp_norm_ids_2)
455 self.assertEqual(norm_ids_3, exp_norm_ids_3)
456 op.storage_manager.delete_storage()
458 def test_dict_to_cache(self):
459 op = OpenaireProcessing()
460 sample_dict = {"dict_type": "sample"}
461 if os.path.exists(MEMO_JSON_PATH):
462 os.remove(MEMO_JSON_PATH)
463 self.assertFalse(os.path.exists(MEMO_JSON_PATH))
464 op.dict_to_cache(sample_dict, MEMO_JSON_PATH)
465 self.assertTrue(os.path.exists(MEMO_JSON_PATH))
466 self.delete_storege(specific_path=MEMO_JSON_PATH)
467 self.assertFalse(os.path.exists(MEMO_JSON_PATH))
468 op.storage_manager.delete_storage()
471 def test_csv_creator_base(self):
472 '''
473 Check that, given an updated openaire entity (i.e.: where the "identifier" field was modified
474 after having checked the presence of the given identifiers in the storage memory) a meta csv
475 table for the entity is created
476 '''
478 op = OpenaireProcessing()
479 csv_row = op.csv_creator(SAMPLE_ENTITY_FOR_CSV_CREATOR)
480 expected_row = {
481 'id': 'pmid:29890726',
482 'title': 'Occupational physical activity habits of UK office workers: cross-sectional data from the Active Buildings Study',
483 'author': 'Smith Lee; Sawyer Alexia; Gardner Benjamin; Seppala Katri; Ucci Marcella; Marmot Alexi; Lally Pippa; Fisher Abi',
484 'pub_date': '2018-06-09',
485 'venue': '',
486 'volume': '',
487 'issue': '',
488 'page': '',
489 'type': 'journal article',
490 'publisher': 'MDPI',
491 'editor': ''
492 }
493 self.assertEqual(csv_row, expected_row)
495 op.storage_manager.delete_storage()
497 def test_csv_creator_base_redis(self):
498 '''
499 Check that, given an updated openaire entity (i.e.: where the "identifier" field was modified
500 after having checked the presence of the given identifiers in the storage memory) a meta csv
501 table for the entity is created
502 '''
504 op = OpenaireProcessing(testing=True)
505 csv_row = op.csv_creator(SAMPLE_ENTITY_FOR_CSV_CREATOR)
506 expected_row = {
507 'id': 'pmid:29890726',
508 'title': 'Occupational physical activity habits of UK office workers: cross-sectional data from the Active Buildings Study',
509 'author': 'Smith Lee; Sawyer Alexia; Gardner Benjamin; Seppala Katri; Ucci Marcella; Marmot Alexi; Lally Pippa; Fisher Abi',
510 'pub_date': '2018-06-09',
511 'venue': '',
512 'volume': '',
513 'issue': '',
514 'page': '',
515 'type': 'journal article',
516 'publisher': 'MDPI',
517 'editor': ''
518 }
519 self.assertEqual(csv_row, expected_row)
521 op.storage_manager.delete_storage()
523 def test_csv_creator_not_accepted_id(self):
524 '''
525 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id),
526 no meta csv rows are created.
527 '''
529 op = OpenaireProcessing()
531 replaced_entity = {'schema': 'handle', 'identifier': 'handle:11245/1.357137', 'valid': None}
532 MODIFIED_ENTITY = {k:v for k,v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()}
533 MODIFIED_ENTITY["identifier"]["to_be_val"]= []
534 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity)
535 csv_row = op.csv_creator(MODIFIED_ENTITY)
536 expected_row = {} #because there is no ID accepted in opencitations for this entity
537 self.assertEqual(csv_row, expected_row)
539 op.storage_manager.delete_storage()
541 def test_csv_creator_not_accepted_id_redis(self):
542 '''
543 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id),
544 no meta csv rows are created.
545 '''
547 op = OpenaireProcessing(testing=True)
549 replaced_entity = {'schema': 'handle', 'identifier': 'handle:11245/1.357137', 'valid': None}
550 MODIFIED_ENTITY = {k:v for k,v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()}
551 MODIFIED_ENTITY["identifier"]["to_be_val"]= []
552 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity)
553 csv_row = op.csv_creator(MODIFIED_ENTITY)
554 expected_row = {} #because there is no ID accepted in opencitations for this entity
555 self.assertEqual(csv_row, expected_row)
557 op.storage_manager.delete_storage()
559 def test_csv_creator_invalid_id(self):
560 '''
561 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id),
562 no meta csv rows are created.
563 '''
565 op = OpenaireProcessing()
567 replaced_entity = {'schema': 'handle', 'identifier': '20.500.11820/fake', 'valid': None}
568 MODIFIED_ENTITY = {k: v for k, v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()}
569 MODIFIED_ENTITY["identifier"]["to_be_val"] = []
570 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity)
571 csv_row = op.csv_creator(MODIFIED_ENTITY)
572 expected_row = {} # because there is no ID accepted in opencitations for this entity
573 self.assertEqual(csv_row, expected_row)
575 op.storage_manager.delete_storage()
578 def test_csv_creator_invalid_id_redis(self):
579 '''
580 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id),
581 no meta csv rows are created.
582 '''
584 op = OpenaireProcessing(testing=True)
586 replaced_entity = {'schema': 'handle', 'identifier': '20.500.11820/fake', 'valid': None}
587 MODIFIED_ENTITY = {k: v for k, v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()}
588 MODIFIED_ENTITY["identifier"]["to_be_val"] = []
589 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity)
590 csv_row = op.csv_creator(MODIFIED_ENTITY)
591 expected_row = {} # because there is no ID accepted in opencitations for this entity
592 self.assertEqual(csv_row, expected_row)
594 op.storage_manager.delete_storage()
596 def test_get_publisher_name_base(self):
597 '''
598 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's
599 normalised name (and possibly its crossref ID) is returned.
601 Base functionalities: No publisher mapping in input -> only Publisher name retrieved from the datasource dump
602 '''
603 op = OpenaireProcessing()
604 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'}
606 doi_pub_1_input = {'name': 'Frontiers Media SA'}
607 doi1 = "10.3389/fnana.2012.00034"
609 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'}
610 doi2 = "10.2527/1995.7392834x"
612 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input)
613 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input)
614 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input)
616 self.assertEqual(doi_pub_output_1, "Frontiers Media SA")
617 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd")
618 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)")
620 op.storage_manager.delete_storage()
622 def test_get_publisher_name_base_redis(self):
623 '''
624 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's
625 normalised name (and possibly its crossref ID) is returned.
627 Base functionalities: No publisher mapping in input -> only Publisher name retrieved from the datasource dump
628 '''
629 op = OpenaireProcessing(testing=True)
630 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'}
632 doi_pub_1_input = {'name': 'Frontiers Media SA'}
633 doi1 = "10.3389/fnana.2012.00034"
635 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'}
636 doi2 = "10.2527/1995.7392834x"
638 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input)
639 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input)
640 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input)
642 self.assertEqual(doi_pub_output_1, "Frontiers Media SA")
643 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd")
644 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)")
646 op.storage_manager.delete_storage()
648 def test_get_publisher_name_publishers_mapping(self):
649 '''
650 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's
651 normalised name (and possibly its crossref ID) is returned.
653 Mapping Provided: Publisher name retrieved + crossref member returned,
654 only if :
655 - the doi prefix is a crossref doi prefix,
656 - it is present in the mapping,
657 -the name of the publisher provided by the datasource corresponds to the from the datasource dump
658 '''
660 op = OpenaireProcessing(publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json")
662 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'}
664 doi_pub_1_input = {'name': 'Frontiers Media SA'}
665 doi1 = "10.3389/fnana.2012.00034"
667 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'}
668 doi2 = "10.2527/1995.7392834x"
670 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input)
671 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input)
672 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input)
674 self.assertEqual(doi_pub_output_1, "Frontiers Media SA")
675 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd")
676 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)")
678 op.storage_manager.delete_storage()
680 def test_get_publisher_name_publishers_mapping_redis(self):
681 '''
682 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's
683 normalised name (and possibly its crossref ID) is returned.
685 Mapping Provided: Publisher name retrieved + crossref member returned,
686 only if :
687 - the doi prefix is a crossref doi prefix,
688 - it is present in the mapping,
689 -the name of the publisher provided by the datasource corresponds to the from the datasource dump
690 '''
692 op = OpenaireProcessing(testing=True,publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json")
694 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'}
696 doi_pub_1_input = {'name': 'Frontiers Media SA'}
697 doi1 = "10.3389/fnana.2012.00034"
699 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'}
700 doi2 = "10.2527/1995.7392834x"
702 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input)
703 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input)
704 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input)
706 self.assertEqual(doi_pub_output_1, "Frontiers Media SA")
707 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd")
708 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)")
710 op.storage_manager.delete_storage()
712 def test_get_publisher_name_publishers_mapping_multi_dois(self):
713 '''
714 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's
715 normalised name (and possibly its crossref ID) is returned.
717 Mapping Provided: Publisher name retrieved + crossref member returned,
718 only if :
719 - the doi prefix is a crossref doi prefix,
720 - it is present in the mapping,
721 -the name of the publisher provided by the datasource corresponds to the from the datasource dump
722 '''
724 op = OpenaireProcessing(publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json")
726 # CASE 1: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the
727 # entity's dois prefixes in the prefix-to-publisher-data mapping in input
728 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member
730 ent_1_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society
731 ent_1_doi_2 = "10.1153/sample_doi"
732 pub_input_1 = {'name': 'American Physiological Society'}
734 no_doi_pub_output = op.get_publisher_name([ent_1_doi_1, ent_1_doi_2], pub_input_1)
736 self.assertEqual(no_doi_pub_output, "American Physiological Society [crossref:24]")
738 # CASE 2: The Publisher Name provided by OPENAIRE does not correspond to the Publisher Name mapped to one of the
739 # entity's dois prefixes in the prefix-to-publisher-data mapping in input
740 # EXPECTED OUTPUT: The publisher name provided by Openaire is retrieved without any crossref member
742 ent_2_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society
743 ent_2_doi_2 = "10.1153/sample_doi"
744 pub_input_2 = {'name': 'Sample Publisher Name'}
746 no_doi_pub_output2 = op.get_publisher_name([ent_2_doi_1, ent_2_doi_2], pub_input_2)
747 self.assertEqual(no_doi_pub_output2, "Sample Publisher Name")
749 # CASE 3: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the
750 # entity's dois prefixes in the prefix-to-publisher-data mapping in input BUT it is not the first doi of the list
751 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member
753 ent_3_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society
754 ent_3_doi_2 = "10.1153/sample_doi"
755 pub_input_3 = {'name': 'American Physiological Society'}
757 doi_pub_output3 = op.get_publisher_name([ent_3_doi_2, ent_3_doi_1], pub_input_3)
759 self.assertEqual(doi_pub_output3, "American Physiological Society [crossref:24]")
761 op.storage_manager.delete_storage()
763 # CASE 4: OPENAIRE does not provide a publisher name but one of the entity's DOI prefixes is in the
764 # prefix-to-publisher-data mapping in input
765 # EXPECTED OUTPUT: empty string
767 ent_4_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society
768 ent_4_doi_2 = "10.1153/sample_doi"
769 pub_input_4 = {'name': ''}
770 pub_input_4_1 = {}
771 pub_input_4_2 = ''
773 doi_pub_output4 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4)
774 doi_pub_output4_1 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_1)
775 doi_pub_output4_2= op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_2)
777 self.assertEqual(doi_pub_output4, "")
778 self.assertEqual(doi_pub_output4_1, "")
779 self.assertEqual(doi_pub_output4_2, "")
781 op.storage_manager.delete_storage()
783 def test_get_publisher_name_publishers_mapping_multi_dois_redis(self):
784 '''
785 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's
786 normalised name (and possibly its crossref ID) is returned.
788 Mapping Provided: Publisher name retrieved + crossref member returned,
789 only if :
790 - the doi prefix is a crossref doi prefix,
791 - it is present in the mapping,
792 -the name of the publisher provided by the datasource corresponds to the from the datasource dump
793 '''
795 op = OpenaireProcessing(testing=True, publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json")
797 # CASE 1: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the
798 # entity's dois prefixes in the prefix-to-publisher-data mapping in input
799 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member
801 ent_1_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society
802 ent_1_doi_2 = "10.1153/sample_doi"
803 pub_input_1 = {'name': 'American Physiological Society'}
805 no_doi_pub_output = op.get_publisher_name([ent_1_doi_1, ent_1_doi_2], pub_input_1)
807 self.assertEqual(no_doi_pub_output, "American Physiological Society [crossref:24]")
809 # CASE 2: The Publisher Name provided by OPENAIRE does not correspond to the Publisher Name mapped to one of the
810 # entity's dois prefixes in the prefix-to-publisher-data mapping in input
811 # EXPECTED OUTPUT: The publisher name provided by Openaire is retrieved without any crossref member
813 ent_2_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society
814 ent_2_doi_2 = "10.1153/sample_doi"
815 pub_input_2 = {'name': 'Sample Publisher Name'}
817 no_doi_pub_output2 = op.get_publisher_name([ent_2_doi_1, ent_2_doi_2], pub_input_2)
818 self.assertEqual(no_doi_pub_output2, "Sample Publisher Name")
820 # CASE 3: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the
821 # entity's dois prefixes in the prefix-to-publisher-data mapping in input BUT it is not the first doi of the list
822 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member
824 ent_3_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society
825 ent_3_doi_2 = "10.1153/sample_doi"
826 pub_input_3 = {'name': 'American Physiological Society'}
828 doi_pub_output3 = op.get_publisher_name([ent_3_doi_2, ent_3_doi_1], pub_input_3)
830 self.assertEqual(doi_pub_output3, "American Physiological Society [crossref:24]")
832 op.storage_manager.delete_storage()
834 # CASE 4: OPENAIRE does not provide a publisher name but one of the entity's DOI prefixes is in the
835 # prefix-to-publisher-data mapping in input
836 # EXPECTED OUTPUT: empty string
838 ent_4_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society
839 ent_4_doi_2 = "10.1153/sample_doi"
840 pub_input_4 = {'name': ''}
841 pub_input_4_1 = {}
842 pub_input_4_2 = ''
844 doi_pub_output4 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4)
845 doi_pub_output4_1 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_1)
846 doi_pub_output4_2= op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_2)
848 self.assertEqual(doi_pub_output4, "")
849 self.assertEqual(doi_pub_output4_1, "")
850 self.assertEqual(doi_pub_output4_2, "")
852 op.storage_manager.delete_storage()
854 def test_manage_arxiv_single_id(self):
855 '''Check the correct management of entities with only one ID, in particular in
856 case it is an arxiv. In this case, if it is an arxiv DOI, we return the normalised
857 version of the correspondent arxiv. Both in case of an arxiv id and of an arxiv doi,
858 we return the versioned arxiv id where the version is available (never in ARXIV doi).
859 If no version is provided, we normalise the arxiv id as arxiv id version 1.
860 In all the other id cases (pmid, pmc, handle (which is discarded in a later step) '''
861 sample_doi_any = [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}]
862 sample_doi_arxiv = [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]
863 sample_arxiv_no_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217', 'valid': None}]
864 sample_arxiv_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509...08217v3', 'valid': None}]
866 op = OpenaireProcessing()
868 # CASE 1: the unique input id dict in list is a not-arxiv doi : the input list is returned
869 out_sample_doi_any = op.manage_arxiv_single_id(sample_doi_any)
870 self.assertEqual(out_sample_doi_any, [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}])
872 # CASE 2: the unique input id dict in list is an arxiv doi: the doi is replaced with its correspondent arxiv v1
873 out_sample_doi_arxiv = op.manage_arxiv_single_id(sample_doi_arxiv)
874 self.assertEqual(out_sample_doi_arxiv, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}])
876 # CASE 3: the unique input id dict in list is an arxiv id without version:
877 # the arxiv id is replaced with its v1
878 out_sample_arxiv_no_ver = op.manage_arxiv_single_id(sample_arxiv_no_ver)
879 self.assertEqual(out_sample_arxiv_no_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}])
881 # CASE 4: the unique input id dict in list is an arxiv id with version: the id is just normalised
882 out_sample_arxiv_ver = op.manage_arxiv_single_id(sample_arxiv_ver)
883 self.assertEqual(out_sample_arxiv_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v3'}])
885 op.storage_manager.delete_storage()
887 def test_manage_arxiv_single_id_redis(self):
888 '''Check the correct management of entities with only one ID, in particular in
889 case it is an arxiv. In this case, if it is an arxiv DOI, we return the normalised
890 version of the correspondent arxiv. Both in case of an arxiv id and of an arxiv doi,
891 we return the versioned arxiv id where the version is available (never in ARXIV doi).
892 If no version is provided, we normalise the arxiv id as arxiv id version 1.
893 In all the other id cases (pmid, pmc, handle (which is discarded in a later step) '''
894 sample_doi_any = [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}]
895 sample_doi_arxiv = [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]
896 sample_arxiv_no_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217', 'valid': None}]
897 sample_arxiv_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509...08217v3', 'valid': None}]
899 op = OpenaireProcessing(testing=True)
901 # CASE 1: the unique input id dict in list is a not-arxiv doi : the input list is returned
902 out_sample_doi_any = op.manage_arxiv_single_id(sample_doi_any)
903 self.assertEqual(out_sample_doi_any, [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}])
905 # CASE 2: the unique input id dict in list is an arxiv doi: the doi is replaced with its correspondent arxiv v1
906 out_sample_doi_arxiv = op.manage_arxiv_single_id(sample_doi_arxiv)
907 self.assertEqual(out_sample_doi_arxiv, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}])
909 # CASE 3: the unique input id dict in list is an arxiv id without version:
910 # the arxiv id is replaced with its v1
911 out_sample_arxiv_no_ver = op.manage_arxiv_single_id(sample_arxiv_no_ver)
912 self.assertEqual(out_sample_arxiv_no_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}])
914 # CASE 4: the unique input id dict in list is an arxiv id with version: the id is just normalised
915 out_sample_arxiv_ver = op.manage_arxiv_single_id(sample_arxiv_ver)
916 self.assertEqual(out_sample_arxiv_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v3'}])
918 op.storage_manager.delete_storage()
920 def test_manage_doi_prefixes_priorities(self):
921 op = OpenaireProcessing()
923 # CASE1: 1 figshare doi (priority 1) with version --> returned as it is
924 es_1 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v3', 'valid': None}]
925 out_1 = op.manage_doi_prefixes_priorities(es_1)
926 self.assertEqual(out_1, es_1)
928 # CASE2: 1 figshare doi (priority 1) without version --> returned with version v1
929 es_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234', 'valid': None}]
930 exp_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v1', 'valid': None}]
931 out_2 = op.manage_doi_prefixes_priorities(es_2)
932 self.assertEqual(exp_2, out_2)
934 # CASE3: 1 arxiv doi (always without and version) --> returned as correspondent arxiv id version v1
935 es_3 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}]
936 out_3 = op.manage_doi_prefixes_priorities(es_3)
937 exp_3 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}]
938 self.assertEqual(exp_3, out_3)
940 # CASE4: >1 arxiv doi or figshare and at least one has version --> return the one(s) with version
941 es_4 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}]
942 out_4 = op.manage_doi_prefixes_priorities(es_4)
943 exp_4 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}]
944 self.assertEqual(exp_4, out_4)
946 # CASE5: >1 arxiv doi or figshare and none has version --> return, as first choice, the arxiv version v1 of the first arxiv doi encountered
947 es_5 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}]
948 out_5 = op.manage_doi_prefixes_priorities(es_5)
949 exp_5 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}]
950 self.assertEqual(exp_5, out_5)
952 # CASE6: >1 figshare dois and none has version --> return, return version v1 doi of the first figshare doi encountered
953 es_6 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/1234', 'valid': None}]
954 out_6 = op.manage_doi_prefixes_priorities(es_6)
955 exp_6 = [{'identifier': 'doi:10.6084/5678v1', 'schema': 'doi', 'valid': None}]
956 self.assertEqual(exp_6, out_6)
958 # CASE7: >1 more than one zenodo doi --> return the one with the highest number: it is the last one assigned and thus it
959 # is a version doi and not the collector doi (which is the first one to be assigned when a publication is uploaded on zenodo).
960 es_7 = [{'schema': 'doi', 'identifier': '10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': '10.5281/zenodo.112', 'valid': None}]
961 es_7_1 = [{'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.112', 'valid': None}]
962 out_7 = op.manage_doi_prefixes_priorities(es_7)
963 out_7_1 = op.manage_doi_prefixes_priorities(es_7_1)
964 exp_7 = [{'identifier': '10.5281/zenodo.112', 'schema': 'doi', 'valid': None}]
965 exp_7_1 = [{'identifier': 'doi:10.5281/zenodo.112', 'schema': 'doi', 'valid': None}]
966 self.assertEqual(exp_7, out_7)
967 self.assertEqual(exp_7_1, out_7_1)
969 # CASE8: None of the previous cases: return the first syntactically valid DOI with highest priority prefix
970 es_8 = [
971 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None},
972 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None},
973 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None},
974 ]
976 out_8 = op.manage_doi_prefixes_priorities(es_8)
977 exp_8 = [{'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}]
978 self.assertEqual(exp_8, out_8)
980 # CASE8_1: first syntactically valid DOI with highest priority prefix is returned
981 es_8_1 = [
982 {'schema': 'doi', 'identifier': '10.5281/zenodo.4725899', 'valid': None},
983 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None},
984 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None},
985 ]
987 out_8_1 = op.manage_doi_prefixes_priorities(es_8_1)
988 exp_8_1 = [{'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}]
989 self.assertEqual(exp_8_1, out_8_1)
991 # CASE8_2:
992 # more valid ids among the ones with a max priority prefix --> return the first one encountered
993 es_8_2 = [
994 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None},
995 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None},
996 {'schema': 'doi', 'identifier': 'doi:10.25384/sage.c.4112909', 'valid': None},
997 ]
999 out_8_2 = op.manage_doi_prefixes_priorities(es_8_2)
1000 exp_8_2 = [{'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}]
1001 self.assertEqual(exp_8_2, out_8_2)
1003 op.storage_manager.delete_storage()
1005 def test_manage_doi_prefixes_priorities_redis(self):
1006 op = OpenaireProcessing(testing=True)
1008 # CASE1: 1 figshare doi (priority 1) with version --> returned as it is
1009 es_1 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v3', 'valid': None}]
1010 out_1 = op.manage_doi_prefixes_priorities(es_1)
1011 self.assertEqual(out_1, es_1)
1013 # CASE2: 1 figshare doi (priority 1) without version --> returned with version v1
1014 es_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234', 'valid': None}]
1015 exp_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v1', 'valid': None}]
1016 out_2 = op.manage_doi_prefixes_priorities(es_2)
1017 self.assertEqual(exp_2, out_2)
1019 # CASE3: 1 arxiv doi (always without and version) --> returned as correspondent arxiv id version v1
1020 es_3 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}]
1021 out_3 = op.manage_doi_prefixes_priorities(es_3)
1022 exp_3 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}]
1023 self.assertEqual(exp_3, out_3)
1025 # CASE4: >1 arxiv doi or figshare and at least one has version --> return the one(s) with version
1026 es_4 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}]
1027 out_4 = op.manage_doi_prefixes_priorities(es_4)
1028 exp_4 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}]
1029 self.assertEqual(exp_4, out_4)
1031 # CASE5: >1 arxiv doi or figshare and none has version --> return, as first choice, the arxiv version v1 of the first arxiv doi encountered
1032 es_5 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}]
1033 out_5 = op.manage_doi_prefixes_priorities(es_5)
1034 exp_5 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}]
1035 self.assertEqual(exp_5, out_5)
1037 # CASE6: >1 figshare dois and none has version --> return, return version v1 doi of the first figshare doi encountered
1038 es_6 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/1234', 'valid': None}]
1039 out_6 = op.manage_doi_prefixes_priorities(es_6)
1040 exp_6 = [{'identifier': 'doi:10.6084/5678v1', 'schema': 'doi', 'valid': None}]
1041 self.assertEqual(exp_6, out_6)
1043 # CASE7: >1 more than one zenodo doi --> return the one with the highest number: it is the last one assigned and thus it
1044 # is a version doi and not the collector doi (which is the first one to be assigned when a publication is uploaded on zenodo).
1045 es_7 = [{'schema': 'doi', 'identifier': '10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': '10.5281/zenodo.112', 'valid': None}]
1046 es_7_1 = [{'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.112', 'valid': None}]
1047 out_7 = op.manage_doi_prefixes_priorities(es_7)
1048 out_7_1 = op.manage_doi_prefixes_priorities(es_7_1)
1049 exp_7 = [{'identifier': '10.5281/zenodo.112', 'schema': 'doi', 'valid': None}]
1050 exp_7_1 = [{'identifier': 'doi:10.5281/zenodo.112', 'schema': 'doi', 'valid': None}]
1051 self.assertEqual(exp_7, out_7)
1052 self.assertEqual(exp_7_1, out_7_1)
1054 # CASE8: None of the previous cases: return the first syntactically valid DOI with highest priority prefix
1055 es_8 = [
1056 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None},
1057 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None},
1058 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None},
1059 ]
1061 out_8 = op.manage_doi_prefixes_priorities(es_8)
1062 exp_8 = [{'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}]
1063 self.assertEqual(exp_8, out_8)
1065 # CASE8_1: first syntactically valid DOI with highest priority prefix is returned
1066 es_8_1 = [
1067 {'schema': 'doi', 'identifier': '10.5281/zenodo.4725899', 'valid': None},
1068 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None},
1069 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None},
1070 ]
1072 out_8_1 = op.manage_doi_prefixes_priorities(es_8_1)
1073 exp_8_1 = [{'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}]
1074 self.assertEqual(exp_8_1, out_8_1)
1076 # CASE8_2:
1077 # more valid ids among the ones with a max priority prefix --> return the first one encountered
1078 es_8_2 = [
1079 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None},
1080 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None},
1081 {'schema': 'doi', 'identifier': 'doi:10.25384/sage.c.4112909', 'valid': None},
1082 ]
1084 out_8_2 = op.manage_doi_prefixes_priorities(es_8_2)
1085 exp_8_2 = [{'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}]
1086 self.assertEqual(exp_8_2, out_8_2)
1088 op.storage_manager.delete_storage()
1090 def test_to_validated_id_list(self):
1091 # NOTE: in tests using the sqlite storage method it must be avoided to delete the storage
1092 # while using the same OpenaireProcessing() instance, otherwise the process would try to
1093 # store data in a filepath that has just been deleted, with no new connection created after it.
1095 # 2 OPTIONS: 1) instantiate OpenaireProcessing only once at the beginning and delete the
1096 # storage only at the end; 2) create a new OpenaireProcessing instance at every check and
1097 # delete the storage each time after the check is done.
1099 op = OpenaireProcessing()
1100 # CASE1_1: No already validated ids + 1 id to be validated, which is valid
1101 inp_1 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}]}
1102 out_1 = op.to_validated_id_list(inp_1)
1103 exp_1 = ['pmid:20662931']
1104 self.assertEqual(out_1, exp_1)
1105 op.storage_manager.delete_storage()
1107 op = OpenaireProcessing()
1108 # CASE1_2: No already validated ids + 1 id to be validated, which is invalid
1109 inp_2 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:abc', 'valid': None}]}
1110 out_2 = op.to_validated_id_list(inp_2)
1111 exp_2 = []
1112 self.assertEqual(out_2, exp_2)
1114 op = OpenaireProcessing()
1115 # CASE1_3: No already validated ids + 1 id to be validated, which is a valid arxiv doi
1116 inp_3 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]}
1117 out_3 = op.to_validated_id_list(inp_3)
1118 exp_3 = ['arxiv:1509.08217v1']
1119 self.assertEqual(out_3, exp_3)
1120 op.storage_manager.delete_storage()
1123 op = OpenaireProcessing()
1124 # CASE1_4: No already validated ids + 1 id to be validated, which hasn't a valid schema
1125 inp_4 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "0", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]}
1126 out_4 = op.to_validated_id_list(inp_4)
1127 exp_4 = []
1128 self.assertEqual(out_4, exp_4)
1129 op.storage_manager.delete_storage()
1131 op = OpenaireProcessing()
1132 # CASE1_5: No already validated ids + 1 id to be validated, which is not valid
1133 inp_5 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "doi", 'identifier': 'doi:INVALID/fake', 'valid': None}]}
1134 out_5 = op.to_validated_id_list(inp_5)
1135 exp_5 = []
1136 self.assertEqual(out_5, exp_5)
1137 op.storage_manager.delete_storage()
1139 op = OpenaireProcessing()
1140 # CASE1_9: No already validated ids + 1 id to be validated, which is a valid PMC
1141 inp_9 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]}
1142 out_9 = op.to_validated_id_list(inp_9)
1143 exp_9 = ['pmcid:PMC2873764']
1144 self.assertEqual(out_9, exp_9)
1145 op.storage_manager.delete_storage()
1147 op = OpenaireProcessing()
1148 # CASE2_1: No already validated ids + >1 id to be validated, both valid and with accepted schemas
1149 inp_6 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},
1150 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None}]}
1151 out_6 = op.to_validated_id_list(inp_6)
1152 exp_6 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0']
1153 self.assertCountEqual(out_6, exp_6) #Test that sequence first contains the same elements as second, regardless of their order
1154 op.storage_manager.delete_storage()
1156 op = OpenaireProcessing()
1157 # CASE2_2: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv id
1158 inp_8 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},
1159 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979', 'valid': None}]}
1160 out_8 = op.to_validated_id_list(inp_8)
1161 exp_8 = ['pmid:20662931']
1162 self.assertEqual(out_8, exp_8)
1163 op.storage_manager.delete_storage()
1165 op = OpenaireProcessing()
1166 # CASE2_3: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv doi
1167 inp_7 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, {'schema': "doi", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]}
1168 out_7 = op.to_validated_id_list(inp_7)
1169 exp_7 = ['pmid:20662931']
1170 self.assertEqual(out_7, exp_7)
1171 op.storage_manager.delete_storage()
1173 op = OpenaireProcessing()
1174 # CASE2_4: No already validated ids + >1 id to be validated, both valid, one of the two is a PMC
1175 inp_10 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},
1176 {'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]}
1177 out_10 = op.to_validated_id_list(inp_10)
1178 exp_10 = ['pmid:20662931']
1179 self.assertEqual(out_10, exp_10)
1180 op.storage_manager.delete_storage()
1182 op = OpenaireProcessing()
1183 # CASE2_5: No already validated ids + >1 id to be validated, 1 valid pmid, 1 valid doi, 1 valid doi with a "critic" prefix
1184 # for opencitations entities management
1186 inp_11 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},
1187 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None},
1188 {'schema': 'doi',
1189 'identifier': 'doi:10.48550/arXiv.1509.08217',
1190 'valid': None}
1191 ]}
1192 out_11 = op.to_validated_id_list(inp_11)
1193 exp_11 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0']
1194 self.assertCountEqual(out_11, exp_11) #Test that sequence first contains the same elements as second, regardless of their order
1195 op.storage_manager.delete_storage()
1197 op = OpenaireProcessing()
1198 # CASE2_6: No already validated ids + >1 id to be validated, one doi with a "critic" prefix and a PMCID
1199 # for opencitations entities management
1201 inp_12 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmcid', 'identifier': 'pmcid:PMC5555555', 'valid': None},
1202 {'schema': 'doi',
1203 'identifier': 'doi:10.48550/arXiv.1509.08217',
1204 'valid': None}
1205 ]}
1206 out_12 = op.to_validated_id_list(inp_12)
1207 exp_12 = ['pmcid:PMC5555555']
1208 self.assertEqual(out_12, exp_12)
1209 op.storage_manager.delete_storage()
1211 op = OpenaireProcessing()
1212 # CASE2_7: no already validated ids + >1 id to be validated, one doi with a "critic" prefix for opencitations
1213 # ingestion workflow and an ARXIV
1215 inp_13 = {'valid': [], 'not_valid': [], 'to_be_val': [
1216 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979v1', 'valid': None},
1217 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}
1218 ]}
1219 out_13 = op.to_validated_id_list(inp_13)
1220 exp_13 = ['arxiv:1107.5979v1']
1221 self.assertEqual(out_13, exp_13)
1222 op.storage_manager.delete_storage()
1224 op = OpenaireProcessing()
1225 # CASE2_8: no already validated ids and more dois with "critic" prefixes for opencitations
1226 # ingestion workflow
1228 inp_14 = {'valid': [], 'not_valid': [], 'to_be_val': [
1229 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None},
1230 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None}
1231 ]}
1232 out_14 = op.to_validated_id_list(inp_14)
1233 exp_14 = ['doi:10.1184/r1/12841247.v1']
1234 self.assertEqual(out_14, exp_14)
1235 op.storage_manager.delete_storage()
1237 op = OpenaireProcessing()
1238 # CASE3: an already validated id and more dois with "critic" prefixes for opencitations
1239 # ingestion workflow
1241 inp_15 = {'valid': [], 'not_valid': [], 'to_be_val': [
1242 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None},
1243 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None},
1244 {'schema': 'doi', 'identifier': 'doi:10.7557/5.5607', 'valid': None},
1245 {}
1246 ]}
1247 out_15 = op.to_validated_id_list(inp_15)
1248 exp_15 = ['doi:10.7557/5.5607']
1249 self.assertEqual(out_15, exp_15)
1250 op.storage_manager.delete_storage()
1252 def test_to_validated_id_list_redis(self):
1253 # NOTE: in tests using the sqlite storage method it must be avoided to delete the storage
1254 # while using the same OpenaireProcessing() instance, otherwise the process would try to
1255 # store data in a filepath that has just been deleted, with no new connection created after it.
1257 # 2 OPTIONS: 1) instantiate OpenaireProcessing only once at the beginning and delete the
1258 # storage only at the end; 2) create a new OpenaireProcessing instance at every check and
1259 # delete the storage each time after the check is done.
1261 op = OpenaireProcessing(testing=True)
1262 # CASE1_1: No already validated ids + 1 id to be validated, which is valid
1263 inp_1 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}]}
1264 out_1 = op.to_validated_id_list(inp_1)
1265 exp_1 = ['pmid:20662931']
1266 self.assertEqual(out_1, exp_1)
1267 op.storage_manager.delete_storage()
1269 op = OpenaireProcessing(testing=True)
1270 # CASE1_2: No already validated ids + 1 id to be validated, which is invalid
1271 inp_2 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:abc', 'valid': None}]}
1272 out_2 = op.to_validated_id_list(inp_2)
1273 exp_2 = []
1274 self.assertEqual(out_2, exp_2)
1275 op.storage_manager.delete_storage()
1277 op = OpenaireProcessing(testing=True)
1278 # CASE1_3: No already validated ids + 1 id to be validated, which is a valid arxiv doi
1279 inp_3 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]}
1280 out_3 = op.to_validated_id_list(inp_3)
1281 exp_3 = ['arxiv:1509.08217v1']
1282 self.assertEqual(out_3, exp_3)
1283 op.storage_manager.delete_storage()
1285 op = OpenaireProcessing(testing=True)
1286 # CASE1_4: No already validated ids + 1 id to be validated, which hasn't a valid schema
1287 inp_4 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "0", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]}
1288 out_4 = op.to_validated_id_list(inp_4)
1289 exp_4 = []
1290 self.assertEqual(out_4, exp_4)
1291 op.storage_manager.delete_storage()
1293 op = OpenaireProcessing(testing=True)
1294 # CASE1_5: No already validated ids + 1 id to be validated, which is not valid
1295 inp_5 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "doi", 'identifier': 'doi:INVALID/fake', 'valid': None}]}
1296 out_5 = op.to_validated_id_list(inp_5)
1297 exp_5 = []
1298 self.assertEqual(out_5, exp_5)
1299 op.storage_manager.delete_storage()
1301 op = OpenaireProcessing(testing=True)
1302 # CASE1_9: No already validated ids + 1 id to be validated, which is a valid PMC
1303 inp_9 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]}
1304 out_9 = op.to_validated_id_list(inp_9)
1305 exp_9 = ['pmcid:PMC2873764']
1306 self.assertEqual(out_9, exp_9)
1307 op.storage_manager.delete_storage()
1309 op = OpenaireProcessing(testing=True)
1310 # CASE2_1: No already validated ids + >1 id to be validated, both valid and with accepted schemas
1311 inp_6 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},
1312 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None}]}
1313 out_6 = op.to_validated_id_list(inp_6)
1314 exp_6 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0']
1315 self.assertCountEqual(out_6, exp_6) #Test that sequence first contains the same elements as second, regardless of their order
1316 op.storage_manager.delete_storage()
1318 op = OpenaireProcessing(testing=True)
1319 # CASE2_2: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv id
1320 inp_8 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},
1321 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979', 'valid': None}]}
1322 out_8 = op.to_validated_id_list(inp_8)
1323 exp_8 = ['pmid:20662931']
1324 self.assertEqual(out_8, exp_8)
1325 op.storage_manager.delete_storage()
1327 op = OpenaireProcessing(testing=True)
1328 # CASE2_3: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv doi
1329 inp_7 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, {'schema': "doi", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]}
1330 out_7 = op.to_validated_id_list(inp_7)
1331 exp_7 = ['pmid:20662931']
1332 self.assertEqual(out_7, exp_7)
1333 op.storage_manager.delete_storage()
1335 op = OpenaireProcessing(testing=True)
1336 # CASE2_4: No already validated ids + >1 id to be validated, both valid, one of the two is a PMC
1337 inp_10 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},
1338 {'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]}
1339 out_10 = op.to_validated_id_list(inp_10)
1340 exp_10 = ['pmid:20662931']
1341 self.assertEqual(out_10, exp_10)
1342 op.storage_manager.delete_storage()
1344 op = OpenaireProcessing(testing=True)
1345 # CASE2_5: No already validated ids + >1 id to be validated, 1 valid pmid, 1 valid doi, 1 valid doi with a "critic" prefix
1346 # for opencitations entities management
1348 inp_11 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},
1349 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None},
1350 {'schema': 'doi',
1351 'identifier': 'doi:10.48550/arXiv.1509.08217',
1352 'valid': None}
1353 ]}
1354 out_11 = op.to_validated_id_list(inp_11)
1355 exp_11 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0']
1356 self.assertCountEqual(out_11, exp_11) #Test that sequence first contains the same elements as second, regardless of their order
1357 op.storage_manager.delete_storage()
1359 op = OpenaireProcessing(testing=True)
1360 # CASE2_6: No already validated ids + >1 id to be validated, one doi with a "critic" prefix and a PMCID
1361 # for opencitations entities management
1363 inp_12 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmcid', 'identifier': 'pmcid:PMC5555555', 'valid': None},
1364 {'schema': 'doi',
1365 'identifier': 'doi:10.48550/arXiv.1509.08217',
1366 'valid': None}
1367 ]}
1368 out_12 = op.to_validated_id_list(inp_12)
1369 exp_12 = ['pmcid:PMC5555555']
1370 self.assertEqual(out_12, exp_12)
1371 op.storage_manager.delete_storage()
1373 op = OpenaireProcessing(testing=True)
1374 # CASE2_7: no already validated ids + >1 id to be validated, one doi with a "critic" prefix for opencitations
1375 # ingestion workflow and an ARXIV
1377 inp_13 = {'valid': [], 'not_valid': [], 'to_be_val': [
1378 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979v1', 'valid': None},
1379 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}
1380 ]}
1381 out_13 = op.to_validated_id_list(inp_13)
1382 exp_13 = ['arxiv:1107.5979v1']
1383 self.assertEqual(out_13, exp_13)
1384 op.storage_manager.delete_storage()
1386 op = OpenaireProcessing(testing=True)
1387 # CASE2_8: no already validated ids and more dois with "critic" prefixes for opencitations
1388 # ingestion workflow
1390 inp_14 = {'valid': [], 'not_valid': [], 'to_be_val': [
1391 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None},
1392 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None}
1393 ]}
1394 out_14 = op.to_validated_id_list(inp_14)
1395 exp_14 = ['doi:10.1184/r1/12841247.v1']
1396 self.assertEqual(out_14, exp_14)
1397 op.storage_manager.delete_storage()
1399 op = OpenaireProcessing(testing=True)
1400 # CASE3: an already validated id and more dois with "critic" prefixes for opencitations
1401 # ingestion workflow
1403 inp_15 = {'valid': [], 'not_valid': [], 'to_be_val': [
1404 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None},
1405 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None},
1406 {'schema': 'doi', 'identifier': 'doi:10.7557/5.5607', 'valid': None},
1407 {}
1408 ]}
1409 out_15 = op.to_validated_id_list(inp_15)
1410 exp_15 = ['doi:10.7557/5.5607']
1411 self.assertEqual(out_15, exp_15)
1412 op.storage_manager.delete_storage()
1415 def test_add_authors_to_agent_list(self):
1416 op = OpenaireProcessing()
1417 sample_inp = {'creator': [{'name': 'Carlos Hoyos'}, {'name': 'Yaron Oz'}, {'identifiers': [{'identifier': '0000-0001-6946-5074', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-6946-5074'}], 'name': 'Bom Soo Kim'}]}
1418 sample_exp = op.add_authors_to_agent_list(sample_inp, [])
1419 sample_out = [{'role': 'author', 'name': 'Carlos Hoyos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Yaron Oz', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Bom Soo Kim', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}]
1420 self.assertEqual(sample_out, sample_exp)
1421 op.storage_manager.delete_storage()
1424 def test_add_authors_to_agent_list_redis(self):
1425 op = OpenaireProcessing(testing=True)
1426 sample_inp = {'creator': [{'name': 'Carlos Hoyos'}, {'name': 'Yaron Oz'}, {'identifiers': [{'identifier': '0000-0001-6946-5074', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-6946-5074'}], 'name': 'Bom Soo Kim'}]}
1427 sample_exp = op.add_authors_to_agent_list(sample_inp, [])
1428 sample_out = [{'role': 'author', 'name': 'Carlos Hoyos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Yaron Oz', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Bom Soo Kim', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}]
1429 self.assertEqual(sample_out, sample_exp)
1430 op.storage_manager.delete_storage()
1432 def test_add_authors_to_agent_list_no_creator(self):
1433 op = OpenaireProcessing()
1434 sample_inp = {'creator': []}
1435 sample_exp = op.add_authors_to_agent_list(sample_inp, [])
1436 sample_out = []
1437 self.assertEqual(sample_out, sample_exp)
1438 op.storage_manager.delete_storage()
1441 def test_add_authors_to_agent_list_no_creator_redis(self):
1442 op = OpenaireProcessing(testing=True)
1443 sample_inp = {'creator': []}
1444 sample_exp = op.add_authors_to_agent_list(sample_inp, [])
1445 sample_out = []
1446 self.assertEqual(sample_out, sample_exp)
1447 op.storage_manager.delete_storage()
1449 def test_get_agents_strings_list(self):
1450 best_doi = "doi:10.1007/jhep03(2014)050"
1451 agents_list_2 = [{'role': 'author', 'name': 'Hoyos, Carlos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Oz, Yaron', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Kim, Bom Soo', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}]
1452 op = OpenaireProcessing()
1453 sample_exp = op.get_agents_strings_list(best_doi, agents_list_2)
1454 self.assertEqual(sample_exp, (['Hoyos Carlos', 'Oz Yaron', 'Kim Bom Soo [orcid:0000-0001-6946-5074]'], []))
1455 op.storage_manager.delete_storage()
1457 def test_get_agents_strings_list_redis(self):
1458 best_doi = "doi:10.1007/jhep03(2014)050"
1459 agents_list_2 = [{'role': 'author', 'name': 'Hoyos, Carlos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Oz, Yaron', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Kim, Bom Soo', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}]
1460 op = OpenaireProcessing(testing=True)
1461 sample_exp = op.get_agents_strings_list(best_doi, agents_list_2)
1462 self.assertEqual(sample_exp, (['Hoyos Carlos', 'Oz Yaron', 'Kim Bom Soo [orcid:0000-0001-6946-5074]'], []))
1463 op.storage_manager.delete_storage()
1465 def test_find_openaire_orcid(self):
1466 op = OpenaireProcessing(testing=True)
1467 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]
1468 out = op.find_openaire_orcid(inp)
1469 exp = "orcid:0000-0001-9759-3938"
1470 self.assertEqual(out, exp)
1472 inp_wrong_schema = [{'identifier': '0000-0001-9759-3938', 'schema': 'fake_schema', 'url': 'https://orcid.org/0000-0001-9759-3938'}]
1473 out_wrong_schema = op.find_openaire_orcid(inp_wrong_schema)
1474 exp_wrong_schema = ""
1475 self.assertEqual(out_wrong_schema, exp_wrong_schema)
1477 inp_invalid_id = [{'identifier': '5500-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]
1478 out_invalid_id = op.find_openaire_orcid(inp_invalid_id)
1479 exp_invalid_id = ""
1480 self.assertEqual(out_invalid_id, exp_invalid_id)
1482 op.orcid_m.storage_manager.delete_storage()
1484 # set a valid id as invalid in storage, so to check that the api check is
1485 # avoided if the info is already in storage
1486 op = OpenaireProcessing(testing=True)
1487 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", False)
1489 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]
1490 out = op.find_openaire_orcid(inp)
1491 exp = ""
1492 self.assertEqual(out, exp)
1494 op.orcid_m.storage_manager.delete_storage()
1495 op = OpenaireProcessing(testing=True)
1496 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", True)
1497 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]
1498 out = op.find_openaire_orcid(inp)
1499 exp = "orcid:0000-0001-9759-3938"
1500 self.assertEqual(out, exp)
1501 op.orcid_m.storage_manager.delete_storage()
1504 def test_find_openaire_orcid_redis(self):
1505 op = OpenaireProcessing(testing=True)
1506 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]
1507 out = op.find_openaire_orcid(inp)
1508 exp = "orcid:0000-0001-9759-3938"
1509 self.assertEqual(out, exp)
1511 inp_wrong_schema = [{'identifier': '0000-0001-9759-3938', 'schema': 'fake_schema', 'url': 'https://orcid.org/0000-0001-9759-3938'}]
1512 out_wrong_schema = op.find_openaire_orcid(inp_wrong_schema)
1513 exp_wrong_schema = ""
1514 self.assertEqual(out_wrong_schema, exp_wrong_schema)
1516 inp_invalid_id = [{'identifier': '5500-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]
1517 out_invalid_id = op.find_openaire_orcid(inp_invalid_id)
1518 exp_invalid_id = ""
1519 self.assertEqual(out_invalid_id, exp_invalid_id)
1521 op.orcid_m.storage_manager.delete_storage()
1523 # set a valid id as invalid in storage, so to check that the api check is
1524 # avoided if the info is already in storage
1525 op = OpenaireProcessing(testing=True)
1526 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", False)
1528 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]
1529 out = op.find_openaire_orcid(inp)
1530 exp = ""
1531 self.assertEqual(out, exp)
1533 op.orcid_m.storage_manager.delete_storage()
1534 op = OpenaireProcessing(testing=True)
1535 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", True)
1536 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]
1537 out = op.find_openaire_orcid(inp)
1538 exp = "orcid:0000-0001-9759-3938"
1539 self.assertEqual(out, exp)
1540 op.orcid_m.storage_manager.delete_storage()
1542 def test_update_redis_values(self):
1543 br = ["pmid:2", "pmid:3"]
1544 ra = ["orcid:0000-0003-0530-4305"]
1545 op = OpenaireProcessing(testing=True)
1546 op.update_redis_values(br,ra)
1547 self.assertEqual(op._redis_values_br, br)
1548 self.assertEqual(op._redis_values_ra, ra)
1551 def test_find_openaire_orcid_with_index(self):
1552 """Test ORCID validation using ORCID index before API validation"""
1553 # Setup
1554 test_doi = "10.1234/test123"
1555 test_orcid = "0000-0002-1234-5678"
1556 test_name = "Smith, John"
1558 # Create OpenaireProcessing instance with ORCID index
1559 op = OpenaireProcessing()
1560 # Correct format for add_value: id_string -> value
1561 op.orcid_index.add_value(test_doi, f"{test_name} [orcid:{test_orcid}]")
1563 # Test Case 1: ORCID found in index
1564 inp_1 = [{'identifier': test_orcid, 'schema': 'ORCID'}]
1565 out_1 = op.find_openaire_orcid(inp_1, test_doi)
1566 exp_1 = f"orcid:{test_orcid}"
1567 self.assertEqual(out_1, exp_1)
1568 # Verify it was added to temporary storage
1569 self.assertTrue(op.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}"))
1571 # Test Case 2: ORCID not in index but valid via API
1572 inp_2 = [{'identifier': '0000-0003-4082-1500', 'schema': 'ORCID'}]
1573 out_2 = op.find_openaire_orcid(inp_2, test_doi)
1574 exp_2 = "orcid:0000-0003-4082-1500"
1575 self.assertEqual(out_2, exp_2)
1577 # Test Case 3: ORCID not in index and invalid
1578 inp_3 = [{'identifier': '0000-0000-0000-0000', 'schema': 'ORCID'}]
1579 out_3 = op.find_openaire_orcid(inp_3, test_doi)
1580 exp_3 = ""
1581 self.assertEqual(out_3, exp_3)
1583 # Test Case 4: Valid ORCID but no DOI provided
1584 inp_4 = [{'identifier': test_orcid, 'schema': 'ORCID'}]
1585 out_4 = op.find_openaire_orcid(inp_4) # No DOI
1586 exp_4 = f"orcid:{test_orcid}" # Should still validate via API
1587 self.assertEqual(out_4, exp_4)
1589 # Cleanup
1590 op.storage_manager.delete_storage()
1593def test_validated_as_with_storage_manager(storage_manager):
1594 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"}
1595 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"}
1596 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"}
1598 op_processing = OpenaireProcessing(storage_manager=storage_manager, testing=True)
1599 op_processing.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True)
1600 op_processing.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False)
1602 assert op_processing.validated_as(valid_doi_in_db) is True
1603 assert op_processing.validated_as(invalid_doi_in_db) is False
1604 assert op_processing.validated_as(valid_doi_not_in_db) is None
1607if __name__ == '__main__':
1608 unittest.main()