Coverage for test / processing_oroci_test.py: 99%

879 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-12 21:23 +0000

1# SPDX-FileCopyrightText: 2023 Arianna Moretti <arianna.moretti4@unibo.it> 

2# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it> 

3# SPDX-FileCopyrightText: 2025-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

4# 

5# SPDX-License-Identifier: ISC 

6 

7import os 

8import unittest 

9 

10from oc_ds_converter.lib.jsonmanager import * 

11from oc_ds_converter.openaire.openaire_processing import OpenaireProcessing 

12# 

13 

14BASE = os.path.join('test', 'openaire_processing') 

15DATA = os.path.join(BASE, 'jSonFile_1.json') 

16DATA_DIR = BASE 

17TMP_SUPPORT_MATERIAL = os.path.join(BASE, "tmp_support") 

18OUTPUT = os.path.join(BASE, 'meta_input') 

19MULTIPROCESS_OUTPUT = os.path.join(BASE, 'multi_process_test') 

20MEMO_JSON_PATH = "test/openaire_processing/tmp_support/memo.json" 

21SAMPLE_ENTITY = {'collectedFrom': [{'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::081b82f96300b6a6e3d282bad31cb6e2', 'schema': 'DNET Identifier'}], 'name': 'Crossref'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::8ac8380272269217cb09a928c8caa993', 'schema': 'DNET Identifier'}], 'name': 'UnpayWall'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::806360c771262b4d6770e7cdf04b5c5a', 'schema': 'DNET Identifier'}], 'name': 'ORCID'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a', 'schema': 'DNET Identifier'}], 'name': 'Microsoft Academic Graph'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::9e3be59865b2c1c335d32dae2fe7b254', 'schema': 'DNET Identifier'}], 'name': 'Datacite'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::6f4922f45568161a8cdf4ad2299f6d23', 'schema': 'DNET Identifier'}], 'name': 'arXiv.org e-Print Archive'}, 'provisionMode': 'collected'}], 'creator': [{'name': 'Matteo Serra'}, {'name': 'Salvatore Mignemi'}, {'identifiers': [{'identifier': '0000-0001-5595-7537', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-5595-7537'}], 'name': 'Mariano Cadoni'}], 'dnetIdentifier': '50|doi_dedup___::41074cd388749ccbdb6668caaf059f4a', 'identifier': [{'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi', 'url': 'https://doi.org/10.1103/physrevd.84.084046'}, {'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi'}, {'identifier': '10.48550/arxiv.1107.5979', 'schema': 'doi', 'url': 'https://dx.doi.org/10.48550/arxiv.1107.5979'}, {'identifier': '1107.5979', 'schema': 'arXiv', 'url': 'http://arxiv.org/abs/1107.5979'}], 'objectSubType': 'Article', 'objectType': 'publication', 'publicationDate': '2011-10-21', 'publisher': [{'name': 'American Physical Society (APS)'}], 'title': 'Exact solutions with AdS asymptotics of Einstein and Einstein-Maxwell gravity minimally coupled to a scalar field'} 

22SAMPLE_ENT2 = {"identifier":"000017d2c913b28e09291b811ce3609a","linkprovider":[{"identifiers":[{"identifier":"10|openaire____::0a836ef43dcb67bb7cbd4dd509b11b73","schema":"DNET Identifier"}],"name":"CORE (RIOXX-UK Aggregator)"},{"identifiers":[{"identifier":"10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357","schema":"DNET Identifier"}],"name":"PubMed Central"},{"identifiers":[{"identifier":"10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c","schema":"DNET Identifier"}],"name":"Europe PubMed Central"},{"identifiers":[{"identifier":"10|opendoar____::229754d7799160502a143a72f6789927","schema":"DNET Identifier"}],"name":"Publications at Bielefeld University"}],"publicationDate":"2014-02-01","publisher":[{"name":"Springer Nature"}],"relationship":{"inverse":"IsCitedBy","name":"Cites","schema":"datacite"},"source":{"collectedFrom":[{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|openaire____::0a836ef43dcb67bb7cbd4dd509b11b73","schema":"DNET Identifier"}],"name":"CORE (RIOXX-UK Aggregator)"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357","schema":"DNET Identifier"}],"name":"PubMed Central"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c","schema":"DNET Identifier"}],"name":"Europe PubMed Central"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|opendoar____::229754d7799160502a143a72f6789927","schema":"DNET Identifier"}],"name":"Publications at Bielefeld University"},"provisionMode":"collected"}],"creator":[{"identifiers":[{"identifier":"0000-0002-6491-0754","schema":"ORCID","url":"https://orcid.org/0000-0002-6491-0754"}],"name":"Sattler, Sebastian"},{"name":"Mehlkop, Guido"},{"name":"Graeff, Peter"},{"identifiers":[{"identifier":"0000-0002-8090-6886","schema":"ORCID","url":"https://orcid.org/0000-0002-8090-6886"}],"name":"Sauer, Carsten"}],"dnetIdentifier":"50|pmid_dedup__::8936076da7a86820c24ede7ca3ff15b3","identifier":[{"identifier":"PMC3928621","schema":"pmc","url":"http://europepmc.org/articles/PMC3928621"},{"identifier":"24484640","schema":"pmid"},{"identifier":"24484640","schema":"pmid","url":"https://pubmed.ncbi.nlm.nih.gov/24484640"},{"identifier":"PMC3928621","schema":"pmc"}],"objectSubType":"Article","objectType":"publication","publicationDate":"2014-02-01","publisher":[{"name":"Springer Nature"}],"title":"Evaluating the drivers of and obstacles to the willingness to use cognitive enhancement drugs: the influence of drug characteristics, social environment, and personal characteristics"},"target":{"collectedFrom":[{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2","schema":"DNET Identifier"}],"name":"Crossref"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a","schema":"DNET Identifier"}],"name":"Microsoft Academic Graph"},"provisionMode":"collected"}],"creator":[{"name":"Harold G. Grasmick"},{"name":"Robert J. Bursik"}],"dnetIdentifier":"50|doi_________::816648c63de74835ec2b0a753a68f037","identifier":[{"identifier":"10.2307/3053861","schema":"doi","url":"https://doi.org/10.2307/3053861"}],"objectSubType":"Article","objectType":"publication","publicationDate":"1990-01-01","publisher":[{"name":"JSTOR"}],"title":"Conscience, significant others, and rational choice: Extending the deterrence model."}} 

23SAMPLE_ENTITY_FOR_CSV_CREATOR = {'collectedFrom': [{'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::0a836ef43dcb67bb7cbd4dd509b11b73', 'schema': 'DNET Identifier'}], 'name': 'CORE (RIOXX-UK Aggregator)'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357', 'schema': 'DNET Identifier'}], 'name': 'PubMed Central'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c', 'schema': 'DNET Identifier'}], 'name': 'Europe PubMed Central'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|driver______::bee53aa31dc2cbb538c10c2b65fa5824', 'schema': 'DNET Identifier'}], 'name': 'DOAJ-Articles'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::566a9968b43628588e76be5a85a0f9e8', 'schema': 'DNET Identifier'}], 'name': "King's Research Portal"}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::c2cdfa5866e03cdd07d313cbc8fb8311', 'schema': 'DNET Identifier'}], 'name': 'Multidisciplinary Digital Publishing Institute'}, 'provisionMode': 'collected'}], 'creator': [{'name': 'Smith, Lee'}, {'name': 'Sawyer, Alexia'}, {'name': 'Gardner, Benjamin'}, {'name': 'Seppala, Katri'}, {'name': 'Ucci, Marcella'}, {'name': 'Marmot, Alexi'}, {'name': 'Lally, Pippa'}, {'name': 'Fisher, Abi'}], 'dnetIdentifier': '50|pmid_dedup__::a1a8687c2378a0d68314566dec29dafb', 'objectSubType': 'Article', 'objectType': 'publication', 'publicationDate': '2018-06-09', 'publisher': [{'name': 'MDPI'}], 'title': 'Occupational physical activity habits of UK office workers: cross-sectional data from the Active Buildings Study', 'identifier': {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:29890726', 'valid': None}]}, "redis_validity_lists":[[],[]]} 

24 

25 

26class TestOpenaireProcessing(unittest.TestCase): 

27 

28 def delete_storege(self, storage_type=None, specific_path=None): 

29 if not specific_path: 

30 if storage_type == "sqlite": 

31 auto_db_created_path = os.path.join(os.getcwd(), "storage", "id_valid_dict.db") 

32 auto_db_created_path = auto_db_created_path if os.path.exists(auto_db_created_path) else auto_db_created_path+"?mode=rw" 

33 if os.path.exists(auto_db_created_path): 

34 os.remove(auto_db_created_path) 

35 else: 

36 auto_db_created_path = os.path.join(os.getcwd(), "storage", "id_value.json") 

37 if os.path.exists(auto_db_created_path): 

38 os.remove(auto_db_created_path) 

39 elif specific_path: 

40 if os.path.exists(specific_path): 

41 os.remove(specific_path) 

42 

43 def test_get_all_ids(self): 

44 opp = OpenaireProcessing() 

45 allids = opp.extract_all_ids(SAMPLE_ENT2) 

46 self.assertCountEqual(['pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'], allids[0]) 

47 self.assertCountEqual(['orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'], allids[1]) 

48 

49 opp.storage_manager.delete_storage() 

50 

51 def test_get_all_ids_redis(self): 

52 opp = OpenaireProcessing(testing=True) 

53 allids = opp.extract_all_ids(SAMPLE_ENT2) 

54 self.assertCountEqual(['pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'], allids[0]) 

55 self.assertCountEqual(['orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'], allids[1]) 

56 opp.storage_manager.delete_storage() 

57 

58 def test_get_redis_validity_list(self): 

59 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'} 

60 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

61 

62 opp = OpenaireProcessing() 

63 br_valid_list = opp.get_redis_validity_list(br, "br") 

64 exp_exp_br_valid_list = [] 

65 ra_valid_list = opp.get_redis_validity_list(ra, "ra") 

66 exp_exp_ra_valid_list = [] 

67 self.assertEqual(ra_valid_list, exp_exp_ra_valid_list) 

68 self.assertEqual(br_valid_list, exp_exp_br_valid_list) 

69 

70 opp.storage_manager.delete_storage() 

71 

72 def test_get_redis_validity_list_redis(self): 

73 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'} 

74 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

75 

76 opp = OpenaireProcessing(testing=True) 

77 br_valid_list = opp.get_redis_validity_list(br, "br") 

78 exp_exp_br_valid_list = [] 

79 ra_valid_list = opp.get_redis_validity_list(ra, "ra") 

80 exp_exp_ra_valid_list = [] 

81 self.assertEqual(ra_valid_list, exp_exp_ra_valid_list) 

82 self.assertEqual(br_valid_list, exp_exp_br_valid_list) 

83 opp.storage_manager.delete_storage() 

84 

85 def test_get_reids_validity_dict_w_fakeredis_db_values_sqlite(self): 

86 opp = OpenaireProcessing() 

87 opp.BR_redis.sadd('pmid:24484640', "omid:1") 

88 opp.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2") 

89 

90 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'} 

91 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

92 

93 br_validity_dict = opp.get_redis_validity_list(br, "br") 

94 exp_br_valid_list = ["pmid:24484640"] 

95 ra_validity_dict = opp.get_redis_validity_list(ra, "ra") 

96 exp_ra_valid_list = ['orcid:0000-0002-8090-6886'] 

97 self.assertEqual(br_validity_dict, exp_br_valid_list) 

98 self.assertEqual(ra_validity_dict, exp_ra_valid_list) 

99 

100 opp.storage_manager.delete_storage() 

101 

102 opp.BR_redis.delete('pmid:24484640') 

103 opp.BR_redis.delete('pmcid:PMC3928621') 

104 opp.RA_redis.delete('orcid:0000-0002-8090-6886') 

105 

106 def test_get_reids_validity_dict_w_fakeredis_db_values_redis(self): 

107 opp = OpenaireProcessing(testing=True) 

108 opp.BR_redis.sadd('pmid:24484640', "omid:1") 

109 opp.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2") 

110 

111 

112 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'} 

113 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

114 

115 br_validity_dict = opp.get_redis_validity_list(br, "br") 

116 exp_br_valid_list = ["pmid:24484640"] 

117 ra_validity_dict = opp.get_redis_validity_list(ra, "ra") 

118 exp_ra_valid_list = ['orcid:0000-0002-8090-6886'] 

119 self.assertEqual(br_validity_dict, exp_br_valid_list) 

120 self.assertEqual(ra_validity_dict, exp_ra_valid_list) 

121 

122 opp.storage_manager.delete_storage() 

123 opp.BR_redis.delete('pmid:24484640') 

124 opp.BR_redis.delete('pmcid:PMC3928621') 

125 opp.RA_redis.delete('orcid:0000-0002-8090-6886') 

126 

127 def test_validated_as_default(self): 

128 """ 

129 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

130 string of the identifier, the method "validated_as" returns: 

131 - True if the id was already validated as valid 

132 - False if the id was already validated as invalid 

133 - None if the id was not validated before 

134 The procedure is tested 

135 - With default storage manager (sqlite) without a pre-existent db associated 

136 """ 

137 

138 opp = OpenaireProcessing() 

139 validate_as_none = opp.validated_as({"schema":"pmid", "identifier": "pmid:23483834"}) 

140 self.assertEqual(validate_as_none, None) 

141 opp.storage_manager.delete_storage() 

142 

143 def test_validated_as_default_redis(self): 

144 ''' 

145 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

146 string of the identifier, the method "validated_as" returns: 

147 - True if the id was already validated as valid 

148 - False if the id was already validated as invalid 

149 - None if the id was not validated before 

150 The procedure is tested 

151 - With redis storage manager without a pre-existent db associated 

152 ''' 

153 

154 opp = OpenaireProcessing(testing=True) 

155 validate_as_none = opp.validated_as({"schema":"pmid", "identifier": "pmid:23483834"}) 

156 self.assertEqual(validate_as_none, None) 

157 opp.storage_manager.delete_storage() 

158 

159 def test_validated_as_redis_with_preexistent_data(self): 

160 ''' 

161 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

162 string of the identifier, the method "validated_as" returns: 

163 - True if the id was already validated as valid 

164 - False if the id was already validated as invalid 

165 - None if the id was not validated before 

166 The procedure is tested 

167 - With redis storage manager and pre-existent data associated 

168 ''' 

169 valid_pmid_not_in_db = {"identifier":"pmid:2938", "schema":"pmid"} 

170 valid_pmid_in_db = {"identifier":"pmid:23483834", "schema":"pmid"} 

171 invalid_pmid_in_db = {"identifier":"pmid:18328387372097", "schema":"pmid"} 

172 

173 # New class instance and set values directly on the id managers' storage_manager 

174 opp_redis = OpenaireProcessing(testing=True) 

175 opp_redis.pmid_m.storage_manager.set_value(valid_pmid_in_db["identifier"], True) 

176 opp_redis.pmid_m.storage_manager.set_value(invalid_pmid_in_db["identifier"], False) 

177 validated_as_True = opp_redis.validated_as(valid_pmid_in_db) 

178 validated_as_False = opp_redis.validated_as(invalid_pmid_in_db) 

179 not_validated = opp_redis.validated_as(valid_pmid_not_in_db) 

180 

181 self.assertEqual(validated_as_True, True) 

182 self.assertEqual(validated_as_False, False) 

183 self.assertEqual(not_validated, None) 

184 

185 opp_redis.pmid_m.storage_manager.delete_storage() 

186 

187 

188 def test_validated_as_inmemory(self): 

189 ''' 

190 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

191 string of the identifier, the method "validated_as" returns: 

192 - True if the id was already validated as valid 

193 - False if the id was already validated as invalid 

194 - None if the id was not validated before 

195 The procedure is tested 

196 - With in Memory + Json storage manager and a pre-existent db associated 

197 - With in Memory + Json storage manager without a pre-existent db associated 

198 ''' 

199 

200 valid_pmid_not_in_db = {"identifier":"pmid:2938", "schema":"pmid"} 

201 valid_pmid_in_db = {"identifier":"pmid:23483834", "schema":"pmid"} 

202 invalid_pmid_in_db = {"identifier":"pmid:18328387372097", "schema":"pmid"} 

203 

204 # New class instance and set values directly on the id managers' storage_manager 

205 opp_sql = OpenaireProcessing(testing=True) 

206 opp_sql.pmid_m.storage_manager.set_value(valid_pmid_in_db["identifier"], True) 

207 opp_sql.pmid_m.storage_manager.set_value(invalid_pmid_in_db["identifier"], False) 

208 validated_as_True = opp_sql.validated_as(valid_pmid_in_db) 

209 validated_as_False = opp_sql.validated_as(invalid_pmid_in_db) 

210 not_validated = opp_sql.validated_as(valid_pmid_not_in_db) 

211 

212 self.assertEqual(validated_as_True, True) 

213 self.assertEqual(validated_as_False, False) 

214 self.assertEqual(not_validated, None) 

215 

216 opp_sql.pmid_m.storage_manager.delete_storage() 

217 

218 

219 def test_validated_as_redis(self): 

220 ''' 

221 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

222 string of the identifier, the method "validated_as" returns: 

223 - True if the id was already validated as valid 

224 - False if the id was already validated as invalid 

225 - None if the id was not validated before 

226 The procedure is tested 

227 - With REDIS storage manager and a pre-existent db associated 

228 - With REDIS storage manager without a pre-existent db associated 

229 ''' 

230 

231 valid_pmid_not_in_db = {"identifier":"pmid:2938", "schema":"pmid"} 

232 valid_pmid_in_db = {"identifier":"pmid:23483834", "schema":"pmid"} 

233 invalid_pmid_in_db = {"identifier":"pmid:18328387372097", "schema":"pmid"} 

234 

235 # New class instance and set values directly on the id managers' storage_manager 

236 opp_redis = OpenaireProcessing(testing=True) 

237 opp_redis.pmid_m.storage_manager.set_value(valid_pmid_in_db["identifier"], True) 

238 opp_redis.pmid_m.storage_manager.set_value(invalid_pmid_in_db["identifier"], False) 

239 validated_as_True = opp_redis.validated_as(valid_pmid_in_db) 

240 validated_as_False = opp_redis.validated_as(invalid_pmid_in_db) 

241 not_validated = opp_redis.validated_as(valid_pmid_not_in_db) 

242 

243 self.assertEqual(validated_as_True, True) 

244 self.assertEqual(validated_as_False, False) 

245 self.assertEqual(not_validated, None) 

246 opp_redis.pmid_m.storage_manager.delete_storage() 

247 

248 def test_get_id_manager(self): 

249 """Check that, given in input the string of a schema (e.g.:'pmid') or an id with a prefix (e.g.: 'pmid:12334') 

250 and a dictionary mapping the strings of the schemas to their id managers, the method returns the correct 

251 id manager. Note that each instance of the Preprocessing class needs its own instances of the id managers, 

252 in order to avoid conflicts while validating data""" 

253 

254 op = OpenaireProcessing() 

255 id_man_dict = op._id_man_dict 

256 

257 pmid_id = "pmid:12345" 

258 pmid_string = "pmid" 

259 pmid_man_exp = op.get_id_manager(pmid_id, id_man_dict) 

260 pmid_man_exp_2 = op.get_id_manager(pmid_string, id_man_dict) 

261 

262 #check that the idmanager for the pmid was returned and that it works as expected 

263 self.assertTrue(pmid_man_exp.is_valid(pmid_id)) 

264 self.assertTrue(pmid_man_exp_2.is_valid(pmid_id)) 

265 

266 doi_id = "doi:10.1103/physrevd.84.084046" 

267 doi_string = "doi" 

268 doi_man_exp = op.get_id_manager(doi_id, id_man_dict) 

269 doi_man_exp_2 = op.get_id_manager(doi_string, id_man_dict) 

270 

271 #check that the idmanager for the doi was returned and that it works as expected 

272 self.assertTrue(doi_man_exp.is_valid(doi_id)) 

273 self.assertTrue(doi_man_exp_2.is_valid(doi_id)) 

274 

275 pmc_id = "pmcid:PMC5555555" 

276 pmc_string = "pmcid" 

277 pmc_man_exp = op.get_id_manager(pmc_id, id_man_dict) 

278 pmc_man_exp_2 = op.get_id_manager(pmc_string, id_man_dict) 

279 

280 #check that the idmanager for the pmc was returned and that it works as expected 

281 self.assertTrue(pmc_man_exp.is_valid(pmc_id)) 

282 self.assertTrue(pmc_man_exp_2.is_valid(pmc_id)) 

283 

284 arxiv_id = "arxiv:1509.08217" 

285 arxiv_string = "arxiv" 

286 arxiv_man_exp = op.get_id_manager(arxiv_id, id_man_dict) 

287 arxiv_man_exp_2 = op.get_id_manager(arxiv_string, id_man_dict) 

288 

289 #check that the idmanager for the arxiv was returned and that it works as expected 

290 self.assertTrue(arxiv_man_exp.is_valid(arxiv_id)) 

291 self.assertTrue(arxiv_man_exp_2.is_valid(arxiv_id)) 

292 

293 op.storage_manager.delete_storage() 

294 

295 def test_get_id_manager_redis(self): 

296 """Check that, given in input the string of a schema (e.g.:'pmid') or an id with a prefix (e.g.: 'pmid:12334') 

297 and a dictionary mapping the strings of the schemas to their id managers, the method returns the correct 

298 id manager. Note that each instance of the Preprocessing class needs its own instances of the id managers, 

299 in order to avoid conflicts while validating data""" 

300 

301 op = OpenaireProcessing(testing=True) 

302 id_man_dict = op._id_man_dict 

303 

304 pmid_id = "pmid:12345" 

305 pmid_string = "pmid" 

306 pmid_man_exp = op.get_id_manager(pmid_id, id_man_dict) 

307 pmid_man_exp_2 = op.get_id_manager(pmid_string, id_man_dict) 

308 

309 #check that the idmanager for the pmid was returned and that it works as expected 

310 self.assertTrue(pmid_man_exp.is_valid(pmid_id)) 

311 self.assertTrue(pmid_man_exp_2.is_valid(pmid_id)) 

312 

313 doi_id = "doi:10.1103/physrevd.84.084046" 

314 doi_string = "doi" 

315 doi_man_exp = op.get_id_manager(doi_id, id_man_dict) 

316 doi_man_exp_2 = op.get_id_manager(doi_string, id_man_dict) 

317 

318 #check that the idmanager for the doi was returned and that it works as expected 

319 self.assertTrue(doi_man_exp.is_valid(doi_id)) 

320 self.assertTrue(doi_man_exp_2.is_valid(doi_id)) 

321 

322 pmc_id = "pmcid:PMC5555555" 

323 pmc_string = "pmcid" 

324 pmc_man_exp = op.get_id_manager(pmc_id, id_man_dict) 

325 pmc_man_exp_2 = op.get_id_manager(pmc_string, id_man_dict) 

326 

327 #check that the idmanager for the pmc was returned and that it works as expected 

328 self.assertTrue(pmc_man_exp.is_valid(pmc_id)) 

329 self.assertTrue(pmc_man_exp_2.is_valid(pmc_id)) 

330 

331 arxiv_id = "arxiv:1509.08217" 

332 arxiv_string = "arxiv" 

333 arxiv_man_exp = op.get_id_manager(arxiv_id, id_man_dict) 

334 arxiv_man_exp_2 = op.get_id_manager(arxiv_string, id_man_dict) 

335 

336 #check that the idmanager for the arxiv was returned and that it works as expected 

337 self.assertTrue(arxiv_man_exp.is_valid(arxiv_id)) 

338 self.assertTrue(arxiv_man_exp_2.is_valid(arxiv_id)) 

339 

340 op.storage_manager.delete_storage() 

341 

342 

343 def test_normalise_any_id(self): 

344 ''' 

345 Check that, given an id with a prefix, any doi, pmid, pmcid and arxiv id is correctly normalised 

346 ''' 

347 op = OpenaireProcessing() 

348 

349 pmid_id = "pmid:12345" 

350 doi_id = "doi:10.1103/physrevd.84.084046" 

351 arxiv_id = "arxiv:1509.08217" 

352 pmc_id = "pmcid:PMC5555555" 

353 

354 self.assertEqual(pmid_id, op.normalise_any_id(pmid_id+"abc")) 

355 self.assertEqual(doi_id, op.normalise_any_id("doi:" + doi_id.split(":")[1].upper())) 

356 self.assertEqual(arxiv_id + "v1", op.normalise_any_id(arxiv_id.replace(".", "...."))) 

357 self.assertEqual(pmc_id, op.normalise_any_id(pmc_id+" ")) 

358 

359 op.storage_manager.delete_storage() 

360 

361 def test_normalise_any_id_redis(self): 

362 ''' 

363 Check that, given an id with a prefix, any doi, pmid, pmcid and arxiv id is correctly normalised 

364 ''' 

365 op = OpenaireProcessing(testing=True) 

366 

367 pmid_id = "pmid:12345" 

368 doi_id = "doi:10.1103/physrevd.84.084046" 

369 arxiv_id = "arxiv:1509.08217" 

370 pmc_id = "pmcid:PMC5555555" 

371 

372 self.assertEqual(pmid_id, op.normalise_any_id(pmid_id+"abc")) 

373 self.assertEqual(doi_id, op.normalise_any_id("doi:" + doi_id.split(":")[1].upper())) 

374 self.assertEqual(arxiv_id + "v1", op.normalise_any_id(arxiv_id.replace(".", "...."))) 

375 self.assertEqual(pmc_id, op.normalise_any_id(pmc_id+" ")) 

376 

377 op.storage_manager.delete_storage() 

378 

379 def test_get_norm_ids(self): 

380 ''' 

381 Check that, given a list of dictionaries representing the ids of an entity, the method returns a reduced version 

382 of the same list, containing only the normalised version of the ids of the schemas managed by opencitations. 

383 Each reduced dictionary only contains two key-value pairs, i.e.: "identifier" and "schema". 

384 ''' 

385 op = OpenaireProcessing() 

386 

387 list_of_ids_to_norm_with_duplicates = [ 

388 {'identifier': '10.1103/PHYSREVD.84.084046', 'schema': 'doi', 

389 'url': 'https://doi.org/10.1103/physrevd.84.084046'}, 

390 {'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi'}, 

391 {'identifier': '10.48550/arxiv.1107.5979', 'schema': 'doi', 

392 'url': 'https://dx.doi.org/10.48550/arxiv.1107.5979'}, 

393 {'identifier': '1107.5979', 'schema': 'arXiv', 'url': 'http://arxiv.org/abs/1107.5979'}] 

394 norm_ids = op.get_norm_ids(list_of_ids_to_norm_with_duplicates) 

395 exp_norm_ids = [{'identifier': 'doi:10.1103/physrevd.84.084046', 'schema': 'doi'}, 

396 {'identifier': 'doi:10.48550/arxiv.1107.5979', 'schema': 'doi'}, 

397 {'identifier': 'arxiv:1107.5979v1', 'schema': 'arxiv'}] 

398 

399 list_of_ids_w_not_managed_schema = [ 

400 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'}, 

401 {'identifier': '21887584', 'schema': 'pmid', 'url': 'https://pubmed.ncbi.nlm.nih.gov/21887584'}, 

402 {'identifier': '10.1007/s12160-011-9282-0', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}] 

403 norm_ids_2 = op.get_norm_ids(list_of_ids_w_not_managed_schema) 

404 exp_norm_ids_2 = [{'identifier': 'pmid:21887584', 'schema': 'pmid'}, 

405 {'identifier': 'doi:10.1007/s12160-011-9282-0', 'schema': 'doi'}] 

406 

407 list_of_ids_not_managed_and_not_normalisable_only = [ 

408 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'}, 

409 {'identifier': '20.ABC/s12160-011-9282-FAKEID', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}] 

410 norm_ids_3 = op.get_norm_ids(list_of_ids_not_managed_and_not_normalisable_only) 

411 exp_norm_ids_3 = [] 

412 

413 self.assertEqual(norm_ids, exp_norm_ids) 

414 self.assertEqual(norm_ids_2, exp_norm_ids_2) 

415 self.assertEqual(norm_ids_3, exp_norm_ids_3) 

416 op.storage_manager.delete_storage() 

417 

418 

419 def test_get_norm_ids_redis(self): 

420 ''' 

421 Check that, given a list of dictionaries representing the ids of an entity, the method returns a reduced version 

422 of the same list, containing only the normalised version of the ids of the schemas managed by opencitations. 

423 Each reduced dictionary only contains two key-value pairs, i.e.: "identifier" and "schema". 

424 ''' 

425 op = OpenaireProcessing(testing=True) 

426 

427 list_of_ids_to_norm_with_duplicates = [ 

428 {'identifier': '10.1103/PHYSREVD.84.084046', 'schema': 'doi', 

429 'url': 'https://doi.org/10.1103/physrevd.84.084046'}, 

430 {'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi'}, 

431 {'identifier': '10.48550/arxiv.1107.5979', 'schema': 'doi', 

432 'url': 'https://dx.doi.org/10.48550/arxiv.1107.5979'}, 

433 {'identifier': '1107.5979', 'schema': 'arXiv', 'url': 'http://arxiv.org/abs/1107.5979'}] 

434 norm_ids = op.get_norm_ids(list_of_ids_to_norm_with_duplicates) 

435 exp_norm_ids = [{'identifier': 'doi:10.1103/physrevd.84.084046', 'schema': 'doi'}, 

436 {'identifier': 'doi:10.48550/arxiv.1107.5979', 'schema': 'doi'}, 

437 {'identifier': 'arxiv:1107.5979v1', 'schema': 'arxiv'}] 

438 

439 list_of_ids_w_not_managed_schema = [ 

440 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'}, 

441 {'identifier': '21887584', 'schema': 'pmid', 'url': 'https://pubmed.ncbi.nlm.nih.gov/21887584'}, 

442 {'identifier': '10.1007/s12160-011-9282-0', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}] 

443 norm_ids_2 = op.get_norm_ids(list_of_ids_w_not_managed_schema) 

444 exp_norm_ids_2 = [{'identifier': 'pmid:21887584', 'schema': 'pmid'}, 

445 {'identifier': 'doi:10.1007/s12160-011-9282-0', 'schema': 'doi'}] 

446 

447 list_of_ids_not_managed_and_not_normalisable_only = [ 

448 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'}, 

449 {'identifier': '20.ABC/s12160-011-9282-FAKEID', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}] 

450 norm_ids_3 = op.get_norm_ids(list_of_ids_not_managed_and_not_normalisable_only) 

451 exp_norm_ids_3 = [] 

452 

453 self.assertEqual(norm_ids, exp_norm_ids) 

454 self.assertEqual(norm_ids_2, exp_norm_ids_2) 

455 self.assertEqual(norm_ids_3, exp_norm_ids_3) 

456 op.storage_manager.delete_storage() 

457 

458 def test_dict_to_cache(self): 

459 op = OpenaireProcessing() 

460 sample_dict = {"dict_type": "sample"} 

461 if os.path.exists(MEMO_JSON_PATH): 

462 os.remove(MEMO_JSON_PATH) 

463 self.assertFalse(os.path.exists(MEMO_JSON_PATH)) 

464 op.dict_to_cache(sample_dict, MEMO_JSON_PATH) 

465 self.assertTrue(os.path.exists(MEMO_JSON_PATH)) 

466 self.delete_storege(specific_path=MEMO_JSON_PATH) 

467 self.assertFalse(os.path.exists(MEMO_JSON_PATH)) 

468 op.storage_manager.delete_storage() 

469 

470 

471 def test_csv_creator_base(self): 

472 ''' 

473 Check that, given an updated openaire entity (i.e.: where the "identifier" field was modified 

474 after having checked the presence of the given identifiers in the storage memory) a meta csv 

475 table for the entity is created 

476 ''' 

477 

478 op = OpenaireProcessing() 

479 csv_row = op.csv_creator(SAMPLE_ENTITY_FOR_CSV_CREATOR) 

480 expected_row = { 

481 'id': 'pmid:29890726', 

482 'title': 'Occupational physical activity habits of UK office workers: cross-sectional data from the Active Buildings Study', 

483 'author': 'Smith Lee; Sawyer Alexia; Gardner Benjamin; Seppala Katri; Ucci Marcella; Marmot Alexi; Lally Pippa; Fisher Abi', 

484 'pub_date': '2018-06-09', 

485 'venue': '', 

486 'volume': '', 

487 'issue': '', 

488 'page': '', 

489 'type': 'journal article', 

490 'publisher': 'MDPI', 

491 'editor': '' 

492 } 

493 self.assertEqual(csv_row, expected_row) 

494 

495 op.storage_manager.delete_storage() 

496 

497 def test_csv_creator_base_redis(self): 

498 ''' 

499 Check that, given an updated openaire entity (i.e.: where the "identifier" field was modified 

500 after having checked the presence of the given identifiers in the storage memory) a meta csv 

501 table for the entity is created 

502 ''' 

503 

504 op = OpenaireProcessing(testing=True) 

505 csv_row = op.csv_creator(SAMPLE_ENTITY_FOR_CSV_CREATOR) 

506 expected_row = { 

507 'id': 'pmid:29890726', 

508 'title': 'Occupational physical activity habits of UK office workers: cross-sectional data from the Active Buildings Study', 

509 'author': 'Smith Lee; Sawyer Alexia; Gardner Benjamin; Seppala Katri; Ucci Marcella; Marmot Alexi; Lally Pippa; Fisher Abi', 

510 'pub_date': '2018-06-09', 

511 'venue': '', 

512 'volume': '', 

513 'issue': '', 

514 'page': '', 

515 'type': 'journal article', 

516 'publisher': 'MDPI', 

517 'editor': '' 

518 } 

519 self.assertEqual(csv_row, expected_row) 

520 

521 op.storage_manager.delete_storage() 

522 

523 def test_csv_creator_not_accepted_id(self): 

524 ''' 

525 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id), 

526 no meta csv rows are created. 

527 ''' 

528 

529 op = OpenaireProcessing() 

530 

531 replaced_entity = {'schema': 'handle', 'identifier': 'handle:11245/1.357137', 'valid': None} 

532 MODIFIED_ENTITY = {k:v for k,v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()} 

533 MODIFIED_ENTITY["identifier"]["to_be_val"]= [] 

534 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity) 

535 csv_row = op.csv_creator(MODIFIED_ENTITY) 

536 expected_row = {} #because there is no ID accepted in opencitations for this entity 

537 self.assertEqual(csv_row, expected_row) 

538 

539 op.storage_manager.delete_storage() 

540 

541 def test_csv_creator_not_accepted_id_redis(self): 

542 ''' 

543 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id), 

544 no meta csv rows are created. 

545 ''' 

546 

547 op = OpenaireProcessing(testing=True) 

548 

549 replaced_entity = {'schema': 'handle', 'identifier': 'handle:11245/1.357137', 'valid': None} 

550 MODIFIED_ENTITY = {k:v for k,v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()} 

551 MODIFIED_ENTITY["identifier"]["to_be_val"]= [] 

552 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity) 

553 csv_row = op.csv_creator(MODIFIED_ENTITY) 

554 expected_row = {} #because there is no ID accepted in opencitations for this entity 

555 self.assertEqual(csv_row, expected_row) 

556 

557 op.storage_manager.delete_storage() 

558 

559 def test_csv_creator_invalid_id(self): 

560 ''' 

561 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id), 

562 no meta csv rows are created. 

563 ''' 

564 

565 op = OpenaireProcessing() 

566 

567 replaced_entity = {'schema': 'handle', 'identifier': '20.500.11820/fake', 'valid': None} 

568 MODIFIED_ENTITY = {k: v for k, v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()} 

569 MODIFIED_ENTITY["identifier"]["to_be_val"] = [] 

570 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity) 

571 csv_row = op.csv_creator(MODIFIED_ENTITY) 

572 expected_row = {} # because there is no ID accepted in opencitations for this entity 

573 self.assertEqual(csv_row, expected_row) 

574 

575 op.storage_manager.delete_storage() 

576 

577 

578 def test_csv_creator_invalid_id_redis(self): 

579 ''' 

580 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id), 

581 no meta csv rows are created. 

582 ''' 

583 

584 op = OpenaireProcessing(testing=True) 

585 

586 replaced_entity = {'schema': 'handle', 'identifier': '20.500.11820/fake', 'valid': None} 

587 MODIFIED_ENTITY = {k: v for k, v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()} 

588 MODIFIED_ENTITY["identifier"]["to_be_val"] = [] 

589 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity) 

590 csv_row = op.csv_creator(MODIFIED_ENTITY) 

591 expected_row = {} # because there is no ID accepted in opencitations for this entity 

592 self.assertEqual(csv_row, expected_row) 

593 

594 op.storage_manager.delete_storage() 

595 

596 def test_get_publisher_name_base(self): 

597 ''' 

598 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's 

599 normalised name (and possibly its crossref ID) is returned. 

600 

601 Base functionalities: No publisher mapping in input -> only Publisher name retrieved from the datasource dump 

602 ''' 

603 op = OpenaireProcessing() 

604 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'} 

605 

606 doi_pub_1_input = {'name': 'Frontiers Media SA'} 

607 doi1 = "10.3389/fnana.2012.00034" 

608 

609 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'} 

610 doi2 = "10.2527/1995.7392834x" 

611 

612 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input) 

613 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input) 

614 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input) 

615 

616 self.assertEqual(doi_pub_output_1, "Frontiers Media SA") 

617 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd") 

618 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)") 

619 

620 op.storage_manager.delete_storage() 

621 

622 def test_get_publisher_name_base_redis(self): 

623 ''' 

624 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's 

625 normalised name (and possibly its crossref ID) is returned. 

626 

627 Base functionalities: No publisher mapping in input -> only Publisher name retrieved from the datasource dump 

628 ''' 

629 op = OpenaireProcessing(testing=True) 

630 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'} 

631 

632 doi_pub_1_input = {'name': 'Frontiers Media SA'} 

633 doi1 = "10.3389/fnana.2012.00034" 

634 

635 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'} 

636 doi2 = "10.2527/1995.7392834x" 

637 

638 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input) 

639 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input) 

640 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input) 

641 

642 self.assertEqual(doi_pub_output_1, "Frontiers Media SA") 

643 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd") 

644 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)") 

645 

646 op.storage_manager.delete_storage() 

647 

648 def test_get_publisher_name_publishers_mapping(self): 

649 ''' 

650 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's 

651 normalised name (and possibly its crossref ID) is returned. 

652 

653 Mapping Provided: Publisher name retrieved + crossref member returned, 

654 only if : 

655 - the doi prefix is a crossref doi prefix, 

656 - it is present in the mapping, 

657 -the name of the publisher provided by the datasource corresponds to the from the datasource dump 

658 ''' 

659 

660 op = OpenaireProcessing(publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json") 

661 

662 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'} 

663 

664 doi_pub_1_input = {'name': 'Frontiers Media SA'} 

665 doi1 = "10.3389/fnana.2012.00034" 

666 

667 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'} 

668 doi2 = "10.2527/1995.7392834x" 

669 

670 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input) 

671 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input) 

672 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input) 

673 

674 self.assertEqual(doi_pub_output_1, "Frontiers Media SA") 

675 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd") 

676 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)") 

677 

678 op.storage_manager.delete_storage() 

679 

680 def test_get_publisher_name_publishers_mapping_redis(self): 

681 ''' 

682 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's 

683 normalised name (and possibly its crossref ID) is returned. 

684 

685 Mapping Provided: Publisher name retrieved + crossref member returned, 

686 only if : 

687 - the doi prefix is a crossref doi prefix, 

688 - it is present in the mapping, 

689 -the name of the publisher provided by the datasource corresponds to the from the datasource dump 

690 ''' 

691 

692 op = OpenaireProcessing(testing=True,publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json") 

693 

694 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'} 

695 

696 doi_pub_1_input = {'name': 'Frontiers Media SA'} 

697 doi1 = "10.3389/fnana.2012.00034" 

698 

699 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'} 

700 doi2 = "10.2527/1995.7392834x" 

701 

702 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input) 

703 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input) 

704 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input) 

705 

706 self.assertEqual(doi_pub_output_1, "Frontiers Media SA") 

707 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd") 

708 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)") 

709 

710 op.storage_manager.delete_storage() 

711 

712 def test_get_publisher_name_publishers_mapping_multi_dois(self): 

713 ''' 

714 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's 

715 normalised name (and possibly its crossref ID) is returned. 

716 

717 Mapping Provided: Publisher name retrieved + crossref member returned, 

718 only if : 

719 - the doi prefix is a crossref doi prefix, 

720 - it is present in the mapping, 

721 -the name of the publisher provided by the datasource corresponds to the from the datasource dump 

722 ''' 

723 

724 op = OpenaireProcessing(publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json") 

725 

726 # CASE 1: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the 

727 # entity's dois prefixes in the prefix-to-publisher-data mapping in input 

728 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member 

729 

730 ent_1_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

731 ent_1_doi_2 = "10.1153/sample_doi" 

732 pub_input_1 = {'name': 'American Physiological Society'} 

733 

734 no_doi_pub_output = op.get_publisher_name([ent_1_doi_1, ent_1_doi_2], pub_input_1) 

735 

736 self.assertEqual(no_doi_pub_output, "American Physiological Society [crossref:24]") 

737 

738 # CASE 2: The Publisher Name provided by OPENAIRE does not correspond to the Publisher Name mapped to one of the 

739 # entity's dois prefixes in the prefix-to-publisher-data mapping in input 

740 # EXPECTED OUTPUT: The publisher name provided by Openaire is retrieved without any crossref member 

741 

742 ent_2_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

743 ent_2_doi_2 = "10.1153/sample_doi" 

744 pub_input_2 = {'name': 'Sample Publisher Name'} 

745 

746 no_doi_pub_output2 = op.get_publisher_name([ent_2_doi_1, ent_2_doi_2], pub_input_2) 

747 self.assertEqual(no_doi_pub_output2, "Sample Publisher Name") 

748 

749 # CASE 3: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the 

750 # entity's dois prefixes in the prefix-to-publisher-data mapping in input BUT it is not the first doi of the list 

751 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member 

752 

753 ent_3_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

754 ent_3_doi_2 = "10.1153/sample_doi" 

755 pub_input_3 = {'name': 'American Physiological Society'} 

756 

757 doi_pub_output3 = op.get_publisher_name([ent_3_doi_2, ent_3_doi_1], pub_input_3) 

758 

759 self.assertEqual(doi_pub_output3, "American Physiological Society [crossref:24]") 

760 

761 op.storage_manager.delete_storage() 

762 

763 # CASE 4: OPENAIRE does not provide a publisher name but one of the entity's DOI prefixes is in the 

764 # prefix-to-publisher-data mapping in input 

765 # EXPECTED OUTPUT: empty string 

766 

767 ent_4_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

768 ent_4_doi_2 = "10.1153/sample_doi" 

769 pub_input_4 = {'name': ''} 

770 pub_input_4_1 = {} 

771 pub_input_4_2 = '' 

772 

773 doi_pub_output4 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4) 

774 doi_pub_output4_1 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_1) 

775 doi_pub_output4_2= op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_2) 

776 

777 self.assertEqual(doi_pub_output4, "") 

778 self.assertEqual(doi_pub_output4_1, "") 

779 self.assertEqual(doi_pub_output4_2, "") 

780 

781 op.storage_manager.delete_storage() 

782 

783 def test_get_publisher_name_publishers_mapping_multi_dois_redis(self): 

784 ''' 

785 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's 

786 normalised name (and possibly its crossref ID) is returned. 

787 

788 Mapping Provided: Publisher name retrieved + crossref member returned, 

789 only if : 

790 - the doi prefix is a crossref doi prefix, 

791 - it is present in the mapping, 

792 -the name of the publisher provided by the datasource corresponds to the from the datasource dump 

793 ''' 

794 

795 op = OpenaireProcessing(testing=True, publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json") 

796 

797 # CASE 1: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the 

798 # entity's dois prefixes in the prefix-to-publisher-data mapping in input 

799 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member 

800 

801 ent_1_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

802 ent_1_doi_2 = "10.1153/sample_doi" 

803 pub_input_1 = {'name': 'American Physiological Society'} 

804 

805 no_doi_pub_output = op.get_publisher_name([ent_1_doi_1, ent_1_doi_2], pub_input_1) 

806 

807 self.assertEqual(no_doi_pub_output, "American Physiological Society [crossref:24]") 

808 

809 # CASE 2: The Publisher Name provided by OPENAIRE does not correspond to the Publisher Name mapped to one of the 

810 # entity's dois prefixes in the prefix-to-publisher-data mapping in input 

811 # EXPECTED OUTPUT: The publisher name provided by Openaire is retrieved without any crossref member 

812 

813 ent_2_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

814 ent_2_doi_2 = "10.1153/sample_doi" 

815 pub_input_2 = {'name': 'Sample Publisher Name'} 

816 

817 no_doi_pub_output2 = op.get_publisher_name([ent_2_doi_1, ent_2_doi_2], pub_input_2) 

818 self.assertEqual(no_doi_pub_output2, "Sample Publisher Name") 

819 

820 # CASE 3: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the 

821 # entity's dois prefixes in the prefix-to-publisher-data mapping in input BUT it is not the first doi of the list 

822 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member 

823 

824 ent_3_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

825 ent_3_doi_2 = "10.1153/sample_doi" 

826 pub_input_3 = {'name': 'American Physiological Society'} 

827 

828 doi_pub_output3 = op.get_publisher_name([ent_3_doi_2, ent_3_doi_1], pub_input_3) 

829 

830 self.assertEqual(doi_pub_output3, "American Physiological Society [crossref:24]") 

831 

832 op.storage_manager.delete_storage() 

833 

834 # CASE 4: OPENAIRE does not provide a publisher name but one of the entity's DOI prefixes is in the 

835 # prefix-to-publisher-data mapping in input 

836 # EXPECTED OUTPUT: empty string 

837 

838 ent_4_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

839 ent_4_doi_2 = "10.1153/sample_doi" 

840 pub_input_4 = {'name': ''} 

841 pub_input_4_1 = {} 

842 pub_input_4_2 = '' 

843 

844 doi_pub_output4 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4) 

845 doi_pub_output4_1 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_1) 

846 doi_pub_output4_2= op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_2) 

847 

848 self.assertEqual(doi_pub_output4, "") 

849 self.assertEqual(doi_pub_output4_1, "") 

850 self.assertEqual(doi_pub_output4_2, "") 

851 

852 op.storage_manager.delete_storage() 

853 

854 def test_manage_arxiv_single_id(self): 

855 '''Check the correct management of entities with only one ID, in particular in 

856 case it is an arxiv. In this case, if it is an arxiv DOI, we return the normalised 

857 version of the correspondent arxiv. Both in case of an arxiv id and of an arxiv doi, 

858 we return the versioned arxiv id where the version is available (never in ARXIV doi). 

859 If no version is provided, we normalise the arxiv id as arxiv id version 1. 

860 In all the other id cases (pmid, pmc, handle (which is discarded in a later step) ''' 

861 sample_doi_any = [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}] 

862 sample_doi_arxiv = [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}] 

863 sample_arxiv_no_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217', 'valid': None}] 

864 sample_arxiv_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509...08217v3', 'valid': None}] 

865 

866 op = OpenaireProcessing() 

867 

868 # CASE 1: the unique input id dict in list is a not-arxiv doi : the input list is returned 

869 out_sample_doi_any = op.manage_arxiv_single_id(sample_doi_any) 

870 self.assertEqual(out_sample_doi_any, [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}]) 

871 

872 # CASE 2: the unique input id dict in list is an arxiv doi: the doi is replaced with its correspondent arxiv v1 

873 out_sample_doi_arxiv = op.manage_arxiv_single_id(sample_doi_arxiv) 

874 self.assertEqual(out_sample_doi_arxiv, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}]) 

875 

876 # CASE 3: the unique input id dict in list is an arxiv id without version: 

877 # the arxiv id is replaced with its v1 

878 out_sample_arxiv_no_ver = op.manage_arxiv_single_id(sample_arxiv_no_ver) 

879 self.assertEqual(out_sample_arxiv_no_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}]) 

880 

881 # CASE 4: the unique input id dict in list is an arxiv id with version: the id is just normalised 

882 out_sample_arxiv_ver = op.manage_arxiv_single_id(sample_arxiv_ver) 

883 self.assertEqual(out_sample_arxiv_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v3'}]) 

884 

885 op.storage_manager.delete_storage() 

886 

887 def test_manage_arxiv_single_id_redis(self): 

888 '''Check the correct management of entities with only one ID, in particular in 

889 case it is an arxiv. In this case, if it is an arxiv DOI, we return the normalised 

890 version of the correspondent arxiv. Both in case of an arxiv id and of an arxiv doi, 

891 we return the versioned arxiv id where the version is available (never in ARXIV doi). 

892 If no version is provided, we normalise the arxiv id as arxiv id version 1. 

893 In all the other id cases (pmid, pmc, handle (which is discarded in a later step) ''' 

894 sample_doi_any = [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}] 

895 sample_doi_arxiv = [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}] 

896 sample_arxiv_no_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217', 'valid': None}] 

897 sample_arxiv_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509...08217v3', 'valid': None}] 

898 

899 op = OpenaireProcessing(testing=True) 

900 

901 # CASE 1: the unique input id dict in list is a not-arxiv doi : the input list is returned 

902 out_sample_doi_any = op.manage_arxiv_single_id(sample_doi_any) 

903 self.assertEqual(out_sample_doi_any, [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}]) 

904 

905 # CASE 2: the unique input id dict in list is an arxiv doi: the doi is replaced with its correspondent arxiv v1 

906 out_sample_doi_arxiv = op.manage_arxiv_single_id(sample_doi_arxiv) 

907 self.assertEqual(out_sample_doi_arxiv, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}]) 

908 

909 # CASE 3: the unique input id dict in list is an arxiv id without version: 

910 # the arxiv id is replaced with its v1 

911 out_sample_arxiv_no_ver = op.manage_arxiv_single_id(sample_arxiv_no_ver) 

912 self.assertEqual(out_sample_arxiv_no_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}]) 

913 

914 # CASE 4: the unique input id dict in list is an arxiv id with version: the id is just normalised 

915 out_sample_arxiv_ver = op.manage_arxiv_single_id(sample_arxiv_ver) 

916 self.assertEqual(out_sample_arxiv_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v3'}]) 

917 

918 op.storage_manager.delete_storage() 

919 

920 def test_manage_doi_prefixes_priorities(self): 

921 op = OpenaireProcessing() 

922 

923 # CASE1: 1 figshare doi (priority 1) with version --> returned as it is 

924 es_1 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v3', 'valid': None}] 

925 out_1 = op.manage_doi_prefixes_priorities(es_1) 

926 self.assertEqual(out_1, es_1) 

927 

928 # CASE2: 1 figshare doi (priority 1) without version --> returned with version v1 

929 es_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234', 'valid': None}] 

930 exp_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v1', 'valid': None}] 

931 out_2 = op.manage_doi_prefixes_priorities(es_2) 

932 self.assertEqual(exp_2, out_2) 

933 

934 # CASE3: 1 arxiv doi (always without and version) --> returned as correspondent arxiv id version v1 

935 es_3 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}] 

936 out_3 = op.manage_doi_prefixes_priorities(es_3) 

937 exp_3 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}] 

938 self.assertEqual(exp_3, out_3) 

939 

940 # CASE4: >1 arxiv doi or figshare and at least one has version --> return the one(s) with version 

941 es_4 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}] 

942 out_4 = op.manage_doi_prefixes_priorities(es_4) 

943 exp_4 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}] 

944 self.assertEqual(exp_4, out_4) 

945 

946 # CASE5: >1 arxiv doi or figshare and none has version --> return, as first choice, the arxiv version v1 of the first arxiv doi encountered 

947 es_5 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}] 

948 out_5 = op.manage_doi_prefixes_priorities(es_5) 

949 exp_5 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}] 

950 self.assertEqual(exp_5, out_5) 

951 

952 # CASE6: >1 figshare dois and none has version --> return, return version v1 doi of the first figshare doi encountered 

953 es_6 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/1234', 'valid': None}] 

954 out_6 = op.manage_doi_prefixes_priorities(es_6) 

955 exp_6 = [{'identifier': 'doi:10.6084/5678v1', 'schema': 'doi', 'valid': None}] 

956 self.assertEqual(exp_6, out_6) 

957 

958 # CASE7: >1 more than one zenodo doi --> return the one with the highest number: it is the last one assigned and thus it 

959 # is a version doi and not the collector doi (which is the first one to be assigned when a publication is uploaded on zenodo). 

960 es_7 = [{'schema': 'doi', 'identifier': '10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': '10.5281/zenodo.112', 'valid': None}] 

961 es_7_1 = [{'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.112', 'valid': None}] 

962 out_7 = op.manage_doi_prefixes_priorities(es_7) 

963 out_7_1 = op.manage_doi_prefixes_priorities(es_7_1) 

964 exp_7 = [{'identifier': '10.5281/zenodo.112', 'schema': 'doi', 'valid': None}] 

965 exp_7_1 = [{'identifier': 'doi:10.5281/zenodo.112', 'schema': 'doi', 'valid': None}] 

966 self.assertEqual(exp_7, out_7) 

967 self.assertEqual(exp_7_1, out_7_1) 

968 

969 # CASE8: None of the previous cases: return the first syntactically valid DOI with highest priority prefix 

970 es_8 = [ 

971 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None}, 

972 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}, 

973 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None}, 

974 ] 

975 

976 out_8 = op.manage_doi_prefixes_priorities(es_8) 

977 exp_8 = [{'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}] 

978 self.assertEqual(exp_8, out_8) 

979 

980 # CASE8_1: first syntactically valid DOI with highest priority prefix is returned 

981 es_8_1 = [ 

982 {'schema': 'doi', 'identifier': '10.5281/zenodo.4725899', 'valid': None}, 

983 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}, 

984 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None}, 

985 ] 

986 

987 out_8_1 = op.manage_doi_prefixes_priorities(es_8_1) 

988 exp_8_1 = [{'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}] 

989 self.assertEqual(exp_8_1, out_8_1) 

990 

991 # CASE8_2: 

992 # more valid ids among the ones with a max priority prefix --> return the first one encountered 

993 es_8_2 = [ 

994 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None}, 

995 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}, 

996 {'schema': 'doi', 'identifier': 'doi:10.25384/sage.c.4112909', 'valid': None}, 

997 ] 

998 

999 out_8_2 = op.manage_doi_prefixes_priorities(es_8_2) 

1000 exp_8_2 = [{'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}] 

1001 self.assertEqual(exp_8_2, out_8_2) 

1002 

1003 op.storage_manager.delete_storage() 

1004 

1005 def test_manage_doi_prefixes_priorities_redis(self): 

1006 op = OpenaireProcessing(testing=True) 

1007 

1008 # CASE1: 1 figshare doi (priority 1) with version --> returned as it is 

1009 es_1 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v3', 'valid': None}] 

1010 out_1 = op.manage_doi_prefixes_priorities(es_1) 

1011 self.assertEqual(out_1, es_1) 

1012 

1013 # CASE2: 1 figshare doi (priority 1) without version --> returned with version v1 

1014 es_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234', 'valid': None}] 

1015 exp_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v1', 'valid': None}] 

1016 out_2 = op.manage_doi_prefixes_priorities(es_2) 

1017 self.assertEqual(exp_2, out_2) 

1018 

1019 # CASE3: 1 arxiv doi (always without and version) --> returned as correspondent arxiv id version v1 

1020 es_3 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}] 

1021 out_3 = op.manage_doi_prefixes_priorities(es_3) 

1022 exp_3 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}] 

1023 self.assertEqual(exp_3, out_3) 

1024 

1025 # CASE4: >1 arxiv doi or figshare and at least one has version --> return the one(s) with version 

1026 es_4 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}] 

1027 out_4 = op.manage_doi_prefixes_priorities(es_4) 

1028 exp_4 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}] 

1029 self.assertEqual(exp_4, out_4) 

1030 

1031 # CASE5: >1 arxiv doi or figshare and none has version --> return, as first choice, the arxiv version v1 of the first arxiv doi encountered 

1032 es_5 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}] 

1033 out_5 = op.manage_doi_prefixes_priorities(es_5) 

1034 exp_5 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}] 

1035 self.assertEqual(exp_5, out_5) 

1036 

1037 # CASE6: >1 figshare dois and none has version --> return, return version v1 doi of the first figshare doi encountered 

1038 es_6 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/1234', 'valid': None}] 

1039 out_6 = op.manage_doi_prefixes_priorities(es_6) 

1040 exp_6 = [{'identifier': 'doi:10.6084/5678v1', 'schema': 'doi', 'valid': None}] 

1041 self.assertEqual(exp_6, out_6) 

1042 

1043 # CASE7: >1 more than one zenodo doi --> return the one with the highest number: it is the last one assigned and thus it 

1044 # is a version doi and not the collector doi (which is the first one to be assigned when a publication is uploaded on zenodo). 

1045 es_7 = [{'schema': 'doi', 'identifier': '10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': '10.5281/zenodo.112', 'valid': None}] 

1046 es_7_1 = [{'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.112', 'valid': None}] 

1047 out_7 = op.manage_doi_prefixes_priorities(es_7) 

1048 out_7_1 = op.manage_doi_prefixes_priorities(es_7_1) 

1049 exp_7 = [{'identifier': '10.5281/zenodo.112', 'schema': 'doi', 'valid': None}] 

1050 exp_7_1 = [{'identifier': 'doi:10.5281/zenodo.112', 'schema': 'doi', 'valid': None}] 

1051 self.assertEqual(exp_7, out_7) 

1052 self.assertEqual(exp_7_1, out_7_1) 

1053 

1054 # CASE8: None of the previous cases: return the first syntactically valid DOI with highest priority prefix 

1055 es_8 = [ 

1056 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None}, 

1057 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}, 

1058 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None}, 

1059 ] 

1060 

1061 out_8 = op.manage_doi_prefixes_priorities(es_8) 

1062 exp_8 = [{'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}] 

1063 self.assertEqual(exp_8, out_8) 

1064 

1065 # CASE8_1: first syntactically valid DOI with highest priority prefix is returned 

1066 es_8_1 = [ 

1067 {'schema': 'doi', 'identifier': '10.5281/zenodo.4725899', 'valid': None}, 

1068 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}, 

1069 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None}, 

1070 ] 

1071 

1072 out_8_1 = op.manage_doi_prefixes_priorities(es_8_1) 

1073 exp_8_1 = [{'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}] 

1074 self.assertEqual(exp_8_1, out_8_1) 

1075 

1076 # CASE8_2: 

1077 # more valid ids among the ones with a max priority prefix --> return the first one encountered 

1078 es_8_2 = [ 

1079 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None}, 

1080 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}, 

1081 {'schema': 'doi', 'identifier': 'doi:10.25384/sage.c.4112909', 'valid': None}, 

1082 ] 

1083 

1084 out_8_2 = op.manage_doi_prefixes_priorities(es_8_2) 

1085 exp_8_2 = [{'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}] 

1086 self.assertEqual(exp_8_2, out_8_2) 

1087 

1088 op.storage_manager.delete_storage() 

1089 

1090 def test_to_validated_id_list(self): 

1091 # NOTE: in tests using the sqlite storage method it must be avoided to delete the storage 

1092 # while using the same OpenaireProcessing() instance, otherwise the process would try to 

1093 # store data in a filepath that has just been deleted, with no new connection created after it. 

1094 

1095 # 2 OPTIONS: 1) instantiate OpenaireProcessing only once at the beginning and delete the 

1096 # storage only at the end; 2) create a new OpenaireProcessing instance at every check and 

1097 # delete the storage each time after the check is done. 

1098 

1099 op = OpenaireProcessing() 

1100 # CASE1_1: No already validated ids + 1 id to be validated, which is valid 

1101 inp_1 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}]} 

1102 out_1 = op.to_validated_id_list(inp_1) 

1103 exp_1 = ['pmid:20662931'] 

1104 self.assertEqual(out_1, exp_1) 

1105 op.storage_manager.delete_storage() 

1106 

1107 op = OpenaireProcessing() 

1108 # CASE1_2: No already validated ids + 1 id to be validated, which is invalid 

1109 inp_2 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:abc', 'valid': None}]} 

1110 out_2 = op.to_validated_id_list(inp_2) 

1111 exp_2 = [] 

1112 self.assertEqual(out_2, exp_2) 

1113 

1114 op = OpenaireProcessing() 

1115 # CASE1_3: No already validated ids + 1 id to be validated, which is a valid arxiv doi 

1116 inp_3 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]} 

1117 out_3 = op.to_validated_id_list(inp_3) 

1118 exp_3 = ['arxiv:1509.08217v1'] 

1119 self.assertEqual(out_3, exp_3) 

1120 op.storage_manager.delete_storage() 

1121 

1122 

1123 op = OpenaireProcessing() 

1124 # CASE1_4: No already validated ids + 1 id to be validated, which hasn't a valid schema 

1125 inp_4 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "0", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]} 

1126 out_4 = op.to_validated_id_list(inp_4) 

1127 exp_4 = [] 

1128 self.assertEqual(out_4, exp_4) 

1129 op.storage_manager.delete_storage() 

1130 

1131 op = OpenaireProcessing() 

1132 # CASE1_5: No already validated ids + 1 id to be validated, which is not valid 

1133 inp_5 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "doi", 'identifier': 'doi:INVALID/fake', 'valid': None}]} 

1134 out_5 = op.to_validated_id_list(inp_5) 

1135 exp_5 = [] 

1136 self.assertEqual(out_5, exp_5) 

1137 op.storage_manager.delete_storage() 

1138 

1139 op = OpenaireProcessing() 

1140 # CASE1_9: No already validated ids + 1 id to be validated, which is a valid PMC 

1141 inp_9 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]} 

1142 out_9 = op.to_validated_id_list(inp_9) 

1143 exp_9 = ['pmcid:PMC2873764'] 

1144 self.assertEqual(out_9, exp_9) 

1145 op.storage_manager.delete_storage() 

1146 

1147 op = OpenaireProcessing() 

1148 # CASE2_1: No already validated ids + >1 id to be validated, both valid and with accepted schemas 

1149 inp_6 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1150 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None}]} 

1151 out_6 = op.to_validated_id_list(inp_6) 

1152 exp_6 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0'] 

1153 self.assertCountEqual(out_6, exp_6) #Test that sequence first contains the same elements as second, regardless of their order 

1154 op.storage_manager.delete_storage() 

1155 

1156 op = OpenaireProcessing() 

1157 # CASE2_2: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv id 

1158 inp_8 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1159 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979', 'valid': None}]} 

1160 out_8 = op.to_validated_id_list(inp_8) 

1161 exp_8 = ['pmid:20662931'] 

1162 self.assertEqual(out_8, exp_8) 

1163 op.storage_manager.delete_storage() 

1164 

1165 op = OpenaireProcessing() 

1166 # CASE2_3: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv doi 

1167 inp_7 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, {'schema': "doi", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]} 

1168 out_7 = op.to_validated_id_list(inp_7) 

1169 exp_7 = ['pmid:20662931'] 

1170 self.assertEqual(out_7, exp_7) 

1171 op.storage_manager.delete_storage() 

1172 

1173 op = OpenaireProcessing() 

1174 # CASE2_4: No already validated ids + >1 id to be validated, both valid, one of the two is a PMC 

1175 inp_10 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1176 {'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]} 

1177 out_10 = op.to_validated_id_list(inp_10) 

1178 exp_10 = ['pmid:20662931'] 

1179 self.assertEqual(out_10, exp_10) 

1180 op.storage_manager.delete_storage() 

1181 

1182 op = OpenaireProcessing() 

1183 # CASE2_5: No already validated ids + >1 id to be validated, 1 valid pmid, 1 valid doi, 1 valid doi with a "critic" prefix 

1184 # for opencitations entities management 

1185 

1186 inp_11 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1187 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None}, 

1188 {'schema': 'doi', 

1189 'identifier': 'doi:10.48550/arXiv.1509.08217', 

1190 'valid': None} 

1191 ]} 

1192 out_11 = op.to_validated_id_list(inp_11) 

1193 exp_11 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0'] 

1194 self.assertCountEqual(out_11, exp_11) #Test that sequence first contains the same elements as second, regardless of their order 

1195 op.storage_manager.delete_storage() 

1196 

1197 op = OpenaireProcessing() 

1198 # CASE2_6: No already validated ids + >1 id to be validated, one doi with a "critic" prefix and a PMCID 

1199 # for opencitations entities management 

1200 

1201 inp_12 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmcid', 'identifier': 'pmcid:PMC5555555', 'valid': None}, 

1202 {'schema': 'doi', 

1203 'identifier': 'doi:10.48550/arXiv.1509.08217', 

1204 'valid': None} 

1205 ]} 

1206 out_12 = op.to_validated_id_list(inp_12) 

1207 exp_12 = ['pmcid:PMC5555555'] 

1208 self.assertEqual(out_12, exp_12) 

1209 op.storage_manager.delete_storage() 

1210 

1211 op = OpenaireProcessing() 

1212 # CASE2_7: no already validated ids + >1 id to be validated, one doi with a "critic" prefix for opencitations 

1213 # ingestion workflow and an ARXIV 

1214 

1215 inp_13 = {'valid': [], 'not_valid': [], 'to_be_val': [ 

1216 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979v1', 'valid': None}, 

1217 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None} 

1218 ]} 

1219 out_13 = op.to_validated_id_list(inp_13) 

1220 exp_13 = ['arxiv:1107.5979v1'] 

1221 self.assertEqual(out_13, exp_13) 

1222 op.storage_manager.delete_storage() 

1223 

1224 op = OpenaireProcessing() 

1225 # CASE2_8: no already validated ids and more dois with "critic" prefixes for opencitations 

1226 # ingestion workflow 

1227 

1228 inp_14 = {'valid': [], 'not_valid': [], 'to_be_val': [ 

1229 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None}, 

1230 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None} 

1231 ]} 

1232 out_14 = op.to_validated_id_list(inp_14) 

1233 exp_14 = ['doi:10.1184/r1/12841247.v1'] 

1234 self.assertEqual(out_14, exp_14) 

1235 op.storage_manager.delete_storage() 

1236 

1237 op = OpenaireProcessing() 

1238 # CASE3: an already validated id and more dois with "critic" prefixes for opencitations 

1239 # ingestion workflow 

1240 

1241 inp_15 = {'valid': [], 'not_valid': [], 'to_be_val': [ 

1242 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None}, 

1243 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None}, 

1244 {'schema': 'doi', 'identifier': 'doi:10.7557/5.5607', 'valid': None}, 

1245 {} 

1246 ]} 

1247 out_15 = op.to_validated_id_list(inp_15) 

1248 exp_15 = ['doi:10.7557/5.5607'] 

1249 self.assertEqual(out_15, exp_15) 

1250 op.storage_manager.delete_storage() 

1251 

1252 def test_to_validated_id_list_redis(self): 

1253 # NOTE: in tests using the sqlite storage method it must be avoided to delete the storage 

1254 # while using the same OpenaireProcessing() instance, otherwise the process would try to 

1255 # store data in a filepath that has just been deleted, with no new connection created after it. 

1256 

1257 # 2 OPTIONS: 1) instantiate OpenaireProcessing only once at the beginning and delete the 

1258 # storage only at the end; 2) create a new OpenaireProcessing instance at every check and 

1259 # delete the storage each time after the check is done. 

1260 

1261 op = OpenaireProcessing(testing=True) 

1262 # CASE1_1: No already validated ids + 1 id to be validated, which is valid 

1263 inp_1 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}]} 

1264 out_1 = op.to_validated_id_list(inp_1) 

1265 exp_1 = ['pmid:20662931'] 

1266 self.assertEqual(out_1, exp_1) 

1267 op.storage_manager.delete_storage() 

1268 

1269 op = OpenaireProcessing(testing=True) 

1270 # CASE1_2: No already validated ids + 1 id to be validated, which is invalid 

1271 inp_2 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:abc', 'valid': None}]} 

1272 out_2 = op.to_validated_id_list(inp_2) 

1273 exp_2 = [] 

1274 self.assertEqual(out_2, exp_2) 

1275 op.storage_manager.delete_storage() 

1276 

1277 op = OpenaireProcessing(testing=True) 

1278 # CASE1_3: No already validated ids + 1 id to be validated, which is a valid arxiv doi 

1279 inp_3 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]} 

1280 out_3 = op.to_validated_id_list(inp_3) 

1281 exp_3 = ['arxiv:1509.08217v1'] 

1282 self.assertEqual(out_3, exp_3) 

1283 op.storage_manager.delete_storage() 

1284 

1285 op = OpenaireProcessing(testing=True) 

1286 # CASE1_4: No already validated ids + 1 id to be validated, which hasn't a valid schema 

1287 inp_4 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "0", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]} 

1288 out_4 = op.to_validated_id_list(inp_4) 

1289 exp_4 = [] 

1290 self.assertEqual(out_4, exp_4) 

1291 op.storage_manager.delete_storage() 

1292 

1293 op = OpenaireProcessing(testing=True) 

1294 # CASE1_5: No already validated ids + 1 id to be validated, which is not valid 

1295 inp_5 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "doi", 'identifier': 'doi:INVALID/fake', 'valid': None}]} 

1296 out_5 = op.to_validated_id_list(inp_5) 

1297 exp_5 = [] 

1298 self.assertEqual(out_5, exp_5) 

1299 op.storage_manager.delete_storage() 

1300 

1301 op = OpenaireProcessing(testing=True) 

1302 # CASE1_9: No already validated ids + 1 id to be validated, which is a valid PMC 

1303 inp_9 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]} 

1304 out_9 = op.to_validated_id_list(inp_9) 

1305 exp_9 = ['pmcid:PMC2873764'] 

1306 self.assertEqual(out_9, exp_9) 

1307 op.storage_manager.delete_storage() 

1308 

1309 op = OpenaireProcessing(testing=True) 

1310 # CASE2_1: No already validated ids + >1 id to be validated, both valid and with accepted schemas 

1311 inp_6 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1312 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None}]} 

1313 out_6 = op.to_validated_id_list(inp_6) 

1314 exp_6 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0'] 

1315 self.assertCountEqual(out_6, exp_6) #Test that sequence first contains the same elements as second, regardless of their order 

1316 op.storage_manager.delete_storage() 

1317 

1318 op = OpenaireProcessing(testing=True) 

1319 # CASE2_2: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv id 

1320 inp_8 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1321 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979', 'valid': None}]} 

1322 out_8 = op.to_validated_id_list(inp_8) 

1323 exp_8 = ['pmid:20662931'] 

1324 self.assertEqual(out_8, exp_8) 

1325 op.storage_manager.delete_storage() 

1326 

1327 op = OpenaireProcessing(testing=True) 

1328 # CASE2_3: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv doi 

1329 inp_7 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, {'schema': "doi", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]} 

1330 out_7 = op.to_validated_id_list(inp_7) 

1331 exp_7 = ['pmid:20662931'] 

1332 self.assertEqual(out_7, exp_7) 

1333 op.storage_manager.delete_storage() 

1334 

1335 op = OpenaireProcessing(testing=True) 

1336 # CASE2_4: No already validated ids + >1 id to be validated, both valid, one of the two is a PMC 

1337 inp_10 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1338 {'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]} 

1339 out_10 = op.to_validated_id_list(inp_10) 

1340 exp_10 = ['pmid:20662931'] 

1341 self.assertEqual(out_10, exp_10) 

1342 op.storage_manager.delete_storage() 

1343 

1344 op = OpenaireProcessing(testing=True) 

1345 # CASE2_5: No already validated ids + >1 id to be validated, 1 valid pmid, 1 valid doi, 1 valid doi with a "critic" prefix 

1346 # for opencitations entities management 

1347 

1348 inp_11 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1349 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None}, 

1350 {'schema': 'doi', 

1351 'identifier': 'doi:10.48550/arXiv.1509.08217', 

1352 'valid': None} 

1353 ]} 

1354 out_11 = op.to_validated_id_list(inp_11) 

1355 exp_11 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0'] 

1356 self.assertCountEqual(out_11, exp_11) #Test that sequence first contains the same elements as second, regardless of their order 

1357 op.storage_manager.delete_storage() 

1358 

1359 op = OpenaireProcessing(testing=True) 

1360 # CASE2_6: No already validated ids + >1 id to be validated, one doi with a "critic" prefix and a PMCID 

1361 # for opencitations entities management 

1362 

1363 inp_12 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmcid', 'identifier': 'pmcid:PMC5555555', 'valid': None}, 

1364 {'schema': 'doi', 

1365 'identifier': 'doi:10.48550/arXiv.1509.08217', 

1366 'valid': None} 

1367 ]} 

1368 out_12 = op.to_validated_id_list(inp_12) 

1369 exp_12 = ['pmcid:PMC5555555'] 

1370 self.assertEqual(out_12, exp_12) 

1371 op.storage_manager.delete_storage() 

1372 

1373 op = OpenaireProcessing(testing=True) 

1374 # CASE2_7: no already validated ids + >1 id to be validated, one doi with a "critic" prefix for opencitations 

1375 # ingestion workflow and an ARXIV 

1376 

1377 inp_13 = {'valid': [], 'not_valid': [], 'to_be_val': [ 

1378 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979v1', 'valid': None}, 

1379 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None} 

1380 ]} 

1381 out_13 = op.to_validated_id_list(inp_13) 

1382 exp_13 = ['arxiv:1107.5979v1'] 

1383 self.assertEqual(out_13, exp_13) 

1384 op.storage_manager.delete_storage() 

1385 

1386 op = OpenaireProcessing(testing=True) 

1387 # CASE2_8: no already validated ids and more dois with "critic" prefixes for opencitations 

1388 # ingestion workflow 

1389 

1390 inp_14 = {'valid': [], 'not_valid': [], 'to_be_val': [ 

1391 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None}, 

1392 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None} 

1393 ]} 

1394 out_14 = op.to_validated_id_list(inp_14) 

1395 exp_14 = ['doi:10.1184/r1/12841247.v1'] 

1396 self.assertEqual(out_14, exp_14) 

1397 op.storage_manager.delete_storage() 

1398 

1399 op = OpenaireProcessing(testing=True) 

1400 # CASE3: an already validated id and more dois with "critic" prefixes for opencitations 

1401 # ingestion workflow 

1402 

1403 inp_15 = {'valid': [], 'not_valid': [], 'to_be_val': [ 

1404 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None}, 

1405 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None}, 

1406 {'schema': 'doi', 'identifier': 'doi:10.7557/5.5607', 'valid': None}, 

1407 {} 

1408 ]} 

1409 out_15 = op.to_validated_id_list(inp_15) 

1410 exp_15 = ['doi:10.7557/5.5607'] 

1411 self.assertEqual(out_15, exp_15) 

1412 op.storage_manager.delete_storage() 

1413 

1414 

1415 def test_add_authors_to_agent_list(self): 

1416 op = OpenaireProcessing() 

1417 sample_inp = {'creator': [{'name': 'Carlos Hoyos'}, {'name': 'Yaron Oz'}, {'identifiers': [{'identifier': '0000-0001-6946-5074', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-6946-5074'}], 'name': 'Bom Soo Kim'}]} 

1418 sample_exp = op.add_authors_to_agent_list(sample_inp, []) 

1419 sample_out = [{'role': 'author', 'name': 'Carlos Hoyos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Yaron Oz', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Bom Soo Kim', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}] 

1420 self.assertEqual(sample_out, sample_exp) 

1421 op.storage_manager.delete_storage() 

1422 

1423 

1424 def test_add_authors_to_agent_list_redis(self): 

1425 op = OpenaireProcessing(testing=True) 

1426 sample_inp = {'creator': [{'name': 'Carlos Hoyos'}, {'name': 'Yaron Oz'}, {'identifiers': [{'identifier': '0000-0001-6946-5074', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-6946-5074'}], 'name': 'Bom Soo Kim'}]} 

1427 sample_exp = op.add_authors_to_agent_list(sample_inp, []) 

1428 sample_out = [{'role': 'author', 'name': 'Carlos Hoyos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Yaron Oz', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Bom Soo Kim', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}] 

1429 self.assertEqual(sample_out, sample_exp) 

1430 op.storage_manager.delete_storage() 

1431 

1432 def test_add_authors_to_agent_list_no_creator(self): 

1433 op = OpenaireProcessing() 

1434 sample_inp = {'creator': []} 

1435 sample_exp = op.add_authors_to_agent_list(sample_inp, []) 

1436 sample_out = [] 

1437 self.assertEqual(sample_out, sample_exp) 

1438 op.storage_manager.delete_storage() 

1439 

1440 

1441 def test_add_authors_to_agent_list_no_creator_redis(self): 

1442 op = OpenaireProcessing(testing=True) 

1443 sample_inp = {'creator': []} 

1444 sample_exp = op.add_authors_to_agent_list(sample_inp, []) 

1445 sample_out = [] 

1446 self.assertEqual(sample_out, sample_exp) 

1447 op.storage_manager.delete_storage() 

1448 

1449 def test_get_agents_strings_list(self): 

1450 best_doi = "doi:10.1007/jhep03(2014)050" 

1451 agents_list_2 = [{'role': 'author', 'name': 'Hoyos, Carlos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Oz, Yaron', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Kim, Bom Soo', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}] 

1452 op = OpenaireProcessing() 

1453 sample_exp = op.get_agents_strings_list(best_doi, agents_list_2) 

1454 self.assertEqual(sample_exp, (['Hoyos Carlos', 'Oz Yaron', 'Kim Bom Soo [orcid:0000-0001-6946-5074]'], [])) 

1455 op.storage_manager.delete_storage() 

1456 

1457 def test_get_agents_strings_list_redis(self): 

1458 best_doi = "doi:10.1007/jhep03(2014)050" 

1459 agents_list_2 = [{'role': 'author', 'name': 'Hoyos, Carlos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Oz, Yaron', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Kim, Bom Soo', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}] 

1460 op = OpenaireProcessing(testing=True) 

1461 sample_exp = op.get_agents_strings_list(best_doi, agents_list_2) 

1462 self.assertEqual(sample_exp, (['Hoyos Carlos', 'Oz Yaron', 'Kim Bom Soo [orcid:0000-0001-6946-5074]'], [])) 

1463 op.storage_manager.delete_storage() 

1464 

1465 def test_find_openaire_orcid(self): 

1466 op = OpenaireProcessing(testing=True) 

1467 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1468 out = op.find_openaire_orcid(inp) 

1469 exp = "orcid:0000-0001-9759-3938" 

1470 self.assertEqual(out, exp) 

1471 

1472 inp_wrong_schema = [{'identifier': '0000-0001-9759-3938', 'schema': 'fake_schema', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1473 out_wrong_schema = op.find_openaire_orcid(inp_wrong_schema) 

1474 exp_wrong_schema = "" 

1475 self.assertEqual(out_wrong_schema, exp_wrong_schema) 

1476 

1477 inp_invalid_id = [{'identifier': '5500-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1478 out_invalid_id = op.find_openaire_orcid(inp_invalid_id) 

1479 exp_invalid_id = "" 

1480 self.assertEqual(out_invalid_id, exp_invalid_id) 

1481 

1482 op.orcid_m.storage_manager.delete_storage() 

1483 

1484 # set a valid id as invalid in storage, so to check that the api check is 

1485 # avoided if the info is already in storage 

1486 op = OpenaireProcessing(testing=True) 

1487 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", False) 

1488 

1489 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1490 out = op.find_openaire_orcid(inp) 

1491 exp = "" 

1492 self.assertEqual(out, exp) 

1493 

1494 op.orcid_m.storage_manager.delete_storage() 

1495 op = OpenaireProcessing(testing=True) 

1496 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", True) 

1497 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1498 out = op.find_openaire_orcid(inp) 

1499 exp = "orcid:0000-0001-9759-3938" 

1500 self.assertEqual(out, exp) 

1501 op.orcid_m.storage_manager.delete_storage() 

1502 

1503 

1504 def test_find_openaire_orcid_redis(self): 

1505 op = OpenaireProcessing(testing=True) 

1506 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1507 out = op.find_openaire_orcid(inp) 

1508 exp = "orcid:0000-0001-9759-3938" 

1509 self.assertEqual(out, exp) 

1510 

1511 inp_wrong_schema = [{'identifier': '0000-0001-9759-3938', 'schema': 'fake_schema', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1512 out_wrong_schema = op.find_openaire_orcid(inp_wrong_schema) 

1513 exp_wrong_schema = "" 

1514 self.assertEqual(out_wrong_schema, exp_wrong_schema) 

1515 

1516 inp_invalid_id = [{'identifier': '5500-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1517 out_invalid_id = op.find_openaire_orcid(inp_invalid_id) 

1518 exp_invalid_id = "" 

1519 self.assertEqual(out_invalid_id, exp_invalid_id) 

1520 

1521 op.orcid_m.storage_manager.delete_storage() 

1522 

1523 # set a valid id as invalid in storage, so to check that the api check is 

1524 # avoided if the info is already in storage 

1525 op = OpenaireProcessing(testing=True) 

1526 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", False) 

1527 

1528 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1529 out = op.find_openaire_orcid(inp) 

1530 exp = "" 

1531 self.assertEqual(out, exp) 

1532 

1533 op.orcid_m.storage_manager.delete_storage() 

1534 op = OpenaireProcessing(testing=True) 

1535 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", True) 

1536 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1537 out = op.find_openaire_orcid(inp) 

1538 exp = "orcid:0000-0001-9759-3938" 

1539 self.assertEqual(out, exp) 

1540 op.orcid_m.storage_manager.delete_storage() 

1541 

1542 def test_update_redis_values(self): 

1543 br = ["pmid:2", "pmid:3"] 

1544 ra = ["orcid:0000-0003-0530-4305"] 

1545 op = OpenaireProcessing(testing=True) 

1546 op.update_redis_values(br,ra) 

1547 self.assertEqual(op._redis_values_br, br) 

1548 self.assertEqual(op._redis_values_ra, ra) 

1549 

1550 

1551 def test_find_openaire_orcid_with_index(self): 

1552 """Test ORCID validation using ORCID index before API validation""" 

1553 # Setup 

1554 test_doi = "10.1234/test123" 

1555 test_orcid = "0000-0002-1234-5678" 

1556 test_name = "Smith, John" 

1557 

1558 # Create OpenaireProcessing instance with ORCID index 

1559 op = OpenaireProcessing() 

1560 # Correct format for add_value: id_string -> value 

1561 op.orcid_index.add_value(test_doi, f"{test_name} [orcid:{test_orcid}]") 

1562 

1563 # Test Case 1: ORCID found in index 

1564 inp_1 = [{'identifier': test_orcid, 'schema': 'ORCID'}] 

1565 out_1 = op.find_openaire_orcid(inp_1, test_doi) 

1566 exp_1 = f"orcid:{test_orcid}" 

1567 self.assertEqual(out_1, exp_1) 

1568 # Verify it was added to temporary storage 

1569 self.assertTrue(op.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}")) 

1570 

1571 # Test Case 2: ORCID not in index but valid via API 

1572 inp_2 = [{'identifier': '0000-0003-4082-1500', 'schema': 'ORCID'}] 

1573 out_2 = op.find_openaire_orcid(inp_2, test_doi) 

1574 exp_2 = "orcid:0000-0003-4082-1500" 

1575 self.assertEqual(out_2, exp_2) 

1576 

1577 # Test Case 3: ORCID not in index and invalid 

1578 inp_3 = [{'identifier': '0000-0000-0000-0000', 'schema': 'ORCID'}] 

1579 out_3 = op.find_openaire_orcid(inp_3, test_doi) 

1580 exp_3 = "" 

1581 self.assertEqual(out_3, exp_3) 

1582 

1583 # Test Case 4: Valid ORCID but no DOI provided 

1584 inp_4 = [{'identifier': test_orcid, 'schema': 'ORCID'}] 

1585 out_4 = op.find_openaire_orcid(inp_4) # No DOI 

1586 exp_4 = f"orcid:{test_orcid}" # Should still validate via API 

1587 self.assertEqual(out_4, exp_4) 

1588 

1589 # Cleanup 

1590 op.storage_manager.delete_storage() 

1591 

1592 

1593def test_validated_as_with_storage_manager(storage_manager): 

1594 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"} 

1595 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"} 

1596 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"} 

1597 

1598 op_processing = OpenaireProcessing(storage_manager=storage_manager, testing=True) 

1599 op_processing.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True) 

1600 op_processing.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False) 

1601 

1602 assert op_processing.validated_as(valid_doi_in_db) is True 

1603 assert op_processing.validated_as(invalid_doi_in_db) is False 

1604 assert op_processing.validated_as(valid_doi_not_in_db) is None 

1605 

1606 

1607if __name__ == '__main__': 

1608 unittest.main()