Coverage for test / processing_oroci_test.py: 99%

879 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2023 Arianna Moretti <arianna.moretti4@unibo.it> 

2# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it> 

3# SPDX-FileCopyrightText: 2025-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

4# 

5# SPDX-License-Identifier: ISC 

6 

7import os 

8import unittest 

9 

10from oc_ds_converter.lib.jsonmanager import * 

11from oc_ds_converter.openaire.openaire_processing import OpenaireProcessing 

12# 

13 

14BASE = os.path.join('test', 'openaire_processing') 

15DATA = os.path.join(BASE, 'jSonFile_1.json') 

16DATA_DIR = BASE 

17TMP_SUPPORT_MATERIAL = os.path.join(BASE, "tmp_support") 

18OUTPUT = os.path.join(BASE, 'meta_input') 

19MULTIPROCESS_OUTPUT = os.path.join(BASE, 'multi_process_test') 

20MEMO_JSON_PATH = "test/openaire_processing/tmp_support/memo.json" 

21SAMPLE_ENTITY = {'collectedFrom': [{'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::081b82f96300b6a6e3d282bad31cb6e2', 'schema': 'DNET Identifier'}], 'name': 'Crossref'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::8ac8380272269217cb09a928c8caa993', 'schema': 'DNET Identifier'}], 'name': 'UnpayWall'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::806360c771262b4d6770e7cdf04b5c5a', 'schema': 'DNET Identifier'}], 'name': 'ORCID'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a', 'schema': 'DNET Identifier'}], 'name': 'Microsoft Academic Graph'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::9e3be59865b2c1c335d32dae2fe7b254', 'schema': 'DNET Identifier'}], 'name': 'Datacite'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::6f4922f45568161a8cdf4ad2299f6d23', 'schema': 'DNET Identifier'}], 'name': 'arXiv.org e-Print Archive'}, 'provisionMode': 'collected'}], 'creator': [{'name': 'Matteo Serra'}, {'name': 'Salvatore Mignemi'}, {'identifiers': [{'identifier': '0000-0001-5595-7537', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-5595-7537'}], 'name': 'Mariano Cadoni'}], 'dnetIdentifier': '50|doi_dedup___::41074cd388749ccbdb6668caaf059f4a', 'identifier': [{'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi', 'url': 'https://doi.org/10.1103/physrevd.84.084046'}, {'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi'}, {'identifier': '10.48550/arxiv.1107.5979', 'schema': 'doi', 'url': 'https://dx.doi.org/10.48550/arxiv.1107.5979'}, {'identifier': '1107.5979', 'schema': 'arXiv', 'url': 'http://arxiv.org/abs/1107.5979'}], 'objectSubType': 'Article', 'objectType': 'publication', 'publicationDate': '2011-10-21', 'publisher': [{'name': 'American Physical Society (APS)'}], 'title': 'Exact solutions with AdS asymptotics of Einstein and Einstein-Maxwell gravity minimally coupled to a scalar field'} 

22SAMPLE_ENT2 = {"identifier":"000017d2c913b28e09291b811ce3609a","linkprovider":[{"identifiers":[{"identifier":"10|openaire____::0a836ef43dcb67bb7cbd4dd509b11b73","schema":"DNET Identifier"}],"name":"CORE (RIOXX-UK Aggregator)"},{"identifiers":[{"identifier":"10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357","schema":"DNET Identifier"}],"name":"PubMed Central"},{"identifiers":[{"identifier":"10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c","schema":"DNET Identifier"}],"name":"Europe PubMed Central"},{"identifiers":[{"identifier":"10|opendoar____::229754d7799160502a143a72f6789927","schema":"DNET Identifier"}],"name":"Publications at Bielefeld University"}],"publicationDate":"2014-02-01","publisher":[{"name":"Springer Nature"}],"relationship":{"inverse":"IsCitedBy","name":"Cites","schema":"datacite"},"source":{"collectedFrom":[{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|openaire____::0a836ef43dcb67bb7cbd4dd509b11b73","schema":"DNET Identifier"}],"name":"CORE (RIOXX-UK Aggregator)"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357","schema":"DNET Identifier"}],"name":"PubMed Central"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c","schema":"DNET Identifier"}],"name":"Europe PubMed Central"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|opendoar____::229754d7799160502a143a72f6789927","schema":"DNET Identifier"}],"name":"Publications at Bielefeld University"},"provisionMode":"collected"}],"creator":[{"identifiers":[{"identifier":"0000-0002-6491-0754","schema":"ORCID","url":"https://orcid.org/0000-0002-6491-0754"}],"name":"Sattler, Sebastian"},{"name":"Mehlkop, Guido"},{"name":"Graeff, Peter"},{"identifiers":[{"identifier":"0000-0002-8090-6886","schema":"ORCID","url":"https://orcid.org/0000-0002-8090-6886"}],"name":"Sauer, Carsten"}],"dnetIdentifier":"50|pmid_dedup__::8936076da7a86820c24ede7ca3ff15b3","identifier":[{"identifier":"PMC3928621","schema":"pmc","url":"http://europepmc.org/articles/PMC3928621"},{"identifier":"24484640","schema":"pmid"},{"identifier":"24484640","schema":"pmid","url":"https://pubmed.ncbi.nlm.nih.gov/24484640"},{"identifier":"PMC3928621","schema":"pmc"}],"objectSubType":"Article","objectType":"publication","publicationDate":"2014-02-01","publisher":[{"name":"Springer Nature"}],"title":"Evaluating the drivers of and obstacles to the willingness to use cognitive enhancement drugs: the influence of drug characteristics, social environment, and personal characteristics"},"target":{"collectedFrom":[{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2","schema":"DNET Identifier"}],"name":"Crossref"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a","schema":"DNET Identifier"}],"name":"Microsoft Academic Graph"},"provisionMode":"collected"}],"creator":[{"name":"Harold G. Grasmick"},{"name":"Robert J. Bursik"}],"dnetIdentifier":"50|doi_________::816648c63de74835ec2b0a753a68f037","identifier":[{"identifier":"10.2307/3053861","schema":"doi","url":"https://doi.org/10.2307/3053861"}],"objectSubType":"Article","objectType":"publication","publicationDate":"1990-01-01","publisher":[{"name":"JSTOR"}],"title":"Conscience, significant others, and rational choice: Extending the deterrence model."}} 

23SAMPLE_ENTITY_FOR_CSV_CREATOR = {'collectedFrom': [{'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::0a836ef43dcb67bb7cbd4dd509b11b73', 'schema': 'DNET Identifier'}], 'name': 'CORE (RIOXX-UK Aggregator)'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357', 'schema': 'DNET Identifier'}], 'name': 'PubMed Central'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c', 'schema': 'DNET Identifier'}], 'name': 'Europe PubMed Central'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|driver______::bee53aa31dc2cbb538c10c2b65fa5824', 'schema': 'DNET Identifier'}], 'name': 'DOAJ-Articles'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::566a9968b43628588e76be5a85a0f9e8', 'schema': 'DNET Identifier'}], 'name': "King's Research Portal"}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::c2cdfa5866e03cdd07d313cbc8fb8311', 'schema': 'DNET Identifier'}], 'name': 'Multidisciplinary Digital Publishing Institute'}, 'provisionMode': 'collected'}], 'creator': [{'name': 'Smith, Lee'}, {'name': 'Sawyer, Alexia'}, {'name': 'Gardner, Benjamin'}, {'name': 'Seppala, Katri'}, {'name': 'Ucci, Marcella'}, {'name': 'Marmot, Alexi'}, {'name': 'Lally, Pippa'}, {'name': 'Fisher, Abi'}], 'dnetIdentifier': '50|pmid_dedup__::a1a8687c2378a0d68314566dec29dafb', 'objectSubType': 'Article', 'objectType': 'publication', 'publicationDate': '2018-06-09', 'publisher': [{'name': 'MDPI'}], 'title': 'Occupational physical activity habits of UK office workers: cross-sectional data from the Active Buildings Study', 'identifier': {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:29890726', 'valid': None}]}, "redis_validity_lists":[[],[]]} 

24 

25 

26class TestOpenaireProcessing(unittest.TestCase): 

27 

28 def delete_storege(self, storage_type=None, specific_path=None): 

29 if not specific_path: 

30 if storage_type == "sqlite": 

31 auto_db_created_path = os.path.join(os.getcwd(), "storage", "id_valid_dict.db") 

32 auto_db_created_path = auto_db_created_path if os.path.exists(auto_db_created_path) else auto_db_created_path+"?mode=rw" 

33 if os.path.exists(auto_db_created_path): 

34 os.remove(auto_db_created_path) 

35 else: 

36 auto_db_created_path = os.path.join(os.getcwd(), "storage", "id_value.json") 

37 if os.path.exists(auto_db_created_path): 

38 os.remove(auto_db_created_path) 

39 elif specific_path: 

40 if os.path.exists(specific_path): 

41 os.remove(specific_path) 

42 

43 def test_get_all_ids(self): 

44 opp = OpenaireProcessing() 

45 allids = opp.extract_all_ids(SAMPLE_ENT2) 

46 self.assertCountEqual(['pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'], allids[0]) 

47 self.assertCountEqual(['orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'], allids[1]) 

48 

49 opp.storage_manager.delete_storage() 

50 

51 def test_get_all_ids_redis(self): 

52 opp = OpenaireProcessing(testing=True) 

53 allids = opp.extract_all_ids(SAMPLE_ENT2) 

54 self.assertCountEqual(['pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'], allids[0]) 

55 self.assertCountEqual(['orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'], allids[1]) 

56 opp.storage_manager.delete_storage() 

57 

58 def test_get_redis_validity_list(self): 

59 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'} 

60 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

61 

62 opp = OpenaireProcessing() 

63 br_valid_list = opp.get_redis_validity_list(br, "br") 

64 exp_exp_br_valid_list = [] 

65 ra_valid_list = opp.get_redis_validity_list(ra, "ra") 

66 exp_exp_ra_valid_list = [] 

67 self.assertEqual(ra_valid_list, exp_exp_ra_valid_list) 

68 self.assertEqual(br_valid_list, exp_exp_br_valid_list) 

69 

70 opp.storage_manager.delete_storage() 

71 

72 def test_get_redis_validity_list_redis(self): 

73 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'} 

74 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

75 

76 opp = OpenaireProcessing(testing=True) 

77 br_valid_list = opp.get_redis_validity_list(br, "br") 

78 exp_exp_br_valid_list = [] 

79 ra_valid_list = opp.get_redis_validity_list(ra, "ra") 

80 exp_exp_ra_valid_list = [] 

81 self.assertEqual(ra_valid_list, exp_exp_ra_valid_list) 

82 self.assertEqual(br_valid_list, exp_exp_br_valid_list) 

83 opp.storage_manager.delete_storage() 

84 

85 def test_get_reids_validity_dict_w_fakeredis_db_values_sqlite(self): 

86 opp = OpenaireProcessing() 

87 opp.BR_redis.sadd('pmid:24484640', "omid:1") 

88 opp.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2") 

89 

90 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'} 

91 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

92 

93 br_validity_dict = opp.get_redis_validity_list(br, "br") 

94 exp_br_valid_list = ["pmid:24484640"] 

95 ra_validity_dict = opp.get_redis_validity_list(ra, "ra") 

96 exp_ra_valid_list = ['orcid:0000-0002-8090-6886'] 

97 self.assertEqual(br_validity_dict, exp_br_valid_list) 

98 self.assertEqual(ra_validity_dict, exp_ra_valid_list) 

99 

100 opp.storage_manager.delete_storage() 

101 

102 opp.BR_redis.delete('pmid:24484640') 

103 opp.BR_redis.delete('pmcid:PMC3928621') 

104 opp.RA_redis.delete('orcid:0000-0002-8090-6886') 

105 

106 def test_get_reids_validity_dict_w_fakeredis_db_values_redis(self): 

107 opp = OpenaireProcessing(testing=True) 

108 opp.BR_redis.sadd('pmid:24484640', "omid:1") 

109 opp.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2") 

110 

111 

112 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'} 

113 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'} 

114 

115 br_validity_dict = opp.get_redis_validity_list(br, "br") 

116 exp_br_valid_list = ["pmid:24484640"] 

117 ra_validity_dict = opp.get_redis_validity_list(ra, "ra") 

118 exp_ra_valid_list = ['orcid:0000-0002-8090-6886'] 

119 self.assertEqual(br_validity_dict, exp_br_valid_list) 

120 self.assertEqual(ra_validity_dict, exp_ra_valid_list) 

121 

122 opp.storage_manager.delete_storage() 

123 opp.BR_redis.delete('pmid:24484640') 

124 opp.BR_redis.delete('pmcid:PMC3928621') 

125 opp.RA_redis.delete('orcid:0000-0002-8090-6886') 

126 

127 def test_validated_as_default(self): 

128 """ 

129 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

130 string of the identifier, the method "validated_as" returns: 

131 - True if the id was already validated as valid 

132 - False if the id was already validated as invalid 

133 - None if the id was not validated before 

134 The procedure is tested 

135 - With default storage manager (sqlite) without a pre-existent db associated 

136 """ 

137 

138 opp = OpenaireProcessing() 

139 validate_as_none = opp.validated_as({"schema":"pmid", "identifier": "pmid:23483834"}) 

140 self.assertEqual(validate_as_none, None) 

141 opp.storage_manager.delete_storage() 

142 

143 def test_validated_as_default_redis(self): 

144 ''' 

145 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

146 string of the identifier, the method "validated_as" returns: 

147 - True if the id was already validated as valid 

148 - False if the id was already validated as invalid 

149 - None if the id was not validated before 

150 The procedure is tested 

151 - With redis storage manager without a pre-existent db associated 

152 ''' 

153 

154 opp = OpenaireProcessing(testing=True) 

155 validate_as_none = opp.validated_as({"schema":"pmid", "identifier": "pmid:23483834"}) 

156 self.assertEqual(validate_as_none, None) 

157 opp.storage_manager.delete_storage() 

158 

159 def test_validated_as_redis_with_preexistent_data(self): 

160 ''' 

161 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

162 string of the identifier, the method "validated_as" returns: 

163 - True if the id was already validated as valid 

164 - False if the id was already validated as invalid 

165 - None if the id was not validated before 

166 The procedure is tested 

167 - With redis storage manager and pre-existent data associated 

168 ''' 

169 valid_pmid_not_in_db = {"identifier":"pmid:2938", "schema":"pmid"} 

170 valid_pmid_in_db = {"identifier":"pmid:23483834", "schema":"pmid"} 

171 invalid_pmid_in_db = {"identifier":"pmid:18328387372097", "schema":"pmid"} 

172 

173 # New class instance and set values directly on the id managers' storage_manager 

174 opp_redis = OpenaireProcessing(testing=True) 

175 opp_redis.pmid_m.storage_manager.set_value(valid_pmid_in_db["identifier"], True) 

176 opp_redis.pmid_m.storage_manager.set_value(invalid_pmid_in_db["identifier"], False) 

177 validated_as_True = opp_redis.validated_as(valid_pmid_in_db) 

178 validated_as_False = opp_redis.validated_as(invalid_pmid_in_db) 

179 not_validated = opp_redis.validated_as(valid_pmid_not_in_db) 

180 

181 self.assertEqual(validated_as_True, True) 

182 self.assertEqual(validated_as_False, False) 

183 self.assertEqual(not_validated, None) 

184 

185 opp_redis.pmid_m.storage_manager.delete_storage() 

186 

187 

188 def test_validated_as_inmemory(self): 

189 ''' 

190 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

191 string of the identifier, the method "validated_as" returns: 

192 - True if the id was already validated as valid 

193 - False if the id was already validated as invalid 

194 - None if the id was not validated before 

195 The procedure is tested 

196 - With in Memory + Json storage manager and a pre-existent db associated 

197 - With in Memory + Json storage manager without a pre-existent db associated 

198 ''' 

199 

200 valid_pmid_not_in_db = {"identifier":"pmid:2938", "schema":"pmid"} 

201 valid_pmid_in_db = {"identifier":"pmid:23483834", "schema":"pmid"} 

202 invalid_pmid_in_db = {"identifier":"pmid:18328387372097", "schema":"pmid"} 

203 

204 # New class instance and set values directly on the id managers' storage_manager 

205 opp_sql = OpenaireProcessing(testing=True) 

206 opp_sql.pmid_m.storage_manager.set_value(valid_pmid_in_db["identifier"], True) 

207 opp_sql.pmid_m.storage_manager.set_value(invalid_pmid_in_db["identifier"], False) 

208 validated_as_True = opp_sql.validated_as(valid_pmid_in_db) 

209 validated_as_False = opp_sql.validated_as(invalid_pmid_in_db) 

210 not_validated = opp_sql.validated_as(valid_pmid_not_in_db) 

211 

212 self.assertEqual(validated_as_True, True) 

213 self.assertEqual(validated_as_False, False) 

214 self.assertEqual(not_validated, None) 

215 

216 opp_sql.pmid_m.storage_manager.delete_storage() 

217 

218 

219 def test_validated_as_redis(self): 

220 ''' 

221 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: 

222 string of the identifier, the method "validated_as" returns: 

223 - True if the id was already validated as valid 

224 - False if the id was already validated as invalid 

225 - None if the id was not validated before 

226 The procedure is tested 

227 - With REDIS storage manager and a pre-existent db associated 

228 - With REDIS storage manager without a pre-existent db associated 

229 ''' 

230 

231 valid_pmid_not_in_db = {"identifier":"pmid:2938", "schema":"pmid"} 

232 valid_pmid_in_db = {"identifier":"pmid:23483834", "schema":"pmid"} 

233 invalid_pmid_in_db = {"identifier":"pmid:18328387372097", "schema":"pmid"} 

234 

235 # New class instance and set values directly on the id managers' storage_manager 

236 opp_redis = OpenaireProcessing(testing=True) 

237 opp_redis.pmid_m.storage_manager.set_value(valid_pmid_in_db["identifier"], True) 

238 opp_redis.pmid_m.storage_manager.set_value(invalid_pmid_in_db["identifier"], False) 

239 validated_as_True = opp_redis.validated_as(valid_pmid_in_db) 

240 validated_as_False = opp_redis.validated_as(invalid_pmid_in_db) 

241 not_validated = opp_redis.validated_as(valid_pmid_not_in_db) 

242 

243 self.assertEqual(validated_as_True, True) 

244 self.assertEqual(validated_as_False, False) 

245 self.assertEqual(not_validated, None) 

246 opp_redis.pmid_m.storage_manager.delete_storage() 

247 

248 def test_get_id_manager(self): 

249 """Check that, given in input the string of a schema (e.g.:'pmid') or an id with a prefix (e.g.: 'pmid:12334') 

250 and a dictionary mapping the strings of the schemas to their id managers, the method returns the correct 

251 id manager. Note that each instance of the Preprocessing class needs its own instances of the id managers, 

252 in order to avoid conflicts while validating data""" 

253 

254 op = OpenaireProcessing() 

255 id_man_dict = op._id_man_dict 

256 

257 pmid_id = "pmid:12345" 

258 pmid_string = "pmid" 

259 pmid_man_exp = op.get_id_manager(pmid_id, id_man_dict) 

260 pmid_man_exp_2 = op.get_id_manager(pmid_string, id_man_dict) 

261 

262 #check that the idmanager for the pmid was returned and that it works as expected 

263 self.assertTrue(pmid_man_exp.is_valid(pmid_id)) 

264 self.assertTrue(pmid_man_exp_2.is_valid(pmid_id)) 

265 

266 doi_id = "doi:10.1103/physrevd.84.084046" 

267 doi_string = "doi" 

268 doi_man_exp = op.get_id_manager(doi_id, id_man_dict) 

269 doi_man_exp_2 = op.get_id_manager(doi_string, id_man_dict) 

270 

271 #check that the idmanager for the doi was returned and that it works as expected 

272 self.assertTrue(doi_man_exp.is_valid(doi_id)) 

273 self.assertTrue(doi_man_exp_2.is_valid(doi_id)) 

274 

275 pmc_id = "pmcid:PMC5555555" 

276 pmc_string = "pmcid" 

277 pmc_man_exp = op.get_id_manager(pmc_id, id_man_dict) 

278 pmc_man_exp_2 = op.get_id_manager(pmc_string, id_man_dict) 

279 

280 #check that the idmanager for the pmc was returned and that it works as expected 

281 self.assertTrue(pmc_man_exp.is_valid(pmc_id)) 

282 self.assertTrue(pmc_man_exp_2.is_valid(pmc_id)) 

283 

284 arxiv_id = "arxiv:1509.08217" 

285 arxiv_string = "arxiv" 

286 arxiv_man_exp = op.get_id_manager(arxiv_id, id_man_dict) 

287 arxiv_man_exp_2 = op.get_id_manager(arxiv_string, id_man_dict) 

288 

289 #check that the idmanager for the arxiv was returned and that it works as expected 

290 self.assertTrue(arxiv_man_exp.is_valid(arxiv_id)) 

291 self.assertTrue(arxiv_man_exp_2.is_valid(arxiv_id)) 

292 

293 op.storage_manager.delete_storage() 

294 

295 def test_get_id_manager_redis(self): 

296 """Check that, given in input the string of a schema (e.g.:'pmid') or an id with a prefix (e.g.: 'pmid:12334') 

297 and a dictionary mapping the strings of the schemas to their id managers, the method returns the correct 

298 id manager. Note that each instance of the Preprocessing class needs its own instances of the id managers, 

299 in order to avoid conflicts while validating data""" 

300 

301 op = OpenaireProcessing(testing=True) 

302 id_man_dict = op._id_man_dict 

303 

304 pmid_id = "pmid:12345" 

305 pmid_string = "pmid" 

306 pmid_man_exp = op.get_id_manager(pmid_id, id_man_dict) 

307 pmid_man_exp_2 = op.get_id_manager(pmid_string, id_man_dict) 

308 

309 #check that the idmanager for the pmid was returned and that it works as expected 

310 self.assertTrue(pmid_man_exp.is_valid(pmid_id)) 

311 self.assertTrue(pmid_man_exp_2.is_valid(pmid_id)) 

312 

313 doi_id = "doi:10.1103/physrevd.84.084046" 

314 doi_string = "doi" 

315 doi_man_exp = op.get_id_manager(doi_id, id_man_dict) 

316 doi_man_exp_2 = op.get_id_manager(doi_string, id_man_dict) 

317 

318 #check that the idmanager for the doi was returned and that it works as expected 

319 self.assertTrue(doi_man_exp.is_valid(doi_id)) 

320 self.assertTrue(doi_man_exp_2.is_valid(doi_id)) 

321 

322 pmc_id = "pmcid:PMC5555555" 

323 pmc_string = "pmcid" 

324 pmc_man_exp = op.get_id_manager(pmc_id, id_man_dict) 

325 pmc_man_exp_2 = op.get_id_manager(pmc_string, id_man_dict) 

326 

327 #check that the idmanager for the pmc was returned and that it works as expected 

328 self.assertTrue(pmc_man_exp.is_valid(pmc_id)) 

329 self.assertTrue(pmc_man_exp_2.is_valid(pmc_id)) 

330 

331 arxiv_id = "arxiv:1509.08217" 

332 arxiv_string = "arxiv" 

333 arxiv_man_exp = op.get_id_manager(arxiv_id, id_man_dict) 

334 arxiv_man_exp_2 = op.get_id_manager(arxiv_string, id_man_dict) 

335 

336 #check that the idmanager for the arxiv was returned and that it works as expected 

337 self.assertTrue(arxiv_man_exp.is_valid(arxiv_id)) 

338 self.assertTrue(arxiv_man_exp_2.is_valid(arxiv_id)) 

339 

340 op.storage_manager.delete_storage() 

341 

342 

343 def test_normalise_any_id(self): 

344 ''' 

345 Check that, given an id with a prefix, any doi, pmid, pmcid and arxiv id is correctly normalised 

346 ''' 

347 op = OpenaireProcessing() 

348 

349 pmid_id = "pmid:12345" 

350 doi_id = "doi:10.1103/physrevd.84.084046" 

351 arxiv_id = "arxiv:1509.08217" 

352 pmc_id = "pmcid:PMC5555555" 

353 

354 self.assertEqual(pmid_id, op.normalise_any_id(pmid_id+"abc")) 

355 self.assertEqual(doi_id, op.normalise_any_id("doi:" + doi_id.split(":")[1].upper())) 

356 self.assertEqual(arxiv_id + "v1", op.normalise_any_id(arxiv_id.replace(".", "...."))) 

357 self.assertEqual(pmc_id, op.normalise_any_id(pmc_id+" ")) 

358 

359 op.storage_manager.delete_storage() 

360 

361 def test_normalise_any_id_redis(self): 

362 ''' 

363 Check that, given an id with a prefix, any doi, pmid, pmcid and arxiv id is correctly normalised 

364 ''' 

365 op = OpenaireProcessing(testing=True) 

366 

367 pmid_id = "pmid:12345" 

368 doi_id = "doi:10.1103/physrevd.84.084046" 

369 arxiv_id = "arxiv:1509.08217" 

370 pmc_id = "pmcid:PMC5555555" 

371 

372 self.assertEqual(pmid_id, op.normalise_any_id(pmid_id+"abc")) 

373 self.assertEqual(doi_id, op.normalise_any_id("doi:" + doi_id.split(":")[1].upper())) 

374 self.assertEqual(arxiv_id + "v1", op.normalise_any_id(arxiv_id.replace(".", "...."))) 

375 self.assertEqual(pmc_id, op.normalise_any_id(pmc_id+" ")) 

376 

377 op.storage_manager.delete_storage() 

378 

379 def test_get_norm_ids(self): 

380 ''' 

381 Check that, given a list of dictionaries representing the ids of an entity, the method returns a reduced version 

382 of the same list, containing only the normalised version of the ids of the schemas managed by opencitations. 

383 Each reduced dictionary only contains two key-value pairs, i.e.: "identifier" and "schema". 

384 ''' 

385 op = OpenaireProcessing() 

386 

387 list_of_ids_to_norm_with_duplicates = [ 

388 {'identifier': '10.1103/PHYSREVD.84.084046', 'schema': 'doi', 

389 'url': 'https://doi.org/10.1103/physrevd.84.084046'}, 

390 {'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi'}, 

391 {'identifier': '10.48550/arxiv.1107.5979', 'schema': 'doi', 

392 'url': 'https://dx.doi.org/10.48550/arxiv.1107.5979'}, 

393 {'identifier': '1107.5979', 'schema': 'arXiv', 'url': 'http://arxiv.org/abs/1107.5979'}] 

394 norm_ids = op.get_norm_ids(list_of_ids_to_norm_with_duplicates) 

395 exp_norm_ids = [{'identifier': 'doi:10.1103/physrevd.84.084046', 'schema': 'doi'}, 

396 {'identifier': 'doi:10.48550/arxiv.1107.5979', 'schema': 'doi'}, 

397 {'identifier': 'arxiv:1107.5979v1', 'schema': 'arxiv'}] 

398 

399 list_of_ids_w_not_managed_schema = [ 

400 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'}, 

401 {'identifier': '21887584', 'schema': 'pmid', 'url': 'https://pubmed.ncbi.nlm.nih.gov/21887584'}, 

402 {'identifier': '10.1007/s12160-011-9282-0', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}] 

403 norm_ids_2 = op.get_norm_ids(list_of_ids_w_not_managed_schema) 

404 exp_norm_ids_2 = [{'identifier': 'pmid:21887584', 'schema': 'pmid'}, 

405 {'identifier': 'doi:10.1007/s12160-011-9282-0', 'schema': 'doi'}] 

406 

407 list_of_ids_not_managed_and_not_normalisable_only = [ 

408 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'}, 

409 {'identifier': '20.ABC/s12160-011-9282-FAKEID', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}] 

410 norm_ids_3 = op.get_norm_ids(list_of_ids_not_managed_and_not_normalisable_only) 

411 exp_norm_ids_3 = [] 

412 

413 self.assertEqual(norm_ids, exp_norm_ids) 

414 self.assertEqual(norm_ids_2, exp_norm_ids_2) 

415 self.assertEqual(norm_ids_3, exp_norm_ids_3) 

416 op.storage_manager.delete_storage() 

417 

418 

419 def test_get_norm_ids_redis(self): 

420 ''' 

421 Check that, given a list of dictionaries representing the ids of an entity, the method returns a reduced version 

422 of the same list, containing only the normalised version of the ids of the schemas managed by opencitations. 

423 Each reduced dictionary only contains two key-value pairs, i.e.: "identifier" and "schema". 

424 ''' 

425 op = OpenaireProcessing(testing=True) 

426 

427 list_of_ids_to_norm_with_duplicates = [ 

428 {'identifier': '10.1103/PHYSREVD.84.084046', 'schema': 'doi', 

429 'url': 'https://doi.org/10.1103/physrevd.84.084046'}, 

430 {'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi'}, 

431 {'identifier': '10.48550/arxiv.1107.5979', 'schema': 'doi', 

432 'url': 'https://dx.doi.org/10.48550/arxiv.1107.5979'}, 

433 {'identifier': '1107.5979', 'schema': 'arXiv', 'url': 'http://arxiv.org/abs/1107.5979'}] 

434 norm_ids = op.get_norm_ids(list_of_ids_to_norm_with_duplicates) 

435 exp_norm_ids = [{'identifier': 'doi:10.1103/physrevd.84.084046', 'schema': 'doi'}, 

436 {'identifier': 'doi:10.48550/arxiv.1107.5979', 'schema': 'doi'}, 

437 {'identifier': 'arxiv:1107.5979v1', 'schema': 'arxiv'}] 

438 

439 list_of_ids_w_not_managed_schema = [ 

440 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'}, 

441 {'identifier': '21887584', 'schema': 'pmid', 'url': 'https://pubmed.ncbi.nlm.nih.gov/21887584'}, 

442 {'identifier': '10.1007/s12160-011-9282-0', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}] 

443 norm_ids_2 = op.get_norm_ids(list_of_ids_w_not_managed_schema) 

444 exp_norm_ids_2 = [{'identifier': 'pmid:21887584', 'schema': 'pmid'}, 

445 {'identifier': 'doi:10.1007/s12160-011-9282-0', 'schema': 'doi'}] 

446 

447 list_of_ids_not_managed_and_not_normalisable_only = [ 

448 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'}, 

449 {'identifier': '20.ABC/s12160-011-9282-FAKEID', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}] 

450 norm_ids_3 = op.get_norm_ids(list_of_ids_not_managed_and_not_normalisable_only) 

451 exp_norm_ids_3 = [] 

452 

453 self.assertEqual(norm_ids, exp_norm_ids) 

454 self.assertEqual(norm_ids_2, exp_norm_ids_2) 

455 self.assertEqual(norm_ids_3, exp_norm_ids_3) 

456 op.storage_manager.delete_storage() 

457 

458 def test_dict_to_cache(self): 

459 op = OpenaireProcessing() 

460 sample_dict = {"dict_type": "sample"} 

461 if os.path.exists(MEMO_JSON_PATH): 

462 os.remove(MEMO_JSON_PATH) 

463 self.assertFalse(os.path.exists(MEMO_JSON_PATH)) 

464 op.dict_to_cache(sample_dict, MEMO_JSON_PATH) 

465 self.assertTrue(os.path.exists(MEMO_JSON_PATH)) 

466 self.delete_storege(specific_path=MEMO_JSON_PATH) 

467 self.assertFalse(os.path.exists(MEMO_JSON_PATH)) 

468 op.storage_manager.delete_storage() 

469 

470 

471 def test_csv_creator_base(self): 

472 ''' 

473 Check that, given an updated openaire entity (i.e.: where the "identifier" field was modified 

474 after having checked the presence of the given identifiers in the storage memory) a meta csv 

475 table for the entity is created 

476 ''' 

477 

478 op = OpenaireProcessing() 

479 csv_row = op.csv_creator(SAMPLE_ENTITY_FOR_CSV_CREATOR) 

480 expected_row = { 

481 'id': 'pmid:29890726', 

482 'title': 'Occupational physical activity habits of UK office workers: cross-sectional data from the Active Buildings Study', 

483 'author': 'Smith Lee; Sawyer Alexia; Gardner Benjamin; Seppala Katri; Ucci Marcella; Marmot Alexi; Lally Pippa; Fisher Abi', 

484 'pub_date': '2018-06-09', 

485 'venue': '', 

486 'volume': '', 

487 'issue': '', 

488 'page': '', 

489 'type': 'journal article', 

490 'publisher': 'MDPI', 

491 'editor': '' 

492 } 

493 self.assertEqual(csv_row, expected_row) 

494 

495 op.storage_manager.delete_storage() 

496 

497 def test_csv_creator_base_redis(self): 

498 ''' 

499 Check that, given an updated openaire entity (i.e.: where the "identifier" field was modified 

500 after having checked the presence of the given identifiers in the storage memory) a meta csv 

501 table for the entity is created 

502 ''' 

503 

504 op = OpenaireProcessing(testing=True) 

505 csv_row = op.csv_creator(SAMPLE_ENTITY_FOR_CSV_CREATOR) 

506 expected_row = { 

507 'id': 'pmid:29890726', 

508 'title': 'Occupational physical activity habits of UK office workers: cross-sectional data from the Active Buildings Study', 

509 'author': 'Smith Lee; Sawyer Alexia; Gardner Benjamin; Seppala Katri; Ucci Marcella; Marmot Alexi; Lally Pippa; Fisher Abi', 

510 'pub_date': '2018-06-09', 

511 'venue': '', 

512 'volume': '', 

513 'issue': '', 

514 'page': '', 

515 'type': 'journal article', 

516 'publisher': 'MDPI', 

517 'editor': '' 

518 } 

519 self.assertEqual(csv_row, expected_row) 

520 

521 op.storage_manager.delete_storage() 

522 

523 def test_csv_creator_not_accepted_id(self): 

524 ''' 

525 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id), 

526 no meta csv rows are created. 

527 ''' 

528 

529 op = OpenaireProcessing() 

530 

531 replaced_entity = {'schema': 'handle', 'identifier': 'handle:11245/1.357137', 'valid': None} 

532 MODIFIED_ENTITY = {k:v for k,v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()} 

533 MODIFIED_ENTITY["identifier"]["to_be_val"]= [] 

534 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity) 

535 csv_row = op.csv_creator(MODIFIED_ENTITY) 

536 expected_row = {} #because there is no ID accepted in opencitations for this entity 

537 self.assertEqual(csv_row, expected_row) 

538 

539 op.storage_manager.delete_storage() 

540 

541 def test_csv_creator_not_accepted_id_redis(self): 

542 ''' 

543 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id), 

544 no meta csv rows are created. 

545 ''' 

546 

547 op = OpenaireProcessing(testing=True) 

548 

549 replaced_entity = {'schema': 'handle', 'identifier': 'handle:11245/1.357137', 'valid': None} 

550 MODIFIED_ENTITY = {k:v for k,v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()} 

551 MODIFIED_ENTITY["identifier"]["to_be_val"]= [] 

552 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity) 

553 csv_row = op.csv_creator(MODIFIED_ENTITY) 

554 expected_row = {} #because there is no ID accepted in opencitations for this entity 

555 self.assertEqual(csv_row, expected_row) 

556 

557 op.storage_manager.delete_storage() 

558 

559 def test_csv_creator_invalid_id(self): 

560 ''' 

561 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id), 

562 no meta csv rows are created. 

563 ''' 

564 

565 op = OpenaireProcessing() 

566 

567 replaced_entity = {'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None} 

568 MODIFIED_ENTITY = {k: v for k, v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()} 

569 MODIFIED_ENTITY["identifier"]["to_be_val"] = [] 

570 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity) 

571 csv_row = op.csv_creator(MODIFIED_ENTITY) 

572 expected_row = {} # because there is no ID accepted in opencitations for this entity 

573 self.assertEqual(csv_row, expected_row) 

574 

575 op.storage_manager.delete_storage() 

576 

577 

578 def test_csv_creator_invalid_id_redis(self): 

579 ''' 

580 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id), 

581 no meta csv rows are created. 

582 ''' 

583 

584 op = OpenaireProcessing(testing=True) 

585 

586 replaced_entity = {'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None} 

587 MODIFIED_ENTITY = {k: v for k, v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()} 

588 MODIFIED_ENTITY["identifier"]["to_be_val"] = [] 

589 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity) 

590 csv_row = op.csv_creator(MODIFIED_ENTITY) 

591 expected_row = {} # because there is no ID accepted in opencitations for this entity 

592 self.assertEqual(csv_row, expected_row) 

593 

594 op.storage_manager.delete_storage() 

595 

596 def test_get_publisher_name_base(self): 

597 ''' 

598 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's 

599 normalised name (and possibly its crossref ID) is returned. 

600 

601 Base functionalities: No publisher mapping in input -> only Publisher name retrieved from the datasource dump 

602 ''' 

603 op = OpenaireProcessing() 

604 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'} 

605 

606 doi_pub_1_input = {'name': 'Frontiers Media SA'} 

607 doi1 = "10.3389/fnana.2012.00034" 

608 

609 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'} 

610 doi2 = "10.2527/1995.7392834x" 

611 

612 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input) 

613 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input) 

614 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input) 

615 

616 self.assertEqual(doi_pub_output_1, "Frontiers Media SA") 

617 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd") 

618 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)") 

619 

620 op.storage_manager.delete_storage() 

621 

622 def test_get_publisher_name_base_redis(self): 

623 ''' 

624 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's 

625 normalised name (and possibly its crossref ID) is returned. 

626 

627 Base functionalities: No publisher mapping in input -> only Publisher name retrieved from the datasource dump 

628 ''' 

629 op = OpenaireProcessing(testing=True) 

630 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'} 

631 

632 doi_pub_1_input = {'name': 'Frontiers Media SA'} 

633 doi1 = "10.3389/fnana.2012.00034" 

634 

635 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'} 

636 doi2 = "10.2527/1995.7392834x" 

637 

638 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input) 

639 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input) 

640 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input) 

641 

642 self.assertEqual(doi_pub_output_1, "Frontiers Media SA") 

643 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd") 

644 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)") 

645 

646 op.storage_manager.delete_storage() 

647 

648 def test_get_publisher_name_publishers_mapping(self): 

649 ''' 

650 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's 

651 normalised name (and possibly its crossref ID) is returned. 

652 

653 Mapping Provided: Publisher name retrieved + crossref member returned, 

654 only if : 

655 - the doi prefix is a crossref doi prefix, 

656 - it is present in the mapping, 

657 -the name of the publisher provided by the datasource corresponds to the from the datasource dump 

658 ''' 

659 

660 op = OpenaireProcessing(publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json") 

661 

662 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'} 

663 

664 doi_pub_1_input = {'name': 'Frontiers Media SA'} 

665 doi1 = "10.3389/fnana.2012.00034" 

666 

667 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'} 

668 doi2 = "10.2527/1995.7392834x" 

669 

670 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input) 

671 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input) 

672 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input) 

673 

674 self.assertEqual(doi_pub_output_1, "Frontiers Media SA") 

675 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd") 

676 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)") 

677 

678 op.storage_manager.delete_storage() 

679 

680 def test_get_publisher_name_publishers_mapping_redis(self): 

681 ''' 

682 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's 

683 normalised name (and possibly its crossref ID) is returned. 

684 

685 Mapping Provided: Publisher name retrieved + crossref member returned, 

686 only if : 

687 - the doi prefix is a crossref doi prefix, 

688 - it is present in the mapping, 

689 -the name of the publisher provided by the datasource corresponds to the from the datasource dump 

690 ''' 

691 

692 op = OpenaireProcessing(testing=True,publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json") 

693 

694 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'} 

695 

696 doi_pub_1_input = {'name': 'Frontiers Media SA'} 

697 doi1 = "10.3389/fnana.2012.00034" 

698 

699 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'} 

700 doi2 = "10.2527/1995.7392834x" 

701 

702 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input) 

703 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input) 

704 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input) 

705 

706 self.assertEqual(doi_pub_output_1, "Frontiers Media SA") 

707 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd") 

708 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)") 

709 

710 op.storage_manager.delete_storage() 

711 

712 def test_get_publisher_name_publishers_mapping_multi_dois(self): 

713 ''' 

714 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's 

715 normalised name (and possibly its crossref ID) is returned. 

716 

717 Mapping Provided: Publisher name retrieved + crossref member returned, 

718 only if : 

719 - the doi prefix is a crossref doi prefix, 

720 - it is present in the mapping, 

721 -the name of the publisher provided by the datasource corresponds to the from the datasource dump 

722 ''' 

723 

724 op = OpenaireProcessing(publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json") 

725 

726 # CASE 1: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the 

727 # entity's dois prefixes in the prefix-to-publisher-data mapping in input 

728 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member 

729 

730 ent_1_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

731 ent_1_doi_2 = "10.1153/sample_doi" 

732 pub_input_1 = {'name': 'American Physiological Society'} 

733 

734 no_doi_pub_output = op.get_publisher_name([ent_1_doi_1, ent_1_doi_2], pub_input_1) 

735 

736 self.assertEqual(no_doi_pub_output, "American Physiological Society [crossref:24]") 

737 

738 # CASE 2: The Publisher Name provided by OPENAIRE does not correspond to the Publisher Name mapped to one of the 

739 # entity's dois prefixes in the prefix-to-publisher-data mapping in input 

740 # EXPECTED OUTPUT: The publisher name provided by Openaire is retrieved without any crossref member 

741 

742 ent_2_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

743 ent_2_doi_2 = "10.1153/sample_doi" 

744 pub_input_2 = {'name': 'Sample Publisher Name'} 

745 

746 no_doi_pub_output2 = op.get_publisher_name([ent_2_doi_1, ent_2_doi_2], pub_input_2) 

747 self.assertEqual(no_doi_pub_output2, "Sample Publisher Name") 

748 

749 # CASE 3: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the 

750 # entity's dois prefixes in the prefix-to-publisher-data mapping in input BUT it is not the first doi of the list 

751 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member 

752 

753 ent_3_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

754 ent_3_doi_2 = "10.1153/sample_doi" 

755 pub_input_3 = {'name': 'American Physiological Society'} 

756 

757 doi_pub_output3 = op.get_publisher_name([ent_3_doi_2, ent_3_doi_1], pub_input_3) 

758 

759 self.assertEqual(doi_pub_output3, "American Physiological Society [crossref:24]") 

760 

761 op.storage_manager.delete_storage() 

762 

763 # CASE 4: OPENAIRE does not provide a publisher name but one of the entity's DOI prefixes is in the 

764 # prefix-to-publisher-data mapping in input 

765 # EXPECTED OUTPUT: empty string 

766 

767 ent_4_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

768 ent_4_doi_2 = "10.1153/sample_doi" 

769 pub_input_4 = {'name': ''} 

770 pub_input_4_1 = {} 

771 pub_input_4_2 = '' 

772 

773 doi_pub_output4 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4) 

774 doi_pub_output4_1 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_1) 

775 doi_pub_output4_2= op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_2) 

776 

777 self.assertEqual(doi_pub_output4, "") 

778 self.assertEqual(doi_pub_output4_1, "") 

779 self.assertEqual(doi_pub_output4_2, "") 

780 

781 op.storage_manager.delete_storage() 

782 

783 def test_get_publisher_name_publishers_mapping_multi_dois_redis(self): 

784 ''' 

785 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's 

786 normalised name (and possibly its crossref ID) is returned. 

787 

788 Mapping Provided: Publisher name retrieved + crossref member returned, 

789 only if : 

790 - the doi prefix is a crossref doi prefix, 

791 - it is present in the mapping, 

792 -the name of the publisher provided by the datasource corresponds to the from the datasource dump 

793 ''' 

794 

795 op = OpenaireProcessing(testing=True, publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json") 

796 

797 # CASE 1: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the 

798 # entity's dois prefixes in the prefix-to-publisher-data mapping in input 

799 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member 

800 

801 ent_1_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

802 ent_1_doi_2 = "10.1153/sample_doi" 

803 pub_input_1 = {'name': 'American Physiological Society'} 

804 

805 no_doi_pub_output = op.get_publisher_name([ent_1_doi_1, ent_1_doi_2], pub_input_1) 

806 

807 self.assertEqual(no_doi_pub_output, "American Physiological Society [crossref:24]") 

808 

809 # CASE 2: The Publisher Name provided by OPENAIRE does not correspond to the Publisher Name mapped to one of the 

810 # entity's dois prefixes in the prefix-to-publisher-data mapping in input 

811 # EXPECTED OUTPUT: The publisher name provided by Openaire is retrieved without any crossref member 

812 

813 ent_2_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

814 ent_2_doi_2 = "10.1153/sample_doi" 

815 pub_input_2 = {'name': 'Sample Publisher Name'} 

816 

817 no_doi_pub_output2 = op.get_publisher_name([ent_2_doi_1, ent_2_doi_2], pub_input_2) 

818 self.assertEqual(no_doi_pub_output2, "Sample Publisher Name") 

819 

820 # CASE 3: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the 

821 # entity's dois prefixes in the prefix-to-publisher-data mapping in input BUT it is not the first doi of the list 

822 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member 

823 

824 ent_3_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

825 ent_3_doi_2 = "10.1153/sample_doi" 

826 pub_input_3 = {'name': 'American Physiological Society'} 

827 

828 doi_pub_output3 = op.get_publisher_name([ent_3_doi_2, ent_3_doi_1], pub_input_3) 

829 

830 self.assertEqual(doi_pub_output3, "American Physiological Society [crossref:24]") 

831 

832 op.storage_manager.delete_storage() 

833 

834 # CASE 4: OPENAIRE does not provide a publisher name but one of the entity's DOI prefixes is in the 

835 # prefix-to-publisher-data mapping in input 

836 # EXPECTED OUTPUT: empty string 

837 

838 ent_4_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society 

839 ent_4_doi_2 = "10.1153/sample_doi" 

840 pub_input_4 = {'name': ''} 

841 pub_input_4_1 = {} 

842 pub_input_4_2 = '' 

843 

844 doi_pub_output4 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4) 

845 doi_pub_output4_1 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_1) 

846 doi_pub_output4_2= op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_2) 

847 

848 self.assertEqual(doi_pub_output4, "") 

849 self.assertEqual(doi_pub_output4_1, "") 

850 self.assertEqual(doi_pub_output4_2, "") 

851 

852 op.storage_manager.delete_storage() 

853 

854 def test_manage_arxiv_single_id(self): 

855 '''Check the correct management of entities with only one ID, in particular in 

856 case it is an arxiv. In this case, if it is an arxiv DOI, we return the normalised 

857 version of the correspondent arxiv. Both in case of an arxiv id and of an arxiv doi, 

858 we return the versioned arxiv id where the version is available (never in ARXIV doi). 

859 If no version is provided, we normalise the arxiv id as arxiv id version 1. 

860 In all the other id cases (pmid, pmc, handle (which is discarded in a later step) ''' 

861 sample_doi_any = [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}] 

862 sample_doi_arxiv = [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}] 

863 sample_arxiv_no_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217', 'valid': None}] 

864 sample_arxiv_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509...08217v3', 'valid': None}] 

865 

866 op = OpenaireProcessing() 

867 

868 # CASE 1: the unique input id dict in list is a not-arxiv doi : the input list is returned 

869 out_sample_doi_any = op.manage_arxiv_single_id(sample_doi_any) 

870 self.assertEqual(out_sample_doi_any, [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}]) 

871 

872 # CASE 2: the unique input id dict in list is an arxiv doi: the doi is replaced with its correspondent arxiv v1 

873 out_sample_doi_arxiv = op.manage_arxiv_single_id(sample_doi_arxiv) 

874 self.assertEqual(out_sample_doi_arxiv, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}]) 

875 

876 # CASE 3: the unique input id dict in list is an arxiv id without version: 

877 # the arxiv id is replaced with its v1 

878 out_sample_arxiv_no_ver = op.manage_arxiv_single_id(sample_arxiv_no_ver) 

879 self.assertEqual(out_sample_arxiv_no_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}]) 

880 

881 # CASE 4: the unique input id dict in list is an arxiv id with version: the id is just normalised 

882 out_sample_arxiv_ver = op.manage_arxiv_single_id(sample_arxiv_ver) 

883 self.assertEqual(out_sample_arxiv_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v3'}]) 

884 

885 op.storage_manager.delete_storage() 

886 

887 def test_manage_arxiv_single_id_redis(self): 

888 '''Check the correct management of entities with only one ID, in particular in 

889 case it is an arxiv. In this case, if it is an arxiv DOI, we return the normalised 

890 version of the correspondent arxiv. Both in case of an arxiv id and of an arxiv doi, 

891 we return the versioned arxiv id where the version is available (never in ARXIV doi). 

892 If no version is provided, we normalise the arxiv id as arxiv id version 1. 

893 In all the other id cases (pmid, pmc, handle (which is discarded in a later step) ''' 

894 sample_doi_any = [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}] 

895 sample_doi_arxiv = [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}] 

896 sample_arxiv_no_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217', 'valid': None}] 

897 sample_arxiv_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509...08217v3', 'valid': None}] 

898 

899 op = OpenaireProcessing(testing=True) 

900 

901 # CASE 1: the unique input id dict in list is a not-arxiv doi : the input list is returned 

902 out_sample_doi_any = op.manage_arxiv_single_id(sample_doi_any) 

903 self.assertEqual(out_sample_doi_any, [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}]) 

904 

905 # CASE 2: the unique input id dict in list is an arxiv doi: the doi is replaced with its correspondent arxiv v1 

906 out_sample_doi_arxiv = op.manage_arxiv_single_id(sample_doi_arxiv) 

907 self.assertEqual(out_sample_doi_arxiv, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}]) 

908 

909 # CASE 3: the unique input id dict in list is an arxiv id without version: 

910 # the arxiv id is replaced with its v1 

911 out_sample_arxiv_no_ver = op.manage_arxiv_single_id(sample_arxiv_no_ver) 

912 self.assertEqual(out_sample_arxiv_no_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}]) 

913 

914 # CASE 4: the unique input id dict in list is an arxiv id with version: the id is just normalised 

915 out_sample_arxiv_ver = op.manage_arxiv_single_id(sample_arxiv_ver) 

916 self.assertEqual(out_sample_arxiv_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v3'}]) 

917 

918 op.storage_manager.delete_storage() 

919 

920 def test_manage_doi_prefixes_priorities(self): 

921 op = OpenaireProcessing() 

922 

923 # CASE1: 1 figshare doi (priority 1) with version --> returned as it is 

924 es_1 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v3', 'valid': None}] 

925 out_1 = op.manage_doi_prefixes_priorities(es_1) 

926 self.assertEqual(out_1, es_1) 

927 

928 # CASE2: 1 figshare doi (priority 1) without version --> returned with version v1 

929 es_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234', 'valid': None}] 

930 exp_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v1', 'valid': None}] 

931 out_2 = op.manage_doi_prefixes_priorities(es_2) 

932 self.assertEqual(exp_2, out_2) 

933 

934 # CASE3: 1 arxiv doi (always without and version) --> returned as correspondent arxiv id version v1 

935 es_3 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}] 

936 out_3 = op.manage_doi_prefixes_priorities(es_3) 

937 exp_3 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}] 

938 self.assertEqual(exp_3, out_3) 

939 

940 # CASE4: >1 arxiv doi or figshare and at least one has version --> return the one(s) with version 

941 es_4 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}] 

942 out_4 = op.manage_doi_prefixes_priorities(es_4) 

943 exp_4 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}] 

944 self.assertEqual(exp_4, out_4) 

945 

946 # CASE5: >1 arxiv doi or figshare and none has version --> return, as first choice, the arxiv version v1 of the first arxiv doi encountered 

947 es_5 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}] 

948 out_5 = op.manage_doi_prefixes_priorities(es_5) 

949 exp_5 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}] 

950 self.assertEqual(exp_5, out_5) 

951 

952 # CASE6: >1 figshare dois and none has version --> return, return version v1 doi of the first figshare doi encountered 

953 es_6 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/1234', 'valid': None}] 

954 out_6 = op.manage_doi_prefixes_priorities(es_6) 

955 exp_6 = [{'identifier': 'doi:10.6084/5678v1', 'schema': 'doi', 'valid': None}] 

956 self.assertEqual(exp_6, out_6) 

957 

958 # CASE7: >1 more than one zenodo doi --> return the one with the highest number: it is the last one assigned and thus it 

959 # is a version doi and not the collector doi (which is the first one to be assigned when a publication is uploaded on zenodo). 

960 es_7 = [{'schema': 'doi', 'identifier': '10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': '10.5281/zenodo.112', 'valid': None}] 

961 es_7_1 = [{'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.112', 'valid': None}] 

962 out_7 = op.manage_doi_prefixes_priorities(es_7) 

963 out_7_1 = op.manage_doi_prefixes_priorities(es_7_1) 

964 exp_7 = [{'identifier': '10.5281/zenodo.112', 'schema': 'doi', 'valid': None}] 

965 exp_7_1 = [{'identifier': 'doi:10.5281/zenodo.112', 'schema': 'doi', 'valid': None}] 

966 self.assertEqual(exp_7, out_7) 

967 self.assertEqual(exp_7_1, out_7_1) 

968 

969 # CASE8: None of the previous cases: return the first VALID DOI with highest priority prefix 

970 #No one of the ids is valid, return an empty list 

971 es_8 = [ 

972 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None}, 

973 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}, 

974 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None}, 

975 ] 

976 

977 out_8 = op.manage_doi_prefixes_priorities(es_8) 

978 exp_8 = [] 

979 self.assertEqual(exp_8, out_8) 

980 

981 # CASE8_1: 

982 # No valid id among the ones with a max priority prefix --> return the first valid ID in order of prefix priority 

983 es_8_1 = [ 

984 {'schema': 'doi', 'identifier': '10.5281/zenodo.4725899', 'valid': None}, 

985 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}, 

986 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None}, 

987 ] 

988 

989 out_8_1 = op.manage_doi_prefixes_priorities(es_8_1) 

990 exp_8_1 = [{'schema': 'doi', 'identifier': '10.5281/zenodo.4725899', 'valid': None}] 

991 self.assertEqual(exp_8_1, out_8_1) 

992 

993 # CASE8_2: 

994 # more valid ids among the ones with a max priority prefix --> return the first one encountered 

995 es_8_2 = [ 

996 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None}, 

997 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}, 

998 {'schema': 'doi', 'identifier': 'doi:10.25384/sage.c.4112909', 'valid': None}, 

999 ] 

1000 

1001 out_8_2 = op.manage_doi_prefixes_priorities(es_8_2) 

1002 exp_8_2 = [{'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}] 

1003 self.assertEqual(exp_8_2, out_8_2) 

1004 

1005 op.storage_manager.delete_storage() 

1006 

1007 def test_manage_doi_prefixes_priorities_redis(self): 

1008 op = OpenaireProcessing(testing=True) 

1009 

1010 # CASE1: 1 figshare doi (priority 1) with version --> returned as it is 

1011 es_1 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v3', 'valid': None}] 

1012 out_1 = op.manage_doi_prefixes_priorities(es_1) 

1013 self.assertEqual(out_1, es_1) 

1014 

1015 # CASE2: 1 figshare doi (priority 1) without version --> returned with version v1 

1016 es_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234', 'valid': None}] 

1017 exp_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v1', 'valid': None}] 

1018 out_2 = op.manage_doi_prefixes_priorities(es_2) 

1019 self.assertEqual(exp_2, out_2) 

1020 

1021 # CASE3: 1 arxiv doi (always without and version) --> returned as correspondent arxiv id version v1 

1022 es_3 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}] 

1023 out_3 = op.manage_doi_prefixes_priorities(es_3) 

1024 exp_3 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}] 

1025 self.assertEqual(exp_3, out_3) 

1026 

1027 # CASE4: >1 arxiv doi or figshare and at least one has version --> return the one(s) with version 

1028 es_4 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}] 

1029 out_4 = op.manage_doi_prefixes_priorities(es_4) 

1030 exp_4 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}] 

1031 self.assertEqual(exp_4, out_4) 

1032 

1033 # CASE5: >1 arxiv doi or figshare and none has version --> return, as first choice, the arxiv version v1 of the first arxiv doi encountered 

1034 es_5 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}] 

1035 out_5 = op.manage_doi_prefixes_priorities(es_5) 

1036 exp_5 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}] 

1037 self.assertEqual(exp_5, out_5) 

1038 

1039 # CASE6: >1 figshare dois and none has version --> return, return version v1 doi of the first figshare doi encountered 

1040 es_6 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/1234', 'valid': None}] 

1041 out_6 = op.manage_doi_prefixes_priorities(es_6) 

1042 exp_6 = [{'identifier': 'doi:10.6084/5678v1', 'schema': 'doi', 'valid': None}] 

1043 self.assertEqual(exp_6, out_6) 

1044 

1045 # CASE7: >1 more than one zenodo doi --> return the one with the highest number: it is the last one assigned and thus it 

1046 # is a version doi and not the collector doi (which is the first one to be assigned when a publication is uploaded on zenodo). 

1047 es_7 = [{'schema': 'doi', 'identifier': '10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': '10.5281/zenodo.112', 'valid': None}] 

1048 es_7_1 = [{'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.112', 'valid': None}] 

1049 out_7 = op.manage_doi_prefixes_priorities(es_7) 

1050 out_7_1 = op.manage_doi_prefixes_priorities(es_7_1) 

1051 exp_7 = [{'identifier': '10.5281/zenodo.112', 'schema': 'doi', 'valid': None}] 

1052 exp_7_1 = [{'identifier': 'doi:10.5281/zenodo.112', 'schema': 'doi', 'valid': None}] 

1053 self.assertEqual(exp_7, out_7) 

1054 self.assertEqual(exp_7_1, out_7_1) 

1055 

1056 # CASE8: None of the previous cases: return the first VALID DOI with highest priority prefix 

1057 #No one of the ids is valid, return an empty list 

1058 es_8 = [ 

1059 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None}, 

1060 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}, 

1061 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None}, 

1062 ] 

1063 

1064 out_8 = op.manage_doi_prefixes_priorities(es_8) 

1065 exp_8 = [] 

1066 self.assertEqual(exp_8, out_8) 

1067 

1068 # CASE8_1: 

1069 # No valid id among the ones with a max priority prefix --> return the first valid ID in order of prefix priority 

1070 es_8_1 = [ 

1071 {'schema': 'doi', 'identifier': '10.5281/zenodo.4725899', 'valid': None}, 

1072 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None}, 

1073 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None}, 

1074 ] 

1075 

1076 out_8_1 = op.manage_doi_prefixes_priorities(es_8_1) 

1077 exp_8_1 = [{'schema': 'doi', 'identifier': '10.5281/zenodo.4725899', 'valid': None}] 

1078 self.assertEqual(exp_8_1, out_8_1) 

1079 

1080 # CASE8_2: 

1081 # more valid ids among the ones with a max priority prefix --> return the first one encountered 

1082 es_8_2 = [ 

1083 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None}, 

1084 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}, 

1085 {'schema': 'doi', 'identifier': 'doi:10.25384/sage.c.4112909', 'valid': None}, 

1086 ] 

1087 

1088 out_8_2 = op.manage_doi_prefixes_priorities(es_8_2) 

1089 exp_8_2 = [{'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}] 

1090 self.assertEqual(exp_8_2, out_8_2) 

1091 

1092 op.storage_manager.delete_storage() 

1093 

1094 def test_to_validated_id_list(self): 

1095 # NOTE: in tests using the sqlite storage method it must be avoided to delete the storage 

1096 # while using the same OpenaireProcessing() instance, otherwise the process would try to 

1097 # store data in a filepath that has just been deleted, with no new connection created after it. 

1098 

1099 # 2 OPTIONS: 1) instantiate OpenaireProcessing only once at the beginning and delete the 

1100 # storage only at the end; 2) create a new OpenaireProcessing instance at every check and 

1101 # delete the storage each time after the check is done. 

1102 

1103 op = OpenaireProcessing() 

1104 # CASE1_1: No already validated ids + 1 id to be validated, which is valid 

1105 inp_1 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}]} 

1106 out_1 = op.to_validated_id_list(inp_1) 

1107 exp_1 = ['pmid:20662931'] 

1108 self.assertEqual(out_1, exp_1) 

1109 op.storage_manager.delete_storage() 

1110 

1111 op = OpenaireProcessing() 

1112 # CASE1_2: No already validated ids + 1 id to be validated, which is invalid 

1113 inp_2 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:999920662931', 'valid': None}]} 

1114 out_2 = op.to_validated_id_list(inp_2) 

1115 exp_2 = [] 

1116 self.assertEqual(out_2, exp_2) 

1117 

1118 op = OpenaireProcessing() 

1119 # CASE1_3: No already validated ids + 1 id to be validated, which is a valid arxiv doi 

1120 inp_3 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]} 

1121 out_3 = op.to_validated_id_list(inp_3) 

1122 exp_3 = ['arxiv:1509.08217v1'] 

1123 self.assertEqual(out_3, exp_3) 

1124 op.storage_manager.delete_storage() 

1125 

1126 

1127 op = OpenaireProcessing() 

1128 # CASE1_4: No already validated ids + 1 id to be validated, which hasn't a valid schema 

1129 inp_4 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "0", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]} 

1130 out_4 = op.to_validated_id_list(inp_4) 

1131 exp_4 = [] 

1132 self.assertEqual(out_4, exp_4) 

1133 op.storage_manager.delete_storage() 

1134 

1135 op = OpenaireProcessing() 

1136 # CASE1_5: No already validated ids + 1 id to be validated, which is not valid 

1137 inp_5 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "doi", 'identifier': 'doi:10.0000/fake_id', 'valid': None}]} 

1138 out_5 = op.to_validated_id_list(inp_5) 

1139 exp_5 = [] 

1140 self.assertEqual(out_5, exp_5) 

1141 op.storage_manager.delete_storage() 

1142 

1143 op = OpenaireProcessing() 

1144 # CASE1_9: No already validated ids + 1 id to be validated, which is a valid PMC 

1145 inp_9 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]} 

1146 out_9 = op.to_validated_id_list(inp_9) 

1147 exp_9 = ['pmcid:PMC2873764'] 

1148 self.assertEqual(out_9, exp_9) 

1149 op.storage_manager.delete_storage() 

1150 

1151 op = OpenaireProcessing() 

1152 # CASE2_1: No already validated ids + >1 id to be validated, both valid and with accepted schemas 

1153 inp_6 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1154 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None}]} 

1155 out_6 = op.to_validated_id_list(inp_6) 

1156 exp_6 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0'] 

1157 self.assertCountEqual(out_6, exp_6) #Test that sequence first contains the same elements as second, regardless of their order 

1158 op.storage_manager.delete_storage() 

1159 

1160 op = OpenaireProcessing() 

1161 # CASE2_2: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv id 

1162 inp_8 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1163 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979', 'valid': None}]} 

1164 out_8 = op.to_validated_id_list(inp_8) 

1165 exp_8 = ['pmid:20662931'] 

1166 self.assertEqual(out_8, exp_8) 

1167 op.storage_manager.delete_storage() 

1168 

1169 op = OpenaireProcessing() 

1170 # CASE2_3: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv doi 

1171 inp_7 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, {'schema': "doi", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]} 

1172 out_7 = op.to_validated_id_list(inp_7) 

1173 exp_7 = ['pmid:20662931'] 

1174 self.assertEqual(out_7, exp_7) 

1175 op.storage_manager.delete_storage() 

1176 

1177 op = OpenaireProcessing() 

1178 # CASE2_4: No already validated ids + >1 id to be validated, both valid, one of the two is a PMC 

1179 inp_10 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1180 {'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]} 

1181 out_10 = op.to_validated_id_list(inp_10) 

1182 exp_10 = ['pmid:20662931'] 

1183 self.assertEqual(out_10, exp_10) 

1184 op.storage_manager.delete_storage() 

1185 

1186 op = OpenaireProcessing() 

1187 # CASE2_5: No already validated ids + >1 id to be validated, 1 valid pmid, 1 valid doi, 1 valid doi with a "critic" prefix 

1188 # for opencitations entities management 

1189 

1190 inp_11 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1191 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None}, 

1192 {'schema': 'doi', 

1193 'identifier': 'doi:10.48550/arXiv.1509.08217', 

1194 'valid': None} 

1195 ]} 

1196 out_11 = op.to_validated_id_list(inp_11) 

1197 exp_11 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0'] 

1198 self.assertCountEqual(out_11, exp_11) #Test that sequence first contains the same elements as second, regardless of their order 

1199 op.storage_manager.delete_storage() 

1200 

1201 op = OpenaireProcessing() 

1202 # CASE2_6: No already validated ids + >1 id to be validated, one doi with a "critic" prefix and a PMCID 

1203 # for opencitations entities management 

1204 

1205 inp_12 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmcid', 'identifier': 'pmcid:PMC5555555', 'valid': None}, 

1206 {'schema': 'doi', 

1207 'identifier': 'doi:10.48550/arXiv.1509.08217', 

1208 'valid': None} 

1209 ]} 

1210 out_12 = op.to_validated_id_list(inp_12) 

1211 exp_12 = ['pmcid:PMC5555555'] 

1212 self.assertEqual(out_12, exp_12) 

1213 op.storage_manager.delete_storage() 

1214 

1215 op = OpenaireProcessing() 

1216 # CASE2_7: no already validated ids + >1 id to be validated, one doi with a "critic" prefix for opencitations 

1217 # ingestion workflow and an ARXIV 

1218 

1219 inp_13 = {'valid': [], 'not_valid': [], 'to_be_val': [ 

1220 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979v1', 'valid': None}, 

1221 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None} 

1222 ]} 

1223 out_13 = op.to_validated_id_list(inp_13) 

1224 exp_13 = ['arxiv:1107.5979v1'] 

1225 self.assertEqual(out_13, exp_13) 

1226 op.storage_manager.delete_storage() 

1227 

1228 op = OpenaireProcessing() 

1229 # CASE2_8: no already validated ids and more dois with "critic" prefixes for opencitations 

1230 # ingestion workflow 

1231 

1232 inp_14 = {'valid': [], 'not_valid': [], 'to_be_val': [ 

1233 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None}, 

1234 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None} 

1235 ]} 

1236 out_14 = op.to_validated_id_list(inp_14) 

1237 exp_14 = ['doi:10.1184/r1/12841247.v1'] 

1238 self.assertEqual(out_14, exp_14) 

1239 op.storage_manager.delete_storage() 

1240 

1241 op = OpenaireProcessing() 

1242 # CASE3: an already validated id and more dois with "critic" prefixes for opencitations 

1243 # ingestion workflow 

1244 

1245 inp_15 = {'valid': [], 'not_valid': [], 'to_be_val': [ 

1246 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None}, 

1247 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None}, 

1248 {'schema': 'doi', 'identifier': 'doi:10.7557/5.5607', 'valid': None}, 

1249 {} 

1250 ]} 

1251 out_15 = op.to_validated_id_list(inp_15) 

1252 exp_15 = ['doi:10.7557/5.5607'] 

1253 self.assertEqual(out_15, exp_15) 

1254 op.storage_manager.delete_storage() 

1255 

1256 def test_to_validated_id_list_redis(self): 

1257 # NOTE: in tests using the sqlite storage method it must be avoided to delete the storage 

1258 # while using the same OpenaireProcessing() instance, otherwise the process would try to 

1259 # store data in a filepath that has just been deleted, with no new connection created after it. 

1260 

1261 # 2 OPTIONS: 1) instantiate OpenaireProcessing only once at the beginning and delete the 

1262 # storage only at the end; 2) create a new OpenaireProcessing instance at every check and 

1263 # delete the storage each time after the check is done. 

1264 

1265 op = OpenaireProcessing(testing=True) 

1266 # CASE1_1: No already validated ids + 1 id to be validated, which is valid 

1267 inp_1 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}]} 

1268 out_1 = op.to_validated_id_list(inp_1) 

1269 exp_1 = ['pmid:20662931'] 

1270 self.assertEqual(out_1, exp_1) 

1271 op.storage_manager.delete_storage() 

1272 

1273 op = OpenaireProcessing(testing=True) 

1274 # CASE1_2: No already validated ids + 1 id to be validated, which is invalid 

1275 inp_2 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:999920662931', 'valid': None}]} 

1276 out_2 = op.to_validated_id_list(inp_2) 

1277 exp_2 = [] 

1278 self.assertEqual(out_2, exp_2) 

1279 op.storage_manager.delete_storage() 

1280 

1281 op = OpenaireProcessing(testing=True) 

1282 # CASE1_3: No already validated ids + 1 id to be validated, which is a valid arxiv doi 

1283 inp_3 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]} 

1284 out_3 = op.to_validated_id_list(inp_3) 

1285 exp_3 = ['arxiv:1509.08217v1'] 

1286 self.assertEqual(out_3, exp_3) 

1287 op.storage_manager.delete_storage() 

1288 

1289 op = OpenaireProcessing(testing=True) 

1290 # CASE1_4: No already validated ids + 1 id to be validated, which hasn't a valid schema 

1291 inp_4 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "0", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]} 

1292 out_4 = op.to_validated_id_list(inp_4) 

1293 exp_4 = [] 

1294 self.assertEqual(out_4, exp_4) 

1295 op.storage_manager.delete_storage() 

1296 

1297 op = OpenaireProcessing(testing=True) 

1298 # CASE1_5: No already validated ids + 1 id to be validated, which is not valid 

1299 inp_5 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "doi", 'identifier': 'doi:10.0000/fake_id', 'valid': None}]} 

1300 out_5 = op.to_validated_id_list(inp_5) 

1301 exp_5 = [] 

1302 self.assertEqual(out_5, exp_5) 

1303 op.storage_manager.delete_storage() 

1304 

1305 op = OpenaireProcessing(testing=True) 

1306 # CASE1_9: No already validated ids + 1 id to be validated, which is a valid PMC 

1307 inp_9 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]} 

1308 out_9 = op.to_validated_id_list(inp_9) 

1309 exp_9 = ['pmcid:PMC2873764'] 

1310 self.assertEqual(out_9, exp_9) 

1311 op.storage_manager.delete_storage() 

1312 

1313 op = OpenaireProcessing(testing=True) 

1314 # CASE2_1: No already validated ids + >1 id to be validated, both valid and with accepted schemas 

1315 inp_6 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1316 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None}]} 

1317 out_6 = op.to_validated_id_list(inp_6) 

1318 exp_6 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0'] 

1319 self.assertCountEqual(out_6, exp_6) #Test that sequence first contains the same elements as second, regardless of their order 

1320 op.storage_manager.delete_storage() 

1321 

1322 op = OpenaireProcessing(testing=True) 

1323 # CASE2_2: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv id 

1324 inp_8 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1325 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979', 'valid': None}]} 

1326 out_8 = op.to_validated_id_list(inp_8) 

1327 exp_8 = ['pmid:20662931'] 

1328 self.assertEqual(out_8, exp_8) 

1329 op.storage_manager.delete_storage() 

1330 

1331 op = OpenaireProcessing(testing=True) 

1332 # CASE2_3: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv doi 

1333 inp_7 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, {'schema': "doi", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]} 

1334 out_7 = op.to_validated_id_list(inp_7) 

1335 exp_7 = ['pmid:20662931'] 

1336 self.assertEqual(out_7, exp_7) 

1337 op.storage_manager.delete_storage() 

1338 

1339 op = OpenaireProcessing(testing=True) 

1340 # CASE2_4: No already validated ids + >1 id to be validated, both valid, one of the two is a PMC 

1341 inp_10 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1342 {'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]} 

1343 out_10 = op.to_validated_id_list(inp_10) 

1344 exp_10 = ['pmid:20662931'] 

1345 self.assertEqual(out_10, exp_10) 

1346 op.storage_manager.delete_storage() 

1347 

1348 op = OpenaireProcessing(testing=True) 

1349 # CASE2_5: No already validated ids + >1 id to be validated, 1 valid pmid, 1 valid doi, 1 valid doi with a "critic" prefix 

1350 # for opencitations entities management 

1351 

1352 inp_11 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, 

1353 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None}, 

1354 {'schema': 'doi', 

1355 'identifier': 'doi:10.48550/arXiv.1509.08217', 

1356 'valid': None} 

1357 ]} 

1358 out_11 = op.to_validated_id_list(inp_11) 

1359 exp_11 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0'] 

1360 self.assertCountEqual(out_11, exp_11) #Test that sequence first contains the same elements as second, regardless of their order 

1361 op.storage_manager.delete_storage() 

1362 

1363 op = OpenaireProcessing(testing=True) 

1364 # CASE2_6: No already validated ids + >1 id to be validated, one doi with a "critic" prefix and a PMCID 

1365 # for opencitations entities management 

1366 

1367 inp_12 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmcid', 'identifier': 'pmcid:PMC5555555', 'valid': None}, 

1368 {'schema': 'doi', 

1369 'identifier': 'doi:10.48550/arXiv.1509.08217', 

1370 'valid': None} 

1371 ]} 

1372 out_12 = op.to_validated_id_list(inp_12) 

1373 exp_12 = ['pmcid:PMC5555555'] 

1374 self.assertEqual(out_12, exp_12) 

1375 op.storage_manager.delete_storage() 

1376 

1377 op = OpenaireProcessing(testing=True) 

1378 # CASE2_7: no already validated ids + >1 id to be validated, one doi with a "critic" prefix for opencitations 

1379 # ingestion workflow and an ARXIV 

1380 

1381 inp_13 = {'valid': [], 'not_valid': [], 'to_be_val': [ 

1382 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979v1', 'valid': None}, 

1383 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None} 

1384 ]} 

1385 out_13 = op.to_validated_id_list(inp_13) 

1386 exp_13 = ['arxiv:1107.5979v1'] 

1387 self.assertEqual(out_13, exp_13) 

1388 op.storage_manager.delete_storage() 

1389 

1390 op = OpenaireProcessing(testing=True) 

1391 # CASE2_8: no already validated ids and more dois with "critic" prefixes for opencitations 

1392 # ingestion workflow 

1393 

1394 inp_14 = {'valid': [], 'not_valid': [], 'to_be_val': [ 

1395 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None}, 

1396 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None} 

1397 ]} 

1398 out_14 = op.to_validated_id_list(inp_14) 

1399 exp_14 = ['doi:10.1184/r1/12841247.v1'] 

1400 self.assertEqual(out_14, exp_14) 

1401 op.storage_manager.delete_storage() 

1402 

1403 op = OpenaireProcessing(testing=True) 

1404 # CASE3: an already validated id and more dois with "critic" prefixes for opencitations 

1405 # ingestion workflow 

1406 

1407 inp_15 = {'valid': [], 'not_valid': [], 'to_be_val': [ 

1408 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None}, 

1409 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None}, 

1410 {'schema': 'doi', 'identifier': 'doi:10.7557/5.5607', 'valid': None}, 

1411 {} 

1412 ]} 

1413 out_15 = op.to_validated_id_list(inp_15) 

1414 exp_15 = ['doi:10.7557/5.5607'] 

1415 self.assertEqual(out_15, exp_15) 

1416 op.storage_manager.delete_storage() 

1417 

1418 

1419 def test_add_authors_to_agent_list(self): 

1420 op = OpenaireProcessing() 

1421 sample_inp = {'creator': [{'name': 'Carlos Hoyos'}, {'name': 'Yaron Oz'}, {'identifiers': [{'identifier': '0000-0001-6946-5074', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-6946-5074'}], 'name': 'Bom Soo Kim'}]} 

1422 sample_exp = op.add_authors_to_agent_list(sample_inp, []) 

1423 sample_out = [{'role': 'author', 'name': 'Carlos Hoyos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Yaron Oz', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Bom Soo Kim', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}] 

1424 self.assertEqual(sample_out, sample_exp) 

1425 op.storage_manager.delete_storage() 

1426 

1427 

1428 def test_add_authors_to_agent_list_redis(self): 

1429 op = OpenaireProcessing(testing=True) 

1430 sample_inp = {'creator': [{'name': 'Carlos Hoyos'}, {'name': 'Yaron Oz'}, {'identifiers': [{'identifier': '0000-0001-6946-5074', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-6946-5074'}], 'name': 'Bom Soo Kim'}]} 

1431 sample_exp = op.add_authors_to_agent_list(sample_inp, []) 

1432 sample_out = [{'role': 'author', 'name': 'Carlos Hoyos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Yaron Oz', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Bom Soo Kim', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}] 

1433 self.assertEqual(sample_out, sample_exp) 

1434 op.storage_manager.delete_storage() 

1435 

1436 def test_add_authors_to_agent_list_no_creator(self): 

1437 op = OpenaireProcessing() 

1438 sample_inp = {'creator': []} 

1439 sample_exp = op.add_authors_to_agent_list(sample_inp, []) 

1440 sample_out = [] 

1441 self.assertEqual(sample_out, sample_exp) 

1442 op.storage_manager.delete_storage() 

1443 

1444 

1445 def test_add_authors_to_agent_list_no_creator_redis(self): 

1446 op = OpenaireProcessing(testing=True) 

1447 sample_inp = {'creator': []} 

1448 sample_exp = op.add_authors_to_agent_list(sample_inp, []) 

1449 sample_out = [] 

1450 self.assertEqual(sample_out, sample_exp) 

1451 op.storage_manager.delete_storage() 

1452 

1453 def test_get_agents_strings_list(self): 

1454 best_doi = "doi:10.1007/jhep03(2014)050" 

1455 agents_list_2 = [{'role': 'author', 'name': 'Hoyos, Carlos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Oz, Yaron', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Kim, Bom Soo', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}] 

1456 op = OpenaireProcessing() 

1457 sample_exp = op.get_agents_strings_list(best_doi, agents_list_2) 

1458 self.assertEqual(sample_exp, (['Hoyos Carlos', 'Oz Yaron', 'Kim Bom Soo [orcid:0000-0001-6946-5074]'], [])) 

1459 op.storage_manager.delete_storage() 

1460 

1461 def test_get_agents_strings_list_redis(self): 

1462 best_doi = "doi:10.1007/jhep03(2014)050" 

1463 agents_list_2 = [{'role': 'author', 'name': 'Hoyos, Carlos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Oz, Yaron', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Kim, Bom Soo', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}] 

1464 op = OpenaireProcessing(testing=True) 

1465 sample_exp = op.get_agents_strings_list(best_doi, agents_list_2) 

1466 self.assertEqual(sample_exp, (['Hoyos Carlos', 'Oz Yaron', 'Kim Bom Soo [orcid:0000-0001-6946-5074]'], [])) 

1467 op.storage_manager.delete_storage() 

1468 

1469 def test_find_openaire_orcid(self): 

1470 op = OpenaireProcessing(testing=True) 

1471 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1472 out = op.find_openaire_orcid(inp) 

1473 exp = "orcid:0000-0001-9759-3938" 

1474 self.assertEqual(out, exp) 

1475 

1476 inp_wrong_schema = [{'identifier': '0000-0001-9759-3938', 'schema': 'fake_schema', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1477 out_wrong_schema = op.find_openaire_orcid(inp_wrong_schema) 

1478 exp_wrong_schema = "" 

1479 self.assertEqual(out_wrong_schema, exp_wrong_schema) 

1480 

1481 inp_invalid_id = [{'identifier': '5500-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1482 out_invalid_id = op.find_openaire_orcid(inp_invalid_id) 

1483 exp_invalid_id = "" 

1484 self.assertEqual(out_invalid_id, exp_invalid_id) 

1485 

1486 op.orcid_m.storage_manager.delete_storage() 

1487 

1488 # set a valid id as invalid in storage, so to check that the api check is 

1489 # avoided if the info is already in storage 

1490 op = OpenaireProcessing(testing=True) 

1491 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", False) 

1492 

1493 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1494 out = op.find_openaire_orcid(inp) 

1495 exp = "" 

1496 self.assertEqual(out, exp) 

1497 

1498 op.orcid_m.storage_manager.delete_storage() 

1499 op = OpenaireProcessing(testing=True) 

1500 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", True) 

1501 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1502 out = op.find_openaire_orcid(inp) 

1503 exp = "orcid:0000-0001-9759-3938" 

1504 self.assertEqual(out, exp) 

1505 op.orcid_m.storage_manager.delete_storage() 

1506 

1507 

1508 def test_find_openaire_orcid_redis(self): 

1509 op = OpenaireProcessing(testing=True) 

1510 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1511 out = op.find_openaire_orcid(inp) 

1512 exp = "orcid:0000-0001-9759-3938" 

1513 self.assertEqual(out, exp) 

1514 

1515 inp_wrong_schema = [{'identifier': '0000-0001-9759-3938', 'schema': 'fake_schema', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1516 out_wrong_schema = op.find_openaire_orcid(inp_wrong_schema) 

1517 exp_wrong_schema = "" 

1518 self.assertEqual(out_wrong_schema, exp_wrong_schema) 

1519 

1520 inp_invalid_id = [{'identifier': '5500-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1521 out_invalid_id = op.find_openaire_orcid(inp_invalid_id) 

1522 exp_invalid_id = "" 

1523 self.assertEqual(out_invalid_id, exp_invalid_id) 

1524 

1525 op.orcid_m.storage_manager.delete_storage() 

1526 

1527 # set a valid id as invalid in storage, so to check that the api check is 

1528 # avoided if the info is already in storage 

1529 op = OpenaireProcessing(testing=True) 

1530 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", False) 

1531 

1532 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1533 out = op.find_openaire_orcid(inp) 

1534 exp = "" 

1535 self.assertEqual(out, exp) 

1536 

1537 op.orcid_m.storage_manager.delete_storage() 

1538 op = OpenaireProcessing(testing=True) 

1539 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", True) 

1540 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}] 

1541 out = op.find_openaire_orcid(inp) 

1542 exp = "orcid:0000-0001-9759-3938" 

1543 self.assertEqual(out, exp) 

1544 op.orcid_m.storage_manager.delete_storage() 

1545 

1546 def test_update_redis_values(self): 

1547 br = ["pmid:2", "pmid:3"] 

1548 ra = ["orcid:0000-0003-0530-4305"] 

1549 op = OpenaireProcessing(testing=True) 

1550 op.update_redis_values(br,ra) 

1551 self.assertEqual(op._redis_values_br, br) 

1552 self.assertEqual(op._redis_values_ra, ra) 

1553 

1554 

1555 def test_find_openaire_orcid_with_index(self): 

1556 """Test ORCID validation using ORCID index before API validation""" 

1557 # Setup 

1558 test_doi = "10.1234/test123" 

1559 test_orcid = "0000-0002-1234-5678" 

1560 test_name = "Smith, John" 

1561 

1562 # Create OpenaireProcessing instance with ORCID index 

1563 op = OpenaireProcessing() 

1564 # Correct format for add_value: id_string -> value 

1565 op.orcid_index.add_value(test_doi, f"{test_name} [orcid:{test_orcid}]") 

1566 

1567 # Test Case 1: ORCID found in index 

1568 inp_1 = [{'identifier': test_orcid, 'schema': 'ORCID'}] 

1569 out_1 = op.find_openaire_orcid(inp_1, test_doi) 

1570 exp_1 = f"orcid:{test_orcid}" 

1571 self.assertEqual(out_1, exp_1) 

1572 # Verify it was added to temporary storage 

1573 self.assertTrue(op.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}")) 

1574 

1575 # Test Case 2: ORCID not in index but valid via API 

1576 inp_2 = [{'identifier': '0000-0003-4082-1500', 'schema': 'ORCID'}] 

1577 out_2 = op.find_openaire_orcid(inp_2, test_doi) 

1578 exp_2 = "orcid:0000-0003-4082-1500" 

1579 self.assertEqual(out_2, exp_2) 

1580 

1581 # Test Case 3: ORCID not in index and invalid 

1582 inp_3 = [{'identifier': '0000-0000-0000-0000', 'schema': 'ORCID'}] 

1583 out_3 = op.find_openaire_orcid(inp_3, test_doi) 

1584 exp_3 = "" 

1585 self.assertEqual(out_3, exp_3) 

1586 

1587 # Test Case 4: Valid ORCID but no DOI provided 

1588 inp_4 = [{'identifier': test_orcid, 'schema': 'ORCID'}] 

1589 out_4 = op.find_openaire_orcid(inp_4) # No DOI 

1590 exp_4 = f"orcid:{test_orcid}" # Should still validate via API 

1591 self.assertEqual(out_4, exp_4) 

1592 

1593 # Cleanup 

1594 op.storage_manager.delete_storage() 

1595 

1596 

1597def test_validated_as_with_storage_manager(storage_manager): 

1598 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"} 

1599 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"} 

1600 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"} 

1601 

1602 op_processing = OpenaireProcessing(storage_manager=storage_manager, testing=True) 

1603 op_processing.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True) 

1604 op_processing.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False) 

1605 

1606 assert op_processing.validated_as(valid_doi_in_db) is True 

1607 assert op_processing.validated_as(invalid_doi_in_db) is False 

1608 assert op_processing.validated_as(valid_doi_not_in_db) is None 

1609 

1610 

1611if __name__ == '__main__': 

1612 unittest.main()