Coverage for test/processing_oroci

1# SPDX-FileCopyrightText: 2023 Arianna Moretti <arianna.moretti4@unibo.it>

2# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it>

3# SPDX-FileCopyrightText: 2025-2026 Arcangelo Massari <arcangelo.massari@unibo.it>

5# SPDX-License-Identifier: ISC

7import os

8import unittest

10from oc_ds_converter.lib.jsonmanager import *

11from oc_ds_converter.openaire.openaire_processing import OpenaireProcessing

12#

14BASE = os.path.join('test', 'openaire_processing')

15DATA = os.path.join(BASE, 'jSonFile_1.json')

16DATA_DIR = BASE

17TMP_SUPPORT_MATERIAL = os.path.join(BASE, "tmp_support")

18OUTPUT = os.path.join(BASE, 'meta_input')

19MULTIPROCESS_OUTPUT = os.path.join(BASE, 'multi_process_test')

20MEMO_JSON_PATH = "test/openaire_processing/tmp_support/memo.json"

21SAMPLE_ENTITY = {'collectedFrom': [{'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::081b82f96300b6a6e3d282bad31cb6e2', 'schema': 'DNET Identifier'}], 'name': 'Crossref'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::8ac8380272269217cb09a928c8caa993', 'schema': 'DNET Identifier'}], 'name': 'UnpayWall'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::806360c771262b4d6770e7cdf04b5c5a', 'schema': 'DNET Identifier'}], 'name': 'ORCID'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a', 'schema': 'DNET Identifier'}], 'name': 'Microsoft Academic Graph'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::9e3be59865b2c1c335d32dae2fe7b254', 'schema': 'DNET Identifier'}], 'name': 'Datacite'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::6f4922f45568161a8cdf4ad2299f6d23', 'schema': 'DNET Identifier'}], 'name': 'arXiv.org e-Print Archive'}, 'provisionMode': 'collected'}], 'creator': [{'name': 'Matteo Serra'}, {'name': 'Salvatore Mignemi'}, {'identifiers': [{'identifier': '0000-0001-5595-7537', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-5595-7537'}], 'name': 'Mariano Cadoni'}], 'dnetIdentifier': '50|doi_dedup___::41074cd388749ccbdb6668caaf059f4a', 'identifier': [{'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi', 'url': 'https://doi.org/10.1103/physrevd.84.084046'}, {'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi'}, {'identifier': '10.48550/arxiv.1107.5979', 'schema': 'doi', 'url': 'https://dx.doi.org/10.48550/arxiv.1107.5979'}, {'identifier': '1107.5979', 'schema': 'arXiv', 'url': 'http://arxiv.org/abs/1107.5979'}], 'objectSubType': 'Article', 'objectType': 'publication', 'publicationDate': '2011-10-21', 'publisher': [{'name': 'American Physical Society (APS)'}], 'title': 'Exact solutions with AdS asymptotics of Einstein and Einstein-Maxwell gravity minimally coupled to a scalar field'}

22SAMPLE_ENT2 = {"identifier":"000017d2c913b28e09291b811ce3609a","linkprovider":[{"identifiers":[{"identifier":"10|openaire____::0a836ef43dcb67bb7cbd4dd509b11b73","schema":"DNET Identifier"}],"name":"CORE (RIOXX-UK Aggregator)"},{"identifiers":[{"identifier":"10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357","schema":"DNET Identifier"}],"name":"PubMed Central"},{"identifiers":[{"identifier":"10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c","schema":"DNET Identifier"}],"name":"Europe PubMed Central"},{"identifiers":[{"identifier":"10|opendoar____::229754d7799160502a143a72f6789927","schema":"DNET Identifier"}],"name":"Publications at Bielefeld University"}],"publicationDate":"2014-02-01","publisher":[{"name":"Springer Nature"}],"relationship":{"inverse":"IsCitedBy","name":"Cites","schema":"datacite"},"source":{"collectedFrom":[{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|openaire____::0a836ef43dcb67bb7cbd4dd509b11b73","schema":"DNET Identifier"}],"name":"CORE (RIOXX-UK Aggregator)"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357","schema":"DNET Identifier"}],"name":"PubMed Central"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c","schema":"DNET Identifier"}],"name":"Europe PubMed Central"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|opendoar____::229754d7799160502a143a72f6789927","schema":"DNET Identifier"}],"name":"Publications at Bielefeld University"},"provisionMode":"collected"}],"creator":[{"identifiers":[{"identifier":"0000-0002-6491-0754","schema":"ORCID","url":"https://orcid.org/0000-0002-6491-0754"}],"name":"Sattler, Sebastian"},{"name":"Mehlkop, Guido"},{"name":"Graeff, Peter"},{"identifiers":[{"identifier":"0000-0002-8090-6886","schema":"ORCID","url":"https://orcid.org/0000-0002-8090-6886"}],"name":"Sauer, Carsten"}],"dnetIdentifier":"50|pmid_dedup__::8936076da7a86820c24ede7ca3ff15b3","identifier":[{"identifier":"PMC3928621","schema":"pmc","url":"http://europepmc.org/articles/PMC3928621"},{"identifier":"24484640","schema":"pmid"},{"identifier":"24484640","schema":"pmid","url":"https://pubmed.ncbi.nlm.nih.gov/24484640"},{"identifier":"PMC3928621","schema":"pmc"}],"objectSubType":"Article","objectType":"publication","publicationDate":"2014-02-01","publisher":[{"name":"Springer Nature"}],"title":"Evaluating the drivers of and obstacles to the willingness to use cognitive enhancement drugs: the influence of drug characteristics, social environment, and personal characteristics"},"target":{"collectedFrom":[{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2","schema":"DNET Identifier"}],"name":"Crossref"},"provisionMode":"collected"},{"completionStatus":"complete","provider":{"identifiers":[{"identifier":"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a","schema":"DNET Identifier"}],"name":"Microsoft Academic Graph"},"provisionMode":"collected"}],"creator":[{"name":"Harold G. Grasmick"},{"name":"Robert J. Bursik"}],"dnetIdentifier":"50|doi_________::816648c63de74835ec2b0a753a68f037","identifier":[{"identifier":"10.2307/3053861","schema":"doi","url":"https://doi.org/10.2307/3053861"}],"objectSubType":"Article","objectType":"publication","publicationDate":"1990-01-01","publisher":[{"name":"JSTOR"}],"title":"Conscience, significant others, and rational choice: Extending the deterrence model."}}

23SAMPLE_ENTITY_FOR_CSV_CREATOR = {'collectedFrom': [{'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::0a836ef43dcb67bb7cbd4dd509b11b73', 'schema': 'DNET Identifier'}], 'name': 'CORE (RIOXX-UK Aggregator)'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::eda80a3d5b344bc40f3bc04f65b7a357', 'schema': 'DNET Identifier'}], 'name': 'PubMed Central'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c', 'schema': 'DNET Identifier'}], 'name': 'Europe PubMed Central'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|driver______::bee53aa31dc2cbb538c10c2b65fa5824', 'schema': 'DNET Identifier'}], 'name': 'DOAJ-Articles'}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|opendoar____::566a9968b43628588e76be5a85a0f9e8', 'schema': 'DNET Identifier'}], 'name': "King's Research Portal"}, 'provisionMode': 'collected'}, {'completionStatus': 'complete', 'provider': {'identifiers': [{'identifier': '10|openaire____::c2cdfa5866e03cdd07d313cbc8fb8311', 'schema': 'DNET Identifier'}], 'name': 'Multidisciplinary Digital Publishing Institute'}, 'provisionMode': 'collected'}], 'creator': [{'name': 'Smith, Lee'}, {'name': 'Sawyer, Alexia'}, {'name': 'Gardner, Benjamin'}, {'name': 'Seppala, Katri'}, {'name': 'Ucci, Marcella'}, {'name': 'Marmot, Alexi'}, {'name': 'Lally, Pippa'}, {'name': 'Fisher, Abi'}], 'dnetIdentifier': '50|pmid_dedup__::a1a8687c2378a0d68314566dec29dafb', 'objectSubType': 'Article', 'objectType': 'publication', 'publicationDate': '2018-06-09', 'publisher': [{'name': 'MDPI'}], 'title': 'Occupational physical activity habits of UK office workers: cross-sectional data from the Active Buildings Study', 'identifier': {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:29890726', 'valid': None}]}, "redis_validity_lists":[[],[]]}

26class TestOpenaireProcessing(unittest.TestCase):

28 def delete_storege(self, storage_type=None, specific_path=None):

29 if not specific_path:

30 if storage_type == "sqlite":

31 auto_db_created_path = os.path.join(os.getcwd(), "storage", "id_valid_dict.db")

32 auto_db_created_path = auto_db_created_path if os.path.exists(auto_db_created_path) else auto_db_created_path+"?mode=rw"

33 if os.path.exists(auto_db_created_path):

34 os.remove(auto_db_created_path)

35 else:

36 auto_db_created_path = os.path.join(os.getcwd(), "storage", "id_value.json")

37 if os.path.exists(auto_db_created_path):

38 os.remove(auto_db_created_path)

39 elif specific_path:

40 if os.path.exists(specific_path):

41 os.remove(specific_path)

43 def test_get_all_ids(self):

44 opp = OpenaireProcessing()

45 allids = opp.extract_all_ids(SAMPLE_ENT2)

46 self.assertCountEqual(['pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'], allids[0])

47 self.assertCountEqual(['orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'], allids[1])

49 opp.storage_manager.delete_storage()

51 def test_get_all_ids_redis(self):

52 opp = OpenaireProcessing(testing=True)

53 allids = opp.extract_all_ids(SAMPLE_ENT2)

54 self.assertCountEqual(['pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'], allids[0])

55 self.assertCountEqual(['orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'], allids[1])

56 opp.storage_manager.delete_storage()

58 def test_get_redis_validity_list(self):

59 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'}

60 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}

62 opp = OpenaireProcessing()

63 br_valid_list = opp.get_redis_validity_list(br, "br")

64 exp_exp_br_valid_list = []

65 ra_valid_list = opp.get_redis_validity_list(ra, "ra")

66 exp_exp_ra_valid_list = []

67 self.assertEqual(ra_valid_list, exp_exp_ra_valid_list)

68 self.assertEqual(br_valid_list, exp_exp_br_valid_list)

70 opp.storage_manager.delete_storage()

72 def test_get_redis_validity_list_redis(self):

73 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'}

74 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}

76 opp = OpenaireProcessing(testing=True)

77 br_valid_list = opp.get_redis_validity_list(br, "br")

78 exp_exp_br_valid_list = []

79 ra_valid_list = opp.get_redis_validity_list(ra, "ra")

80 exp_exp_ra_valid_list = []

81 self.assertEqual(ra_valid_list, exp_exp_ra_valid_list)

82 self.assertEqual(br_valid_list, exp_exp_br_valid_list)

83 opp.storage_manager.delete_storage()

85 def test_get_reids_validity_dict_w_fakeredis_db_values_sqlite(self):

86 opp = OpenaireProcessing()

87 opp.BR_redis.sadd('pmid:24484640', "omid:1")

88 opp.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2")

90 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'}

91 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}

93 br_validity_dict = opp.get_redis_validity_list(br, "br")

94 exp_br_valid_list = ["pmid:24484640"]

95 ra_validity_dict = opp.get_redis_validity_list(ra, "ra")

96 exp_ra_valid_list = ['orcid:0000-0002-8090-6886']

97 self.assertEqual(br_validity_dict, exp_br_valid_list)

98 self.assertEqual(ra_validity_dict, exp_ra_valid_list)

100 opp.storage_manager.delete_storage()

101

102 opp.BR_redis.delete('pmid:24484640')

103 opp.BR_redis.delete('pmcid:PMC3928621')

104 opp.RA_redis.delete('orcid:0000-0002-8090-6886')

105

106 def test_get_reids_validity_dict_w_fakeredis_db_values_redis(self):

107 opp = OpenaireProcessing(testing=True)

108 opp.BR_redis.sadd('pmid:24484640', "omid:1")

109 opp.RA_redis.sadd('orcid:0000-0002-8090-6886', "omid:2")

110

111

112 br = {'pmid:24484640', 'pmcid:PMC3928621', 'doi:10.2307/3053861'}

113 ra = {'orcid:0000-0002-8090-6886', 'orcid:0000-0002-6491-0754'}

114

115 br_validity_dict = opp.get_redis_validity_list(br, "br")

116 exp_br_valid_list = ["pmid:24484640"]

117 ra_validity_dict = opp.get_redis_validity_list(ra, "ra")

118 exp_ra_valid_list = ['orcid:0000-0002-8090-6886']

119 self.assertEqual(br_validity_dict, exp_br_valid_list)

120 self.assertEqual(ra_validity_dict, exp_ra_valid_list)

121

122 opp.storage_manager.delete_storage()

123 opp.BR_redis.delete('pmid:24484640')

124 opp.BR_redis.delete('pmcid:PMC3928621')

125 opp.RA_redis.delete('orcid:0000-0002-8090-6886')

126

127 def test_validated_as_default(self):

128 """

129 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:

130 string of the identifier, the method "validated_as" returns:

131 - True if the id was already validated as valid

132 - False if the id was already validated as invalid

133 - None if the id was not validated before

134 The procedure is tested

135 - With default storage manager (sqlite) without a pre-existent db associated

136 """

137

138 opp = OpenaireProcessing()

139 validate_as_none = opp.validated_as({"schema":"pmid", "identifier": "pmid:23483834"})

140 self.assertEqual(validate_as_none, None)

141 opp.storage_manager.delete_storage()

142

143 def test_validated_as_default_redis(self):

144 '''

145 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:

146 string of the identifier, the method "validated_as" returns:

147 - True if the id was already validated as valid

148 - False if the id was already validated as invalid

149 - None if the id was not validated before

150 The procedure is tested

151 - With redis storage manager without a pre-existent db associated

152 '''

153

154 opp = OpenaireProcessing(testing=True)

155 validate_as_none = opp.validated_as({"schema":"pmid", "identifier": "pmid:23483834"})

156 self.assertEqual(validate_as_none, None)

157 opp.storage_manager.delete_storage()

158

159 def test_validated_as_redis_with_preexistent_data(self):

160 '''

161 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:

162 string of the identifier, the method "validated_as" returns:

163 - True if the id was already validated as valid

164 - False if the id was already validated as invalid

165 - None if the id was not validated before

166 The procedure is tested

167 - With redis storage manager and pre-existent data associated

168 '''

169 valid_pmid_not_in_db = {"identifier":"pmid:2938", "schema":"pmid"}

170 valid_pmid_in_db = {"identifier":"pmid:23483834", "schema":"pmid"}

171 invalid_pmid_in_db = {"identifier":"pmid:18328387372097", "schema":"pmid"}

172

173 # New class instance and set values directly on the id managers' storage_manager

174 opp_redis = OpenaireProcessing(testing=True)

175 opp_redis.pmid_m.storage_manager.set_value(valid_pmid_in_db["identifier"], True)

176 opp_redis.pmid_m.storage_manager.set_value(invalid_pmid_in_db["identifier"], False)

177 validated_as_True = opp_redis.validated_as(valid_pmid_in_db)

178 validated_as_False = opp_redis.validated_as(invalid_pmid_in_db)

179 not_validated = opp_redis.validated_as(valid_pmid_not_in_db)

180

181 self.assertEqual(validated_as_True, True)

182 self.assertEqual(validated_as_False, False)

183 self.assertEqual(not_validated, None)

184

185 opp_redis.pmid_m.storage_manager.delete_storage()

186

187

188 def test_validated_as_inmemory(self):

189 '''

190 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:

191 string of the identifier, the method "validated_as" returns:

192 - True if the id was already validated as valid

193 - False if the id was already validated as invalid

194 - None if the id was not validated before

195 The procedure is tested

196 - With in Memory + Json storage manager and a pre-existent db associated

197 - With in Memory + Json storage manager without a pre-existent db associated

198 '''

199

200 valid_pmid_not_in_db = {"identifier":"pmid:2938", "schema":"pmid"}

201 valid_pmid_in_db = {"identifier":"pmid:23483834", "schema":"pmid"}

202 invalid_pmid_in_db = {"identifier":"pmid:18328387372097", "schema":"pmid"}

203

204 # New class instance and set values directly on the id managers' storage_manager

205 opp_sql = OpenaireProcessing(testing=True)

206 opp_sql.pmid_m.storage_manager.set_value(valid_pmid_in_db["identifier"], True)

207 opp_sql.pmid_m.storage_manager.set_value(invalid_pmid_in_db["identifier"], False)

208 validated_as_True = opp_sql.validated_as(valid_pmid_in_db)

209 validated_as_False = opp_sql.validated_as(invalid_pmid_in_db)

210 not_validated = opp_sql.validated_as(valid_pmid_not_in_db)

211

212 self.assertEqual(validated_as_True, True)

213 self.assertEqual(validated_as_False, False)

214 self.assertEqual(not_validated, None)

215

216 opp_sql.pmid_m.storage_manager.delete_storage()

217

218

219 def test_validated_as_redis(self):

220 '''

221 Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value:

222 string of the identifier, the method "validated_as" returns:

223 - True if the id was already validated as valid

224 - False if the id was already validated as invalid

225 - None if the id was not validated before

226 The procedure is tested

227 - With REDIS storage manager and a pre-existent db associated

228 - With REDIS storage manager without a pre-existent db associated

229 '''

230

231 valid_pmid_not_in_db = {"identifier":"pmid:2938", "schema":"pmid"}

232 valid_pmid_in_db = {"identifier":"pmid:23483834", "schema":"pmid"}

233 invalid_pmid_in_db = {"identifier":"pmid:18328387372097", "schema":"pmid"}

234

235 # New class instance and set values directly on the id managers' storage_manager

236 opp_redis = OpenaireProcessing(testing=True)

237 opp_redis.pmid_m.storage_manager.set_value(valid_pmid_in_db["identifier"], True)

238 opp_redis.pmid_m.storage_manager.set_value(invalid_pmid_in_db["identifier"], False)

239 validated_as_True = opp_redis.validated_as(valid_pmid_in_db)

240 validated_as_False = opp_redis.validated_as(invalid_pmid_in_db)

241 not_validated = opp_redis.validated_as(valid_pmid_not_in_db)

242

243 self.assertEqual(validated_as_True, True)

244 self.assertEqual(validated_as_False, False)

245 self.assertEqual(not_validated, None)

246 opp_redis.pmid_m.storage_manager.delete_storage()

247

248 def test_get_id_manager(self):

249 """Check that, given in input the string of a schema (e.g.:'pmid') or an id with a prefix (e.g.: 'pmid:12334')

250 and a dictionary mapping the strings of the schemas to their id managers, the method returns the correct

251 id manager. Note that each instance of the Preprocessing class needs its own instances of the id managers,

252 in order to avoid conflicts while validating data"""

253

254 op = OpenaireProcessing()

255 id_man_dict = op._id_man_dict

256

257 pmid_id = "pmid:12345"

258 pmid_string = "pmid"

259 pmid_man_exp = op.get_id_manager(pmid_id, id_man_dict)

260 pmid_man_exp_2 = op.get_id_manager(pmid_string, id_man_dict)

261

262 #check that the idmanager for the pmid was returned and that it works as expected

263 self.assertTrue(pmid_man_exp.is_valid(pmid_id))

264 self.assertTrue(pmid_man_exp_2.is_valid(pmid_id))

265

266 doi_id = "doi:10.1103/physrevd.84.084046"

267 doi_string = "doi"

268 doi_man_exp = op.get_id_manager(doi_id, id_man_dict)

269 doi_man_exp_2 = op.get_id_manager(doi_string, id_man_dict)

270

271 #check that the idmanager for the doi was returned and that it works as expected

272 self.assertTrue(doi_man_exp.is_valid(doi_id))

273 self.assertTrue(doi_man_exp_2.is_valid(doi_id))

274

275 pmc_id = "pmcid:PMC5555555"

276 pmc_string = "pmcid"

277 pmc_man_exp = op.get_id_manager(pmc_id, id_man_dict)

278 pmc_man_exp_2 = op.get_id_manager(pmc_string, id_man_dict)

279

280 #check that the idmanager for the pmc was returned and that it works as expected

281 self.assertTrue(pmc_man_exp.is_valid(pmc_id))

282 self.assertTrue(pmc_man_exp_2.is_valid(pmc_id))

283

284 arxiv_id = "arxiv:1509.08217"

285 arxiv_string = "arxiv"

286 arxiv_man_exp = op.get_id_manager(arxiv_id, id_man_dict)

287 arxiv_man_exp_2 = op.get_id_manager(arxiv_string, id_man_dict)

288

289 #check that the idmanager for the arxiv was returned and that it works as expected

290 self.assertTrue(arxiv_man_exp.is_valid(arxiv_id))

291 self.assertTrue(arxiv_man_exp_2.is_valid(arxiv_id))

292

293 op.storage_manager.delete_storage()

294

295 def test_get_id_manager_redis(self):

296 """Check that, given in input the string of a schema (e.g.:'pmid') or an id with a prefix (e.g.: 'pmid:12334')

297 and a dictionary mapping the strings of the schemas to their id managers, the method returns the correct

298 id manager. Note that each instance of the Preprocessing class needs its own instances of the id managers,

299 in order to avoid conflicts while validating data"""

300

301 op = OpenaireProcessing(testing=True)

302 id_man_dict = op._id_man_dict

303

304 pmid_id = "pmid:12345"

305 pmid_string = "pmid"

306 pmid_man_exp = op.get_id_manager(pmid_id, id_man_dict)

307 pmid_man_exp_2 = op.get_id_manager(pmid_string, id_man_dict)

308

309 #check that the idmanager for the pmid was returned and that it works as expected

310 self.assertTrue(pmid_man_exp.is_valid(pmid_id))

311 self.assertTrue(pmid_man_exp_2.is_valid(pmid_id))

312

313 doi_id = "doi:10.1103/physrevd.84.084046"

314 doi_string = "doi"

315 doi_man_exp = op.get_id_manager(doi_id, id_man_dict)

316 doi_man_exp_2 = op.get_id_manager(doi_string, id_man_dict)

317

318 #check that the idmanager for the doi was returned and that it works as expected

319 self.assertTrue(doi_man_exp.is_valid(doi_id))

320 self.assertTrue(doi_man_exp_2.is_valid(doi_id))

321

322 pmc_id = "pmcid:PMC5555555"

323 pmc_string = "pmcid"

324 pmc_man_exp = op.get_id_manager(pmc_id, id_man_dict)

325 pmc_man_exp_2 = op.get_id_manager(pmc_string, id_man_dict)

326

327 #check that the idmanager for the pmc was returned and that it works as expected

328 self.assertTrue(pmc_man_exp.is_valid(pmc_id))

329 self.assertTrue(pmc_man_exp_2.is_valid(pmc_id))

330

331 arxiv_id = "arxiv:1509.08217"

332 arxiv_string = "arxiv"

333 arxiv_man_exp = op.get_id_manager(arxiv_id, id_man_dict)

334 arxiv_man_exp_2 = op.get_id_manager(arxiv_string, id_man_dict)

335

336 #check that the idmanager for the arxiv was returned and that it works as expected

337 self.assertTrue(arxiv_man_exp.is_valid(arxiv_id))

338 self.assertTrue(arxiv_man_exp_2.is_valid(arxiv_id))

339

340 op.storage_manager.delete_storage()

341

342

343 def test_normalise_any_id(self):

344 '''

345 Check that, given an id with a prefix, any doi, pmid, pmcid and arxiv id is correctly normalised

346 '''

347 op = OpenaireProcessing()

348

349 pmid_id = "pmid:12345"

350 doi_id = "doi:10.1103/physrevd.84.084046"

351 arxiv_id = "arxiv:1509.08217"

352 pmc_id = "pmcid:PMC5555555"

353

354 self.assertEqual(pmid_id, op.normalise_any_id(pmid_id+"abc"))

355 self.assertEqual(doi_id, op.normalise_any_id("doi:" + doi_id.split(":")[1].upper()))

356 self.assertEqual(arxiv_id + "v1", op.normalise_any_id(arxiv_id.replace(".", "....")))

357 self.assertEqual(pmc_id, op.normalise_any_id(pmc_id+" "))

358

359 op.storage_manager.delete_storage()

360

361 def test_normalise_any_id_redis(self):

362 '''

363 Check that, given an id with a prefix, any doi, pmid, pmcid and arxiv id is correctly normalised

364 '''

365 op = OpenaireProcessing(testing=True)

366

367 pmid_id = "pmid:12345"

368 doi_id = "doi:10.1103/physrevd.84.084046"

369 arxiv_id = "arxiv:1509.08217"

370 pmc_id = "pmcid:PMC5555555"

371

372 self.assertEqual(pmid_id, op.normalise_any_id(pmid_id+"abc"))

373 self.assertEqual(doi_id, op.normalise_any_id("doi:" + doi_id.split(":")[1].upper()))

374 self.assertEqual(arxiv_id + "v1", op.normalise_any_id(arxiv_id.replace(".", "....")))

375 self.assertEqual(pmc_id, op.normalise_any_id(pmc_id+" "))

376

377 op.storage_manager.delete_storage()

378

379 def test_get_norm_ids(self):

380 '''

381 Check that, given a list of dictionaries representing the ids of an entity, the method returns a reduced version

382 of the same list, containing only the normalised version of the ids of the schemas managed by opencitations.

383 Each reduced dictionary only contains two key-value pairs, i.e.: "identifier" and "schema".

384 '''

385 op = OpenaireProcessing()

386

387 list_of_ids_to_norm_with_duplicates = [

388 {'identifier': '10.1103/PHYSREVD.84.084046', 'schema': 'doi',

389 'url': 'https://doi.org/10.1103/physrevd.84.084046'},

390 {'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi'},

391 {'identifier': '10.48550/arxiv.1107.5979', 'schema': 'doi',

392 'url': 'https://dx.doi.org/10.48550/arxiv.1107.5979'},

393 {'identifier': '1107.5979', 'schema': 'arXiv', 'url': 'http://arxiv.org/abs/1107.5979'}]

394 norm_ids = op.get_norm_ids(list_of_ids_to_norm_with_duplicates)

395 exp_norm_ids = [{'identifier': 'doi:10.1103/physrevd.84.084046', 'schema': 'doi'},

396 {'identifier': 'doi:10.48550/arxiv.1107.5979', 'schema': 'doi'},

397 {'identifier': 'arxiv:1107.5979v1', 'schema': 'arxiv'}]

398

399 list_of_ids_w_not_managed_schema = [

400 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'},

401 {'identifier': '21887584', 'schema': 'pmid', 'url': 'https://pubmed.ncbi.nlm.nih.gov/21887584'},

402 {'identifier': '10.1007/s12160-011-9282-0', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}]

403 norm_ids_2 = op.get_norm_ids(list_of_ids_w_not_managed_schema)

404 exp_norm_ids_2 = [{'identifier': 'pmid:21887584', 'schema': 'pmid'},

405 {'identifier': 'doi:10.1007/s12160-011-9282-0', 'schema': 'doi'}]

406

407 list_of_ids_not_managed_and_not_normalisable_only = [

408 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'},

409 {'identifier': '20.ABC/s12160-011-9282-FAKEID', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}]

410 norm_ids_3 = op.get_norm_ids(list_of_ids_not_managed_and_not_normalisable_only)

411 exp_norm_ids_3 = []

412

413 self.assertEqual(norm_ids, exp_norm_ids)

414 self.assertEqual(norm_ids_2, exp_norm_ids_2)

415 self.assertEqual(norm_ids_3, exp_norm_ids_3)

416 op.storage_manager.delete_storage()

417

418

419 def test_get_norm_ids_redis(self):

420 '''

421 Check that, given a list of dictionaries representing the ids of an entity, the method returns a reduced version

422 of the same list, containing only the normalised version of the ids of the schemas managed by opencitations.

423 Each reduced dictionary only contains two key-value pairs, i.e.: "identifier" and "schema".

424 '''

425 op = OpenaireProcessing(testing=True)

426

427 list_of_ids_to_norm_with_duplicates = [

428 {'identifier': '10.1103/PHYSREVD.84.084046', 'schema': 'doi',

429 'url': 'https://doi.org/10.1103/physrevd.84.084046'},

430 {'identifier': '10.1103/physrevd.84.084046', 'schema': 'doi'},

431 {'identifier': '10.48550/arxiv.1107.5979', 'schema': 'doi',

432 'url': 'https://dx.doi.org/10.48550/arxiv.1107.5979'},

433 {'identifier': '1107.5979', 'schema': 'arXiv', 'url': 'http://arxiv.org/abs/1107.5979'}]

434 norm_ids = op.get_norm_ids(list_of_ids_to_norm_with_duplicates)

435 exp_norm_ids = [{'identifier': 'doi:10.1103/physrevd.84.084046', 'schema': 'doi'},

436 {'identifier': 'doi:10.48550/arxiv.1107.5979', 'schema': 'doi'},

437 {'identifier': 'arxiv:1107.5979v1', 'schema': 'arxiv'}]

438

439 list_of_ids_w_not_managed_schema = [

440 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'},

441 {'identifier': '21887584', 'schema': 'pmid', 'url': 'https://pubmed.ncbi.nlm.nih.gov/21887584'},

442 {'identifier': '10.1007/s12160-011-9282-0', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}]

443 norm_ids_2 = op.get_norm_ids(list_of_ids_w_not_managed_schema)

444 exp_norm_ids_2 = [{'identifier': 'pmid:21887584', 'schema': 'pmid'},

445 {'identifier': 'doi:10.1007/s12160-011-9282-0', 'schema': 'doi'}]

446

447 list_of_ids_not_managed_and_not_normalisable_only = [

448 {'identifier': '11245/1.357137', 'schema': 'handle', 'url': 'https://hdl.handle.net/11245/1.357137'},

449 {'identifier': '20.ABC/s12160-011-9282-FAKEID', 'schema': 'doi','url': 'https://doi.org/10.1007/s12160-011-9282-0'}]

450 norm_ids_3 = op.get_norm_ids(list_of_ids_not_managed_and_not_normalisable_only)

451 exp_norm_ids_3 = []

452

453 self.assertEqual(norm_ids, exp_norm_ids)

454 self.assertEqual(norm_ids_2, exp_norm_ids_2)

455 self.assertEqual(norm_ids_3, exp_norm_ids_3)

456 op.storage_manager.delete_storage()

457

458 def test_dict_to_cache(self):

459 op = OpenaireProcessing()

460 sample_dict = {"dict_type": "sample"}

461 if os.path.exists(MEMO_JSON_PATH):

462 os.remove(MEMO_JSON_PATH)

463 self.assertFalse(os.path.exists(MEMO_JSON_PATH))

464 op.dict_to_cache(sample_dict, MEMO_JSON_PATH)

465 self.assertTrue(os.path.exists(MEMO_JSON_PATH))

466 self.delete_storege(specific_path=MEMO_JSON_PATH)

467 self.assertFalse(os.path.exists(MEMO_JSON_PATH))

468 op.storage_manager.delete_storage()

469

470

471 def test_csv_creator_base(self):

472 '''

473 Check that, given an updated openaire entity (i.e.: where the "identifier" field was modified

474 after having checked the presence of the given identifiers in the storage memory) a meta csv

475 table for the entity is created

476 '''

477

478 op = OpenaireProcessing()

479 csv_row = op.csv_creator(SAMPLE_ENTITY_FOR_CSV_CREATOR)

480 expected_row = {

481 'id': 'pmid:29890726',

482 'title': 'Occupational physical activity habits of UK office workers: cross-sectional data from the Active Buildings Study',

483 'author': 'Smith Lee; Sawyer Alexia; Gardner Benjamin; Seppala Katri; Ucci Marcella; Marmot Alexi; Lally Pippa; Fisher Abi',

484 'pub_date': '2018-06-09',

485 'venue': '',

486 'volume': '',

487 'issue': '',

488 'page': '',

489 'type': 'journal article',

490 'publisher': 'MDPI',

491 'editor': ''

492 }

493 self.assertEqual(csv_row, expected_row)

494

495 op.storage_manager.delete_storage()

496

497 def test_csv_creator_base_redis(self):

498 '''

499 Check that, given an updated openaire entity (i.e.: where the "identifier" field was modified

500 after having checked the presence of the given identifiers in the storage memory) a meta csv

501 table for the entity is created

502 '''

503

504 op = OpenaireProcessing(testing=True)

505 csv_row = op.csv_creator(SAMPLE_ENTITY_FOR_CSV_CREATOR)

506 expected_row = {

507 'id': 'pmid:29890726',

508 'title': 'Occupational physical activity habits of UK office workers: cross-sectional data from the Active Buildings Study',

509 'author': 'Smith Lee; Sawyer Alexia; Gardner Benjamin; Seppala Katri; Ucci Marcella; Marmot Alexi; Lally Pippa; Fisher Abi',

510 'pub_date': '2018-06-09',

511 'venue': '',

512 'volume': '',

513 'issue': '',

514 'page': '',

515 'type': 'journal article',

516 'publisher': 'MDPI',

517 'editor': ''

518 }

519 self.assertEqual(csv_row, expected_row)

520

521 op.storage_manager.delete_storage()

522

523 def test_csv_creator_not_accepted_id(self):

524 '''

525 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id),

526 no meta csv rows are created.

527 '''

528

529 op = OpenaireProcessing()

530

531 replaced_entity = {'schema': 'handle', 'identifier': 'handle:11245/1.357137', 'valid': None}

532 MODIFIED_ENTITY = {k:v for k,v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()}

533 MODIFIED_ENTITY["identifier"]["to_be_val"]= []

534 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity)

535 csv_row = op.csv_creator(MODIFIED_ENTITY)

536 expected_row = {} #because there is no ID accepted in opencitations for this entity

537 self.assertEqual(csv_row, expected_row)

538

539 op.storage_manager.delete_storage()

540

541 def test_csv_creator_not_accepted_id_redis(self):

542 '''

543 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id),

544 no meta csv rows are created.

545 '''

546

547 op = OpenaireProcessing(testing=True)

548

549 replaced_entity = {'schema': 'handle', 'identifier': 'handle:11245/1.357137', 'valid': None}

550 MODIFIED_ENTITY = {k:v for k,v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()}

551 MODIFIED_ENTITY["identifier"]["to_be_val"]= []

552 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity)

553 csv_row = op.csv_creator(MODIFIED_ENTITY)

554 expected_row = {} #because there is no ID accepted in opencitations for this entity

555 self.assertEqual(csv_row, expected_row)

556

557 op.storage_manager.delete_storage()

558

559 def test_csv_creator_invalid_id(self):

560 '''

561 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id),

562 no meta csv rows are created.

563 '''

564

565 op = OpenaireProcessing()

566

567 replaced_entity = {'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}

568 MODIFIED_ENTITY = {k: v for k, v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()}

569 MODIFIED_ENTITY["identifier"]["to_be_val"] = []

570 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity)

571 csv_row = op.csv_creator(MODIFIED_ENTITY)

572 expected_row = {} # because there is no ID accepted in opencitations for this entity

573 self.assertEqual(csv_row, expected_row)

574

575 op.storage_manager.delete_storage()

576

577

578 def test_csv_creator_invalid_id_redis(self):

579 '''

580 Check that, given an updated openaire entity with NO ids managed by opencitations (i.e.: an handle id),

581 no meta csv rows are created.

582 '''

583

584 op = OpenaireProcessing(testing=True)

585

586 replaced_entity = {'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}

587 MODIFIED_ENTITY = {k: v for k, v in SAMPLE_ENTITY_FOR_CSV_CREATOR.items()}

588 MODIFIED_ENTITY["identifier"]["to_be_val"] = []

589 MODIFIED_ENTITY["identifier"]["to_be_val"].append(replaced_entity)

590 csv_row = op.csv_creator(MODIFIED_ENTITY)

591 expected_row = {} # because there is no ID accepted in opencitations for this entity

592 self.assertEqual(csv_row, expected_row)

593

594 op.storage_manager.delete_storage()

595

596 def test_get_publisher_name_base(self):

597 '''

598 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's

599 normalised name (and possibly its crossref ID) is returned.

600

601 Base functionalities: No publisher mapping in input -> only Publisher name retrieved from the datasource dump

602 '''

603 op = OpenaireProcessing()

604 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'}

605

606 doi_pub_1_input = {'name': 'Frontiers Media SA'}

607 doi1 = "10.3389/fnana.2012.00034"

608

609 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'}

610 doi2 = "10.2527/1995.7392834x"

611

612 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input)

613 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input)

614 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input)

615

616 self.assertEqual(doi_pub_output_1, "Frontiers Media SA")

617 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd")

618 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)")

619

620 op.storage_manager.delete_storage()

621

622 def test_get_publisher_name_base_redis(self):

623 '''

624 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's

625 normalised name (and possibly its crossref ID) is returned.

626

627 Base functionalities: No publisher mapping in input -> only Publisher name retrieved from the datasource dump

628 '''

629 op = OpenaireProcessing(testing=True)

630 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'}

631

632 doi_pub_1_input = {'name': 'Frontiers Media SA'}

633 doi1 = "10.3389/fnana.2012.00034"

634

635 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'}

636 doi2 = "10.2527/1995.7392834x"

637

638 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input)

639 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input)

640 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input)

641

642 self.assertEqual(doi_pub_output_1, "Frontiers Media SA")

643 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd")

644 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)")

645

646 op.storage_manager.delete_storage()

647

648 def test_get_publisher_name_publishers_mapping(self):

649 '''

650 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's

651 normalised name (and possibly its crossref ID) is returned.

652

653 Mapping Provided: Publisher name retrieved + crossref member returned,

654 only if :

655 - the doi prefix is a crossref doi prefix,

656 - it is present in the mapping,

657 -the name of the publisher provided by the datasource corresponds to the from the datasource dump

658 '''

659

660 op = OpenaireProcessing(publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json")

661

662 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'}

663

664 doi_pub_1_input = {'name': 'Frontiers Media SA'}

665 doi1 = "10.3389/fnana.2012.00034"

666

667 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'}

668 doi2 = "10.2527/1995.7392834x"

669

670 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input)

671 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input)

672 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input)

673

674 self.assertEqual(doi_pub_output_1, "Frontiers Media SA")

675 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd")

676 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)")

677

678 op.storage_manager.delete_storage()

679

680 def test_get_publisher_name_publishers_mapping_redis(self):

681 '''

682 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's

683 normalised name (and possibly its crossref ID) is returned.

684

685 Mapping Provided: Publisher name retrieved + crossref member returned,

686 only if :

687 - the doi prefix is a crossref doi prefix,

688 - it is present in the mapping,

689 -the name of the publisher provided by the datasource corresponds to the from the datasource dump

690 '''

691

692 op = OpenaireProcessing(testing=True,publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json")

693

694 no_doi_pub_input = {'name': 'Blackwell Publishing Ltd'}

695

696 doi_pub_1_input = {'name': 'Frontiers Media SA'}

697 doi1 = "10.3389/fnana.2012.00034"

698

699 doi_pub_2_input = {'name': 'Oxford University Press (OUP)'}

700 doi2 = "10.2527/1995.7392834x"

701

702 no_doi_pub_output = op.get_publisher_name([""], no_doi_pub_input)

703 doi_pub_output_1 = op.get_publisher_name([doi1], doi_pub_1_input)

704 doi_pub_output_2 = op.get_publisher_name([doi2], doi_pub_2_input)

705

706 self.assertEqual(doi_pub_output_1, "Frontiers Media SA")

707 self.assertEqual(no_doi_pub_output, "Blackwell Publishing Ltd")

708 self.assertEqual(doi_pub_output_2, "Oxford University Press (OUP)")

709

710 op.storage_manager.delete_storage()

711

712 def test_get_publisher_name_publishers_mapping_multi_dois(self):

713 '''

714 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's

715 normalised name (and possibly its crossref ID) is returned.

716

717 Mapping Provided: Publisher name retrieved + crossref member returned,

718 only if :

719 - the doi prefix is a crossref doi prefix,

720 - it is present in the mapping,

721 -the name of the publisher provided by the datasource corresponds to the from the datasource dump

722 '''

723

724 op = OpenaireProcessing(publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json")

725

726 # CASE 1: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the

727 # entity's dois prefixes in the prefix-to-publisher-data mapping in input

728 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member

729

730 ent_1_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society

731 ent_1_doi_2 = "10.1153/sample_doi"

732 pub_input_1 = {'name': 'American Physiological Society'}

733

734 no_doi_pub_output = op.get_publisher_name([ent_1_doi_1, ent_1_doi_2], pub_input_1)

735

736 self.assertEqual(no_doi_pub_output, "American Physiological Society [crossref:24]")

737

738 # CASE 2: The Publisher Name provided by OPENAIRE does not correspond to the Publisher Name mapped to one of the

739 # entity's dois prefixes in the prefix-to-publisher-data mapping in input

740 # EXPECTED OUTPUT: The publisher name provided by Openaire is retrieved without any crossref member

741

742 ent_2_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society

743 ent_2_doi_2 = "10.1153/sample_doi"

744 pub_input_2 = {'name': 'Sample Publisher Name'}

745

746 no_doi_pub_output2 = op.get_publisher_name([ent_2_doi_1, ent_2_doi_2], pub_input_2)

747 self.assertEqual(no_doi_pub_output2, "Sample Publisher Name")

748

749 # CASE 3: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the

750 # entity's dois prefixes in the prefix-to-publisher-data mapping in input BUT it is not the first doi of the list

751 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member

752

753 ent_3_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society

754 ent_3_doi_2 = "10.1153/sample_doi"

755 pub_input_3 = {'name': 'American Physiological Society'}

756

757 doi_pub_output3 = op.get_publisher_name([ent_3_doi_2, ent_3_doi_1], pub_input_3)

758

759 self.assertEqual(doi_pub_output3, "American Physiological Society [crossref:24]")

760

761 op.storage_manager.delete_storage()

762

763 # CASE 4: OPENAIRE does not provide a publisher name but one of the entity's DOI prefixes is in the

764 # prefix-to-publisher-data mapping in input

765 # EXPECTED OUTPUT: empty string

766

767 ent_4_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society

768 ent_4_doi_2 = "10.1153/sample_doi"

769 pub_input_4 = {'name': ''}

770 pub_input_4_1 = {}

771 pub_input_4_2 = ''

772

773 doi_pub_output4 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4)

774 doi_pub_output4_1 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_1)

775 doi_pub_output4_2= op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_2)

776

777 self.assertEqual(doi_pub_output4, "")

778 self.assertEqual(doi_pub_output4_1, "")

779 self.assertEqual(doi_pub_output4_2, "")

780

781 op.storage_manager.delete_storage()

782

783 def test_get_publisher_name_publishers_mapping_multi_dois_redis(self):

784 '''

785 Check that, given a doi and a dictionary representing a publisher's data, the string of the publisher's

786 normalised name (and possibly its crossref ID) is returned.

787

788 Mapping Provided: Publisher name retrieved + crossref member returned,

789 only if :

790 - the doi prefix is a crossref doi prefix,

791 - it is present in the mapping,

792 -the name of the publisher provided by the datasource corresponds to the from the datasource dump

793 '''

794

795 op = OpenaireProcessing(testing=True, publishers_filepath_openaire="test/openaire_processing/support_material/publishers.json")

796

797 # CASE 1: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the

798 # entity's dois prefixes in the prefix-to-publisher-data mapping in input

799 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member

800

801 ent_1_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society

802 ent_1_doi_2 = "10.1153/sample_doi"

803 pub_input_1 = {'name': 'American Physiological Society'}

804

805 no_doi_pub_output = op.get_publisher_name([ent_1_doi_1, ent_1_doi_2], pub_input_1)

806

807 self.assertEqual(no_doi_pub_output, "American Physiological Society [crossref:24]")

808

809 # CASE 2: The Publisher Name provided by OPENAIRE does not correspond to the Publisher Name mapped to one of the

810 # entity's dois prefixes in the prefix-to-publisher-data mapping in input

811 # EXPECTED OUTPUT: The publisher name provided by Openaire is retrieved without any crossref member

812

813 ent_2_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society

814 ent_2_doi_2 = "10.1153/sample_doi"

815 pub_input_2 = {'name': 'Sample Publisher Name'}

816

817 no_doi_pub_output2 = op.get_publisher_name([ent_2_doi_1, ent_2_doi_2], pub_input_2)

818 self.assertEqual(no_doi_pub_output2, "Sample Publisher Name")

819

820 # CASE 3: The Publisher Name provided by OPENAIRE corresponds to the Publisher Name mapped to one of the

821 # entity's dois prefixes in the prefix-to-publisher-data mapping in input BUT it is not the first doi of the list

822 # EXPECTED OUTPUT: The publisher name is retrieved with its crossref member

823

824 ent_3_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society

825 ent_3_doi_2 = "10.1153/sample_doi"

826 pub_input_3 = {'name': 'American Physiological Society'}

827

828 doi_pub_output3 = op.get_publisher_name([ent_3_doi_2, ent_3_doi_1], pub_input_3)

829

830 self.assertEqual(doi_pub_output3, "American Physiological Society [crossref:24]")

831

832 op.storage_manager.delete_storage()

833

834 # CASE 4: OPENAIRE does not provide a publisher name but one of the entity's DOI prefixes is in the

835 # prefix-to-publisher-data mapping in input

836 # EXPECTED OUTPUT: empty string

837

838 ent_4_doi_1 = "10.1152/sample_doi" #this prefix is in the mapping and corresponds to American Physiological Society

839 ent_4_doi_2 = "10.1153/sample_doi"

840 pub_input_4 = {'name': ''}

841 pub_input_4_1 = {}

842 pub_input_4_2 = ''

843

844 doi_pub_output4 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4)

845 doi_pub_output4_1 = op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_1)

846 doi_pub_output4_2= op.get_publisher_name([ent_4_doi_1, ent_4_doi_2], pub_input_4_2)

847

848 self.assertEqual(doi_pub_output4, "")

849 self.assertEqual(doi_pub_output4_1, "")

850 self.assertEqual(doi_pub_output4_2, "")

851

852 op.storage_manager.delete_storage()

853

854 def test_manage_arxiv_single_id(self):

855 '''Check the correct management of entities with only one ID, in particular in

856 case it is an arxiv. In this case, if it is an arxiv DOI, we return the normalised

857 version of the correspondent arxiv. Both in case of an arxiv id and of an arxiv doi,

858 we return the versioned arxiv id where the version is available (never in ARXIV doi).

859 If no version is provided, we normalise the arxiv id as arxiv id version 1.

860 In all the other id cases (pmid, pmc, handle (which is discarded in a later step) '''

861 sample_doi_any = [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}]

862 sample_doi_arxiv = [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]

863 sample_arxiv_no_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217', 'valid': None}]

864 sample_arxiv_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509...08217v3', 'valid': None}]

865

866 op = OpenaireProcessing()

867

868 # CASE 1: the unique input id dict in list is a not-arxiv doi : the input list is returned

869 out_sample_doi_any = op.manage_arxiv_single_id(sample_doi_any)

870 self.assertEqual(out_sample_doi_any, [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}])

871

872 # CASE 2: the unique input id dict in list is an arxiv doi: the doi is replaced with its correspondent arxiv v1

873 out_sample_doi_arxiv = op.manage_arxiv_single_id(sample_doi_arxiv)

874 self.assertEqual(out_sample_doi_arxiv, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}])

875

876 # CASE 3: the unique input id dict in list is an arxiv id without version:

877 # the arxiv id is replaced with its v1

878 out_sample_arxiv_no_ver = op.manage_arxiv_single_id(sample_arxiv_no_ver)

879 self.assertEqual(out_sample_arxiv_no_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}])

880

881 # CASE 4: the unique input id dict in list is an arxiv id with version: the id is just normalised

882 out_sample_arxiv_ver = op.manage_arxiv_single_id(sample_arxiv_ver)

883 self.assertEqual(out_sample_arxiv_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v3'}])

884

885 op.storage_manager.delete_storage()

886

887 def test_manage_arxiv_single_id_redis(self):

888 '''Check the correct management of entities with only one ID, in particular in

889 case it is an arxiv. In this case, if it is an arxiv DOI, we return the normalised

890 version of the correspondent arxiv. Both in case of an arxiv id and of an arxiv doi,

891 we return the versioned arxiv id where the version is available (never in ARXIV doi).

892 If no version is provided, we normalise the arxiv id as arxiv id version 1.

893 In all the other id cases (pmid, pmc, handle (which is discarded in a later step) '''

894 sample_doi_any = [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}]

895 sample_doi_arxiv = [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]

896 sample_arxiv_no_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217', 'valid': None}]

897 sample_arxiv_ver = [{'schema': 'arxiv', 'identifier': 'arxiv:1509...08217v3', 'valid': None}]

898

899 op = OpenaireProcessing(testing=True)

900

901 # CASE 1: the unique input id dict in list is a not-arxiv doi : the input list is returned

902 out_sample_doi_any = op.manage_arxiv_single_id(sample_doi_any)

903 self.assertEqual(out_sample_doi_any, [{'schema': 'doi', 'identifier': 'doi:10.1000/FAKE_ID', 'valid': None}])

904

905 # CASE 2: the unique input id dict in list is an arxiv doi: the doi is replaced with its correspondent arxiv v1

906 out_sample_doi_arxiv = op.manage_arxiv_single_id(sample_doi_arxiv)

907 self.assertEqual(out_sample_doi_arxiv, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}])

908

909 # CASE 3: the unique input id dict in list is an arxiv id without version:

910 # the arxiv id is replaced with its v1

911 out_sample_arxiv_no_ver = op.manage_arxiv_single_id(sample_arxiv_no_ver)

912 self.assertEqual(out_sample_arxiv_no_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v1'}])

913

914 # CASE 4: the unique input id dict in list is an arxiv id with version: the id is just normalised

915 out_sample_arxiv_ver = op.manage_arxiv_single_id(sample_arxiv_ver)

916 self.assertEqual(out_sample_arxiv_ver, [{'schema': 'arxiv', 'identifier': 'arxiv:1509.08217v3'}])

917

918 op.storage_manager.delete_storage()

919

920 def test_manage_doi_prefixes_priorities(self):

921 op = OpenaireProcessing()

922

923 # CASE1: 1 figshare doi (priority 1) with version --> returned as it is

924 es_1 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v3', 'valid': None}]

925 out_1 = op.manage_doi_prefixes_priorities(es_1)

926 self.assertEqual(out_1, es_1)

927

928 # CASE2: 1 figshare doi (priority 1) without version --> returned with version v1

929 es_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234', 'valid': None}]

930 exp_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v1', 'valid': None}]

931 out_2 = op.manage_doi_prefixes_priorities(es_2)

932 self.assertEqual(exp_2, out_2)

933

934 # CASE3: 1 arxiv doi (always without and version) --> returned as correspondent arxiv id version v1

935 es_3 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}]

936 out_3 = op.manage_doi_prefixes_priorities(es_3)

937 exp_3 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}]

938 self.assertEqual(exp_3, out_3)

939

940 # CASE4: >1 arxiv doi or figshare and at least one has version --> return the one(s) with version

941 es_4 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}]

942 out_4 = op.manage_doi_prefixes_priorities(es_4)

943 exp_4 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}]

944 self.assertEqual(exp_4, out_4)

945

946 # CASE5: >1 arxiv doi or figshare and none has version --> return, as first choice, the arxiv version v1 of the first arxiv doi encountered

947 es_5 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}]

948 out_5 = op.manage_doi_prefixes_priorities(es_5)

949 exp_5 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}]

950 self.assertEqual(exp_5, out_5)

951

952 # CASE6: >1 figshare dois and none has version --> return, return version v1 doi of the first figshare doi encountered

953 es_6 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/1234', 'valid': None}]

954 out_6 = op.manage_doi_prefixes_priorities(es_6)

955 exp_6 = [{'identifier': 'doi:10.6084/5678v1', 'schema': 'doi', 'valid': None}]

956 self.assertEqual(exp_6, out_6)

957

958 # CASE7: >1 more than one zenodo doi --> return the one with the highest number: it is the last one assigned and thus it

959 # is a version doi and not the collector doi (which is the first one to be assigned when a publication is uploaded on zenodo).

960 es_7 = [{'schema': 'doi', 'identifier': '10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': '10.5281/zenodo.112', 'valid': None}]

961 es_7_1 = [{'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.112', 'valid': None}]

962 out_7 = op.manage_doi_prefixes_priorities(es_7)

963 out_7_1 = op.manage_doi_prefixes_priorities(es_7_1)

964 exp_7 = [{'identifier': '10.5281/zenodo.112', 'schema': 'doi', 'valid': None}]

965 exp_7_1 = [{'identifier': 'doi:10.5281/zenodo.112', 'schema': 'doi', 'valid': None}]

966 self.assertEqual(exp_7, out_7)

967 self.assertEqual(exp_7_1, out_7_1)

968

969 # CASE8: None of the previous cases: return the first VALID DOI with highest priority prefix

970 #No one of the ids is valid, return an empty list

971 es_8 = [

972 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None},

973 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None},

974 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None},

975 ]

976

977 out_8 = op.manage_doi_prefixes_priorities(es_8)

978 exp_8 = []

979 self.assertEqual(exp_8, out_8)

980

981 # CASE8_1:

982 # No valid id among the ones with a max priority prefix --> return the first valid ID in order of prefix priority

983 es_8_1 = [

984 {'schema': 'doi', 'identifier': '10.5281/zenodo.4725899', 'valid': None},

985 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None},

986 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None},

987 ]

988

989 out_8_1 = op.manage_doi_prefixes_priorities(es_8_1)

990 exp_8_1 = [{'schema': 'doi', 'identifier': '10.5281/zenodo.4725899', 'valid': None}]

991 self.assertEqual(exp_8_1, out_8_1)

992

993 # CASE8_2:

994 # more valid ids among the ones with a max priority prefix --> return the first one encountered

995 es_8_2 = [

996 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None},

997 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None},

998 {'schema': 'doi', 'identifier': 'doi:10.25384/sage.c.4112909', 'valid': None},

999 ]

1000

1001 out_8_2 = op.manage_doi_prefixes_priorities(es_8_2)

1002 exp_8_2 = [{'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}]

1003 self.assertEqual(exp_8_2, out_8_2)

1004

1005 op.storage_manager.delete_storage()

1006

1007 def test_manage_doi_prefixes_priorities_redis(self):

1008 op = OpenaireProcessing(testing=True)

1009

1010 # CASE1: 1 figshare doi (priority 1) with version --> returned as it is

1011 es_1 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v3', 'valid': None}]

1012 out_1 = op.manage_doi_prefixes_priorities(es_1)

1013 self.assertEqual(out_1, es_1)

1014

1015 # CASE2: 1 figshare doi (priority 1) without version --> returned with version v1

1016 es_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234', 'valid': None}]

1017 exp_2 = [{'schema': 'doi', 'identifier': 'doi:10.6084/1234.1234v1', 'valid': None}]

1018 out_2 = op.manage_doi_prefixes_priorities(es_2)

1019 self.assertEqual(exp_2, out_2)

1020

1021 # CASE3: 1 arxiv doi (always without and version) --> returned as correspondent arxiv id version v1

1022 es_3 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}]

1023 out_3 = op.manage_doi_prefixes_priorities(es_3)

1024 exp_3 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}]

1025 self.assertEqual(exp_3, out_3)

1026

1027 # CASE4: >1 arxiv doi or figshare and at least one has version --> return the one(s) with version

1028 es_4 = [{'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}]

1029 out_4 = op.manage_doi_prefixes_priorities(es_4)

1030 exp_4 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678v3', 'valid': None}]

1031 self.assertEqual(exp_4, out_4)

1032

1033 # CASE5: >1 arxiv doi or figshare and none has version --> return, as first choice, the arxiv version v1 of the first arxiv doi encountered

1034 es_5 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.48550/1234.1234', 'valid': None}]

1035 out_5 = op.manage_doi_prefixes_priorities(es_5)

1036 exp_5 = [{'identifier': 'arxiv:1234.1234v1', 'schema': 'arxiv'}]

1037 self.assertEqual(exp_5, out_5)

1038

1039 # CASE6: >1 figshare dois and none has version --> return, return version v1 doi of the first figshare doi encountered

1040 es_6 = [{'schema': 'doi', 'identifier': 'doi:10.6084/5678', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.6084/1234', 'valid': None}]

1041 out_6 = op.manage_doi_prefixes_priorities(es_6)

1042 exp_6 = [{'identifier': 'doi:10.6084/5678v1', 'schema': 'doi', 'valid': None}]

1043 self.assertEqual(exp_6, out_6)

1044

1045 # CASE7: >1 more than one zenodo doi --> return the one with the highest number: it is the last one assigned and thus it

1046 # is a version doi and not the collector doi (which is the first one to be assigned when a publication is uploaded on zenodo).

1047 es_7 = [{'schema': 'doi', 'identifier': '10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': '10.5281/zenodo.112', 'valid': None}]

1048 es_7_1 = [{'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None}, {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.112', 'valid': None}]

1049 out_7 = op.manage_doi_prefixes_priorities(es_7)

1050 out_7_1 = op.manage_doi_prefixes_priorities(es_7_1)

1051 exp_7 = [{'identifier': '10.5281/zenodo.112', 'schema': 'doi', 'valid': None}]

1052 exp_7_1 = [{'identifier': 'doi:10.5281/zenodo.112', 'schema': 'doi', 'valid': None}]

1053 self.assertEqual(exp_7, out_7)

1054 self.assertEqual(exp_7_1, out_7_1)

1055

1056 # CASE8: None of the previous cases: return the first VALID DOI with highest priority prefix

1057 #No one of the ids is valid, return an empty list

1058 es_8 = [

1059 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.111', 'valid': None},

1060 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None},

1061 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None},

1062 ]

1063

1064 out_8 = op.manage_doi_prefixes_priorities(es_8)

1065 exp_8 = []

1066 self.assertEqual(exp_8, out_8)

1067

1068 # CASE8_1:

1069 # No valid id among the ones with a max priority prefix --> return the first valid ID in order of prefix priority

1070 es_8_1 = [

1071 {'schema': 'doi', 'identifier': '10.5281/zenodo.4725899', 'valid': None},

1072 {'schema': 'doi', 'identifier': 'doi:10.1184/abc', 'valid': None},

1073 {'schema': 'doi', 'identifier': 'doi:10.25384/efg', 'valid': None},

1074 ]

1075

1076 out_8_1 = op.manage_doi_prefixes_priorities(es_8_1)

1077 exp_8_1 = [{'schema': 'doi', 'identifier': '10.5281/zenodo.4725899', 'valid': None}]

1078 self.assertEqual(exp_8_1, out_8_1)

1079

1080 # CASE8_2:

1081 # more valid ids among the ones with a max priority prefix --> return the first one encountered

1082 es_8_2 = [

1083 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None},

1084 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None},

1085 {'schema': 'doi', 'identifier': 'doi:10.25384/sage.c.4112909', 'valid': None},

1086 ]

1087

1088 out_8_2 = op.manage_doi_prefixes_priorities(es_8_2)

1089 exp_8_2 = [{'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}]

1090 self.assertEqual(exp_8_2, out_8_2)

1091

1092 op.storage_manager.delete_storage()

1093

1094 def test_to_validated_id_list(self):

1095 # NOTE: in tests using the sqlite storage method it must be avoided to delete the storage

1096 # while using the same OpenaireProcessing() instance, otherwise the process would try to

1097 # store data in a filepath that has just been deleted, with no new connection created after it.

1098

1099 # 2 OPTIONS: 1) instantiate OpenaireProcessing only once at the beginning and delete the

1100 # storage only at the end; 2) create a new OpenaireProcessing instance at every check and

1101 # delete the storage each time after the check is done.

1102

1103 op = OpenaireProcessing()

1104 # CASE1_1: No already validated ids + 1 id to be validated, which is valid

1105 inp_1 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}]}

1106 out_1 = op.to_validated_id_list(inp_1)

1107 exp_1 = ['pmid:20662931']

1108 self.assertEqual(out_1, exp_1)

1109 op.storage_manager.delete_storage()

1110

1111 op = OpenaireProcessing()

1112 # CASE1_2: No already validated ids + 1 id to be validated, which is invalid

1113 inp_2 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:999920662931', 'valid': None}]}

1114 out_2 = op.to_validated_id_list(inp_2)

1115 exp_2 = []

1116 self.assertEqual(out_2, exp_2)

1117

1118 op = OpenaireProcessing()

1119 # CASE1_3: No already validated ids + 1 id to be validated, which is a valid arxiv doi

1120 inp_3 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]}

1121 out_3 = op.to_validated_id_list(inp_3)

1122 exp_3 = ['arxiv:1509.08217v1']

1123 self.assertEqual(out_3, exp_3)

1124 op.storage_manager.delete_storage()

1125

1126

1127 op = OpenaireProcessing()

1128 # CASE1_4: No already validated ids + 1 id to be validated, which hasn't a valid schema

1129 inp_4 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "0", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]}

1130 out_4 = op.to_validated_id_list(inp_4)

1131 exp_4 = []

1132 self.assertEqual(out_4, exp_4)

1133 op.storage_manager.delete_storage()

1134

1135 op = OpenaireProcessing()

1136 # CASE1_5: No already validated ids + 1 id to be validated, which is not valid

1137 inp_5 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "doi", 'identifier': 'doi:10.0000/fake_id', 'valid': None}]}

1138 out_5 = op.to_validated_id_list(inp_5)

1139 exp_5 = []

1140 self.assertEqual(out_5, exp_5)

1141 op.storage_manager.delete_storage()

1142

1143 op = OpenaireProcessing()

1144 # CASE1_9: No already validated ids + 1 id to be validated, which is a valid PMC

1145 inp_9 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]}

1146 out_9 = op.to_validated_id_list(inp_9)

1147 exp_9 = ['pmcid:PMC2873764']

1148 self.assertEqual(out_9, exp_9)

1149 op.storage_manager.delete_storage()

1150

1151 op = OpenaireProcessing()

1152 # CASE2_1: No already validated ids + >1 id to be validated, both valid and with accepted schemas

1153 inp_6 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},

1154 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None}]}

1155 out_6 = op.to_validated_id_list(inp_6)

1156 exp_6 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0']

1157 self.assertCountEqual(out_6, exp_6) #Test that sequence first contains the same elements as second, regardless of their order

1158 op.storage_manager.delete_storage()

1159

1160 op = OpenaireProcessing()

1161 # CASE2_2: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv id

1162 inp_8 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},

1163 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979', 'valid': None}]}

1164 out_8 = op.to_validated_id_list(inp_8)

1165 exp_8 = ['pmid:20662931']

1166 self.assertEqual(out_8, exp_8)

1167 op.storage_manager.delete_storage()

1168

1169 op = OpenaireProcessing()

1170 # CASE2_3: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv doi

1171 inp_7 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, {'schema': "doi", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]}

1172 out_7 = op.to_validated_id_list(inp_7)

1173 exp_7 = ['pmid:20662931']

1174 self.assertEqual(out_7, exp_7)

1175 op.storage_manager.delete_storage()

1176

1177 op = OpenaireProcessing()

1178 # CASE2_4: No already validated ids + >1 id to be validated, both valid, one of the two is a PMC

1179 inp_10 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},

1180 {'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]}

1181 out_10 = op.to_validated_id_list(inp_10)

1182 exp_10 = ['pmid:20662931']

1183 self.assertEqual(out_10, exp_10)

1184 op.storage_manager.delete_storage()

1185

1186 op = OpenaireProcessing()

1187 # CASE2_5: No already validated ids + >1 id to be validated, 1 valid pmid, 1 valid doi, 1 valid doi with a "critic" prefix

1188 # for opencitations entities management

1189

1190 inp_11 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},

1191 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None},

1192 {'schema': 'doi',

1193 'identifier': 'doi:10.48550/arXiv.1509.08217',

1194 'valid': None}

1195 ]}

1196 out_11 = op.to_validated_id_list(inp_11)

1197 exp_11 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0']

1198 self.assertCountEqual(out_11, exp_11) #Test that sequence first contains the same elements as second, regardless of their order

1199 op.storage_manager.delete_storage()

1200

1201 op = OpenaireProcessing()

1202 # CASE2_6: No already validated ids + >1 id to be validated, one doi with a "critic" prefix and a PMCID

1203 # for opencitations entities management

1204

1205 inp_12 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmcid', 'identifier': 'pmcid:PMC5555555', 'valid': None},

1206 {'schema': 'doi',

1207 'identifier': 'doi:10.48550/arXiv.1509.08217',

1208 'valid': None}

1209 ]}

1210 out_12 = op.to_validated_id_list(inp_12)

1211 exp_12 = ['pmcid:PMC5555555']

1212 self.assertEqual(out_12, exp_12)

1213 op.storage_manager.delete_storage()

1214

1215 op = OpenaireProcessing()

1216 # CASE2_7: no already validated ids + >1 id to be validated, one doi with a "critic" prefix for opencitations

1217 # ingestion workflow and an ARXIV

1218

1219 inp_13 = {'valid': [], 'not_valid': [], 'to_be_val': [

1220 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979v1', 'valid': None},

1221 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}

1222 ]}

1223 out_13 = op.to_validated_id_list(inp_13)

1224 exp_13 = ['arxiv:1107.5979v1']

1225 self.assertEqual(out_13, exp_13)

1226 op.storage_manager.delete_storage()

1227

1228 op = OpenaireProcessing()

1229 # CASE2_8: no already validated ids and more dois with "critic" prefixes for opencitations

1230 # ingestion workflow

1231

1232 inp_14 = {'valid': [], 'not_valid': [], 'to_be_val': [

1233 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None},

1234 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None}

1235 ]}

1236 out_14 = op.to_validated_id_list(inp_14)

1237 exp_14 = ['doi:10.1184/r1/12841247.v1']

1238 self.assertEqual(out_14, exp_14)

1239 op.storage_manager.delete_storage()

1240

1241 op = OpenaireProcessing()

1242 # CASE3: an already validated id and more dois with "critic" prefixes for opencitations

1243 # ingestion workflow

1244

1245 inp_15 = {'valid': [], 'not_valid': [], 'to_be_val': [

1246 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None},

1247 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None},

1248 {'schema': 'doi', 'identifier': 'doi:10.7557/5.5607', 'valid': None},

1249 {}

1250 ]}

1251 out_15 = op.to_validated_id_list(inp_15)

1252 exp_15 = ['doi:10.7557/5.5607']

1253 self.assertEqual(out_15, exp_15)

1254 op.storage_manager.delete_storage()

1255

1256 def test_to_validated_id_list_redis(self):

1257 # NOTE: in tests using the sqlite storage method it must be avoided to delete the storage

1258 # while using the same OpenaireProcessing() instance, otherwise the process would try to

1259 # store data in a filepath that has just been deleted, with no new connection created after it.

1260

1261 # 2 OPTIONS: 1) instantiate OpenaireProcessing only once at the beginning and delete the

1262 # storage only at the end; 2) create a new OpenaireProcessing instance at every check and

1263 # delete the storage each time after the check is done.

1264

1265 op = OpenaireProcessing(testing=True)

1266 # CASE1_1: No already validated ids + 1 id to be validated, which is valid

1267 inp_1 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}]}

1268 out_1 = op.to_validated_id_list(inp_1)

1269 exp_1 = ['pmid:20662931']

1270 self.assertEqual(out_1, exp_1)

1271 op.storage_manager.delete_storage()

1272

1273 op = OpenaireProcessing(testing=True)

1274 # CASE1_2: No already validated ids + 1 id to be validated, which is invalid

1275 inp_2 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:999920662931', 'valid': None}]}

1276 out_2 = op.to_validated_id_list(inp_2)

1277 exp_2 = []

1278 self.assertEqual(out_2, exp_2)

1279 op.storage_manager.delete_storage()

1280

1281 op = OpenaireProcessing(testing=True)

1282 # CASE1_3: No already validated ids + 1 id to be validated, which is a valid arxiv doi

1283 inp_3 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'doi', 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]}

1284 out_3 = op.to_validated_id_list(inp_3)

1285 exp_3 = ['arxiv:1509.08217v1']

1286 self.assertEqual(out_3, exp_3)

1287 op.storage_manager.delete_storage()

1288

1289 op = OpenaireProcessing(testing=True)

1290 # CASE1_4: No already validated ids + 1 id to be validated, which hasn't a valid schema

1291 inp_4 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "0", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]}

1292 out_4 = op.to_validated_id_list(inp_4)

1293 exp_4 = []

1294 self.assertEqual(out_4, exp_4)

1295 op.storage_manager.delete_storage()

1296

1297 op = OpenaireProcessing(testing=True)

1298 # CASE1_5: No already validated ids + 1 id to be validated, which is not valid

1299 inp_5 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "doi", 'identifier': 'doi:10.0000/fake_id', 'valid': None}]}

1300 out_5 = op.to_validated_id_list(inp_5)

1301 exp_5 = []

1302 self.assertEqual(out_5, exp_5)

1303 op.storage_manager.delete_storage()

1304

1305 op = OpenaireProcessing(testing=True)

1306 # CASE1_9: No already validated ids + 1 id to be validated, which is a valid PMC

1307 inp_9 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]}

1308 out_9 = op.to_validated_id_list(inp_9)

1309 exp_9 = ['pmcid:PMC2873764']

1310 self.assertEqual(out_9, exp_9)

1311 op.storage_manager.delete_storage()

1312

1313 op = OpenaireProcessing(testing=True)

1314 # CASE2_1: No already validated ids + >1 id to be validated, both valid and with accepted schemas

1315 inp_6 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},

1316 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None}]}

1317 out_6 = op.to_validated_id_list(inp_6)

1318 exp_6 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0']

1319 self.assertCountEqual(out_6, exp_6) #Test that sequence first contains the same elements as second, regardless of their order

1320 op.storage_manager.delete_storage()

1321

1322 op = OpenaireProcessing(testing=True)

1323 # CASE2_2: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv id

1324 inp_8 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},

1325 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979', 'valid': None}]}

1326 out_8 = op.to_validated_id_list(inp_8)

1327 exp_8 = ['pmid:20662931']

1328 self.assertEqual(out_8, exp_8)

1329 op.storage_manager.delete_storage()

1330

1331 op = OpenaireProcessing(testing=True)

1332 # CASE2_3: No already validated ids + >1 id to be validated, both valid, one of the two is an arxiv doi

1333 inp_7 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None}, {'schema': "doi", 'identifier': 'doi:10.48550/arXiv.1509.08217', 'valid': None}]}

1334 out_7 = op.to_validated_id_list(inp_7)

1335 exp_7 = ['pmid:20662931']

1336 self.assertEqual(out_7, exp_7)

1337 op.storage_manager.delete_storage()

1338

1339 op = OpenaireProcessing(testing=True)

1340 # CASE2_4: No already validated ids + >1 id to be validated, both valid, one of the two is a PMC

1341 inp_10 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},

1342 {'schema': "pmcid", 'identifier': 'pmcid:PMC2873764', 'valid': None}]}

1343 out_10 = op.to_validated_id_list(inp_10)

1344 exp_10 = ['pmid:20662931']

1345 self.assertEqual(out_10, exp_10)

1346 op.storage_manager.delete_storage()

1347

1348 op = OpenaireProcessing(testing=True)

1349 # CASE2_5: No already validated ids + >1 id to be validated, 1 valid pmid, 1 valid doi, 1 valid doi with a "critic" prefix

1350 # for opencitations entities management

1351

1352 inp_11 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmid', 'identifier': 'pmid:20662931', 'valid': None},

1353 {'schema': 'doi', 'identifier': 'doi:10.1007/s12160-011-9282-0', 'valid': None},

1354 {'schema': 'doi',

1355 'identifier': 'doi:10.48550/arXiv.1509.08217',

1356 'valid': None}

1357 ]}

1358 out_11 = op.to_validated_id_list(inp_11)

1359 exp_11 = ['pmid:20662931', 'doi:10.1007/s12160-011-9282-0']

1360 self.assertCountEqual(out_11, exp_11) #Test that sequence first contains the same elements as second, regardless of their order

1361 op.storage_manager.delete_storage()

1362

1363 op = OpenaireProcessing(testing=True)

1364 # CASE2_6: No already validated ids + >1 id to be validated, one doi with a "critic" prefix and a PMCID

1365 # for opencitations entities management

1366

1367 inp_12 = {'valid': [], 'not_valid': [], 'to_be_val': [{'schema': 'pmcid', 'identifier': 'pmcid:PMC5555555', 'valid': None},

1368 {'schema': 'doi',

1369 'identifier': 'doi:10.48550/arXiv.1509.08217',

1370 'valid': None}

1371 ]}

1372 out_12 = op.to_validated_id_list(inp_12)

1373 exp_12 = ['pmcid:PMC5555555']

1374 self.assertEqual(out_12, exp_12)

1375 op.storage_manager.delete_storage()

1376

1377 op = OpenaireProcessing(testing=True)

1378 # CASE2_7: no already validated ids + >1 id to be validated, one doi with a "critic" prefix for opencitations

1379 # ingestion workflow and an ARXIV

1380

1381 inp_13 = {'valid': [], 'not_valid': [], 'to_be_val': [

1382 {'schema': 'arxiv', 'identifier': 'arxiv:1107.5979v1', 'valid': None},

1383 {'schema': 'doi', 'identifier': 'doi:10.1184/R1/12841247.v1', 'valid': None}

1384 ]}

1385 out_13 = op.to_validated_id_list(inp_13)

1386 exp_13 = ['arxiv:1107.5979v1']

1387 self.assertEqual(out_13, exp_13)

1388 op.storage_manager.delete_storage()

1389

1390 op = OpenaireProcessing(testing=True)

1391 # CASE2_8: no already validated ids and more dois with "critic" prefixes for opencitations

1392 # ingestion workflow

1393

1394 inp_14 = {'valid': [], 'not_valid': [], 'to_be_val': [

1395 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None},

1396 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None}

1397 ]}

1398 out_14 = op.to_validated_id_list(inp_14)

1399 exp_14 = ['doi:10.1184/r1/12841247.v1']

1400 self.assertEqual(out_14, exp_14)

1401 op.storage_manager.delete_storage()

1402

1403 op = OpenaireProcessing(testing=True)

1404 # CASE3: an already validated id and more dois with "critic" prefixes for opencitations

1405 # ingestion workflow

1406

1407 inp_15 = {'valid': [], 'not_valid': [], 'to_be_val': [

1408 {'schema': 'doi', 'identifier': 'doi:10.5281/zenodo.4725899', 'valid': None},

1409 {'schema': 'doi', 'identifier': 'doi:10.1184/r1/12841247.v1', 'valid': None},

1410 {'schema': 'doi', 'identifier': 'doi:10.7557/5.5607', 'valid': None},

1411 {}

1412 ]}

1413 out_15 = op.to_validated_id_list(inp_15)

1414 exp_15 = ['doi:10.7557/5.5607']

1415 self.assertEqual(out_15, exp_15)

1416 op.storage_manager.delete_storage()

1417

1418

1419 def test_add_authors_to_agent_list(self):

1420 op = OpenaireProcessing()

1421 sample_inp = {'creator': [{'name': 'Carlos Hoyos'}, {'name': 'Yaron Oz'}, {'identifiers': [{'identifier': '0000-0001-6946-5074', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-6946-5074'}], 'name': 'Bom Soo Kim'}]}

1422 sample_exp = op.add_authors_to_agent_list(sample_inp, [])

1423 sample_out = [{'role': 'author', 'name': 'Carlos Hoyos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Yaron Oz', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Bom Soo Kim', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}]

1424 self.assertEqual(sample_out, sample_exp)

1425 op.storage_manager.delete_storage()

1426

1427

1428 def test_add_authors_to_agent_list_redis(self):

1429 op = OpenaireProcessing(testing=True)

1430 sample_inp = {'creator': [{'name': 'Carlos Hoyos'}, {'name': 'Yaron Oz'}, {'identifiers': [{'identifier': '0000-0001-6946-5074', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-6946-5074'}], 'name': 'Bom Soo Kim'}]}

1431 sample_exp = op.add_authors_to_agent_list(sample_inp, [])

1432 sample_out = [{'role': 'author', 'name': 'Carlos Hoyos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Yaron Oz', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Bom Soo Kim', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}]

1433 self.assertEqual(sample_out, sample_exp)

1434 op.storage_manager.delete_storage()

1435

1436 def test_add_authors_to_agent_list_no_creator(self):

1437 op = OpenaireProcessing()

1438 sample_inp = {'creator': []}

1439 sample_exp = op.add_authors_to_agent_list(sample_inp, [])

1440 sample_out = []

1441 self.assertEqual(sample_out, sample_exp)

1442 op.storage_manager.delete_storage()

1443

1444

1445 def test_add_authors_to_agent_list_no_creator_redis(self):

1446 op = OpenaireProcessing(testing=True)

1447 sample_inp = {'creator': []}

1448 sample_exp = op.add_authors_to_agent_list(sample_inp, [])

1449 sample_out = []

1450 self.assertEqual(sample_out, sample_exp)

1451 op.storage_manager.delete_storage()

1452

1453 def test_get_agents_strings_list(self):

1454 best_doi = "doi:10.1007/jhep03(2014)050"

1455 agents_list_2 = [{'role': 'author', 'name': 'Hoyos, Carlos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Oz, Yaron', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Kim, Bom Soo', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}]

1456 op = OpenaireProcessing()

1457 sample_exp = op.get_agents_strings_list(best_doi, agents_list_2)

1458 self.assertEqual(sample_exp, (['Hoyos Carlos', 'Oz Yaron', 'Kim Bom Soo [orcid:0000-0001-6946-5074]'], []))

1459 op.storage_manager.delete_storage()

1460

1461 def test_get_agents_strings_list_redis(self):

1462 best_doi = "doi:10.1007/jhep03(2014)050"

1463 agents_list_2 = [{'role': 'author', 'name': 'Hoyos, Carlos', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Oz, Yaron', 'family': '', 'given': ''}, {'role': 'author', 'name': 'Kim, Bom Soo', 'family': '', 'given': '', 'orcid': 'orcid:0000-0001-6946-5074'}]

1464 op = OpenaireProcessing(testing=True)

1465 sample_exp = op.get_agents_strings_list(best_doi, agents_list_2)

1466 self.assertEqual(sample_exp, (['Hoyos Carlos', 'Oz Yaron', 'Kim Bom Soo [orcid:0000-0001-6946-5074]'], []))

1467 op.storage_manager.delete_storage()

1468

1469 def test_find_openaire_orcid(self):

1470 op = OpenaireProcessing(testing=True)

1471 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]

1472 out = op.find_openaire_orcid(inp)

1473 exp = "orcid:0000-0001-9759-3938"

1474 self.assertEqual(out, exp)

1475

1476 inp_wrong_schema = [{'identifier': '0000-0001-9759-3938', 'schema': 'fake_schema', 'url': 'https://orcid.org/0000-0001-9759-3938'}]

1477 out_wrong_schema = op.find_openaire_orcid(inp_wrong_schema)

1478 exp_wrong_schema = ""

1479 self.assertEqual(out_wrong_schema, exp_wrong_schema)

1480

1481 inp_invalid_id = [{'identifier': '5500-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]

1482 out_invalid_id = op.find_openaire_orcid(inp_invalid_id)

1483 exp_invalid_id = ""

1484 self.assertEqual(out_invalid_id, exp_invalid_id)

1485

1486 op.orcid_m.storage_manager.delete_storage()

1487

1488 # set a valid id as invalid in storage, so to check that the api check is

1489 # avoided if the info is already in storage

1490 op = OpenaireProcessing(testing=True)

1491 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", False)

1492

1493 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]

1494 out = op.find_openaire_orcid(inp)

1495 exp = ""

1496 self.assertEqual(out, exp)

1497

1498 op.orcid_m.storage_manager.delete_storage()

1499 op = OpenaireProcessing(testing=True)

1500 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", True)

1501 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]

1502 out = op.find_openaire_orcid(inp)

1503 exp = "orcid:0000-0001-9759-3938"

1504 self.assertEqual(out, exp)

1505 op.orcid_m.storage_manager.delete_storage()

1506

1507

1508 def test_find_openaire_orcid_redis(self):

1509 op = OpenaireProcessing(testing=True)

1510 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]

1511 out = op.find_openaire_orcid(inp)

1512 exp = "orcid:0000-0001-9759-3938"

1513 self.assertEqual(out, exp)

1514

1515 inp_wrong_schema = [{'identifier': '0000-0001-9759-3938', 'schema': 'fake_schema', 'url': 'https://orcid.org/0000-0001-9759-3938'}]

1516 out_wrong_schema = op.find_openaire_orcid(inp_wrong_schema)

1517 exp_wrong_schema = ""

1518 self.assertEqual(out_wrong_schema, exp_wrong_schema)

1519

1520 inp_invalid_id = [{'identifier': '5500-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]

1521 out_invalid_id = op.find_openaire_orcid(inp_invalid_id)

1522 exp_invalid_id = ""

1523 self.assertEqual(out_invalid_id, exp_invalid_id)

1524

1525 op.orcid_m.storage_manager.delete_storage()

1526

1527 # set a valid id as invalid in storage, so to check that the api check is

1528 # avoided if the info is already in storage

1529 op = OpenaireProcessing(testing=True)

1530 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", False)

1531

1532 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]

1533 out = op.find_openaire_orcid(inp)

1534 exp = ""

1535 self.assertEqual(out, exp)

1536

1537 op.orcid_m.storage_manager.delete_storage()

1538 op = OpenaireProcessing(testing=True)

1539 op.orcid_m.storage_manager.set_value("orcid:0000-0001-9759-3938", True)

1540 inp = [{'identifier': '0000-0001-9759-3938', 'schema': 'ORCID', 'url': 'https://orcid.org/0000-0001-9759-3938'}]

1541 out = op.find_openaire_orcid(inp)

1542 exp = "orcid:0000-0001-9759-3938"

1543 self.assertEqual(out, exp)

1544 op.orcid_m.storage_manager.delete_storage()

1545

1546 def test_update_redis_values(self):

1547 br = ["pmid:2", "pmid:3"]

1548 ra = ["orcid:0000-0003-0530-4305"]

1549 op = OpenaireProcessing(testing=True)

1550 op.update_redis_values(br,ra)

1551 self.assertEqual(op._redis_values_br, br)

1552 self.assertEqual(op._redis_values_ra, ra)

1553

1554

1555 def test_find_openaire_orcid_with_index(self):

1556 """Test ORCID validation using ORCID index before API validation"""

1557 # Setup

1558 test_doi = "10.1234/test123"

1559 test_orcid = "0000-0002-1234-5678"

1560 test_name = "Smith, John"

1561

1562 # Create OpenaireProcessing instance with ORCID index

1563 op = OpenaireProcessing()

1564 # Correct format for add_value: id_string -> value

1565 op.orcid_index.add_value(test_doi, f"{test_name} [orcid:{test_orcid}]")

1566

1567 # Test Case 1: ORCID found in index

1568 inp_1 = [{'identifier': test_orcid, 'schema': 'ORCID'}]

1569 out_1 = op.find_openaire_orcid(inp_1, test_doi)

1570 exp_1 = f"orcid:{test_orcid}"

1571 self.assertEqual(out_1, exp_1)

1572 # Verify it was added to temporary storage

1573 self.assertTrue(op.tmp_orcid_m.storage_manager.get_value(f"orcid:{test_orcid}"))

1574

1575 # Test Case 2: ORCID not in index but valid via API

1576 inp_2 = [{'identifier': '0000-0003-4082-1500', 'schema': 'ORCID'}]

1577 out_2 = op.find_openaire_orcid(inp_2, test_doi)

1578 exp_2 = "orcid:0000-0003-4082-1500"

1579 self.assertEqual(out_2, exp_2)

1580

1581 # Test Case 3: ORCID not in index and invalid

1582 inp_3 = [{'identifier': '0000-0000-0000-0000', 'schema': 'ORCID'}]

1583 out_3 = op.find_openaire_orcid(inp_3, test_doi)

1584 exp_3 = ""

1585 self.assertEqual(out_3, exp_3)

1586

1587 # Test Case 4: Valid ORCID but no DOI provided

1588 inp_4 = [{'identifier': test_orcid, 'schema': 'ORCID'}]

1589 out_4 = op.find_openaire_orcid(inp_4) # No DOI

1590 exp_4 = f"orcid:{test_orcid}" # Should still validate via API

1591 self.assertEqual(out_4, exp_4)

1592

1593 # Cleanup

1594 op.storage_manager.delete_storage()

1595

1596

1597def test_validated_as_with_storage_manager(storage_manager):

1598 valid_doi_not_in_db = {"identifier": "doi:10.1001/2012.jama.10158", "schema": "doi"}

1599 valid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.10368", "schema": "doi"}

1600 invalid_doi_in_db = {"identifier": "doi:10.1001/2012.jama.1036", "schema": "doi"}

1601

1602 op_processing = OpenaireProcessing(storage_manager=storage_manager, testing=True)

1603 op_processing.doi_m.storage_manager.set_value(valid_doi_in_db["identifier"], True)

1604 op_processing.doi_m.storage_manager.set_value(invalid_doi_in_db["identifier"], False)

1605

1606 assert op_processing.validated_as(valid_doi_in_db) is True

1607 assert op_processing.validated_as(invalid_doi_in_db) is False

1608 assert op_processing.validated_as(valid_doi_not_in_db) is None

1609

1610

1611if __name__ == '__main__':

1612 unittest.main()

Coverage for test / processing_oroci_test.py: 99%

879 statements