Coverage for test/curator_test.py: 99%

770 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-12-20 08:55 +0000

1import csv 

2import shutil 

3import unittest 

4 

5import redis 

6from oc_meta.core.creator import Creator 

7from oc_meta.core.curator import * 

8from oc_meta.lib.file_manager import get_csv_data 

9from oc_meta.lib.finder import ResourceFinder 

10from oc_meta.plugins.multiprocess.resp_agents_curator import RespAgentsCurator 

11from oc_ocdm import Storer 

12from oc_ocdm.counter_handler.redis_counter_handler import RedisCounterHandler 

13from rdflib import Dataset, Graph 

14from sparqlite import SPARQLClient 

15 

16SERVER = 'http://127.0.0.1:8805/sparql' 

17BASE_DIR = os.path.join('test') 

18MANUAL_DATA_CSV = f'{BASE_DIR}/manual_data.csv' 

19MANUAL_DATA_RDF = f'{BASE_DIR}/testcases/ts/testcase_ts-13.ttl' 

20REAL_DATA_CSV = os.path.join(BASE_DIR, 'real_data.csv') 

21REAL_DATA_RDF = f'{BASE_DIR}/testcases/ts/real_data.nt' 

22REAL_DATA_RDF_WITH_PROV = f'{BASE_DIR}/testcases/ts/real_data_with_prov.nq' 

23BASE_IRI = 'https://w3id.org/oc/meta/' 

24CURATOR_COUNTER_DIR = f'{BASE_DIR}/curator_counter' 

25OUTPUT_DIR = f'{BASE_DIR}/output' 

26PROV_CONFIG = f'{BASE_DIR}/prov_config.json' 

27 

28# Redis configuration 

29REDIS_HOST = 'localhost' 

30REDIS_PORT = 6381 

31REDIS_DB = 5 

32 

33def get_path(path:str) -> str: 

34 # absolute_path:str = os.path.abspath(path) 

35 universal_path = path.replace('\\', '/') 

36 return universal_path 

37 

38def reset_redis_counters(): 

39 redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB) 

40 redis_client.flushdb() 

41 

42def get_counter_handler(): 

43 return RedisCounterHandler(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB) 

44 

45def reset(): 

46 reset_redis_counters() 

47 

48def reset_server(server:str=SERVER) -> None: 

49 with SPARQLClient(server, timeout=60) as client: 

50 for graph in {'https://w3id.org/oc/meta/br/', 'https://w3id.org/oc/meta/ra/', 'https://w3id.org/oc/meta/re/', 'https://w3id.org/oc/meta/id/', 'https://w3id.org/oc/meta/ar/', 'http://default.graph/'}: 

51 client.update(f'CLEAR GRAPH <{graph}>') 

52 

53def add_data_ts(server:str=SERVER, data_path:str=os.path.abspath(os.path.join('test', 'testcases', 'ts', 'real_data.nt')).replace('\\', '/'), batch_size:int=100, default_graph_uri=URIRef("http://default.graph/")): 

54 reset_server(server) 

55 f_path = get_path(data_path) 

56 

57 file_extension = os.path.splitext(f_path)[1].lower() 

58 if file_extension == '.nt': 

59 g = Graph() 

60 g.parse(location=f_path, format='nt') 

61 elif file_extension == '.nq': 

62 g = Dataset(default_union=True) 

63 g.parse(location=f_path, format='nquads') 

64 elif file_extension == '.ttl': 

65 g = Graph() 

66 g.parse(location=f_path, format='turtle') 

67 else: 

68 raise ValueError(f"Unsupported file extension: {file_extension}") 

69 

70 triples_list = [] 

71 if file_extension in {'.nt', '.ttl'}: 

72 for subj, pred, obj in g: 

73 triples_list.append((subj, pred, obj, default_graph_uri)) 

74 elif file_extension == '.nq': 

75 for subj, pred, obj, ctx in g.quads(): 

76 triples_list.append((subj, pred, obj, ctx)) 

77 

78 with SPARQLClient(server, timeout=60) as client: 

79 for i in range(0, len(triples_list), batch_size): 

80 batch_triples = triples_list[i:i + batch_size] 

81 

82 triples_str = "" 

83 for subj, pred, obj, ctx in batch_triples: 

84 if ctx: 

85 triples_str += f"GRAPH {ctx.n3().replace('[', '').replace(']', '')} {{ {subj.n3()} {pred.n3()} {obj.n3()} }} " 

86 else: 

87 triples_str += f"{subj.n3()} {pred.n3()} {obj.n3()} . " 

88 

89 query = f"INSERT DATA {{ {triples_str} }}" 

90 client.update(query) 

91 

92def store_curated_data(curator_obj:Curator, server:str) -> None: 

93 creator_obj = Creator(curator_obj.data, curator_obj.finder, BASE_IRI, None, None, 'https://orcid.org/0000-0002-8420-0696', 

94 curator_obj.index_id_ra, curator_obj.index_id_br, curator_obj.re_index, 

95 curator_obj.ar_index, curator_obj.VolIss) 

96 creator = creator_obj.creator(source=None) 

97 res_storer = Storer(creator) 

98 res_storer.upload_all(server, base_dir=None, batch_size=100) 

99 

100def prepare_to_test(data, name): 

101 reset_redis_counters() 

102 

103 reset_server(SERVER) 

104 if float(name) > 12: 

105 add_data_ts(SERVER, os.path.abspath(os.path.join('test', 'testcases', 'ts', 'testcase_ts-13.ttl')).replace('\\', '/')) 

106 

107 testcase_csv = get_path('test/testcases/testcase_data/testcase_' + name + '_data.csv') 

108 testcase_id_br = get_path('test/testcases/testcase_data/indices/' + name + '/index_id_br_' + name + '.csv') 

109 testcase_id_ra = get_path('test/testcases/testcase_data/indices/' + name + '/index_id_ra_' + name + '.csv') 

110 testcase_ar = get_path('test/testcases/testcase_data/indices/' + name + '/index_ar_' + name + '.csv') 

111 testcase_re = get_path('test/testcases/testcase_data/indices/' + name + '/index_re_' + name + '.csv') 

112 testcase_vi = get_path('test/testcases/testcase_data/indices/' + name + '/index_vi_' + name + '.json') 

113 

114 counter_handler = get_counter_handler() 

115 settings = {'normalize_titles': True} 

116 curator_obj = Curator(data, SERVER, prov_config=PROV_CONFIG, counter_handler=counter_handler, settings=settings) 

117 curator_obj.curator() 

118 testcase_csv = get_csv_data(testcase_csv) 

119 for csv in [testcase_csv, curator_obj.data]: 

120 for row in csv: 

121 row['id'] = sorted(row['id'].split()) 

122 testcase_id_br = get_csv_data(testcase_id_br) 

123 testcase_id_ra = get_csv_data(testcase_id_ra) 

124 testcase_ar = get_csv_data(testcase_ar) 

125 testcase_re = get_csv_data(testcase_re) 

126 for csv in [testcase_id_br, testcase_id_ra, testcase_ar, testcase_re, curator_obj.index_id_br, curator_obj.index_id_ra, curator_obj.ar_index, curator_obj.re_index]: 

127 try: 

128 csv.sort(key=lambda x:x['id']) 

129 except KeyError: 

130 try: 

131 csv.sort(key=lambda x:x['meta']) 

132 except KeyError: 

133 csv.sort(key=lambda x:x['br']) 

134 with open(testcase_vi) as json_file: 

135 testcase_vi = json.load(json_file) 

136 testcase = [testcase_csv, testcase_id_br, testcase_id_ra, testcase_ar, testcase_re, testcase_vi] 

137 data_curated = [curator_obj.data, curator_obj.index_id_br, curator_obj.index_id_ra, curator_obj.ar_index, 

138 curator_obj.re_index, curator_obj.VolIss] 

139 return data_curated, testcase 

140 

141def prepareCurator(data:list, server:str=SERVER, resp_agents_only:bool=False) -> Curator: 

142 settings = {'normalize_titles': True} 

143 reset_redis_counters() 

144 counter_handler = get_counter_handler() 

145 if resp_agents_only: 

146 curator = RespAgentsCurator(data, server, prov_config=PROV_CONFIG, counter_handler=counter_handler) 

147 else: 

148 curator = Curator(data, server, prov_config=PROV_CONFIG, counter_handler=counter_handler, settings=settings) 

149 return curator 

150 

151 

152class test_Curator(unittest.TestCase): 

153 @classmethod 

154 def setUpClass(cls): 

155 add_data_ts() 

156 

157 def setUp(self): 

158 reset_redis_counters() 

159 

160 def tearDown(self): 

161 reset_redis_counters() 

162 

163 def test_merge_entities_in_csv(self): 

164 curator = prepareCurator(list()) 

165 curator.counter_handler.set_counter(4, 'id', supplier_prefix='060') 

166 entity_dict = {'0601': {'ids': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'others': []}} 

167 id_dict = dict() 

168 curator.merge_entities_in_csv(['doi:10.1787/eco_outlook-v2011-2-graph138-en'], '0601', 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', entity_dict, id_dict) 

169 expected_output = ( 

170 {'0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'others': []}}, 

171 {'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0605'} 

172 ) 

173 self.assertEqual((entity_dict, id_dict), expected_output) 

174 

175 def test_clean_id_list(self): 

176 input = ['doi:10.001/B-1', 'wikidata:B1111111', 'OMID:br/060101'] 

177 output = Curator.clean_id_list(input, br=True) 

178 expected_output = (['doi:10.001/b-1', 'wikidata:B1111111'], '060101') 

179 self.assertEqual(output, expected_output) 

180 

181 def test_equalizer(self): 

182 # Test equalizer with a row that contains an ID that can be resolved to an existing entity 

183 row = {'id': 'doi:10.1001/archderm.104.1.106', 'title': '', 'author': '', 'pub_date': '1972-12-01', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''} 

184 curator = prepareCurator(list()) 

185 curator.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI) 

186 

187 metavals, identifiers, vvis = curator.extract_identifiers_and_metavals(row, valid_dois_cache=set()) 

188 curator.finder.get_everything_about_res(metavals=metavals, identifiers=identifiers, vvis=vvis) 

189 

190 curator.log[0] = {'id': {}} 

191 curator.clean_id(row) 

192 extracted_metaval = row['id'] 

193 self.assertEqual(extracted_metaval, '3757') 

194 

195 # Reset the row to test equalizer 

196 row = {'id': '', 'title': '', 'author': '', 'pub_date': '1972-12-01', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''} 

197 

198 curator.rowcnt = 0 

199 curator.log[0] = { 

200 'id': {}, 

201 'author': {}, 

202 'venue': {}, 

203 'editor': {}, 

204 'publisher': {}, 

205 'page': {}, 

206 'volume': {}, 

207 'issue': {}, 

208 'pub_date': {}, 

209 'type': {}, 

210 'title': {} 

211 } 

212 curator.equalizer(row, extracted_metaval) 

213 output = (curator.log, row) 

214 

215 expected_output = ( 

216 {0: {'id': {'status': 'Entity already exists'}, 'author': {}, 'venue': {}, 'editor': {}, 'publisher': {}, 'page': {}, 'volume': {}, 'issue': {}, 'pub_date': {'status': 'New value proposed'}, 'type': {}, 'title': {}}}, 

217 {'id': '', 'title': '', 'author': 'Curth, W. [omid:ra/6033]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416 issn:0003-987X]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': 'American Medical Association (ama) [omid:ra/3309 crossref:10]', 'editor': ''} 

218 ) 

219 self.assertEqual(output, expected_output) 

220 

221 def test_clean_id_metaid_not_in_ts(self): 

222 # A MetaId was specified, but it is not on ts. Therefore, it is invalid 

223 curator = prepareCurator(list()) 

224 row = {'id': 'omid:br/131313', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''} 

225 curator.log[0] = {'id': {}} 

226 curator.clean_id(row) 

227 expected_output = {'id': 'wannabe_0', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''} 

228 self.assertEqual(row, expected_output) 

229 

230 def test_clean_id(self): 

231 curator = prepareCurator(list()) 

232 row = {'id': 'doi:10.1001/archderm.104.1.106', 'title': 'Multiple Blasto', 'author': '', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''} 

233 curator.log[0] = {'id': {}} 

234 curator.finder.get_everything_about_res(metavals=set(), identifiers={'doi:10.1001/archderm.104.1.106'}, vvis=set()) 

235 curator.clean_id(row) 

236 expected_output = {'id': '3757', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''} 

237 self.assertEqual(row, expected_output) 

238 

239 def test_merge_duplicate_entities(self): 

240 # Test merge_duplicate_entities with realistic data that includes an ID that resolves to an existing entity 

241 data = [ 

242 {'id': 'doi:10.1001/archderm.104.1.106', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''}, 

243 {'id': '', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-02', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''}, 

244 {'id': '', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-03', 'venue': 'Archives Of Blast [omid:br/4416]', 'volume': '105', 'issue': '2', 'page': '106-108', 'type': 'journal volume', 'publisher': '', 'editor': ''}, 

245 ] 

246 curator = prepareCurator(list()) 

247 curator.data = data 

248 curator.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI) 

249 

250 # Extract metavals and identifiers from each row 

251 all_metavals = set() 

252 all_identifiers = set() 

253 all_vvis = set() 

254 

255 for row in data: 

256 metavals, identifiers, vvis = curator.extract_identifiers_and_metavals(row, valid_dois_cache=set()) 

257 all_metavals.update(metavals) 

258 all_identifiers.update(identifiers) 

259 all_vvis.update(vvis) 

260 

261 curator.finder.get_everything_about_res(metavals=all_metavals, identifiers=all_identifiers, vvis=all_vvis) 

262 

263 # Process each row with clean_id to get the actual metavals 

264 for i, row in enumerate(data): 

265 curator.log[i] = {'id': {}} 

266 curator.rowcnt = i 

267 curator.clean_id(row) 

268 

269 # Initialize log for merge_duplicate_entities 

270 for i in range(3): 

271 curator.log[i] = { 

272 'id': {}, 

273 'author': {}, 

274 'venue': {}, 

275 'editor': {}, 

276 'publisher': {}, 

277 'page': {}, 

278 'volume': {}, 

279 'issue': {}, 

280 'pub_date': {}, 

281 'type': {} 

282 } 

283 

284 # The brdict should be populated by clean_id, but we need to set up the "others" relationship 

285 # The first row should have resolved to '3757', and the other rows should be wannabes 

286 first_row_metaval = curator.data[0]['id'] # Should be '3757' 

287 self.assertEqual(first_row_metaval, '3757') 

288 

289 # Set up the relationship between the existing entity and the wannabes 

290 if first_row_metaval in curator.brdict: 

291 curator.brdict[first_row_metaval]['others'].extend(['wannabe_0', 'wannabe_1']) 

292 

293 curator.merge_duplicate_entities() 

294 output = (curator.data, curator.log) 

295 

296 expected_output = ( 

297 [ 

298 {'id': '3757', 'title': 'Multiple Keloids', 'author': 'Curth, W. [omid:ra/6033]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [issn:0003-987X omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': 'American Medical Association (ama) [omid:ra/3309 crossref:10]', 'editor': ''}, 

299 {'id': '3757', 'title': 'Multiple Keloids', 'author': 'Curth, W. [omid:ra/6033]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [issn:0003-987X omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': 'American Medical Association (ama) [omid:ra/3309 crossref:10]', 'editor': ''}, 

300 {'id': '3757', 'title': 'Multiple Keloids', 'author': 'Curth, W. [omid:ra/6033]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [issn:0003-987X omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': 'American Medical Association (ama) [omid:ra/3309 crossref:10]', 'editor': ''} 

301 ], 

302 { 

303 0: {'id': {'status': 'Entity already exists'}, 'author': {}, 'venue': {}, 'editor': {}, 'publisher': {}, 'page': {}, 'volume': {}, 'issue': {}, 'pub_date': {}, 'type': {}}, 

304 1: {'id': {'status': 'Entity already exists'}, 'author': {}, 'venue': {'status': 'New value proposed'}, 'editor': {}, 'publisher': {}, 'page': {}, 'volume': {}, 'issue': {}, 'pub_date': {'status': 'New value proposed'}, 'type': {}}, 

305 2: {'id': {'status': 'Entity already exists'}, 'author': {}, 'venue': {'status': 'New value proposed'}, 'editor': {}, 'publisher': {}, 'page': {'status': 'New value proposed'}, 'volume': {'status': 'New value proposed'}, 'issue': {'status': 'New value proposed'}, 'pub_date': {'status': 'New value proposed'}, 'type': {'status': 'New value proposed'}} 

306 } 

307 ) 

308 self.assertEqual(output, expected_output) 

309 

310 def test_clean_vvi_all_data_on_ts(self): 

311 # All data are already on the triplestore. They need to be retrieved and organized correctly 

312 row = {'id': 'doi:10.1001/archderm.104.1.106', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''} 

313 curator = prepareCurator(list()) 

314 curator.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI) 

315 

316 metavals, identifiers, vvis = curator.extract_identifiers_and_metavals(row, valid_dois_cache=set()) 

317 curator.finder.get_everything_about_res(metavals=metavals, identifiers=identifiers, vvis=vvis) 

318 

319 curator.log[0] = {'id': {}} 

320 curator.clean_id(row) 

321 

322 curator.clean_vvi(row) 

323 expected_output = { 

324 "4416": { 

325 "issue": {}, 

326 "volume": { 

327 "104": { 

328 "id": "4712", 

329 "issue": { 

330 "1": { 

331 "id": "4713" 

332 } 

333 } 

334 } 

335 } 

336 } 

337 } 

338 self.assertEqual(curator.vvi, expected_output) 

339 

340 def test_clean_vvi_new_venue(self): 

341 # It is a new venue 

342 row = {'id': 'wannabe_1', 'title': 'Money growth, interest rates, inflation and raw materials prices: China', 'author': '', 'pub_date': '2011-11-28', 'venue': 'OECD Economic Outlook', 'volume': '2011', 'issue': '2', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''} 

343 curator = prepareCurator(list()) 

344 curator.clean_vvi(row) 

345 expected_output = {'wannabe_0': {'volume': {'2011': {'id': 'wannabe_1', 'issue': {'2': {'id': 'wannabe_2'}}}}, 'issue': {}}} 

346 self.assertEqual(curator.vvi, expected_output) 

347 

348 def test_clean_vvi_volume_with_title(self): 

349 # A journal volume having a title 

350 row = [{'id': '', 'title': 'The volume title', 'author': '', 'pub_date': '', 'venue': 'OECD Economic Outlook', 'volume': '2011', 'issue': '2', 'page': '', 'type': 'journal volume', 'publisher': '', 'editor': ''}] 

351 curator = prepareCurator(row) 

352 curator.curator() 

353 expected_output = [{'id': 'omid:br/0601', 'title': 'The Volume Title', 'author': '', 'pub_date': '', 'venue': 'OECD Economic Outlook [omid:br/0602]', 'volume': '', 'issue': '', 'page': '', 'type': 'journal volume', 'publisher': '', 'editor': ''}] 

354 self.assertEqual(curator.data, expected_output) 

355 

356 def test_clean_vvi_invalid_volume(self): 

357 # The data must be invalidated, because the resource is journal volume but an issue has also been specified 

358 row = {'id': 'wannabe_1', 'title': '', 'author': '', 'pub_date': '', 'venue': 'OECD Economic Outlook', 'volume': '2011', 'issue': '2', 'page': '', 'type': 'journal volume', 'publisher': '', 'editor': ''} 

359 curator = prepareCurator(list()) 

360 curator.clean_vvi(row) 

361 expected_output = {'wannabe_0': {'volume': {}, 'issue': {}}} 

362 self.assertEqual(curator.vvi, expected_output) 

363 

364 def test_clean_vvi_invalid_venue(self): 

365 # The data must be invalidated, because the resource is journal but a volume has also been specified 

366 row = {'id': 'wannabe_1', 'title': '', 'author': '', 'pub_date': '', 'venue': 'OECD Economic Outlook', 'volume': '2011', 'issue': '', 'page': '', 'type': 'journal', 'publisher': '', 'editor': ''} 

367 curator = prepareCurator(list()) 

368 curator.clean_vvi(row) 

369 expected_output = {'wannabe_0': {'volume': {}, 'issue': {}}} 

370 self.assertEqual(curator.vvi, expected_output) 

371 

372 def test_clean_vvi_new_volume_and_issue(self): 

373 # There is a row with vvi and no ids 

374 row = {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': 'Archives Of Surgery [omid:br/4480]', 'volume': '147', 'issue': '11', 'page': '', 'type': 'journal article', 'publisher': '', 'editor': ''} 

375 curator = prepareCurator(list()) 

376 curator.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI) 

377 

378 metavals, identifiers, vvis = curator.extract_identifiers_and_metavals(row, valid_dois_cache=set()) 

379 curator.finder.get_everything_about_res(metavals=metavals, identifiers=identifiers, vvis=vvis) 

380 curator.clean_id(row) 

381 curator.clean_vvi(row) 

382 expected_output = { 

383 "4480": { 

384 "issue": {}, 

385 "volume": { 

386 "147": { 

387 "id": "4481", 

388 "issue": { 

389 "11": { 

390 "id": "4482" 

391 } 

392 } 

393 } 

394 } 

395 } 

396 } 

397 self.assertEqual(curator.vvi, expected_output) 

398 

399 def test_clean_ra_overlapping_surnames(self): 

400 # The surname of one author is included in the surname of another. 

401 row = {'id': 'wannabe_0', 'title': 'Giant Oyster Mushroom Pleurotus giganteus (Agaricomycetes) Enhances Adipocyte Differentiation and Glucose Uptake via Activation of PPARγ and Glucose Transporters 1 and 4 in 3T3-L1 Cells', 'author': 'Paravamsivam, Puvaneswari; Heng, Chua Kek; Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]; Sabaratnam, Vikineswary; M, Ravishankar Ram; Kuppusamy, Umah Rani', 'pub_date': '2016', 'venue': 'International Journal of Medicinal Mushrooms [issn:1521-9437]', 'volume': '18', 'issue': '9', 'page': '821-831', 'type': 'journal article', 'publisher': 'Begell House [crossref:613]', 'editor': ''} 

402 curator = prepareCurator(list()) 

403 curator.brdict = {'wannabe_0': {'ids': ['doi:10.1615/intjmedmushrooms.v18.i9.60'], 'title': 'Giant Oyster Mushroom Pleurotus giganteus (Agaricomycetes) Enhances Adipocyte Differentiation and Glucose Uptake via Activation of PPARγ and Glucose Transporters 1 and 4 in 3T3-L1 Cells', 'others': []}} 

404 curator.clean_ra(row, 'author') 

405 output = (curator.ardict, curator.radict, curator.idra) 

406 expected_output = ( 

407 {'wannabe_0': {'author': [('0601', 'wannabe_0'), ('0602', 'wannabe_1'), ('0603', 'wannabe_2'), ('0604', 'wannabe_3'), ('0605', 'wannabe_4'), ('0606', 'wannabe_5')], 'editor': [], 'publisher': []}}, 

408 {'wannabe_0': {'ids': [], 'others': [], 'title': 'Paravamsivam, Puvaneswari'}, 'wannabe_1': {'ids': [], 'others': [], 'title': 'Heng, Chua Kek'}, 'wannabe_2': {'ids': ['orcid:0000-0001-6278-8559'], 'others': [], 'title': 'Malek, Sri Nurestri Abdul'}, 'wannabe_3': {'ids': [], 'others': [], 'title': 'Sabaratnam, Vikineswary'}, 'wannabe_4': {'ids': [], 'others': [], 'title': 'M, Ravishankar Ram'}, 'wannabe_5': {'ids': [], 'others': [], 'title': 'Kuppusamy, Umah Rani'}}, 

409 {'orcid:0000-0001-6278-8559': '0601'} 

410 ) 

411 self.assertEqual(output, expected_output) 

412 

413 def test_clean_ra_with_br_metaid(self): 

414 # One author is in the triplestore, the other is not.  

415 # br_metaval is a MetaID 

416 # There are two ids for one author 

417 row = {'id': 'doi:10.1001/archderm.104.1.106', 'title': 'Multiple Keloids', 'author': 'Curth, W.; McSorley, J. [orcid:0000-0003-0530-4305 schema:12345]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''} 

418 curator = prepareCurator(list()) 

419 curator.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI) 

420 metavals, identifiers, vvis = curator.extract_identifiers_and_metavals(row, valid_dois_cache=set()) 

421 curator.finder.get_everything_about_res(metavals=metavals, identifiers=identifiers, vvis=vvis) 

422 

423 curator.log[0] = {'id': {}} 

424 curator.clean_id(row) 

425 

426 resolved_metaval = row['id'] 

427 self.assertEqual(resolved_metaval, '3757') 

428 curator.brdict = {resolved_metaval: {'ids': ['doi:10.1001/archderm.104.1.106'], 'title': 'Multiple Keloids', 'others': []}} 

429 

430 curator.clean_ra(row, 'author') 

431 output = (curator.ardict, curator.radict, curator.idra) 

432 expected_output = ( 

433 {'3757': {'author': [('9445', '6033'), ('0601', 'wannabe_0')], 'editor': [], 'publisher': []}}, 

434 {'6033': {'ids': [], 'others': [], 'title': 'Curth, W.'}, 'wannabe_0': {'ids': ['orcid:0000-0003-0530-4305', 'schema:12345'], 'others': [], 'title': 'McSorley, J.'}}, 

435 {'orcid:0000-0003-0530-4305': '0601', 'schema:12345': '0602'} 

436 ) 

437 self.assertEqual(output, expected_output) 

438 

439 def test_clean_ra_with_br_wannabe(self): 

440 # Authors not on the triplestore.  

441 # br_metaval is a wannabe 

442 row = {'id': 'wannabe_0', 'title': 'Multiple Keloids', 'author': 'Curth, W. [orcid:0000-0002-8420-0696] ; McSorley, J. [orcid:0000-0003-0530-4305]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''} 

443 curator = prepareCurator(list()) 

444 curator.brdict = {'wannabe_0': {'ids': ['doi:10.1001/archderm.104.1.106'], 'title': 'Multiple Keloids', 'others': []}} 

445 curator.wnb_cnt = 1 

446 curator.clean_ra(row, 'author') 

447 output = (curator.ardict, curator.radict, curator.idra) 

448 expected_output = ( 

449 {'wannabe_0': {'author': [('0601', 'wannabe_1'), ('0602', 'wannabe_2')], 'editor': [], 'publisher': []}}, 

450 {'wannabe_1': {'ids': ['orcid:0000-0002-8420-0696'], 'others': [], 'title': 'Curth, W.'}, 'wannabe_2': {'ids': ['orcid:0000-0003-0530-4305'], 'others': [], 'title': 'McSorley, J.'}}, 

451 {'orcid:0000-0002-8420-0696': '0601', 'orcid:0000-0003-0530-4305': '0602'} 

452 ) 

453 self.assertEqual(output, expected_output) 

454 

455 def test_clean_ra_with_empty_square_brackets(self): 

456 # One author's name contains a closed square bracket. 

457 row = {'id': 'doi:10.1001/archderm.104.1.106', 'title': 'Multiple Keloids', 'author': 'Bernacki, Edward J. [ ]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''} 

458 curator = prepareCurator(list()) 

459 curator.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI) 

460 

461 metavals, identifiers, vvis = curator.extract_identifiers_and_metavals(row, valid_dois_cache=set()) 

462 curator.finder.get_everything_about_res(metavals=metavals, identifiers=identifiers, vvis=vvis) 

463 

464 curator.log[0] = {'id': {}} 

465 curator.clean_id(row) 

466 

467 resolved_metaval = row['id'] 

468 self.assertEqual(resolved_metaval, '3757') 

469 curator.brdict = {resolved_metaval: {'ids': ['doi:10.1001/archderm.104.1.106'], 'title': 'Multiple Keloids', 'others': []}} 

470 

471 curator.clean_ra(row, 'author') 

472 output = (curator.ardict, curator.radict, curator.idra) 

473 expected_output = ( 

474 {'3757': {'author': [('9445', '6033'), ('0601', 'wannabe_0')], 'editor': [], 'publisher': []}}, 

475 {'6033': {'ids': [], 'others': [], 'title': 'Curth, W.'}, 'wannabe_0': {'ids': [], 'others': [], 'title': 'Bernacki, Edward J.'}}, 

476 {} 

477 ) 

478 self.assertEqual(output, expected_output) 

479 

480 def test_meta_maker(self): 

481 curator = prepareCurator(list()) 

482 curator.brdict = {'3757': {'ids': ['doi:10.1001/archderm.104.1.106', 'pmid:29098884'], 'title': 'Multiple Keloids', 'others': []}, '4416': {'ids': ['issn:0003-987X'], 'title': 'Archives Of Dermatology', 'others': []}} 

483 curator.radict = {'6033': {'ids': [], 'others': [], 'title': 'Curth, W.'}, 'wannabe_0': {'ids': ['orcid:0000-0003-0530-4305', 'schema:12345'], 'others': [], 'title': 'Mcsorley, J.'}} 

484 curator.ardict = {'3757': {'author': [('9445', '6033'), ('0601', 'wannabe_0')], 'editor': [], 'publisher': []}} 

485 curator.vvi = {'4416': {'issue': {}, 'volume': {'107': {'id': '4733', 'issue': {'1': {'id': '4734'}, '2': {'id': '4735'}, '3': {'id': '4736'}, '4': {'id': '4737'}, '5': {'id': '4738'}, '6': {'id': '4739'}}}, '108': {'id': '4740', 'issue': {'1': {'id': '4741'}, '2': {'id': '4742'}, '3': {'id': '4743'}, '4': {'id': '4744'}}}, '104': {'id': '4712', 'issue': {'1': {'id': '4713'}, '2': {'id': '4714'}, '3': {'id': '4715'}, '4': {'id': '4716'}, '5': {'id': '4717'}, '6': {'id': '4718'}}}, '148': {'id': '4417', 'issue': {'12': {'id': '4418'}, '11': {'id': '4419'}}}, '100': {'id': '4684', 'issue': {'1': {'id': '4685'}, '2': {'id': '4686'}, '3': {'id': '4687'}, '4': {'id': '4688'}, '5': {'id': '4689'}, '6': {'id': '4690'}}}, '101': {'id': '4691', 'issue': {'1': {'id': '4692'}, '2': {'id': '4693'}, '3': {'id': '4694'}, '4': {'id': '4695'}, '5': {'id': '4696'}, '6': {'id': '4697'}}}, '102': {'id': '4698', 'issue': {'1': {'id': '4699'}, '2': {'id': '4700'}, '3': {'id': '4701'}, '4': {'id': '4702'}, '5': {'id': '4703'}, '6': {'id': '4704'}}}, '103': {'id': '4705', 'issue': {'1': {'id': '4706'}, '2': {'id': '4707'}, '3': {'id': '4708'}, '4': {'id': '4709'}, '5': {'id': '4710'}, '6': {'id': '4711'}}}, '105': {'id': '4719', 'issue': {'1': {'id': '4720'}, '2': {'id': '4721'}, '3': {'id': '4722'}, '4': {'id': '4723'}, '5': {'id': '4724'}, '6': {'id': '4725'}}}, '106': {'id': '4726', 'issue': {'6': {'id': '4732'}, '1': {'id': '4727'}, '2': {'id': '4728'}, '3': {'id': '4729'}, '4': {'id': '4730'}, '5': {'id': '4731'}}}}}} 

486 curator.meta_maker() 

487 output = (curator.brmeta, curator.rameta, curator.armeta) 

488 expected_output = ( 

489 {'3757': {'ids': ['doi:10.1001/archderm.104.1.106', 'pmid:29098884', 'omid:br/3757'], 'title': 'Multiple Keloids', 'others': []}, '4416': {'ids': ['issn:0003-987X', 'omid:br/4416'], 'title': 'Archives Of Dermatology', 'others': []}}, 

490 {'6033': {'ids': ['omid:ra/6033'], 'others': [], 'title': 'Curth, W.'}, '0601': {'ids': ['orcid:0000-0003-0530-4305', 'schema:12345', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Mcsorley, J.'}}, 

491 {'3757': {'author': [('9445', '6033'), ('0601', '0601')], 'editor': [], 'publisher': []}} 

492 ) 

493 self.assertEqual(output, expected_output) 

494 

495 def test_enricher(self): 

496 curator = prepareCurator(list()) 

497 curator.data = [{'id': 'wannabe_0', 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'author': '', 'pub_date': '2011-11-28', 'venue': 'wannabe_1', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': 'OECD [crossref:1963]', 'editor': ''}] 

498 curator.brmeta = { 

499 '0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en', 'omid:br/0601'], 'others': ['wannabe_0'], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'}, 

500 '0602': {'ids': ['omid:br/0604'], 'others': ['wannabe_1'], 'title': 'OECD Economic Outlook'} 

501 } 

502 curator.armeta = {'0601': {'author': [], 'editor': [], 'publisher': [('0601', '0601')]}} 

503 curator.rameta = {'0601': {'ids': ['crossref:1963', 'omid:ra/0601'], 'others': ['wannabe_2'], 'title': 'Oecd'}} 

504 curator.remeta = dict() 

505 curator.meta_maker() 

506 curator.enrich() 

507 output = curator.data 

508 expected_output = [{'id': 'doi:10.1787/eco_outlook-v2011-2-graph138-en omid:br/0601', 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'author': '', 'pub_date': '2011-11-28', 'venue': 'OECD Economic Outlook [omid:br/0604]', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': 'Oecd [crossref:1963 omid:ra/0601]', 'editor': ''}] 

509 self.assertEqual(output, expected_output) 

510 

511 def test_indexer(self): 

512 """Test that indexer() correctly transforms internal dicts to list-of-dicts.""" 

513 curator = prepareCurator(list()) 

514 curator.filename = '0.csv' 

515 curator.idra = {'orcid:0000-0003-0530-4305': '0601', 'schema:12345': '0602'} 

516 curator.idbr = {'doi:10.1001/2013.jamasurg.270': '2585'} 

517 curator.armeta = {'2585': {'author': [('9445', '0602'), ('0601', '0601')], 'editor': [], 'publisher': []}} 

518 curator.remeta = dict() 

519 curator.brmeta = { 

520 '0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en', 'omid:br/0601'], 'others': ['wannabe_0'], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'}, 

521 '0602': {'ids': ['omid:br/0602'], 'others': ['wannabe_1'], 'title': 'OECD Economic Outlook'} 

522 } 

523 curator.vvi = { 

524 'wannabe_1': { 

525 'issue': {}, 

526 'volume': { 

527 '107': {'id': '4733', 'issue': {'1': {'id': '4734'}, '2': {'id': '4735'}, '3': {'id': '4736'}, '4': {'id': '4737'}, '5': {'id': '4738'}, '6': {'id': '4739'}}}, 

528 '108': {'id': '4740', 'issue': {'1': {'id': '4741'}, '2': {'id': '4742'}, '3': {'id': '4743'}, '4': {'id': '4744'}}}, 

529 '104': {'id': '4712', 'issue': {'1': {'id': '4713'}, '2': {'id': '4714'}, '3': {'id': '4715'}, '4': {'id': '4716'}, '5': {'id': '4717'}, '6': {'id': '4718'}}}, 

530 '148': {'id': '4417', 'issue': {'12': {'id': '4418'}, '11': {'id': '4419'}}}, 

531 '100': {'id': '4684', 'issue': {'1': {'id': '4685'}, '2': {'id': '4686'}, '3': {'id': '4687'}, '4': {'id': '4688'}, '5': {'id': '4689'}, '6': {'id': '4690'}}}, 

532 '101': {'id': '4691', 'issue': {'1': {'id': '4692'}, '2': {'id': '4693'}, '3': {'id': '4694'}, '4': {'id': '4695'}, '5': {'id': '4696'}, '6': {'id': '4697'}}}, 

533 '102': {'id': '4698', 'issue': {'1': {'id': '4699'}, '2': {'id': '4700'}, '3': {'id': '4701'}, '4': {'id': '4702'}, '5': {'id': '4703'}, '6': {'id': '4704'}}}, 

534 '103': {'id': '4705', 'issue': {'1': {'id': '4706'}, '2': {'id': '4707'}, '3': {'id': '4708'}, '4': {'id': '4709'}, '5': {'id': '4710'}, '6': {'id': '4711'}}}, 

535 '105': {'id': '4719', 'issue': {'1': {'id': '4720'}, '2': {'id': '4721'}, '3': {'id': '4722'}, '4': {'id': '4723'}, '5': {'id': '4724'}, '6': {'id': '4725'}}}, 

536 '106': {'id': '4726', 'issue': {'6': {'id': '4732'}, '1': {'id': '4727'}, '2': {'id': '4728'}, '3': {'id': '4729'}, '4': {'id': '4730'}, '5': {'id': '4731'}}} 

537 } 

538 } 

539 } 

540 curator.meta_maker() 

541 curator.indexer() 

542 # Test in-memory data structures 

543 expected_index_ar = [{'meta': '2585', 'author': '9445, 0602; 0601, 0601', 'editor': '', 'publisher': ''}] 

544 expected_index_id_br = [{'id': 'doi:10.1001/2013.jamasurg.270', 'meta': '2585'}] 

545 expected_index_id_ra = [{'id': 'orcid:0000-0003-0530-4305', 'meta': '0601'}, {'id': 'schema:12345', 'meta': '0602'}] 

546 expected_index_re = [{'br': '', 're': ''}] 

547 expected_index_vi = {'0602': {'issue': {}, 'volume': {'107': {'id': '4733', 'issue': {'1': {'id': '4734'}, '2': {'id': '4735'}, '3': {'id': '4736'}, '4': {'id': '4737'}, '5': {'id': '4738'}, '6': {'id': '4739'}}}, '108': {'id': '4740', 'issue': {'1': {'id': '4741'}, '2': {'id': '4742'}, '3': {'id': '4743'}, '4': {'id': '4744'}}}, '104': {'id': '4712', 'issue': {'1': {'id': '4713'}, '2': {'id': '4714'}, '3': {'id': '4715'}, '4': {'id': '4716'}, '5': {'id': '4717'}, '6': {'id': '4718'}}}, '148': {'id': '4417', 'issue': {'12': {'id': '4418'}, '11': {'id': '4419'}}}, '100': {'id': '4684', 'issue': {'1': {'id': '4685'}, '2': {'id': '4686'}, '3': {'id': '4687'}, '4': {'id': '4688'}, '5': {'id': '4689'}, '6': {'id': '4690'}}}, '101': {'id': '4691', 'issue': {'1': {'id': '4692'}, '2': {'id': '4693'}, '3': {'id': '4694'}, '4': {'id': '4695'}, '5': {'id': '4696'}, '6': {'id': '4697'}}}, '102': {'id': '4698', 'issue': {'1': {'id': '4699'}, '2': {'id': '4700'}, '3': {'id': '4701'}, '4': {'id': '4702'}, '5': {'id': '4703'}, '6': {'id': '4704'}}}, '103': {'id': '4705', 'issue': {'1': {'id': '4706'}, '2': {'id': '4707'}, '3': {'id': '4708'}, '4': {'id': '4709'}, '5': {'id': '4710'}, '6': {'id': '4711'}}}, '105': {'id': '4719', 'issue': {'1': {'id': '4720'}, '2': {'id': '4721'}, '3': {'id': '4722'}, '4': {'id': '4723'}, '5': {'id': '4724'}, '6': {'id': '4725'}}}, '106': {'id': '4726', 'issue': {'6': {'id': '4732'}, '1': {'id': '4727'}, '2': {'id': '4728'}, '3': {'id': '4729'}, '4': {'id': '4730'}, '5': {'id': '4731'}}}}}} 

548 # Sort for comparison (order may vary due to dict iteration) 

549 curator.index_id_ra.sort(key=lambda x: x['id']) 

550 expected_index_id_ra.sort(key=lambda x: x['id']) 

551 self.assertEqual(curator.ar_index, expected_index_ar) 

552 self.assertEqual(curator.index_id_br, expected_index_id_br) 

553 self.assertEqual(curator.index_id_ra, expected_index_id_ra) 

554 self.assertEqual(curator.re_index, expected_index_re) 

555 self.assertEqual(curator.VolIss, expected_index_vi) 

556 

557 def test_is_a_valid_row(self): 

558 rows = [ 

559 {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

560 {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '1', 'issue': '', 'page': '', 'type': 'journal volume', 'publisher': '', 'editor': ''}, 

561 {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '1', 'page': '', 'type': 'journal issue', 'publisher': '', 'editor': ''}, 

562 {'id': 'doi:10.1001/2013.jamasurg.270', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

563 {'id': '', 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'author': 'Deckert, Ron J. [orcid:0000-0003-2100-6412]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

564 {'id': '', 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'author': 'Deckert, Ron J. [orcid:0000-0003-2100-6412]', 'pub_date': '03-01-2020', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': 'book'}, 

565 {'id': 'doi:10.1001/2013.jamasurg.270', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '5', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''} 

566 ] 

567 output = [] 

568 for row in rows: 

569 output.append(is_a_valid_row(row)) 

570 expected_output = [False, False, False, True, False, True, False] 

571 self.assertEqual(output, expected_output) 

572 

573 def test_get_preexisting_entities(self): 

574 row = {'id': 'omid:br/2715', 'title': 'Image Of The Year For 2012', 'author': '', 'pub_date': '', 'venue': 'Archives Of Surgery [omid:br/4480]', 'volume': '99', 'issue': '1', 'page': '', 'type': 'journal article', 'publisher': '', 'editor': ''} 

575 curator = prepareCurator(data=[row]) 

576 curator.curator() 

577 expected_output = ( 

578 {'id/4270', 'ra/3309', 'ar/7240', 'br/4481', 'br/2715', 'br/4480', 'id/4274', 'id/2581', 'br/4487', 're/2350'}, 

579 [{'id': 'doi:10.1001/2013.jamasurg.202 omid:br/2715', 'title': 'Image Of The Year For 2012', 'author': '', 'pub_date': '2012-12-01', 'venue': 'Archives Of Surgery [issn:0004-0010 omid:br/4480]', 'volume': '147', 'issue': '12', 'page': '1140-1140', 'type': 'journal article', 'publisher': 'American Medical Association (ama) [crossref:10 omid:ra/3309]', 'editor': ''}] 

580 ) 

581 self.assertEqual((curator.preexisting_entities, curator.data), expected_output) 

582 

583 

584class test_RespAgentsCurator(unittest.TestCase): 

585 def test_curator_publishers(self): 

586 reset() 

587 data = [ 

588 {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': 'American Medical Association (AMA) [crossref:10 crossref:9999]', 'editor': ''}, 

589 {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': 'Elsevier BV [crossref:78]', 'editor': ''}, 

590 {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': 'Wiley [crossref:311]', 'editor': ''}] 

591 resp_agents_curator = prepareCurator(data=data, server=SERVER, resp_agents_only=True) 

592 resp_agents_curator.curator(filename=None, path_csv=None) 

593 output = (resp_agents_curator.data, resp_agents_curator.radict, resp_agents_curator.idra, resp_agents_curator.rameta) 

594 expected_output = ( 

595 [ 

596 {'id': '', 'title': '', 'author': '', 'venue': '', 'editor': '', 'publisher': 'American Medical Association (ama) [crossref:10 crossref:9999 omid:ra/3309]', 'page': '', 'volume': '', 'issue': '', 'pub_date': '', 'type': ''}, 

597 {'id': '', 'title': '', 'author': '', 'venue': '', 'editor': '', 'publisher': 'Elsevier Bv [crossref:78 omid:ra/0601]', 'page': '', 'volume': '', 'issue': '', 'pub_date': '', 'type': ''}, 

598 {'id': '', 'title': '', 'author': '', 'venue': '', 'editor': '', 'publisher': 'Wiley [crossref:311 omid:ra/0602]', 'page': '', 'volume': '', 'issue': '', 'pub_date': '', 'type': ''}], 

599 { 

600 '3309': {'ids': ['crossref:10', 'crossref:9999', 'omid:ra/3309'], 'others': [], 'title': 'American Medical Association (ama)'}, 

601 'wannabe_0': {'ids': ['crossref:78', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Elsevier Bv'}, 

602 'wannabe_1': {'ids': ['crossref:311', 'omid:ra/0602'], 'others': ['wannabe_1'], 'title': 'Wiley'}}, 

603 {'crossref:10': '4274', 'crossref:9999': '0601', 'crossref:78': '0602', 'crossref:311': '0603'}, 

604 { 

605 '3309': {'ids': ['crossref:10', 'crossref:9999', 'omid:ra/3309'], 'others': [], 'title': 'American Medical Association (ama)'}, 

606 '0601': {'ids': ['crossref:78', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Elsevier Bv'}, 

607 '0602': {'ids': ['crossref:311', 'omid:ra/0602'], 'others': ['wannabe_1'], 'title': 'Wiley'}} 

608 ) 

609 self.assertEqual(output, expected_output) 

610 

611 def test_curator(self): 

612 reset() 

613 data = [ 

614 {'id': '', 'title': '', 'author': 'Deckert, Ron J. [orcid:0000-0003-2100-6412]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

615 {'id': '', 'title': '', 'author': 'Ruso, Juan M. [orcid:0000-0001-5909-6754]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

616 {'id': '', 'title': '', 'author': 'Sarmiento, Félix [orcid:0000-0002-4487-6894]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''} 

617 ] 

618 resp_agents_curator = prepareCurator(data=data, server=SERVER, resp_agents_only=True) 

619 resp_agents_curator.curator(filename='resp_agents_curator_output', path_csv='test/testcases/testcase_data') 

620 output = (resp_agents_curator.data, resp_agents_curator.radict, resp_agents_curator.idra, resp_agents_curator.rameta) 

621 expected_output = ( 

622 [ 

623 {'id': '', 'title': '', 'author': 'Deckert, Ron J. [orcid:0000-0003-2100-6412 omid:ra/0601]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

624 {'id': '', 'title': '', 'author': 'Ruso, Juan M. [orcid:0000-0001-5909-6754 omid:ra/0602]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

625 {'id': '', 'title': '', 'author': 'Sarmiento, Félix [orcid:0000-0002-4487-6894 omid:ra/0603]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}], 

626 { 

627 'wannabe_0': {'ids': ['orcid:0000-0003-2100-6412', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Deckert, Ron J.'}, 

628 'wannabe_1': {'ids': ['orcid:0000-0001-5909-6754', 'omid:ra/0602'], 'others': ['wannabe_1'], 'title': 'Ruso, Juan M.'}, 

629 'wannabe_2': {'ids': ['orcid:0000-0002-4487-6894', 'omid:ra/0603'], 'others': ['wannabe_2'], 'title': 'Sarmiento, Félix'}}, 

630 {'orcid:0000-0003-2100-6412': '0601', 'orcid:0000-0001-5909-6754': '0602', 'orcid:0000-0002-4487-6894': '0603'}, 

631 { 

632 '0601': {'ids': ['orcid:0000-0003-2100-6412', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Deckert, Ron J.'}, 

633 '0602': {'ids': ['orcid:0000-0001-5909-6754', 'omid:ra/0602'], 'others': ['wannabe_1'], 'title': 'Ruso, Juan M.'}, 

634 '0603': {'ids': ['orcid:0000-0002-4487-6894', 'omid:ra/0603'], 'others': ['wannabe_2'], 'title': 'Sarmiento, Félix'}} 

635 ) 

636 self.assertEqual(output, expected_output) 

637 

638 def test_curator_ra_on_ts(self): 

639 # A responsible agent is already on the triplestore 

640 add_data_ts(server=SERVER, data_path=os.path.abspath(os.path.join('test', 'testcases', 'ts', 'real_data.nt')).replace('\\', '/')) 

641 self.maxDiff = None 

642 data = [ 

643 {'id': '', 'title': '', 'author': 'Deckert, Ron J. [orcid:0000-0003-2100-6412]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

644 {'id': '', 'title': '', 'author': 'Mehrotra, Ateev [orcid:0000-0003-2223-1582]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

645 {'id': '', 'title': '', 'author': 'Sarmiento, Félix [orcid:0000-0002-4487-6894]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''} 

646 ] 

647 resp_agents_curator = prepareCurator(data=data, server=SERVER, resp_agents_only=True) 

648 resp_agents_curator.curator() 

649 output = (resp_agents_curator.data, resp_agents_curator.radict, resp_agents_curator.idra, resp_agents_curator.rameta) 

650 expected_output = ( 

651 [ 

652 {'id': '', 'title': '', 'author': 'Deckert, Ron J. [orcid:0000-0003-2100-6412 omid:ra/0601]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

653 {'id': '', 'title': '', 'author': 'Mehrotra, Ateev [orcid:0000-0003-2223-1582 omid:ra/3976]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}, 

654 {'id': '', 'title': '', 'author': 'Sarmiento, Félix [orcid:0000-0002-4487-6894 omid:ra/0602]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}], 

655 { 

656 'wannabe_0': {'ids': ['orcid:0000-0003-2100-6412', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Deckert, Ron J.'}, 

657 '3976': {'ids': ['orcid:0000-0003-2223-1582', 'omid:ra/3976'], 'others': [], 'title': 'Mehrotra, Ateev'}, 

658 'wannabe_1': {'ids': ['orcid:0000-0002-4487-6894', 'omid:ra/0602'], 'others': ['wannabe_1'], 'title': 'Sarmiento, Félix'}}, 

659 {'orcid:0000-0003-2100-6412': '0601', 'orcid:0000-0003-2223-1582': '4351', 'orcid:0000-0002-4487-6894': '0602'}, 

660 { 

661 '0601': {'ids': ['orcid:0000-0003-2100-6412', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Deckert, Ron J.'}, 

662 '3976': {'ids': ['orcid:0000-0003-2223-1582', 'omid:ra/3976'], 'others': [], 'title': 'Mehrotra, Ateev'}, 

663 '0602': {'ids': ['orcid:0000-0002-4487-6894', 'omid:ra/0602'], 'others': ['wannabe_1'], 'title': 'Sarmiento, Félix'}} 

664 ) 

665 self.assertEqual(output, expected_output) 

666 

667 

668class test_id_worker(unittest.TestCase): 

669 @classmethod 

670 def setUpClass(cls): 

671 add_data_ts(SERVER, os.path.abspath(os.path.join('test', 'testcases', 'ts', 'real_data.nt')).replace('\\', '/')) 

672 cls.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI) 

673 cls.finder.get_everything_about_res(metavals={'omid:br/3309', 'omid:br/2438', 'omid:br/0601'}, identifiers={'doi:10.1001/2013.jamasurg.270', 'doi:10.1787/eco_outlook-v2011-2-graph138-en', 'orcid:0000-0001-6994-8412', 'doi:10.1001/archderm.104.1.106', 'pmid:29098884'}, vvis=set()) 

674 

675 def test_id_worker_1(self): 

676 # 1 EntityA is a new one 

677 curator = prepareCurator(list()) 

678 name = 'βέβαιος, α, ον' 

679 idslist = ['doi:10.1163/2214-8655_lgo_lgo_02_0074_ger'] 

680 wannabe_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False) 

681 output = (wannabe_id, curator.brdict, curator.radict, curator.idbr, curator.idra, curator.log) 

682 expected_output = ( 

683 'wannabe_0', 

684 {'wannabe_0': {'ids': ['doi:10.1163/2214-8655_lgo_lgo_02_0074_ger'], 'others': [], 'title': 'βέβαιος, α, ον'}}, 

685 {}, 

686 {'doi:10.1163/2214-8655_lgo_lgo_02_0074_ger': '0601'}, 

687 {}, 

688 {} 

689 ) 

690 self.assertEqual(output, expected_output) 

691 

692 def test_id_worker_1_no_id(self): 

693 # 1 EntityA is a new one and has no ids 

694 curator = prepareCurator(list()) 

695 name = 'βέβαιος, α, ον' 

696 idslist = [] 

697 wannabe_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False) 

698 output = (wannabe_id, curator.brdict, curator.radict, curator.idbr, curator.idra, curator.log) 

699 expected_output = ( 

700 'wannabe_0', 

701 {'wannabe_0': {'ids': [], 'others': [], 'title': 'βέβαιος, α, ον'}}, 

702 {}, 

703 {}, 

704 {}, 

705 {} 

706 ) 

707 self.assertEqual(output, expected_output) 

708 

709 def test_id_worker_2_id_ts(self): 

710 # 2 Retrieve EntityA data in triplestore to update EntityA inside CSV 

711 curator = prepareCurator(list()) 

712 curator.finder = self.finder 

713 name = 'American Medical Association (AMA)' # *(ama) on the ts. The name on the ts must prevail 

714 idslist = ['crossref:10'] 

715 wannabe_id = curator.id_worker('editor', name, idslist, '', ra_ent=True, br_ent=False, vvi_ent=False, publ_entity=True) 

716 output = (wannabe_id, curator.brdict, curator.radict, curator.idbr, curator.idra, curator.log) 

717 expected_output = ('3309', {}, {'3309': {'ids': ['crossref:10'], 'others': [], 'title': 'American Medical Association (ama)'}}, {}, {'crossref:10': '4274'}, {}) 

718 self.assertEqual(output, expected_output) 

719 

720 def test_id_worker_2_metaid_ts(self): 

721 # 2 Retrieve EntityA data in triplestore to update EntityA inside CSV 

722 curator = prepareCurator(list()) 

723 curator.finder = self.finder 

724 name = 'American Medical Association (AMA)' # *(ama) on the ts. The name on the ts must prevail 

725 # MetaID only 

726 wannabe_id = curator.id_worker('editor', name, [], '3309', ra_ent=True, br_ent=False, vvi_ent=False, publ_entity=True) 

727 output = (wannabe_id, curator.brdict, curator.radict, curator.idbr, curator.idra, curator.log) 

728 expected_output = ('3309', {}, {'3309': {'ids': ['crossref:10'], 'others': [], 'title': 'American Medical Association (ama)'}}, {}, {'crossref:10': '4274'}, {}) 

729 self.assertEqual(output, expected_output) 

730 

731 def test_id_worker_2_id_metaid_ts(self): 

732 # 2 Retrieve EntityA data in triplestore to update EntityA inside CSV 

733 curator = prepareCurator(list()) 

734 name = 'American Medical Association (AMA)' # *(ama) on the ts. The name on the ts must prevail 

735 curator.finder = self.finder 

736 # ID and MetaID 

737 wannabe_id = curator.id_worker('publisher', name, ['crossref:10'], '3309', ra_ent=True, br_ent=False, vvi_ent=False, publ_entity=True) 

738 output = (wannabe_id, curator.brdict, curator.radict, curator.idbr, curator.idra, curator.log) 

739 expected_output = ('3309', {}, {'3309': {'ids': ['crossref:10'], 'others': [], 'title': 'American Medical Association (ama)'}}, {}, {'crossref:10': '4274'}, {}) 

740 self.assertEqual(output, expected_output) 

741 

742 def test_id_worker_3(self): 

743 # 2 Retrieve EntityA data in triplestore to update EntityA inside CSV. MetaID on ts has precedence 

744 curator = prepareCurator(list()) 

745 name = 'American Medical Association (AMA)' # *(ama) on the ts. The name on the ts must prevail 

746 curator.finder = self.finder 

747 # ID and MetaID, but it's omid:ra/3309 on ts 

748 wannabe_id = curator.id_worker('publisher', name, ['crossref:10'], '33090', ra_ent=True, br_ent=False, vvi_ent=False, publ_entity=True) 

749 output = (wannabe_id, curator.brdict, curator.radict, curator.idbr, curator.idra, curator.log) 

750 expected_output = ('3309', {}, {'3309': {'ids': ['crossref:10'], 'others': [], 'title': 'American Medical Association (ama)'}}, {}, {'crossref:10': '4274'}, {}) 

751 self.assertEqual(output, expected_output) 

752 

753 def test_id_worker_conflict(self): 

754 # there's no meta or there was one but it didn't exist 

755 # There are other ids that already exist, but refer to multiple entities on ts. 

756 # Conflict! 

757 idslist = ['doi:10.1001/2013.jamasurg.270'] 

758 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China' 

759 curator = prepareCurator(list()) 

760 curator.finder = self.finder 

761 curator.log[0] = {'id': {}} 

762 id_dict = dict() 

763 metaval = curator.conflict(idslist, name, id_dict, 'id') # Only the conflict function is tested here, not id_worker 

764 output = (metaval, curator.brdict, curator.log, id_dict) 

765 expected_output = ( 

766 'wannabe_0', 

767 {'wannabe_0': {'ids': ['doi:10.1001/2013.jamasurg.270'], 'others': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'}}, 

768 {0: {'id': {'Conflict entity': 'wannabe_0'}}}, 

769 {'doi:10.1001/2013.jamasurg.270': '2585'} 

770 ) 

771 self.assertEqual(output, expected_output) 

772 

773 def test_conflict_br(self): 

774 # No MetaId, an identifier to which two separate br point: there is a conflict, and a new entity must be created 

775 curator = prepareCurator(list()) 

776 curator.log[0] = {'id': {}} 

777 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China' 

778 idslist = ['doi:10.1001/2013.jamasurg.270'] 

779 curator.finder = self.finder 

780 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False) 

781 output = (meta_id, curator.idbr, curator.idra, curator.brdict, curator.log) 

782 expected_output_1 = ( 

783 '2719', 

784 {'doi:10.1001/2013.jamasurg.270': '2585'}, 

785 {}, 

786 {'2719': {'ids': ['doi:10.1001/2013.jamasurg.270'], 'others': [], 'title': 'Patient Satisfaction As A Possible Indicator Of Quality Surgical Care'}}, 

787 {0: {'id': {}}} 

788 ) 

789 expected_output_2 = ('2720', 

790 {'doi:10.1001/2013.jamasurg.270': '2585'}, 

791 {}, 

792 {'2720': {'ids': ['doi:10.1001/2013.jamasurg.270'], 

793 'others': [], 

794 'title': 'Pediatric Injury Outcomes In Racial/Ethnic Minorities In ' 

795 'California'}}, 

796 {0: {'id': {}}} 

797 ) 

798 self.assertTrue(output == expected_output_1 or output == expected_output_2) 

799 

800 def test_conflict_ra(self): 

801 # No MetaId, an identifier to which two separate ra point: there is a conflict, and a new entity must be created 

802 idslist = ['orcid:0000-0001-6994-8412'] 

803 name = 'Alarcon, Louis H.' 

804 curator = prepareCurator(list()) 

805 curator.finder = self.finder 

806 curator.log[0] = {'author': {}} 

807 meta_id = curator.id_worker('author', name, idslist, '', ra_ent=True, br_ent=False, vvi_ent=False, publ_entity=False) 

808 output = (meta_id, curator.idbr, curator.idra, curator.brdict, curator.radict, curator.log) 

809 expected_output_1 = ( 

810 '4940', 

811 {}, 

812 {'orcid:0000-0001-6994-8412': '4475'}, 

813 {}, 

814 {'4940': {'ids': ['orcid:0000-0001-6994-8412'], 'others': [], 'title': 'Alarcon, Louis H.'}}, 

815 {0: {'author': {}}} 

816 ) 

817 expected_output_2 = ('1000000', 

818 {}, 

819 {'orcid:0000-0001-6994-8412': '4475'}, 

820 {}, 

821 {'1000000': {'ids': ['orcid:0000-0001-6994-8412'], 

822 'others': [], 

823 'title': 'Alarcon, Louis H.'}}, 

824 {0: {'author': {}}}) 

825 self.assertTrue(output == expected_output_1 or output == expected_output_2) 

826 

827 def test_conflict_suspect_id_among_existing(self): 

828 # ID already exist in entity_dict and refer to one entity having a MetaID, but there is another ID not in entity_dict that highlights a conflict on ts 

829 br_dict = { 

830 'omid:br/0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'}, 

831 'omid:br/0602': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 'others': [], 'title': 'Contributions To GDP Growth And Inflation: South Africa'}, 

832 'omid:br/0603': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph18-en'], 'others': [], 'title': 'Official Loans To The Governments Of Greece, Ireland And Portugal'}, 

833 } 

834 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: Japan' # The first title must have precedence (China, not Japan) 

835 idslist = ['doi:10.1787/eco_outlook-v2011-2-graph138-en', 'doi:10.1001/2013.jamasurg.270'] 

836 curator = prepareCurator(get_csv_data(REAL_DATA_CSV)) 

837 curator.log[0] = {'id': {}} 

838 curator.brdict = br_dict 

839 curator.finder = self.finder 

840 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False) 

841 output = (meta_id, curator.idbr, curator.idra, curator.brdict, curator.radict, curator.log) 

842 expected_output = ( 

843 'wannabe_0', 

844 { 

845 'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601', 

846 'doi:10.1001/2013.jamasurg.270': '2585' 

847 }, 

848 {}, 

849 {'omid:br/0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 

850 'others': [], 

851 'title': 'Money Growth, Interest Rates, Inflation And Raw ' 

852 'Materials Prices: China'}, 

853 'omid:br/0602': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 

854 'others': [], 

855 'title': 'Contributions To GDP Growth And Inflation: South ' 

856 'Africa'}, 

857 'omid:br/0603': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph18-en'], 

858 'others': [], 

859 'title': 'Official Loans To The Governments Of Greece, ' 

860 'Ireland And Portugal'}, 

861 'wannabe_0': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en', 

862 'doi:10.1001/2013.jamasurg.270'], 

863 'others': [], 

864 'title': 'Money Growth, Interest Rates, Inflation And Raw ' 

865 'Materials Prices: Japan'}}, 

866 {}, 

867 {0: {'id': {'Conflict entity': 'wannabe_0'}}} 

868 ) 

869 self.assertEqual(output, expected_output) 

870 

871 def test_conflict_suspect_id_among_wannabe(self): 

872 # ID already exist in entity_dict and refer to one temporary, but there is another ID not in entity_dict that highlights a conflict on ts 

873 br_dict = { 

874 'wannabe_0': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'}, 

875 'wannabe_2': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 'others': [], 'title': 'Contributions To GDP Growth And Inflation: South Africa'}, 

876 'wannabe_3': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph18-en'], 'others': [], 'title': 'Official Loans To The Governments Of Greece, Ireland And Portugal'}, 

877 } 

878 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: Japan' # The first title must have precedence (China, not Japan) 

879 idslist = ['doi:10.1787/eco_outlook-v2011-2-graph138-en', 'doi:10.1001/2013.jamasurg.270'] 

880 curator = prepareCurator(get_csv_data(REAL_DATA_CSV)) 

881 curator.log[0] = {'id': {}} 

882 curator.brdict = br_dict 

883 curator.finder = self.finder 

884 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False) 

885 output = (meta_id, curator.idbr, curator.idra, curator.brdict, curator.radict, curator.log) 

886 expected_output_1 = ( 

887 '2720', 

888 { 

889 'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601', 

890 'doi:10.1001/2013.jamasurg.270': '2585' 

891 }, 

892 {}, 

893 {'2720': {'ids': ['doi:10.1001/2013.jamasurg.270', 'doi:10.1787/eco_outlook-v2011-2-graph138-en'], 

894 'others': ['wannabe_0'], 

895 'title': 'Pediatric Injury Outcomes In Racial/Ethnic Minorities In ' 

896 'California'}, 

897 'wannabe_2': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 

898 'others': [], 

899 'title': 'Contributions To GDP Growth And Inflation: South ' 

900 'Africa'}, 

901 'wannabe_3': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph18-en'], 

902 'others': [], 

903 'title': 'Official Loans To The Governments Of Greece, Ireland ' 

904 'And Portugal'}}, 

905 {}, 

906 {0: {'id': {}}} 

907 ) 

908 expected_output_2 = ( 

909 '2719', 

910 { 

911 'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601', 

912 'doi:10.1001/2013.jamasurg.270': '2585' 

913 }, 

914 {}, 

915 {'2719': {'ids': ['doi:10.1001/2013.jamasurg.270', 'doi:10.1787/eco_outlook-v2011-2-graph138-en'], 

916 'others': ['wannabe_0'], 

917 'title': 'Patient Satisfaction As A Possible Indicator Of Quality ' 

918 'Surgical Care'}, 

919 'wannabe_2': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 

920 'others': [], 

921 'title': 'Contributions To GDP Growth And Inflation: South ' 

922 'Africa'}, 

923 'wannabe_3': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph18-en'], 

924 'others': [], 

925 'title': 'Official Loans To The Governments Of Greece, Ireland ' 

926 'And Portugal'}}, 

927 {}, 

928 {0: {'id': {}}} 

929 ) 

930 self.assertTrue(output == expected_output_1 or output == expected_output_2) 

931 

932 def test_id_worker_4(self): 

933 # 4 Merge data from EntityA (CSV) with data from EntityX (CSV), update both with data from EntityA (RDF) 

934 br_dict = { 

935 'wannabe_0': {'ids': ['doi:10.1001/archderm.104.1.106'], 'others': [], 'title': 'Multiple eloids'}, 

936 'wannabe_1': {'ids': ['doi:10.1001/archderm.104.1.106'], 'others': [], 'title': 'Multiple Blastoids'}, 

937 } 

938 name = 'Multiple Palloids' 

939 idslist = ['doi:10.1001/archderm.104.1.106', 'pmid:29098884'] 

940 curator = prepareCurator(list()) 

941 curator.brdict = br_dict 

942 curator.wnb_cnt = 2 

943 curator.finder = self.finder 

944 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False) 

945 output = (meta_id, curator.idbr, curator.idra, curator.log) 

946 expected_output = ( 

947 '3757', 

948 {'doi:10.1001/archderm.104.1.106': '3624', 'pmid:29098884': '2000000'}, 

949 {}, 

950 {} 

951 ) 

952 self.assertEqual(output, expected_output) 

953 

954class test_id_worker_with_reset(unittest.TestCase): 

955 def test_id_worker_2_meta_in_entity_dict(self): 

956 # MetaID exists among data. 

957 # MetaID already in entity_dict (no care about conflicts, we have a MetaID specified) 

958 # 2 Retrieve EntityA data to update EntityA inside CSV 

959 reset_server() 

960 data = get_csv_data(REAL_DATA_CSV) 

961 curator = prepareCurator(data) 

962 curator.curator() 

963 store_curated_data(curator, SERVER) 

964 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China' 

965 curator_empty = prepareCurator(list()) 

966 curator_empty.finder.get_everything_about_res(metavals=set(), identifiers={'doi:10.1787/eco_outlook-v2011-2-graph138-en'}, vvis=set()) 

967 # put metaval in entity_dict 

968 meta_id = curator_empty.id_worker('id', name, [], '0601', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False) 

969 # metaval is in entity_dict 

970 meta_id = curator_empty.id_worker('id', name, [], '0601', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False) 

971 output = (meta_id, curator_empty.brdict, curator_empty.radict, curator_empty.idbr, curator_empty.idra, curator_empty.log) 

972 expected_output = ('0601', {'0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'others': []}}, {}, {'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601'}, {}, {}) 

973 self.assertEqual(output, expected_output) 

974 

975 def test_conflict_existing(self): 

976 # ID already exist in entity_dict but refer to multiple entities having a MetaID 

977 reset_server() 

978 br_dict = { 

979 'omid:br/0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'}, 

980 'omid:br/0602': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 'others': [], 'title': 'Contributions To GDP Growth And Inflation: South Africa'}, 

981 'omid:br/0603': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Official Loans To The Governments Of Greece, Ireland And Portugal'}, 

982 } 

983 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China' 

984 idslist = ['doi:10.1787/eco_outlook-v2011-2-graph138-en'] 

985 curator = prepareCurator(list()) 

986 curator.log[0] = {'id': {}} 

987 curator.brdict = br_dict 

988 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False) 

989 output = (meta_id, curator.idbr, curator.idra, curator.brdict, curator.radict, curator.log) 

990 expected_output = ( 

991 'wannabe_0', 

992 {'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601'}, 

993 {}, 

994 {'omid:br/0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 

995 'others': [], 

996 'title': 'Money Growth, Interest Rates, Inflation And Raw ' 

997 'Materials Prices: China'}, 

998 'omid:br/0602': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 

999 'others': [], 

1000 'title': 'Contributions To GDP Growth And Inflation: South ' 

1001 'Africa'}, 

1002 'omid:br/0603': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 

1003 'others': [], 

1004 'title': 'Official Loans To The Governments Of Greece, ' 

1005 'Ireland And Portugal'}, 

1006 'wannabe_0': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 

1007 'others': [], 

1008 'title': 'Money Growth, Interest Rates, Inflation And Raw ' 

1009 'Materials Prices: China'}}, 

1010 {}, 

1011 {0: {'id': {'Conflict entity': 'wannabe_0'}}} 

1012 ) 

1013 self.assertEqual(output, expected_output) 

1014 

1015 def test_id_worker_5(self): 

1016 # ID already exist in entity_dict and refer to one or more temporary entities -> collective merge 

1017 reset_server() 

1018 br_dict = { 

1019 'wannabe_0': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'}, 

1020 'wannabe_1': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 'others': [], 'title': 'Contributions To GDP Growth And Inflation: South Africa'}, 

1021 'wannabe_2': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Official Loans To The Governments Of Greece, Ireland And Portugal'}, 

1022 } 

1023 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China' 

1024 idslist = ['doi:10.1787/eco_outlook-v2011-2-graph138-en'] 

1025 curator = prepareCurator(list()) 

1026 curator.brdict = br_dict 

1027 curator.wnb_cnt = 2 

1028 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False) 

1029 output = (meta_id, curator.idbr, curator.idra, curator.log) 

1030 expected_output = ( 

1031 'wannabe_0', 

1032 {'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601'}, 

1033 {}, 

1034 {} 

1035 ) 

1036 self.assertEqual(output, expected_output) 

1037 

1038 def test_no_conflict_existing(self): 

1039 # ID already exist in entity_dict and refer to one entity 

1040 reset_server() 

1041 br_dict = { 

1042 'omid:br/0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'}, 

1043 'omid:br/0602': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 'others': [], 'title': 'Contributions To GDP Growth And Inflation: South Africa'}, 

1044 'omid:br/0603': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph18-en'], 'others': [], 'title': 'Official Loans To The Governments Of Greece, Ireland And Portugal'}, 

1045 } 

1046 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: Japan' # The first title must have precedence (China, not Japan) 

1047 idslist = ['doi:10.1787/eco_outlook-v2011-2-graph138-en'] 

1048 curator = prepareCurator(list()) 

1049 curator.log[0] = {'id': {}} 

1050 curator.brdict = br_dict 

1051 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False) 

1052 output = (meta_id, curator.idbr, curator.idra, curator.log) 

1053 expected_output = ( 

1054 'omid:br/0601', 

1055 {'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601'}, 

1056 {}, 

1057 {0: {'id': {}}} 

1058 ) 

1059 self.assertEqual(output, expected_output) 

1060 

1061 def test_metaid_in_prov(self): 

1062 # MetaID not found in data, but found in the provenance metadata. 

1063 reset_server() 

1064 add_data_ts(server=SERVER, data_path=os.path.abspath(os.path.join('test', 'testcases', 'ts', 'real_data_with_prov.nq')).replace('\\', '/')) 

1065 name = '' 

1066 curator = prepareCurator(list()) 

1067 meta_id = curator.id_worker('id', name, [], '4321', ra_ent=True, br_ent=False, vvi_ent=False, publ_entity=False) 

1068 self.assertEqual(meta_id, '38013') 

1069 

1070 

1071class testcase_01(unittest.TestCase): 

1072 def test(self): 

1073 # testcase1: 2 different issues of the same venue (no volume) 

1074 name = '01' 

1075 data = get_csv_data(MANUAL_DATA_CSV) 

1076 partial_data = list() 

1077 partial_data.append(data[0]) 

1078 partial_data.append(data[5]) 

1079 data_curated, testcase = prepare_to_test(partial_data, name) 

1080 for pos, element in enumerate(data_curated): 

1081 self.assertEqual(element, testcase[pos]) 

1082 

1083 

1084class testcase_02(unittest.TestCase): 

1085 def test(self): 

1086 # testcase2: 2 different volumes of the same venue (no issue) 

1087 name = '02' 

1088 data = get_csv_data(MANUAL_DATA_CSV) 

1089 partial_data = list() 

1090 partial_data.append(data[1]) 

1091 partial_data.append(data[3]) 

1092 data_curated, testcase = prepare_to_test(partial_data, name) 

1093 self.assertEqual(data_curated, testcase) 

1094 

1095 

1096class testcase_03(unittest.TestCase): 

1097 def test(self): 

1098 # testcase3: 2 different issues of the same volume 

1099 name = '03' 

1100 data = get_csv_data(MANUAL_DATA_CSV) 

1101 partial_data = list() 

1102 partial_data.append(data[2]) 

1103 partial_data.append(data[4]) 

1104 data_curated, testcase = prepare_to_test(partial_data, name) 

1105 self.assertEqual(data_curated, testcase) 

1106 

1107 

1108class testcase_04(unittest.TestCase): 

1109 def test(self): 

1110 # testcase4: 2 new IDS and different date format (yyyy-mm and yyyy-mm-dd) 

1111 name = '04' 

1112 data = get_csv_data(MANUAL_DATA_CSV) 

1113 partial_data = list() 

1114 partial_data.append(data[6]) 

1115 partial_data.append(data[7]) 

1116 data_curated, testcase = prepare_to_test(partial_data, name) 

1117 for pos, element in enumerate(data_curated): 

1118 self.assertEqual(element, testcase[pos]) 

1119 

1120 

1121class testcase_05(unittest.TestCase): 

1122 def test(self): 

1123 # testcase5: NO ID scenario 

1124 name = '05' 

1125 data = get_csv_data(MANUAL_DATA_CSV) 

1126 partial_data = list() 

1127 partial_data.append(data[8]) 

1128 data_curated, testcase = prepare_to_test(partial_data, name) 

1129 self.assertEqual(data_curated, testcase) 

1130 

1131 

1132class testcase_06(unittest.TestCase): 

1133 def test(self): 

1134 # testcase6: ALL types test 

1135 name = '06' 

1136 data = get_csv_data(MANUAL_DATA_CSV) 

1137 partial_data = data[9:33] 

1138 data_curated, testcase = prepare_to_test(partial_data, name) 

1139 self.assertEqual(data_curated, testcase) 

1140 

1141 

1142class testcase_07(unittest.TestCase): 

1143 def test(self): 

1144 # testcase7: all journal related types with an editor 

1145 name = '07' 

1146 data = get_csv_data(MANUAL_DATA_CSV) 

1147 partial_data = data[34:40] 

1148 data_curated, testcase = prepare_to_test(partial_data, name) 

1149 self.assertEqual(data_curated, testcase) 

1150 

1151 

1152class testcase_08(unittest.TestCase): 

1153 def test(self): 

1154 # testcase8: all book related types with an editor 

1155 name = '08' 

1156 data = get_csv_data(MANUAL_DATA_CSV) 

1157 partial_data = data[40:43] 

1158 data_curated, testcase = prepare_to_test(partial_data, name) 

1159 self.assertEqual(data_curated, testcase) 

1160 

1161 

1162class testcase_09(unittest.TestCase): 

1163 def test(self): 

1164 # testcase09: all proceeding related types with an editor 

1165 name = '09' 

1166 data = get_csv_data(MANUAL_DATA_CSV) 

1167 partial_data = data[43:45] 

1168 data_curated, testcase = prepare_to_test(partial_data, name) 

1169 self.assertEqual(data_curated, testcase) 

1170 

1171 

1172class testcase_10(unittest.TestCase): 

1173 def test(self): 

1174 # testcase10: a book inside a book series and a book inside a book set 

1175 name = '10' 

1176 data = get_csv_data(MANUAL_DATA_CSV) 

1177 partial_data = data[45:49] 

1178 data_curated, testcase = prepare_to_test(partial_data, name) 

1179 self.assertEqual(data_curated, testcase) 

1180 

1181 

1182class testcase_11(unittest.TestCase): 

1183 def test(self): 

1184 # testcase11: real time entity update 

1185 name = '11' 

1186 data = get_csv_data(MANUAL_DATA_CSV) 

1187 partial_data = data[49:52] 

1188 data_curated, testcase = prepare_to_test(partial_data, name) 

1189 self.assertEqual(data_curated, testcase) 

1190 

1191 

1192class testcase_12(unittest.TestCase): 

1193 def test(self): 

1194 # testcase12: clean name, title, ids 

1195 name = '12' 

1196 data = get_csv_data(MANUAL_DATA_CSV) 

1197 partial_data = data[52:53] 

1198 data_curated, testcase = prepare_to_test(partial_data, name) 

1199 self.assertEqual(data_curated, testcase) 

1200 

1201 

1202class testcase_13(unittest.TestCase): 

1203 # testcase13: ID_clean massive test 

1204 

1205 def test1(self): 

1206 # 1--- meta specified br in a row, wannabe with a new id in a row, meta specified with an id related to wannabe 

1207 # in a row 

1208 name = '13.1' 

1209 data = get_csv_data(MANUAL_DATA_CSV) 

1210 partial_data = data[53:56] 

1211 data_curated, testcase = prepare_to_test(partial_data, name) 

1212 self.assertEqual(data_curated, testcase) 

1213 

1214 def test2(self): 

1215 # 2---Conflict with META precedence: a br has a meta_id and an id related to another meta_id, the first 

1216 # specified meta has precedence 

1217 data = get_csv_data(MANUAL_DATA_CSV) 

1218 name = '13.2' 

1219 partial_data = data[56:57] 

1220 data_curated, testcase = prepare_to_test(partial_data, name) 

1221 self.assertEqual(data_curated, testcase) 

1222 

1223 def test3(self): 

1224 # 3--- conflict: br with id shared with 2 meta 

1225 data = get_csv_data(MANUAL_DATA_CSV) 

1226 name_1 = '13.3' 

1227 name_2 = '13.31' 

1228 partial_data = data[57:58] 

1229 data_curated, testcase_1 = prepare_to_test(partial_data, name_1) 

1230 _, testcase_2 = prepare_to_test(partial_data, name_2) 

1231 self.assertTrue(data_curated == testcase_1 or data_curated == testcase_2) 

1232 

1233 

1234class testcase_14(unittest.TestCase): 

1235 

1236 def test1(self): 

1237 # update existing sequence, in particular, a new author and an existing author without an existing id (matched 

1238 # thanks to surname,name(BAD WRITTEN!) 

1239 name = '14.1' 

1240 data = get_csv_data(MANUAL_DATA_CSV) 

1241 partial_data = data[58:59] 

1242 data_curated, testcase = prepare_to_test(partial_data, name) 

1243 self.assertEqual(data_curated, testcase) 

1244 

1245 def test2(self): 

1246 # same sequence different order, with new ids 

1247 name = '14.2' 

1248 data = get_csv_data(MANUAL_DATA_CSV) 

1249 partial_data = data[59:60] 

1250 data_curated, testcase = prepare_to_test(partial_data, name) 

1251 self.assertEqual(data_curated, testcase) 

1252 

1253 def test3(self): 

1254 # RA 

1255 # Author with two different ids 

1256 name_1 = '14.3' 

1257 data = get_csv_data(MANUAL_DATA_CSV) 

1258 partial_data = data[60:61] 

1259 data_curated, testcase_1 = prepare_to_test(partial_data, name_1) 

1260 self.assertEqual(data_curated, testcase_1) 

1261 

1262 def test4(self): 

1263 # meta specified ra in a row, wannabe ra with a new id in a row, meta specified with an id related to wannabe 

1264 # in a ra 

1265 name = '14.4' 

1266 data = get_csv_data(MANUAL_DATA_CSV) 

1267 partial_data = data[61:64] 

1268 data_curated, testcase = prepare_to_test(partial_data, name) 

1269 self.assertEqual(data_curated, testcase) 

1270 

1271 

1272class testcase_15(unittest.TestCase): 

1273 

1274 def test1(self): 

1275 # venue volume issue already exists in ts 

1276 name = '15.1' 

1277 data = get_csv_data(MANUAL_DATA_CSV) 

1278 partial_data = data[64:65] 

1279 data_curated, testcase = prepare_to_test(partial_data, name) 

1280 self.assertEqual(data_curated, testcase) 

1281 

1282 def test2(self): 

1283 # venue conflict 

1284 name = '15.2' 

1285 data = get_csv_data(MANUAL_DATA_CSV) 

1286 partial_data = data[65:66] 

1287 data_curated, testcase = prepare_to_test(partial_data, name) 

1288 # _, testcase_2 = prepare_to_test(partial_data, name_2) 

1289 self.assertEqual(data_curated, testcase) 

1290 

1291 def test3(self): 

1292 # venue in ts is now the br 

1293 name = '15.3' 

1294 data = get_csv_data(MANUAL_DATA_CSV) 

1295 partial_data = data[66:67] 

1296 data_curated, testcase = prepare_to_test(partial_data, name) 

1297 self.assertEqual(data_curated, testcase) 

1298 

1299 def test4(self): 

1300 # br in ts is now the venue 

1301 name = '15.4' 

1302 data = get_csv_data(MANUAL_DATA_CSV) 

1303 partial_data = data[67:68] 

1304 data_curated, testcase = prepare_to_test(partial_data, name) 

1305 self.assertEqual(data_curated, testcase) 

1306 

1307 def test5(self): 

1308 # volume in ts is now the br 

1309 name = '15.5' 

1310 data = get_csv_data(MANUAL_DATA_CSV) 

1311 partial_data = data[71:72] 

1312 data_curated, testcase = prepare_to_test(partial_data, name) 

1313 self.assertEqual(data_curated, testcase) 

1314 

1315 def test6(self): 

1316 # br is a volume 

1317 name = '15.6' 

1318 data = get_csv_data(MANUAL_DATA_CSV) 

1319 partial_data = data[72:73] 

1320 data_curated, testcase = prepare_to_test(partial_data, name) 

1321 self.assertEqual(data_curated, testcase) 

1322 

1323 def test7(self): 

1324 # issue in ts is now the br 

1325 name = '15.7' 

1326 data = get_csv_data(MANUAL_DATA_CSV) 

1327 partial_data = data[73:74] 

1328 data_curated, testcase = prepare_to_test(partial_data, name) 

1329 self.assertEqual(data_curated, testcase) 

1330 

1331 def test8(self): 

1332 # br is a issue 

1333 name = '15.8' 

1334 data = get_csv_data(MANUAL_DATA_CSV) 

1335 partial_data = data[74:75] 

1336 data_curated, testcase = prepare_to_test(partial_data, name) 

1337 self.assertEqual(data_curated, testcase) 

1338 

1339 

1340class testcase_16(unittest.TestCase): 

1341 

1342 def test1(self): 

1343 # Date cleaning 2019-02-29 

1344 name = '16.1' 

1345 # add_data_ts('http://127.0.0.1:8805/sparql') 

1346 # wrong date (2019/02/29) 

1347 data = get_csv_data(MANUAL_DATA_CSV) 

1348 partial_data = data[75:76] 

1349 data_curated, testcase = prepare_to_test(partial_data, name) 

1350 self.assertEqual(data_curated, testcase) 

1351 

1352 def test2(self): 

1353 # existing re 

1354 name = '16.2' 

1355 data = get_csv_data(MANUAL_DATA_CSV) 

1356 partial_data = data[76:77] 

1357 data_curated, testcase = prepare_to_test(partial_data, name) 

1358 self.assertEqual(data_curated, testcase) 

1359 

1360 def test3(self): 

1361 # given name for an RA with only a family name in TS 

1362 name = '16.3' 

1363 data = get_csv_data(MANUAL_DATA_CSV) 

1364 partial_data = data[77:78] 

1365 data_curated, testcase = prepare_to_test(partial_data, name) 

1366 self.assertEqual(data_curated, testcase) 

1367 

1368 

1369if __name__ == '__main__': # pragma: no cover 

1370 unittest.main()