Coverage for test/curator

1import csv

2import shutil

3import unittest

5from oc_ocdm import Storer

6from SPARQLWrapper import POST, SPARQLWrapper

7from rdflib import Graph, ConjunctiveGraph

8import redis

9from oc_ocdm.counter_handler.redis_counter_handler import RedisCounterHandler

11from oc_meta.core.creator import Creator

12from oc_meta.core.curator import *

13from oc_meta.lib.file_manager import get_csv_data

14from oc_meta.lib.finder import ResourceFinder

15from oc_meta.plugins.multiprocess.resp_agents_curator import RespAgentsCurator

17SERVER = 'http://127.0.0.1:8805/sparql'

18BASE_DIR = os.path.join('test')

19MANUAL_DATA_CSV = f'{BASE_DIR}/manual_data.csv'

20MANUAL_DATA_RDF = f'{BASE_DIR}/testcases/ts/testcase_ts-13.ttl'

21REAL_DATA_CSV = os.path.join(BASE_DIR, 'real_data.csv')

22REAL_DATA_RDF = f'{BASE_DIR}/testcases/ts/real_data.nt'

23REAL_DATA_RDF_WITH_PROV = f'{BASE_DIR}/testcases/ts/real_data_with_prov.nq'

24BASE_IRI = 'https://w3id.org/oc/meta/'

25CURATOR_COUNTER_DIR = f'{BASE_DIR}/curator_counter'

26OUTPUT_DIR = f'{BASE_DIR}/output'

27PROV_CONFIG = f'{BASE_DIR}/prov_config.json'

29# Redis configuration

30REDIS_HOST = 'localhost'

31REDIS_PORT = 6379

32REDIS_DB = 5

34def get_path(path:str) -> str:

35 # absolute_path:str = os.path.abspath(path)

36 universal_path = path.replace('\\', '/')

37 return universal_path

39def reset_redis_counters():

40 redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB)

41 redis_client.flushdb()

43def get_counter_handler():

44 return RedisCounterHandler(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB)

46def reset():

47 reset_redis_counters()

49def reset_server(server:str=SERVER) -> None:

50 ts = SPARQLWrapper(server)

51 for graph in {'https://w3id.org/oc/meta/br/', 'https://w3id.org/oc/meta/ra/', 'https://w3id.org/oc/meta/re/', 'https://w3id.org/oc/meta/id/', 'https://w3id.org/oc/meta/ar/', 'http://default.graph/'}:

52 ts.setQuery(f'CLEAR GRAPH <{graph}>')

53 ts.setMethod(POST)

54 ts.query()

56def add_data_ts(server:str=SERVER, data_path:str=os.path.abspath(os.path.join('test', 'testcases', 'ts', 'real_data.nt')).replace('\\', '/'), batch_size:int=100, default_graph_uri=URIRef("http://default.graph/")):

57 reset_server(server)

58 f_path = get_path(data_path)

60 # Determina il formato del file

61 file_extension = os.path.splitext(f_path)[1].lower()

62 if file_extension == '.nt':

63 g = Graph()

64 g.parse(location=f_path, format='nt')

65 elif file_extension == '.nq':

66 g = ConjunctiveGraph()

67 g.parse(location=f_path, format='nquads')

68 elif file_extension == '.ttl':

69 g = Graph()

70 g.parse(location=f_path, format='turtle')

71 else:

72 raise ValueError(f"Unsupported file extension: {file_extension}")

74 triples_list = []

75 if file_extension in {'.nt', '.ttl'}:

76 for subj, pred, obj in g:

77 triples_list.append((subj, pred, obj, default_graph_uri))

78 elif file_extension == '.nq':

79 for subj, pred, obj, ctx in g.quads((None, None, None, None)):

80 triples_list.append((subj, pred, obj, ctx))

82 for i in range(0, len(triples_list), batch_size):

83 batch_triples = triples_list[i:i + batch_size]

85 triples_str = ""

86 for subj, pred, obj, ctx in batch_triples:

87 if ctx:

88 triples_str += f"GRAPH {ctx.n3().replace('[', '').replace(']', '')} {{ {subj.n3()} {pred.n3()} {obj.n3()} }} "

89 else:

90 triples_str += f"{subj.n3()} {pred.n3()} {obj.n3()} . "

92 query = f"INSERT DATA {{ {triples_str} }}"

94 ts = SPARQLWrapper(server)

95 ts.setQuery(query)

96 ts.setMethod(POST)

97 ts.query()

99def store_curated_data(curator_obj:Curator, server:str) -> None:

100 creator_obj = Creator(curator_obj.data, SERVER, BASE_IRI, None, None, 'https://orcid.org/0000-0002-8420-0696',

101 curator_obj.index_id_ra, curator_obj.index_id_br, curator_obj.re_index,

102 curator_obj.ar_index, curator_obj.VolIss, preexisting_entities=set(), everything_everywhere_allatonce=Graph())

103 creator = creator_obj.creator(source=None)

104 res_storer = Storer(creator)

105 res_storer.upload_all(server, base_dir=None, batch_size=100)

106

107def prepare_to_test(data, name):

108 reset_redis_counters()

109

110 reset_server(SERVER)

111 if float(name) > 12:

112 add_data_ts(SERVER, os.path.abspath(os.path.join('test', 'testcases', 'ts', 'testcase_ts-13.ttl')).replace('\\', '/'))

113

114 testcase_csv = get_path('test/testcases/testcase_data/testcase_' + name + '_data.csv')

115 testcase_id_br = get_path('test/testcases/testcase_data/indices/' + name + '/index_id_br_' + name + '.csv')

116 testcase_id_ra = get_path('test/testcases/testcase_data/indices/' + name + '/index_id_ra_' + name + '.csv')

117 testcase_ar = get_path('test/testcases/testcase_data/indices/' + name + '/index_ar_' + name + '.csv')

118 testcase_re = get_path('test/testcases/testcase_data/indices/' + name + '/index_re_' + name + '.csv')

119 testcase_vi = get_path('test/testcases/testcase_data/indices/' + name + '/index_vi_' + name + '.json')

120

121 counter_handler = get_counter_handler()

122 settings = {'normalize_titles': True}

123 curator_obj = Curator(data, SERVER, prov_config=PROV_CONFIG, counter_handler=counter_handler, settings=settings)

124 curator_obj.curator()

125 testcase_csv = get_csv_data(testcase_csv)

126 for csv in [testcase_csv, curator_obj.data]:

127 for row in csv:

128 row['id'] = sorted(row['id'].split())

129 testcase_id_br = get_csv_data(testcase_id_br)

130 testcase_id_ra = get_csv_data(testcase_id_ra)

131 testcase_ar = get_csv_data(testcase_ar)

132 testcase_re = get_csv_data(testcase_re)

133 for csv in [testcase_id_br, testcase_id_ra, testcase_ar, testcase_re, curator_obj.index_id_br, curator_obj.index_id_ra, curator_obj.ar_index, curator_obj.re_index]:

134 try:

135 csv.sort(key=lambda x:x['id'])

136 except KeyError:

137 try:

138 csv.sort(key=lambda x:x['meta'])

139 except KeyError:

140 csv.sort(key=lambda x:x['br'])

141 with open(testcase_vi) as json_file:

142 testcase_vi = json.load(json_file)

143 testcase = [testcase_csv, testcase_id_br, testcase_id_ra, testcase_ar, testcase_re, testcase_vi]

144 data_curated = [curator_obj.data, curator_obj.index_id_br, curator_obj.index_id_ra, curator_obj.ar_index,

145 curator_obj.re_index, curator_obj.VolIss]

146 return data_curated, testcase

147

148def prepareCurator(data:list, server:str=SERVER, resp_agents_only:bool=False) -> Curator:

149 settings = {'normalize_titles': True}

150 reset_redis_counters()

151 counter_handler = get_counter_handler()

152 if resp_agents_only:

153 curator = RespAgentsCurator(data, server, prov_config=PROV_CONFIG, counter_handler=counter_handler)

154 else:

155 curator = Curator(data, server, prov_config=PROV_CONFIG, counter_handler=counter_handler, settings=settings)

156 return curator

157

158

159class test_Curator(unittest.TestCase):

160 @classmethod

161 def setUpClass(cls):

162 add_data_ts()

163

164 def setUp(self):

165 reset_redis_counters()

166

167 def tearDown(self):

168 reset_redis_counters()

169

170 def test_merge_entities_in_csv(self):

171 curator = prepareCurator(list())

172 curator.counter_handler.set_counter(4, 'id', supplier_prefix='060')

173 entity_dict = {'0601': {'ids': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'others': []}}

174 id_dict = dict()

175 curator.merge_entities_in_csv(['doi:10.1787/eco_outlook-v2011-2-graph138-en'], '0601', 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', entity_dict, id_dict)

176 expected_output = (

177 {'0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'others': []}},

178 {'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0605'}

179 )

180 self.assertEqual((entity_dict, id_dict), expected_output)

181

182 def test_clean_id_list(self):

183 input = ['doi:10.001/B-1', 'wikidata:B1111111', 'OMID:br/060101']

184 output = Curator.clean_id_list(input, br=True)

185 expected_output = (['doi:10.001/b-1', 'wikidata:B1111111'], '060101')

186 self.assertEqual(output, expected_output)

187

188 def test_equalizer(self):

189 # Test equalizer with a row that contains an ID that can be resolved to an existing entity

190 row = {'id': 'doi:10.1001/archderm.104.1.106', 'title': '', 'author': '', 'pub_date': '1972-12-01', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}

191 curator = prepareCurator(list())

192 curator.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI)

193

194 metavals, identifiers, vvis = curator.extract_identifiers_and_metavals(row, valid_dois_cache=set())

195 curator.finder.get_everything_about_res(metavals=metavals, identifiers=identifiers, vvis=vvis)

196

197 curator.log[0] = {'id': {}}

198 curator.clean_id(row)

199 extracted_metaval = row['id']

200 self.assertEqual(extracted_metaval, '3757')

201

202 # Reset the row to test equalizer

203 row = {'id': '', 'title': '', 'author': '', 'pub_date': '1972-12-01', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}

204

205 curator.rowcnt = 0

206 curator.log[0] = {

207 'id': {},

208 'author': {},

209 'venue': {},

210 'editor': {},

211 'publisher': {},

212 'page': {},

213 'volume': {},

214 'issue': {},

215 'pub_date': {},

216 'type': {},

217 'title': {}

218 }

219 curator.equalizer(row, extracted_metaval)

220 output = (curator.log, row)

221

222 expected_output = (

223 {0: {'id': {'status': 'Entity already exists'}, 'author': {}, 'venue': {}, 'editor': {}, 'publisher': {}, 'page': {}, 'volume': {}, 'issue': {}, 'pub_date': {'status': 'New value proposed'}, 'type': {}, 'title': {}}},

224 {'id': '', 'title': '', 'author': 'Curth, W. [omid:ra/6033]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416 issn:0003-987X]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': 'American Medical Association (ama) [omid:ra/3309 crossref:10]', 'editor': ''}

225 )

226 self.assertEqual(output, expected_output)

227

228 def test_clean_id_metaid_not_in_ts(self):

229 # A MetaId was specified, but it is not on ts. Therefore, it is invalid

230 curator = prepareCurator(list())

231 row = {'id': 'omid:br/131313', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''}

232 curator.log[0] = {'id': {}}

233 curator.clean_id(row)

234 expected_output = {'id': 'wannabe_0', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''}

235 self.assertEqual(row, expected_output)

236

237 def test_clean_id(self):

238 curator = prepareCurator(list())

239 row = {'id': 'doi:10.1001/archderm.104.1.106', 'title': 'Multiple Blasto', 'author': '', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}

240 curator.log[0] = {'id': {}}

241 curator.finder.get_everything_about_res(metavals=set(), identifiers={'doi:10.1001/archderm.104.1.106'}, vvis=set())

242 curator.clean_id(row)

243 expected_output = {'id': '3757', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}

244 self.assertEqual(row, expected_output)

245

246 def test_merge_duplicate_entities(self):

247 # Test merge_duplicate_entities with realistic data that includes an ID that resolves to an existing entity

248 data = [

249 {'id': 'doi:10.1001/archderm.104.1.106', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''},

250 {'id': '', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-02', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''},

251 {'id': '', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-03', 'venue': 'Archives Of Blast [omid:br/4416]', 'volume': '105', 'issue': '2', 'page': '106-108', 'type': 'journal volume', 'publisher': '', 'editor': ''},

252 ]

253 curator = prepareCurator(list())

254 curator.data = data

255 curator.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI)

256

257 # Extract metavals and identifiers from each row

258 all_metavals = set()

259 all_identifiers = set()

260 all_vvis = set()

261

262 for row in data:

263 metavals, identifiers, vvis = curator.extract_identifiers_and_metavals(row, valid_dois_cache=set())

264 all_metavals.update(metavals)

265 all_identifiers.update(identifiers)

266 all_vvis.update(vvis)

267

268 curator.finder.get_everything_about_res(metavals=all_metavals, identifiers=all_identifiers, vvis=all_vvis)

269

270 # Process each row with clean_id to get the actual metavals

271 for i, row in enumerate(data):

272 curator.log[i] = {'id': {}}

273 curator.rowcnt = i

274 curator.clean_id(row)

275

276 # Initialize log for merge_duplicate_entities

277 for i in range(3):

278 curator.log[i] = {

279 'id': {},

280 'author': {},

281 'venue': {},

282 'editor': {},

283 'publisher': {},

284 'page': {},

285 'volume': {},

286 'issue': {},

287 'pub_date': {},

288 'type': {}

289 }

290

291 # The brdict should be populated by clean_id, but we need to set up the "others" relationship

292 # The first row should have resolved to '3757', and the other rows should be wannabes

293 first_row_metaval = curator.data[0]['id'] # Should be '3757'

294 self.assertEqual(first_row_metaval, '3757')

295

296 # Set up the relationship between the existing entity and the wannabes

297 if first_row_metaval in curator.brdict:

298 curator.brdict[first_row_metaval]['others'].extend(['wannabe_0', 'wannabe_1'])

299

300 curator.merge_duplicate_entities()

301 output = (curator.data, curator.log)

302

303 expected_output = (

304 [

305 {'id': '3757', 'title': 'Multiple Keloids', 'author': 'Curth, W. [omid:ra/6033]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [issn:0003-987X omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': 'American Medical Association (ama) [omid:ra/3309 crossref:10]', 'editor': ''},

306 {'id': '3757', 'title': 'Multiple Keloids', 'author': 'Curth, W. [omid:ra/6033]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [issn:0003-987X omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': 'American Medical Association (ama) [omid:ra/3309 crossref:10]', 'editor': ''},

307 {'id': '3757', 'title': 'Multiple Keloids', 'author': 'Curth, W. [omid:ra/6033]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [issn:0003-987X omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': 'American Medical Association (ama) [omid:ra/3309 crossref:10]', 'editor': ''}

308 ],

309 {

310 0: {'id': {'status': 'Entity already exists'}, 'author': {}, 'venue': {}, 'editor': {}, 'publisher': {}, 'page': {}, 'volume': {}, 'issue': {}, 'pub_date': {}, 'type': {}},

311 1: {'id': {'status': 'Entity already exists'}, 'author': {}, 'venue': {'status': 'New value proposed'}, 'editor': {}, 'publisher': {}, 'page': {}, 'volume': {}, 'issue': {}, 'pub_date': {'status': 'New value proposed'}, 'type': {}},

312 2: {'id': {'status': 'Entity already exists'}, 'author': {}, 'venue': {'status': 'New value proposed'}, 'editor': {}, 'publisher': {}, 'page': {'status': 'New value proposed'}, 'volume': {'status': 'New value proposed'}, 'issue': {'status': 'New value proposed'}, 'pub_date': {'status': 'New value proposed'}, 'type': {'status': 'New value proposed'}}

313 }

314 )

315 self.assertEqual(output, expected_output)

316

317 def test_clean_vvi_all_data_on_ts(self):

318 # All data are already on the triplestore. They need to be retrieved and organized correctly

319 row = {'id': 'doi:10.1001/archderm.104.1.106', 'title': 'Multiple Keloids', 'author': '', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''}

320 curator = prepareCurator(list())

321 curator.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI)

322

323 metavals, identifiers, vvis = curator.extract_identifiers_and_metavals(row, valid_dois_cache=set())

324 curator.finder.get_everything_about_res(metavals=metavals, identifiers=identifiers, vvis=vvis)

325

326 curator.log[0] = {'id': {}}

327 curator.clean_id(row)

328

329 curator.clean_vvi(row)

330 expected_output = {

331 "4416": {

332 "issue": {},

333 "volume": {

334 "104": {

335 "id": "4712",

336 "issue": {

337 "1": {

338 "id": "4713"

339 }

340 }

341 }

342 }

343 }

344 }

345 self.assertEqual(curator.vvi, expected_output)

346

347 def test_clean_vvi_new_venue(self):

348 # It is a new venue

349 row = {'id': 'wannabe_1', 'title': 'Money growth, interest rates, inflation and raw materials prices: China', 'author': '', 'pub_date': '2011-11-28', 'venue': 'OECD Economic Outlook', 'volume': '2011', 'issue': '2', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''}

350 curator = prepareCurator(list())

351 curator.clean_vvi(row)

352 expected_output = {'wannabe_0': {'volume': {'2011': {'id': 'wannabe_1', 'issue': {'2': {'id': 'wannabe_2'}}}}, 'issue': {}}}

353 self.assertEqual(curator.vvi, expected_output)

354

355 def test_clean_vvi_volume_with_title(self):

356 # A journal volume having a title

357 row = [{'id': '', 'title': 'The volume title', 'author': '', 'pub_date': '', 'venue': 'OECD Economic Outlook', 'volume': '2011', 'issue': '2', 'page': '', 'type': 'journal volume', 'publisher': '', 'editor': ''}]

358 curator = prepareCurator(row)

359 curator.curator()

360 expected_output = [{'id': 'omid:br/0601', 'title': 'The Volume Title', 'author': '', 'pub_date': '', 'venue': 'OECD Economic Outlook [omid:br/0602]', 'volume': '', 'issue': '', 'page': '', 'type': 'journal volume', 'publisher': '', 'editor': ''}]

361 self.assertEqual(curator.data, expected_output)

362

363 def test_clean_vvi_invalid_volume(self):

364 # The data must be invalidated, because the resource is journal volume but an issue has also been specified

365 row = {'id': 'wannabe_1', 'title': '', 'author': '', 'pub_date': '', 'venue': 'OECD Economic Outlook', 'volume': '2011', 'issue': '2', 'page': '', 'type': 'journal volume', 'publisher': '', 'editor': ''}

366 curator = prepareCurator(list())

367 curator.clean_vvi(row)

368 expected_output = {'wannabe_0': {'volume': {}, 'issue': {}}}

369 self.assertEqual(curator.vvi, expected_output)

370

371 def test_clean_vvi_invalid_venue(self):

372 # The data must be invalidated, because the resource is journal but a volume has also been specified

373 row = {'id': 'wannabe_1', 'title': '', 'author': '', 'pub_date': '', 'venue': 'OECD Economic Outlook', 'volume': '2011', 'issue': '', 'page': '', 'type': 'journal', 'publisher': '', 'editor': ''}

374 curator = prepareCurator(list())

375 curator.clean_vvi(row)

376 expected_output = {'wannabe_0': {'volume': {}, 'issue': {}}}

377 self.assertEqual(curator.vvi, expected_output)

378

379 def test_clean_vvi_new_volume_and_issue(self):

380 # There is a row with vvi and no ids

381 row = {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': 'Archives Of Surgery [omid:br/4480]', 'volume': '147', 'issue': '11', 'page': '', 'type': 'journal article', 'publisher': '', 'editor': ''}

382 curator = prepareCurator(list())

383 curator.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI)

384

385 metavals, identifiers, vvis = curator.extract_identifiers_and_metavals(row, valid_dois_cache=set())

386 curator.finder.get_everything_about_res(metavals=metavals, identifiers=identifiers, vvis=vvis)

387 curator.clean_id(row)

388 curator.clean_vvi(row)

389 expected_output = {

390 "4480": {

391 "issue": {},

392 "volume": {

393 "147": {

394 "id": "4481",

395 "issue": {

396 "11": {

397 "id": "4482"

398 }

399 }

400 }

401 }

402 }

403 }

404 self.assertEqual(curator.vvi, expected_output)

405

406 def test_clean_ra_overlapping_surnames(self):

407 # The surname of one author is included in the surname of another.

408 row = {'id': 'wannabe_0', 'title': 'Giant Oyster Mushroom Pleurotus giganteus (Agaricomycetes) Enhances Adipocyte Differentiation and Glucose Uptake via Activation of PPARγ and Glucose Transporters 1 and 4 in 3T3-L1 Cells', 'author': 'Paravamsivam, Puvaneswari; Heng, Chua Kek; Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]; Sabaratnam, Vikineswary; M, Ravishankar Ram; Kuppusamy, Umah Rani', 'pub_date': '2016', 'venue': 'International Journal of Medicinal Mushrooms [issn:1521-9437]', 'volume': '18', 'issue': '9', 'page': '821-831', 'type': 'journal article', 'publisher': 'Begell House [crossref:613]', 'editor': ''}

409 curator = prepareCurator(list())

410 curator.brdict = {'wannabe_0': {'ids': ['doi:10.1615/intjmedmushrooms.v18.i9.60'], 'title': 'Giant Oyster Mushroom Pleurotus giganteus (Agaricomycetes) Enhances Adipocyte Differentiation and Glucose Uptake via Activation of PPARγ and Glucose Transporters 1 and 4 in 3T3-L1 Cells', 'others': []}}

411 curator.clean_ra(row, 'author')

412 output = (curator.ardict, curator.radict, curator.idra)

413 expected_output = (

414 {'wannabe_0': {'author': [('0601', 'wannabe_0'), ('0602', 'wannabe_1'), ('0603', 'wannabe_2'), ('0604', 'wannabe_3'), ('0605', 'wannabe_4'), ('0606', 'wannabe_5')], 'editor': [], 'publisher': []}},

415 {'wannabe_0': {'ids': [], 'others': [], 'title': 'Paravamsivam, Puvaneswari'}, 'wannabe_1': {'ids': [], 'others': [], 'title': 'Heng, Chua Kek'}, 'wannabe_2': {'ids': ['orcid:0000-0001-6278-8559'], 'others': [], 'title': 'Malek, Sri Nurestri Abdul'}, 'wannabe_3': {'ids': [], 'others': [], 'title': 'Sabaratnam, Vikineswary'}, 'wannabe_4': {'ids': [], 'others': [], 'title': 'M, Ravishankar Ram'}, 'wannabe_5': {'ids': [], 'others': [], 'title': 'Kuppusamy, Umah Rani'}},

416 {'orcid:0000-0001-6278-8559': '0601'}

417 )

418 self.assertEqual(output, expected_output)

419

420 def test_clean_ra_with_br_metaid(self):

421 # One author is in the triplestore, the other is not.

422 # br_metaval is a MetaID

423 # There are two ids for one author

424 row = {'id': 'doi:10.1001/archderm.104.1.106', 'title': 'Multiple Keloids', 'author': 'Curth, W.; McSorley, J. [orcid:0000-0003-0530-4305 schema:12345]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''}

425 curator = prepareCurator(list())

426 curator.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI)

427 metavals, identifiers, vvis = curator.extract_identifiers_and_metavals(row, valid_dois_cache=set())

428 curator.finder.get_everything_about_res(metavals=metavals, identifiers=identifiers, vvis=vvis)

429

430 curator.log[0] = {'id': {}}

431 curator.clean_id(row)

432

433 resolved_metaval = row['id']

434 self.assertEqual(resolved_metaval, '3757')

435 curator.brdict = {resolved_metaval: {'ids': ['doi:10.1001/archderm.104.1.106'], 'title': 'Multiple Keloids', 'others': []}}

436

437 curator.clean_ra(row, 'author')

438 output = (curator.ardict, curator.radict, curator.idra)

439 expected_output = (

440 {'3757': {'author': [('9445', '6033'), ('0601', 'wannabe_0')], 'editor': [], 'publisher': []}},

441 {'6033': {'ids': [], 'others': [], 'title': 'Curth, W.'}, 'wannabe_0': {'ids': ['orcid:0000-0003-0530-4305', 'schema:12345'], 'others': [], 'title': 'McSorley, J.'}},

442 {'orcid:0000-0003-0530-4305': '0601', 'schema:12345': '0602'}

443 )

444 self.assertEqual(output, expected_output)

445

446 def test_clean_ra_with_br_wannabe(self):

447 # Authors not on the triplestore.

448 # br_metaval is a wannabe

449 row = {'id': 'wannabe_0', 'title': 'Multiple Keloids', 'author': 'Curth, W. [orcid:0000-0002-8420-0696] ; McSorley, J. [orcid:0000-0003-0530-4305]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''}

450 curator = prepareCurator(list())

451 curator.brdict = {'wannabe_0': {'ids': ['doi:10.1001/archderm.104.1.106'], 'title': 'Multiple Keloids', 'others': []}}

452 curator.wnb_cnt = 1

453 curator.clean_ra(row, 'author')

454 output = (curator.ardict, curator.radict, curator.idra)

455 expected_output = (

456 {'wannabe_0': {'author': [('0601', 'wannabe_1'), ('0602', 'wannabe_2')], 'editor': [], 'publisher': []}},

457 {'wannabe_1': {'ids': ['orcid:0000-0002-8420-0696'], 'others': [], 'title': 'Curth, W.'}, 'wannabe_2': {'ids': ['orcid:0000-0003-0530-4305'], 'others': [], 'title': 'McSorley, J.'}},

458 {'orcid:0000-0002-8420-0696': '0601', 'orcid:0000-0003-0530-4305': '0602'}

459 )

460 self.assertEqual(output, expected_output)

461

462 def test_clean_ra_with_empty_square_brackets(self):

463 # One author's name contains a closed square bracket.

464 row = {'id': 'doi:10.1001/archderm.104.1.106', 'title': 'Multiple Keloids', 'author': 'Bernacki, Edward J. [ ]', 'pub_date': '1971-07-01', 'venue': 'Archives Of Dermatology [omid:br/4416]', 'volume': '104', 'issue': '1', 'page': '106-107', 'type': 'journal article', 'publisher': '', 'editor': ''}

465 curator = prepareCurator(list())

466 curator.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI)

467

468 metavals, identifiers, vvis = curator.extract_identifiers_and_metavals(row, valid_dois_cache=set())

469 curator.finder.get_everything_about_res(metavals=metavals, identifiers=identifiers, vvis=vvis)

470

471 curator.log[0] = {'id': {}}

472 curator.clean_id(row)

473

474 resolved_metaval = row['id']

475 self.assertEqual(resolved_metaval, '3757')

476 curator.brdict = {resolved_metaval: {'ids': ['doi:10.1001/archderm.104.1.106'], 'title': 'Multiple Keloids', 'others': []}}

477

478 curator.clean_ra(row, 'author')

479 output = (curator.ardict, curator.radict, curator.idra)

480 expected_output = (

481 {'3757': {'author': [('9445', '6033'), ('0601', 'wannabe_0')], 'editor': [], 'publisher': []}},

482 {'6033': {'ids': [], 'others': [], 'title': 'Curth, W.'}, 'wannabe_0': {'ids': [], 'others': [], 'title': 'Bernacki, Edward J.'}},

483 {}

484 )

485 self.assertEqual(output, expected_output)

486

487 def test_meta_maker(self):

488 curator = prepareCurator(list())

489 curator.brdict = {'3757': {'ids': ['doi:10.1001/archderm.104.1.106', 'pmid:29098884'], 'title': 'Multiple Keloids', 'others': []}, '4416': {'ids': ['issn:0003-987X'], 'title': 'Archives Of Dermatology', 'others': []}}

490 curator.radict = {'6033': {'ids': [], 'others': [], 'title': 'Curth, W.'}, 'wannabe_0': {'ids': ['orcid:0000-0003-0530-4305', 'schema:12345'], 'others': [], 'title': 'Mcsorley, J.'}}

491 curator.ardict = {'3757': {'author': [('9445', '6033'), ('0601', 'wannabe_0')], 'editor': [], 'publisher': []}}

492 curator.vvi = {'4416': {'issue': {}, 'volume': {'107': {'id': '4733', 'issue': {'1': {'id': '4734'}, '2': {'id': '4735'}, '3': {'id': '4736'}, '4': {'id': '4737'}, '5': {'id': '4738'}, '6': {'id': '4739'}}}, '108': {'id': '4740', 'issue': {'1': {'id': '4741'}, '2': {'id': '4742'}, '3': {'id': '4743'}, '4': {'id': '4744'}}}, '104': {'id': '4712', 'issue': {'1': {'id': '4713'}, '2': {'id': '4714'}, '3': {'id': '4715'}, '4': {'id': '4716'}, '5': {'id': '4717'}, '6': {'id': '4718'}}}, '148': {'id': '4417', 'issue': {'12': {'id': '4418'}, '11': {'id': '4419'}}}, '100': {'id': '4684', 'issue': {'1': {'id': '4685'}, '2': {'id': '4686'}, '3': {'id': '4687'}, '4': {'id': '4688'}, '5': {'id': '4689'}, '6': {'id': '4690'}}}, '101': {'id': '4691', 'issue': {'1': {'id': '4692'}, '2': {'id': '4693'}, '3': {'id': '4694'}, '4': {'id': '4695'}, '5': {'id': '4696'}, '6': {'id': '4697'}}}, '102': {'id': '4698', 'issue': {'1': {'id': '4699'}, '2': {'id': '4700'}, '3': {'id': '4701'}, '4': {'id': '4702'}, '5': {'id': '4703'}, '6': {'id': '4704'}}}, '103': {'id': '4705', 'issue': {'1': {'id': '4706'}, '2': {'id': '4707'}, '3': {'id': '4708'}, '4': {'id': '4709'}, '5': {'id': '4710'}, '6': {'id': '4711'}}}, '105': {'id': '4719', 'issue': {'1': {'id': '4720'}, '2': {'id': '4721'}, '3': {'id': '4722'}, '4': {'id': '4723'}, '5': {'id': '4724'}, '6': {'id': '4725'}}}, '106': {'id': '4726', 'issue': {'6': {'id': '4732'}, '1': {'id': '4727'}, '2': {'id': '4728'}, '3': {'id': '4729'}, '4': {'id': '4730'}, '5': {'id': '4731'}}}}}}

493 curator.meta_maker()

494 output = (curator.brmeta, curator.rameta, curator.armeta)

495 expected_output = (

496 {'3757': {'ids': ['doi:10.1001/archderm.104.1.106', 'pmid:29098884', 'omid:br/3757'], 'title': 'Multiple Keloids', 'others': []}, '4416': {'ids': ['issn:0003-987X', 'omid:br/4416'], 'title': 'Archives Of Dermatology', 'others': []}},

497 {'6033': {'ids': ['omid:ra/6033'], 'others': [], 'title': 'Curth, W.'}, '0601': {'ids': ['orcid:0000-0003-0530-4305', 'schema:12345', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Mcsorley, J.'}},

498 {'3757': {'author': [('9445', '6033'), ('0601', '0601')], 'editor': [], 'publisher': []}}

499 )

500 self.assertEqual(output, expected_output)

501

502 def test_enricher(self):

503 curator = prepareCurator(list())

504 curator.data = [{'id': 'wannabe_0', 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'author': '', 'pub_date': '2011-11-28', 'venue': 'wannabe_1', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': 'OECD [crossref:1963]', 'editor': ''}]

505 curator.brmeta = {

506 '0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en', 'omid:br/0601'], 'others': ['wannabe_0'], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'},

507 '0602': {'ids': ['omid:br/0604'], 'others': ['wannabe_1'], 'title': 'OECD Economic Outlook'}

508 }

509 curator.armeta = {'0601': {'author': [], 'editor': [], 'publisher': [('0601', '0601')]}}

510 curator.rameta = {'0601': {'ids': ['crossref:1963', 'omid:ra/0601'], 'others': ['wannabe_2'], 'title': 'Oecd'}}

511 curator.remeta = dict()

512 curator.meta_maker()

513 curator.enrich()

514 output = curator.data

515 expected_output = [{'id': 'doi:10.1787/eco_outlook-v2011-2-graph138-en omid:br/0601', 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'author': '', 'pub_date': '2011-11-28', 'venue': 'OECD Economic Outlook [omid:br/0604]', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': 'Oecd [crossref:1963 omid:ra/0601]', 'editor': ''}]

516 self.assertEqual(output, expected_output)

517

518 def test_indexer(self):

519 path_index = f'{OUTPUT_DIR}/index'

520 path_csv = f'{OUTPUT_DIR}'

521 curator = prepareCurator(list())

522 curator.filename = '0.csv'

523 curator.idra = {'orcid:0000-0003-0530-4305': '0601', 'schema:12345': '0602'}

524 curator.idbr = {'doi:10.1001/2013.jamasurg.270': '2585'}

525 curator.armeta = {'2585': {'author': [('9445', '0602'), ('0601', '0601')], 'editor': [], 'publisher': []}}

526 curator.remeta = dict()

527 curator.brmeta = {

528 '0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en', 'omid:br/0601'], 'others': ['wannabe_0'], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'},

529 '0602': {'ids': ['omid:br/0602'], 'others': ['wannabe_1'], 'title': 'OECD Economic Outlook'}

530 }

531 curator.vvi = {

532 'wannabe_1': {

533 'issue': {},

534 'volume': {

535 '107': {'id': '4733', 'issue': {'1': {'id': '4734'}, '2': {'id': '4735'}, '3': {'id': '4736'}, '4': {'id': '4737'}, '5': {'id': '4738'}, '6': {'id': '4739'}}},

536 '108': {'id': '4740', 'issue': {'1': {'id': '4741'}, '2': {'id': '4742'}, '3': {'id': '4743'}, '4': {'id': '4744'}}},

537 '104': {'id': '4712', 'issue': {'1': {'id': '4713'}, '2': {'id': '4714'}, '3': {'id': '4715'}, '4': {'id': '4716'}, '5': {'id': '4717'}, '6': {'id': '4718'}}},

538 '148': {'id': '4417', 'issue': {'12': {'id': '4418'}, '11': {'id': '4419'}}},

539 '100': {'id': '4684', 'issue': {'1': {'id': '4685'}, '2': {'id': '4686'}, '3': {'id': '4687'}, '4': {'id': '4688'}, '5': {'id': '4689'}, '6': {'id': '4690'}}},

540 '101': {'id': '4691', 'issue': {'1': {'id': '4692'}, '2': {'id': '4693'}, '3': {'id': '4694'}, '4': {'id': '4695'}, '5': {'id': '4696'}, '6': {'id': '4697'}}},

541 '102': {'id': '4698', 'issue': {'1': {'id': '4699'}, '2': {'id': '4700'}, '3': {'id': '4701'}, '4': {'id': '4702'}, '5': {'id': '4703'}, '6': {'id': '4704'}}},

542 '103': {'id': '4705', 'issue': {'1': {'id': '4706'}, '2': {'id': '4707'}, '3': {'id': '4708'}, '4': {'id': '4709'}, '5': {'id': '4710'}, '6': {'id': '4711'}}},

543 '105': {'id': '4719', 'issue': {'1': {'id': '4720'}, '2': {'id': '4721'}, '3': {'id': '4722'}, '4': {'id': '4723'}, '5': {'id': '4724'}, '6': {'id': '4725'}}},

544 '106': {'id': '4726', 'issue': {'6': {'id': '4732'}, '1': {'id': '4727'}, '2': {'id': '4728'}, '3': {'id': '4729'}, '4': {'id': '4730'}, '5': {'id': '4731'}}}

545 }

546 }

547 }

548 curator.meta_maker()

549 curator.indexer(path_index, path_csv)

550 with open(os.path.join(path_index, 'index_ar.csv'), 'r', encoding='utf-8') as f:

551 index_ar = list(csv.DictReader(f))

552 with open(os.path.join(path_index, 'index_id_br.csv'), 'r', encoding='utf-8') as f:

553 index_id_br = list(csv.DictReader(f))

554 with open(os.path.join(path_index, 'index_id_ra.csv'), 'r', encoding='utf-8') as f:

555 index_id_ra = list(csv.DictReader(f))

556 with open(os.path.join(path_index, 'index_vi.json'), 'r', encoding='utf-8') as f:

557 index_vi = json.load(f)

558 with open(os.path.join(path_index, 'index_re.csv'), 'r', encoding='utf-8') as f:

559 index_re = list(csv.DictReader(f))

560 expected_index_ar = [{'meta': '2585', 'author': '9445, 0602; 0601, 0601', 'editor': '', 'publisher': ''}]

561 expected_index_id_br = [{'id': 'doi:10.1001/2013.jamasurg.270', 'meta': '2585'}]

562 expected_index_id_ra = [{'id': 'orcid:0000-0003-0530-4305', 'meta': '0601'}, {'id': 'schema:12345', 'meta': '0602'}]

563 expected_index_re = [{'br': '', 're': ''}]

564 expected_index_vi = {'0602': {'issue': {}, 'volume': {'107': {'id': '4733', 'issue': {'1': {'id': '4734'}, '2': {'id': '4735'}, '3': {'id': '4736'}, '4': {'id': '4737'}, '5': {'id': '4738'}, '6': {'id': '4739'}}}, '108': {'id': '4740', 'issue': {'1': {'id': '4741'}, '2': {'id': '4742'}, '3': {'id': '4743'}, '4': {'id': '4744'}}}, '104': {'id': '4712', 'issue': {'1': {'id': '4713'}, '2': {'id': '4714'}, '3': {'id': '4715'}, '4': {'id': '4716'}, '5': {'id': '4717'}, '6': {'id': '4718'}}}, '148': {'id': '4417', 'issue': {'12': {'id': '4418'}, '11': {'id': '4419'}}}, '100': {'id': '4684', 'issue': {'1': {'id': '4685'}, '2': {'id': '4686'}, '3': {'id': '4687'}, '4': {'id': '4688'}, '5': {'id': '4689'}, '6': {'id': '4690'}}}, '101': {'id': '4691', 'issue': {'1': {'id': '4692'}, '2': {'id': '4693'}, '3': {'id': '4694'}, '4': {'id': '4695'}, '5': {'id': '4696'}, '6': {'id': '4697'}}}, '102': {'id': '4698', 'issue': {'1': {'id': '4699'}, '2': {'id': '4700'}, '3': {'id': '4701'}, '4': {'id': '4702'}, '5': {'id': '4703'}, '6': {'id': '4704'}}}, '103': {'id': '4705', 'issue': {'1': {'id': '4706'}, '2': {'id': '4707'}, '3': {'id': '4708'}, '4': {'id': '4709'}, '5': {'id': '4710'}, '6': {'id': '4711'}}}, '105': {'id': '4719', 'issue': {'1': {'id': '4720'}, '2': {'id': '4721'}, '3': {'id': '4722'}, '4': {'id': '4723'}, '5': {'id': '4724'}, '6': {'id': '4725'}}}, '106': {'id': '4726', 'issue': {'6': {'id': '4732'}, '1': {'id': '4727'}, '2': {'id': '4728'}, '3': {'id': '4729'}, '4': {'id': '4730'}, '5': {'id': '4731'}}}}}}

565 output = (index_ar, index_id_br, index_id_ra, index_re, index_vi)

566 expected_output = (expected_index_ar, expected_index_id_br, expected_index_id_ra, expected_index_re, expected_index_vi)

567 shutil.rmtree(OUTPUT_DIR)

568 self.assertEqual(output, expected_output)

569

570 def test_is_a_valid_row(self):

571 rows = [

572 {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

573 {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '1', 'issue': '', 'page': '', 'type': 'journal volume', 'publisher': '', 'editor': ''},

574 {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '1', 'page': '', 'type': 'journal issue', 'publisher': '', 'editor': ''},

575 {'id': 'doi:10.1001/2013.jamasurg.270', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

576 {'id': '', 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'author': 'Deckert, Ron J. [orcid:0000-0003-2100-6412]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

577 {'id': '', 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'author': 'Deckert, Ron J. [orcid:0000-0003-2100-6412]', 'pub_date': '03-01-2020', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': 'book'},

578 {'id': 'doi:10.1001/2013.jamasurg.270', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '5', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}

579 ]

580 output = []

581 for row in rows:

582 output.append(is_a_valid_row(row))

583 expected_output = [False, False, False, True, False, True, False]

584 self.assertEqual(output, expected_output)

585

586 def test_get_preexisting_entities(self):

587 row = {'id': 'omid:br/2715', 'title': 'Image Of The Year For 2012', 'author': '', 'pub_date': '', 'venue': 'Archives Of Surgery [omid:br/4480]', 'volume': '99', 'issue': '1', 'page': '', 'type': 'journal article', 'publisher': '', 'editor': ''}

588 curator = prepareCurator(data=[row])

589 curator.curator()

590 expected_output = (

591 {'id/4270', 'ra/3309', 'ar/7240', 'br/4481', 'br/2715', 'br/4480', 'id/4274', 'id/2581', 'br/4487', 're/2350'},

592 [{'id': 'doi:10.1001/2013.jamasurg.202 omid:br/2715', 'title': 'Image Of The Year For 2012', 'author': '', 'pub_date': '2012-12-01', 'venue': 'Archives Of Surgery [issn:0004-0010 omid:br/4480]', 'volume': '147', 'issue': '12', 'page': '1140-1140', 'type': 'journal article', 'publisher': 'American Medical Association (ama) [crossref:10 omid:ra/3309]', 'editor': ''}]

593 )

594 self.assertEqual((curator.preexisting_entities, curator.data), expected_output)

595

596

597class test_RespAgentsCurator(unittest.TestCase):

598 def test_curator_publishers(self):

599 reset()

600 data = [

601 {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': 'American Medical Association (AMA) [crossref:10 crossref:9999]', 'editor': ''},

602 {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': 'Elsevier BV [crossref:78]', 'editor': ''},

603 {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': 'Wiley [crossref:311]', 'editor': ''}]

604 resp_agents_curator = prepareCurator(data=data, server=SERVER, resp_agents_only=True)

605 resp_agents_curator.curator(filename=None, path_csv=None, path_index=None)

606 output = (resp_agents_curator.data, resp_agents_curator.radict, resp_agents_curator.idra, resp_agents_curator.rameta)

607 expected_output = (

608 [

609 {'id': '', 'title': '', 'author': '', 'venue': '', 'editor': '', 'publisher': 'American Medical Association (ama) [crossref:10 crossref:9999 omid:ra/3309]', 'page': '', 'volume': '', 'issue': '', 'pub_date': '', 'type': ''},

610 {'id': '', 'title': '', 'author': '', 'venue': '', 'editor': '', 'publisher': 'Elsevier Bv [crossref:78 omid:ra/0601]', 'page': '', 'volume': '', 'issue': '', 'pub_date': '', 'type': ''},

611 {'id': '', 'title': '', 'author': '', 'venue': '', 'editor': '', 'publisher': 'Wiley [crossref:311 omid:ra/0602]', 'page': '', 'volume': '', 'issue': '', 'pub_date': '', 'type': ''}],

612 {

613 '3309': {'ids': ['crossref:10', 'crossref:9999', 'omid:ra/3309'], 'others': [], 'title': 'American Medical Association (ama)'},

614 'wannabe_0': {'ids': ['crossref:78', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Elsevier Bv'},

615 'wannabe_1': {'ids': ['crossref:311', 'omid:ra/0602'], 'others': ['wannabe_1'], 'title': 'Wiley'}},

616 {'crossref:10': '4274', 'crossref:9999': '0601', 'crossref:78': '0602', 'crossref:311': '0603'},

617 {

618 '3309': {'ids': ['crossref:10', 'crossref:9999', 'omid:ra/3309'], 'others': [], 'title': 'American Medical Association (ama)'},

619 '0601': {'ids': ['crossref:78', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Elsevier Bv'},

620 '0602': {'ids': ['crossref:311', 'omid:ra/0602'], 'others': ['wannabe_1'], 'title': 'Wiley'}}

621 )

622 self.assertEqual(output, expected_output)

623

624 def test_curator(self):

625 reset()

626 data = [

627 {'id': '', 'title': '', 'author': 'Deckert, Ron J. [orcid:0000-0003-2100-6412]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

628 {'id': '', 'title': '', 'author': 'Ruso, Juan M. [orcid:0000-0001-5909-6754]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

629 {'id': '', 'title': '', 'author': 'Sarmiento, Félix [orcid:0000-0002-4487-6894]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}

630 ]

631 resp_agents_curator = prepareCurator(data=data, server=SERVER, resp_agents_only=True)

632 resp_agents_curator.curator(filename='resp_agents_curator_output', path_csv='test/testcases/testcase_data', path_index='test/testcases/testcase_data/indices')

633 output = (resp_agents_curator.data, resp_agents_curator.radict, resp_agents_curator.idra, resp_agents_curator.rameta)

634 expected_output = (

635 [

636 {'id': '', 'title': '', 'author': 'Deckert, Ron J. [orcid:0000-0003-2100-6412 omid:ra/0601]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

637 {'id': '', 'title': '', 'author': 'Ruso, Juan M. [orcid:0000-0001-5909-6754 omid:ra/0602]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

638 {'id': '', 'title': '', 'author': 'Sarmiento, Félix [orcid:0000-0002-4487-6894 omid:ra/0603]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}],

639 {

640 'wannabe_0': {'ids': ['orcid:0000-0003-2100-6412', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Deckert, Ron J.'},

641 'wannabe_1': {'ids': ['orcid:0000-0001-5909-6754', 'omid:ra/0602'], 'others': ['wannabe_1'], 'title': 'Ruso, Juan M.'},

642 'wannabe_2': {'ids': ['orcid:0000-0002-4487-6894', 'omid:ra/0603'], 'others': ['wannabe_2'], 'title': 'Sarmiento, Félix'}},

643 {'orcid:0000-0003-2100-6412': '0601', 'orcid:0000-0001-5909-6754': '0602', 'orcid:0000-0002-4487-6894': '0603'},

644 {

645 '0601': {'ids': ['orcid:0000-0003-2100-6412', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Deckert, Ron J.'},

646 '0602': {'ids': ['orcid:0000-0001-5909-6754', 'omid:ra/0602'], 'others': ['wannabe_1'], 'title': 'Ruso, Juan M.'},

647 '0603': {'ids': ['orcid:0000-0002-4487-6894', 'omid:ra/0603'], 'others': ['wannabe_2'], 'title': 'Sarmiento, Félix'}}

648 )

649 self.assertEqual(output, expected_output)

650

651 def test_curator_ra_on_ts(self):

652 # A responsible agent is already on the triplestore

653 add_data_ts(server=SERVER, data_path=os.path.abspath(os.path.join('test', 'testcases', 'ts', 'real_data.nt')).replace('\\', '/'))

654 self.maxDiff = None

655 data = [

656 {'id': '', 'title': '', 'author': 'Deckert, Ron J. [orcid:0000-0003-2100-6412]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

657 {'id': '', 'title': '', 'author': 'Mehrotra, Ateev [orcid:0000-0003-2223-1582]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

658 {'id': '', 'title': '', 'author': 'Sarmiento, Félix [orcid:0000-0002-4487-6894]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}

659 ]

660 resp_agents_curator = prepareCurator(data=data, server=SERVER, resp_agents_only=True)

661 resp_agents_curator.curator()

662 output = (resp_agents_curator.data, resp_agents_curator.radict, resp_agents_curator.idra, resp_agents_curator.rameta)

663 expected_output = (

664 [

665 {'id': '', 'title': '', 'author': 'Deckert, Ron J. [orcid:0000-0003-2100-6412 omid:ra/0601]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

666 {'id': '', 'title': '', 'author': 'Mehrotra, Ateev [orcid:0000-0003-2223-1582 omid:ra/3976]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''},

667 {'id': '', 'title': '', 'author': 'Sarmiento, Félix [orcid:0000-0002-4487-6894 omid:ra/0602]', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': '', 'publisher': '', 'editor': ''}],

668 {

669 'wannabe_0': {'ids': ['orcid:0000-0003-2100-6412', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Deckert, Ron J.'},

670 '3976': {'ids': ['orcid:0000-0003-2223-1582', 'omid:ra/3976'], 'others': [], 'title': 'Mehrotra, Ateev'},

671 'wannabe_1': {'ids': ['orcid:0000-0002-4487-6894', 'omid:ra/0602'], 'others': ['wannabe_1'], 'title': 'Sarmiento, Félix'}},

672 {'orcid:0000-0003-2100-6412': '0601', 'orcid:0000-0003-2223-1582': '4351', 'orcid:0000-0002-4487-6894': '0602'},

673 {

674 '0601': {'ids': ['orcid:0000-0003-2100-6412', 'omid:ra/0601'], 'others': ['wannabe_0'], 'title': 'Deckert, Ron J.'},

675 '3976': {'ids': ['orcid:0000-0003-2223-1582', 'omid:ra/3976'], 'others': [], 'title': 'Mehrotra, Ateev'},

676 '0602': {'ids': ['orcid:0000-0002-4487-6894', 'omid:ra/0602'], 'others': ['wannabe_1'], 'title': 'Sarmiento, Félix'}}

677 )

678 self.assertEqual(output, expected_output)

679

680

681class test_id_worker(unittest.TestCase):

682 @classmethod

683 def setUpClass(cls):

684 add_data_ts(SERVER, os.path.abspath(os.path.join('test', 'testcases', 'ts', 'real_data.nt')).replace('\\', '/'))

685 cls.finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI)

686 cls.finder.get_everything_about_res(metavals={'omid:br/3309', 'omid:br/2438', 'omid:br/0601'}, identifiers={'doi:10.1001/2013.jamasurg.270', 'doi:10.1787/eco_outlook-v2011-2-graph138-en', 'orcid:0000-0001-6994-8412', 'doi:10.1001/archderm.104.1.106', 'pmid:29098884'}, vvis=set())

687

688 def test_id_worker_1(self):

689 # 1 EntityA is a new one

690 curator = prepareCurator(list())

691 name = 'βέβαιος, α, ον'

692 idslist = ['doi:10.1163/2214-8655_lgo_lgo_02_0074_ger']

693 wannabe_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False)

694 output = (wannabe_id, curator.brdict, curator.radict, curator.idbr, curator.idra, curator.log)

695 expected_output = (

696 'wannabe_0',

697 {'wannabe_0': {'ids': ['doi:10.1163/2214-8655_lgo_lgo_02_0074_ger'], 'others': [], 'title': 'βέβαιος, α, ον'}},

698 {},

699 {'doi:10.1163/2214-8655_lgo_lgo_02_0074_ger': '0601'},

700 {},

701 {}

702 )

703 self.assertEqual(output, expected_output)

704

705 def test_id_worker_1_no_id(self):

706 # 1 EntityA is a new one and has no ids

707 curator = prepareCurator(list())

708 name = 'βέβαιος, α, ον'

709 idslist = []

710 wannabe_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False)

711 output = (wannabe_id, curator.brdict, curator.radict, curator.idbr, curator.idra, curator.log)

712 expected_output = (

713 'wannabe_0',

714 {'wannabe_0': {'ids': [], 'others': [], 'title': 'βέβαιος, α, ον'}},

715 {},

716 {},

717 {},

718 {}

719 )

720 self.assertEqual(output, expected_output)

721

722 def test_id_worker_2_id_ts(self):

723 # 2 Retrieve EntityA data in triplestore to update EntityA inside CSV

724 curator = prepareCurator(list())

725 curator.finder = self.finder

726 name = 'American Medical Association (AMA)' # *(ama) on the ts. The name on the ts must prevail

727 idslist = ['crossref:10']

728 wannabe_id = curator.id_worker('editor', name, idslist, '', ra_ent=True, br_ent=False, vvi_ent=False, publ_entity=True)

729 output = (wannabe_id, curator.brdict, curator.radict, curator.idbr, curator.idra, curator.log)

730 expected_output = ('3309', {}, {'3309': {'ids': ['crossref:10'], 'others': [], 'title': 'American Medical Association (ama)'}}, {}, {'crossref:10': '4274'}, {})

731 self.assertEqual(output, expected_output)

732

733 def test_id_worker_2_metaid_ts(self):

734 # 2 Retrieve EntityA data in triplestore to update EntityA inside CSV

735 curator = prepareCurator(list())

736 curator.finder = self.finder

737 name = 'American Medical Association (AMA)' # *(ama) on the ts. The name on the ts must prevail

738 # MetaID only

739 wannabe_id = curator.id_worker('editor', name, [], '3309', ra_ent=True, br_ent=False, vvi_ent=False, publ_entity=True)

740 output = (wannabe_id, curator.brdict, curator.radict, curator.idbr, curator.idra, curator.log)

741 expected_output = ('3309', {}, {'3309': {'ids': ['crossref:10'], 'others': [], 'title': 'American Medical Association (ama)'}}, {}, {'crossref:10': '4274'}, {})

742 self.assertEqual(output, expected_output)

743

744 def test_id_worker_2_id_metaid_ts(self):

745 # 2 Retrieve EntityA data in triplestore to update EntityA inside CSV

746 curator = prepareCurator(list())

747 name = 'American Medical Association (AMA)' # *(ama) on the ts. The name on the ts must prevail

748 curator.finder = self.finder

749 # ID and MetaID

750 wannabe_id = curator.id_worker('publisher', name, ['crossref:10'], '3309', ra_ent=True, br_ent=False, vvi_ent=False, publ_entity=True)

751 output = (wannabe_id, curator.brdict, curator.radict, curator.idbr, curator.idra, curator.log)

752 expected_output = ('3309', {}, {'3309': {'ids': ['crossref:10'], 'others': [], 'title': 'American Medical Association (ama)'}}, {}, {'crossref:10': '4274'}, {})

753 self.assertEqual(output, expected_output)

754

755 def test_id_worker_3(self):

756 # 2 Retrieve EntityA data in triplestore to update EntityA inside CSV. MetaID on ts has precedence

757 curator = prepareCurator(list())

758 name = 'American Medical Association (AMA)' # *(ama) on the ts. The name on the ts must prevail

759 curator.finder = self.finder

760 # ID and MetaID, but it's omid:ra/3309 on ts

761 wannabe_id = curator.id_worker('publisher', name, ['crossref:10'], '33090', ra_ent=True, br_ent=False, vvi_ent=False, publ_entity=True)

762 output = (wannabe_id, curator.brdict, curator.radict, curator.idbr, curator.idra, curator.log)

763 expected_output = ('3309', {}, {'3309': {'ids': ['crossref:10'], 'others': [], 'title': 'American Medical Association (ama)'}}, {}, {'crossref:10': '4274'}, {})

764 self.assertEqual(output, expected_output)

765

766 def test_id_worker_conflict(self):

767 # there's no meta or there was one but it didn't exist

768 # There are other ids that already exist, but refer to multiple entities on ts.

769 # Conflict!

770 idslist = ['doi:10.1001/2013.jamasurg.270']

771 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'

772 curator = prepareCurator(list())

773 curator.finder = self.finder

774 curator.log[0] = {'id': {}}

775 id_dict = dict()

776 metaval = curator.conflict(idslist, name, id_dict, 'id') # Only the conflict function is tested here, not id_worker

777 output = (metaval, curator.brdict, curator.log, id_dict)

778 expected_output = (

779 'wannabe_0',

780 {'wannabe_0': {'ids': ['doi:10.1001/2013.jamasurg.270'], 'others': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'}},

781 {0: {'id': {'Conflict entity': 'wannabe_0'}}},

782 {'doi:10.1001/2013.jamasurg.270': '2585'}

783 )

784 self.assertEqual(output, expected_output)

785

786 def test_conflict_br(self):

787 # No MetaId, an identifier to which two separate br point: there is a conflict, and a new entity must be created

788 curator = prepareCurator(list())

789 curator.log[0] = {'id': {}}

790 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'

791 idslist = ['doi:10.1001/2013.jamasurg.270']

792 curator.finder = self.finder

793 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False)

794 output = (meta_id, curator.idbr, curator.idra, curator.brdict, curator.log)

795 expected_output_1 = (

796 '2719',

797 {'doi:10.1001/2013.jamasurg.270': '2585'},

798 {},

799 {'2719': {'ids': ['doi:10.1001/2013.jamasurg.270'], 'others': [], 'title': 'Patient Satisfaction As A Possible Indicator Of Quality Surgical Care'}},

800 {0: {'id': {}}}

801 )

802 expected_output_2 = ('2720',

803 {'doi:10.1001/2013.jamasurg.270': '2585'},

804 {},

805 {'2720': {'ids': ['doi:10.1001/2013.jamasurg.270'],

806 'others': [],

807 'title': 'Pediatric Injury Outcomes In Racial/Ethnic Minorities In '

808 'California'}},

809 {0: {'id': {}}}

810 )

811 self.assertTrue(output == expected_output_1 or output == expected_output_2)

812

813 def test_conflict_ra(self):

814 # No MetaId, an identifier to which two separate ra point: there is a conflict, and a new entity must be created

815 idslist = ['orcid:0000-0001-6994-8412']

816 name = 'Alarcon, Louis H.'

817 curator = prepareCurator(list())

818 curator.finder = self.finder

819 curator.log[0] = {'author': {}}

820 meta_id = curator.id_worker('author', name, idslist, '', ra_ent=True, br_ent=False, vvi_ent=False, publ_entity=False)

821 output = (meta_id, curator.idbr, curator.idra, curator.brdict, curator.radict, curator.log)

822 expected_output_1 = (

823 '4940',

824 {},

825 {'orcid:0000-0001-6994-8412': '4475'},

826 {},

827 {'4940': {'ids': ['orcid:0000-0001-6994-8412'], 'others': [], 'title': 'Alarcon, Louis H.'}},

828 {0: {'author': {}}}

829 )

830 expected_output_2 = ('1000000',

831 {},

832 {'orcid:0000-0001-6994-8412': '4475'},

833 {},

834 {'1000000': {'ids': ['orcid:0000-0001-6994-8412'],

835 'others': [],

836 'title': 'Alarcon, Louis H.'}},

837 {0: {'author': {}}})

838 self.assertTrue(output == expected_output_1 or output == expected_output_2)

839

840 def test_conflict_suspect_id_among_existing(self):

841 # ID already exist in entity_dict and refer to one entity having a MetaID, but there is another ID not in entity_dict that highlights a conflict on ts

842 br_dict = {

843 'omid:br/0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'},

844 'omid:br/0602': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 'others': [], 'title': 'Contributions To GDP Growth And Inflation: South Africa'},

845 'omid:br/0603': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph18-en'], 'others': [], 'title': 'Official Loans To The Governments Of Greece, Ireland And Portugal'},

846 }

847 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: Japan' # The first title must have precedence (China, not Japan)

848 idslist = ['doi:10.1787/eco_outlook-v2011-2-graph138-en', 'doi:10.1001/2013.jamasurg.270']

849 curator = prepareCurator(get_csv_data(REAL_DATA_CSV))

850 curator.log[0] = {'id': {}}

851 curator.brdict = br_dict

852 curator.finder = self.finder

853 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False)

854 output = (meta_id, curator.idbr, curator.idra, curator.brdict, curator.radict, curator.log)

855 expected_output = (

856 'wannabe_0',

857 {

858 'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601',

859 'doi:10.1001/2013.jamasurg.270': '2585'

860 },

861 {},

862 {'omid:br/0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'],

863 'others': [],

864 'title': 'Money Growth, Interest Rates, Inflation And Raw '

865 'Materials Prices: China'},

866 'omid:br/0602': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'],

867 'others': [],

868 'title': 'Contributions To GDP Growth And Inflation: South '

869 'Africa'},

870 'omid:br/0603': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph18-en'],

871 'others': [],

872 'title': 'Official Loans To The Governments Of Greece, '

873 'Ireland And Portugal'},

874 'wannabe_0': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en',

875 'doi:10.1001/2013.jamasurg.270'],

876 'others': [],

877 'title': 'Money Growth, Interest Rates, Inflation And Raw '

878 'Materials Prices: Japan'}},

879 {},

880 {0: {'id': {'Conflict entity': 'wannabe_0'}}}

881 )

882 self.assertEqual(output, expected_output)

883

884 def test_conflict_suspect_id_among_wannabe(self):

885 # ID already exist in entity_dict and refer to one temporary, but there is another ID not in entity_dict that highlights a conflict on ts

886 br_dict = {

887 'wannabe_0': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'},

888 'wannabe_2': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 'others': [], 'title': 'Contributions To GDP Growth And Inflation: South Africa'},

889 'wannabe_3': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph18-en'], 'others': [], 'title': 'Official Loans To The Governments Of Greece, Ireland And Portugal'},

890 }

891 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: Japan' # The first title must have precedence (China, not Japan)

892 idslist = ['doi:10.1787/eco_outlook-v2011-2-graph138-en', 'doi:10.1001/2013.jamasurg.270']

893 curator = prepareCurator(get_csv_data(REAL_DATA_CSV))

894 curator.log[0] = {'id': {}}

895 curator.brdict = br_dict

896 curator.finder = self.finder

897 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False)

898 output = (meta_id, curator.idbr, curator.idra, curator.brdict, curator.radict, curator.log)

899 expected_output_1 = (

900 '2720',

901 {

902 'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601',

903 'doi:10.1001/2013.jamasurg.270': '2585'

904 },

905 {},

906 {'2720': {'ids': ['doi:10.1001/2013.jamasurg.270', 'doi:10.1787/eco_outlook-v2011-2-graph138-en'],

907 'others': ['wannabe_0'],

908 'title': 'Pediatric Injury Outcomes In Racial/Ethnic Minorities In '

909 'California'},

910 'wannabe_2': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'],

911 'others': [],

912 'title': 'Contributions To GDP Growth And Inflation: South '

913 'Africa'},

914 'wannabe_3': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph18-en'],

915 'others': [],

916 'title': 'Official Loans To The Governments Of Greece, Ireland '

917 'And Portugal'}},

918 {},

919 {0: {'id': {}}}

920 )

921 expected_output_2 = (

922 '2719',

923 {

924 'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601',

925 'doi:10.1001/2013.jamasurg.270': '2585'

926 },

927 {},

928 {'2719': {'ids': ['doi:10.1001/2013.jamasurg.270', 'doi:10.1787/eco_outlook-v2011-2-graph138-en'],

929 'others': ['wannabe_0'],

930 'title': 'Patient Satisfaction As A Possible Indicator Of Quality '

931 'Surgical Care'},

932 'wannabe_2': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'],

933 'others': [],

934 'title': 'Contributions To GDP Growth And Inflation: South '

935 'Africa'},

936 'wannabe_3': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph18-en'],

937 'others': [],

938 'title': 'Official Loans To The Governments Of Greece, Ireland '

939 'And Portugal'}},

940 {},

941 {0: {'id': {}}}

942 )

943 self.assertTrue(output == expected_output_1 or output == expected_output_2)

944

945 def test_id_worker_4(self):

946 # 4 Merge data from EntityA (CSV) with data from EntityX (CSV), update both with data from EntityA (RDF)

947 br_dict = {

948 'wannabe_0': {'ids': ['doi:10.1001/archderm.104.1.106'], 'others': [], 'title': 'Multiple eloids'},

949 'wannabe_1': {'ids': ['doi:10.1001/archderm.104.1.106'], 'others': [], 'title': 'Multiple Blastoids'},

950 }

951 name = 'Multiple Palloids'

952 idslist = ['doi:10.1001/archderm.104.1.106', 'pmid:29098884']

953 curator = prepareCurator(list())

954 curator.brdict = br_dict

955 curator.wnb_cnt = 2

956 curator.finder = self.finder

957 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False)

958 output = (meta_id, curator.idbr, curator.idra, curator.log)

959 expected_output = (

960 '3757',

961 {'doi:10.1001/archderm.104.1.106': '3624', 'pmid:29098884': '2000000'},

962 {},

963 {}

964 )

965 self.assertEqual(output, expected_output)

966

967class test_id_worker_with_reset(unittest.TestCase):

968 def test_id_worker_2_meta_in_entity_dict(self):

969 # MetaID exists among data.

970 # MetaID already in entity_dict (no care about conflicts, we have a MetaID specified)

971 # 2 Retrieve EntityA data to update EntityA inside CSV

972 reset_server()

973 data = get_csv_data(REAL_DATA_CSV)

974 curator = prepareCurator(data)

975 curator.curator()

976 store_curated_data(curator, SERVER)

977 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'

978 curator_empty = prepareCurator(list())

979 finder = ResourceFinder(ts_url=SERVER, base_iri=BASE_IRI, local_g=curator_empty.everything_everywhere_allatonce)

980 finder.get_everything_about_res(metavals=set(), identifiers={'doi:10.1787/eco_outlook-v2011-2-graph138-en'}, vvis=set())

981 # put metaval in entity_dict

982 meta_id = curator_empty.id_worker('id', name, [], '0601', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False)

983 # metaval is in entity_dict

984 meta_id = curator_empty.id_worker('id', name, [], '0601', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False)

985 output = (meta_id, curator_empty.brdict, curator_empty.radict, curator_empty.idbr, curator_empty.idra, curator_empty.log)

986 expected_output = ('0601', {'0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China', 'others': []}}, {}, {'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601'}, {}, {})

987 self.assertEqual(output, expected_output)

988

989 def test_conflict_existing(self):

990 # ID already exist in entity_dict but refer to multiple entities having a MetaID

991 reset_server()

992 br_dict = {

993 'omid:br/0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'},

994 'omid:br/0602': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 'others': [], 'title': 'Contributions To GDP Growth And Inflation: South Africa'},

995 'omid:br/0603': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Official Loans To The Governments Of Greece, Ireland And Portugal'},

996 }

997 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'

998 idslist = ['doi:10.1787/eco_outlook-v2011-2-graph138-en']

999 curator = prepareCurator(list())

1000 curator.log[0] = {'id': {}}

1001 curator.brdict = br_dict

1002 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False)

1003 output = (meta_id, curator.idbr, curator.idra, curator.brdict, curator.radict, curator.log)

1004 expected_output = (

1005 'wannabe_0',

1006 {'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601'},

1007 {},

1008 {'omid:br/0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'],

1009 'others': [],

1010 'title': 'Money Growth, Interest Rates, Inflation And Raw '

1011 'Materials Prices: China'},

1012 'omid:br/0602': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'],

1013 'others': [],

1014 'title': 'Contributions To GDP Growth And Inflation: South '

1015 'Africa'},

1016 'omid:br/0603': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'],

1017 'others': [],

1018 'title': 'Official Loans To The Governments Of Greece, '

1019 'Ireland And Portugal'},

1020 'wannabe_0': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'],

1021 'others': [],

1022 'title': 'Money Growth, Interest Rates, Inflation And Raw '

1023 'Materials Prices: China'}},

1024 {},

1025 {0: {'id': {'Conflict entity': 'wannabe_0'}}}

1026 )

1027 self.assertEqual(output, expected_output)

1028

1029 def test_id_worker_5(self):

1030 # ID already exist in entity_dict and refer to one or more temporary entities -> collective merge

1031 reset_server()

1032 br_dict = {

1033 'wannabe_0': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'},

1034 'wannabe_1': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 'others': [], 'title': 'Contributions To GDP Growth And Inflation: South Africa'},

1035 'wannabe_2': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Official Loans To The Governments Of Greece, Ireland And Portugal'},

1036 }

1037 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'

1038 idslist = ['doi:10.1787/eco_outlook-v2011-2-graph138-en']

1039 curator = prepareCurator(list())

1040 curator.brdict = br_dict

1041 curator.wnb_cnt = 2

1042 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False)

1043 output = (meta_id, curator.idbr, curator.idra, curator.log)

1044 expected_output = (

1045 'wannabe_0',

1046 {'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601'},

1047 {},

1048 {}

1049 )

1050 self.assertEqual(output, expected_output)

1051

1052 def test_no_conflict_existing(self):

1053 # ID already exist in entity_dict and refer to one entity

1054 reset_server()

1055 br_dict = {

1056 'omid:br/0601': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph138-en'], 'others': [], 'title': 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: China'},

1057 'omid:br/0602': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph150-en'], 'others': [], 'title': 'Contributions To GDP Growth And Inflation: South Africa'},

1058 'omid:br/0603': {'ids': ['doi:10.1787/eco_outlook-v2011-2-graph18-en'], 'others': [], 'title': 'Official Loans To The Governments Of Greece, Ireland And Portugal'},

1059 }

1060 name = 'Money Growth, Interest Rates, Inflation And Raw Materials Prices: Japan' # The first title must have precedence (China, not Japan)

1061 idslist = ['doi:10.1787/eco_outlook-v2011-2-graph138-en']

1062 curator = prepareCurator(list())

1063 curator.log[0] = {'id': {}}

1064 curator.brdict = br_dict

1065 meta_id = curator.id_worker('id', name, idslist, '', ra_ent=False, br_ent=True, vvi_ent=False, publ_entity=False)

1066 output = (meta_id, curator.idbr, curator.idra, curator.log)

1067 expected_output = (

1068 'omid:br/0601',

1069 {'doi:10.1787/eco_outlook-v2011-2-graph138-en': '0601'},

1070 {},

1071 {0: {'id': {}}}

1072 )

1073 self.assertEqual(output, expected_output)

1074

1075 def test_metaid_in_prov(self):

1076 # MetaID not found in data, but found in the provenance metadata.

1077 reset_server()

1078 add_data_ts(server=SERVER, data_path=os.path.abspath(os.path.join('test', 'testcases', 'ts', 'real_data_with_prov.nq')).replace('\\', '/'))

1079 name = ''

1080 curator = prepareCurator(list())

1081 meta_id = curator.id_worker('id', name, [], '4321', ra_ent=True, br_ent=False, vvi_ent=False, publ_entity=False)

1082 self.assertEqual(meta_id, '38013')

1083

1084

1085class testcase_01(unittest.TestCase):

1086 def test(self):

1087 # testcase1: 2 different issues of the same venue (no volume)

1088 name = '01'

1089 data = get_csv_data(MANUAL_DATA_CSV)

1090 partial_data = list()

1091 partial_data.append(data[0])

1092 partial_data.append(data[5])

1093 data_curated, testcase = prepare_to_test(partial_data, name)

1094 for pos, element in enumerate(data_curated):

1095 self.assertEqual(element, testcase[pos])

1096

1097

1098class testcase_02(unittest.TestCase):

1099 def test(self):

1100 # testcase2: 2 different volumes of the same venue (no issue)

1101 name = '02'

1102 data = get_csv_data(MANUAL_DATA_CSV)

1103 partial_data = list()

1104 partial_data.append(data[1])

1105 partial_data.append(data[3])

1106 data_curated, testcase = prepare_to_test(partial_data, name)

1107 self.assertEqual(data_curated, testcase)

1108

1109

1110class testcase_03(unittest.TestCase):

1111 def test(self):

1112 # testcase3: 2 different issues of the same volume

1113 name = '03'

1114 data = get_csv_data(MANUAL_DATA_CSV)

1115 partial_data = list()

1116 partial_data.append(data[2])

1117 partial_data.append(data[4])

1118 data_curated, testcase = prepare_to_test(partial_data, name)

1119 self.assertEqual(data_curated, testcase)

1120

1121

1122class testcase_04(unittest.TestCase):

1123 def test(self):

1124 # testcase4: 2 new IDS and different date format (yyyy-mm and yyyy-mm-dd)

1125 name = '04'

1126 data = get_csv_data(MANUAL_DATA_CSV)

1127 partial_data = list()

1128 partial_data.append(data[6])

1129 partial_data.append(data[7])

1130 data_curated, testcase = prepare_to_test(partial_data, name)

1131 for pos, element in enumerate(data_curated):

1132 self.assertEqual(element, testcase[pos])

1133

1134

1135class testcase_05(unittest.TestCase):

1136 def test(self):

1137 # testcase5: NO ID scenario

1138 name = '05'

1139 data = get_csv_data(MANUAL_DATA_CSV)

1140 partial_data = list()

1141 partial_data.append(data[8])

1142 data_curated, testcase = prepare_to_test(partial_data, name)

1143 self.assertEqual(data_curated, testcase)

1144

1145

1146class testcase_06(unittest.TestCase):

1147 def test(self):

1148 # testcase6: ALL types test

1149 name = '06'

1150 data = get_csv_data(MANUAL_DATA_CSV)

1151 partial_data = data[9:33]

1152 data_curated, testcase = prepare_to_test(partial_data, name)

1153 self.assertEqual(data_curated, testcase)

1154

1155

1156class testcase_07(unittest.TestCase):

1157 def test(self):

1158 # testcase7: all journal related types with an editor

1159 name = '07'

1160 data = get_csv_data(MANUAL_DATA_CSV)

1161 partial_data = data[34:40]

1162 data_curated, testcase = prepare_to_test(partial_data, name)

1163 self.assertEqual(data_curated, testcase)

1164

1165

1166class testcase_08(unittest.TestCase):

1167 def test(self):

1168 # testcase8: all book related types with an editor

1169 name = '08'

1170 data = get_csv_data(MANUAL_DATA_CSV)

1171 partial_data = data[40:43]

1172 data_curated, testcase = prepare_to_test(partial_data, name)

1173 self.assertEqual(data_curated, testcase)

1174

1175

1176class testcase_09(unittest.TestCase):

1177 def test(self):

1178 # testcase09: all proceeding related types with an editor

1179 name = '09'

1180 data = get_csv_data(MANUAL_DATA_CSV)

1181 partial_data = data[43:45]

1182 data_curated, testcase = prepare_to_test(partial_data, name)

1183 self.assertEqual(data_curated, testcase)

1184

1185

1186class testcase_10(unittest.TestCase):

1187 def test(self):

1188 # testcase10: a book inside a book series and a book inside a book set

1189 name = '10'

1190 data = get_csv_data(MANUAL_DATA_CSV)

1191 partial_data = data[45:49]

1192 data_curated, testcase = prepare_to_test(partial_data, name)

1193 self.assertEqual(data_curated, testcase)

1194

1195

1196class testcase_11(unittest.TestCase):

1197 def test(self):

1198 # testcase11: real time entity update

1199 name = '11'

1200 data = get_csv_data(MANUAL_DATA_CSV)

1201 partial_data = data[49:52]

1202 data_curated, testcase = prepare_to_test(partial_data, name)

1203 self.assertEqual(data_curated, testcase)

1204

1205

1206class testcase_12(unittest.TestCase):

1207 def test(self):

1208 # testcase12: clean name, title, ids

1209 name = '12'

1210 data = get_csv_data(MANUAL_DATA_CSV)

1211 partial_data = data[52:53]

1212 data_curated, testcase = prepare_to_test(partial_data, name)

1213 self.assertEqual(data_curated, testcase)

1214

1215

1216class testcase_13(unittest.TestCase):

1217 # testcase13: ID_clean massive test

1218

1219 def test1(self):

1220 # 1--- meta specified br in a row, wannabe with a new id in a row, meta specified with an id related to wannabe

1221 # in a row

1222 name = '13.1'

1223 data = get_csv_data(MANUAL_DATA_CSV)

1224 partial_data = data[53:56]

1225 data_curated, testcase = prepare_to_test(partial_data, name)

1226 self.assertEqual(data_curated, testcase)

1227

1228 def test2(self):

1229 # 2---Conflict with META precedence: a br has a meta_id and an id related to another meta_id, the first

1230 # specified meta has precedence

1231 data = get_csv_data(MANUAL_DATA_CSV)

1232 name = '13.2'

1233 partial_data = data[56:57]

1234 data_curated, testcase = prepare_to_test(partial_data, name)

1235 self.assertEqual(data_curated, testcase)

1236

1237 def test3(self):

1238 # 3--- conflict: br with id shared with 2 meta

1239 data = get_csv_data(MANUAL_DATA_CSV)

1240 name_1 = '13.3'

1241 name_2 = '13.31'

1242 partial_data = data[57:58]

1243 data_curated, testcase_1 = prepare_to_test(partial_data, name_1)

1244 _, testcase_2 = prepare_to_test(partial_data, name_2)

1245 self.assertTrue(data_curated == testcase_1 or data_curated == testcase_2)

1246

1247

1248class testcase_14(unittest.TestCase):

1249

1250 def test1(self):

1251 # update existing sequence, in particular, a new author and an existing author without an existing id (matched

1252 # thanks to surname,name(BAD WRITTEN!)

1253 name = '14.1'

1254 data = get_csv_data(MANUAL_DATA_CSV)

1255 partial_data = data[58:59]

1256 data_curated, testcase = prepare_to_test(partial_data, name)

1257 self.assertEqual(data_curated, testcase)

1258

1259 def test2(self):

1260 # same sequence different order, with new ids

1261 name = '14.2'

1262 data = get_csv_data(MANUAL_DATA_CSV)

1263 partial_data = data[59:60]

1264 data_curated, testcase = prepare_to_test(partial_data, name)

1265 self.assertEqual(data_curated, testcase)

1266

1267 def test3(self):

1268 # RA

1269 # Author with two different ids

1270 name_1 = '14.3'

1271 data = get_csv_data(MANUAL_DATA_CSV)

1272 partial_data = data[60:61]

1273 data_curated, testcase_1 = prepare_to_test(partial_data, name_1)

1274 self.assertEqual(data_curated, testcase_1)

1275

1276 def test4(self):

1277 # meta specified ra in a row, wannabe ra with a new id in a row, meta specified with an id related to wannabe

1278 # in a ra

1279 name = '14.4'

1280 data = get_csv_data(MANUAL_DATA_CSV)

1281 partial_data = data[61:64]

1282 data_curated, testcase = prepare_to_test(partial_data, name)

1283 self.assertEqual(data_curated, testcase)

1284

1285

1286class testcase_15(unittest.TestCase):

1287

1288 def test1(self):

1289 # venue volume issue already exists in ts

1290 name = '15.1'

1291 data = get_csv_data(MANUAL_DATA_CSV)

1292 partial_data = data[64:65]

1293 data_curated, testcase = prepare_to_test(partial_data, name)

1294 self.assertEqual(data_curated, testcase)

1295

1296 def test2(self):

1297 # venue conflict

1298 name = '15.2'

1299 data = get_csv_data(MANUAL_DATA_CSV)

1300 partial_data = data[65:66]

1301 data_curated, testcase = prepare_to_test(partial_data, name)

1302 # _, testcase_2 = prepare_to_test(partial_data, name_2)

1303 self.assertEqual(data_curated, testcase)

1304

1305 def test3(self):

1306 # venue in ts is now the br

1307 name = '15.3'

1308 data = get_csv_data(MANUAL_DATA_CSV)

1309 partial_data = data[66:67]

1310 data_curated, testcase = prepare_to_test(partial_data, name)

1311 self.assertEqual(data_curated, testcase)

1312

1313 def test4(self):

1314 # br in ts is now the venue

1315 name = '15.4'

1316 data = get_csv_data(MANUAL_DATA_CSV)

1317 partial_data = data[67:68]

1318 data_curated, testcase = prepare_to_test(partial_data, name)

1319 self.assertEqual(data_curated, testcase)

1320

1321 def test5(self):

1322 # volume in ts is now the br

1323 name = '15.5'

1324 data = get_csv_data(MANUAL_DATA_CSV)

1325 partial_data = data[71:72]

1326 data_curated, testcase = prepare_to_test(partial_data, name)

1327 self.assertEqual(data_curated, testcase)

1328

1329 def test6(self):

1330 # br is a volume

1331 name = '15.6'

1332 data = get_csv_data(MANUAL_DATA_CSV)

1333 partial_data = data[72:73]

1334 data_curated, testcase = prepare_to_test(partial_data, name)

1335 self.assertEqual(data_curated, testcase)

1336

1337 def test7(self):

1338 # issue in ts is now the br

1339 name = '15.7'

1340 data = get_csv_data(MANUAL_DATA_CSV)

1341 partial_data = data[73:74]

1342 data_curated, testcase = prepare_to_test(partial_data, name)

1343 self.assertEqual(data_curated, testcase)

1344

1345 def test8(self):

1346 # br is a issue

1347 name = '15.8'

1348 data = get_csv_data(MANUAL_DATA_CSV)

1349 partial_data = data[74:75]

1350 data_curated, testcase = prepare_to_test(partial_data, name)

1351 self.assertEqual(data_curated, testcase)

1352

1353

1354class testcase_16(unittest.TestCase):

1355

1356 def test1(self):

1357 # Date cleaning 2019-02-29

1358 name = '16.1'

1359 # add_data_ts('http://127.0.0.1:8805/sparql')

1360 # wrong date (2019/02/29)

1361 data = get_csv_data(MANUAL_DATA_CSV)

1362 partial_data = data[75:76]

1363 data_curated, testcase = prepare_to_test(partial_data, name)

1364 self.assertEqual(data_curated, testcase)

1365

1366 def test2(self):

1367 # existing re

1368 name = '16.2'

1369 data = get_csv_data(MANUAL_DATA_CSV)

1370 partial_data = data[76:77]

1371 data_curated, testcase = prepare_to_test(partial_data, name)

1372 self.assertEqual(data_curated, testcase)

1373

1374 def test3(self):

1375 # given name for an RA with only a family name in TS

1376 name = '16.3'

1377 data = get_csv_data(MANUAL_DATA_CSV)

1378 partial_data = data[77:78]

1379 data_curated, testcase = prepare_to_test(partial_data, name)

1380 self.assertEqual(data_curated, testcase)

1381

1382

1383if __name__ == '__main__': # pragma: no cover

1384 unittest.main()

Coverage for test/curator_test.py: 99%

784 statements