Coverage for test/meta_process_test.py: 96%

562 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-07-14 14:06 +0000

1import csv 

2import json 

3import os 

4import re 

5import shutil 

6import subprocess 

7import sys 

8import tempfile 

9import time 

10import unittest 

11from datetime import datetime 

12from test.test_utils import (PROV_SERVER, SERVER, VIRTUOSO_CONTAINER, 

13 VIRTUOSO_PROV_CONTAINER, execute_sparql_query, 

14 reset_redis_counters, reset_server) 

15 

16import yaml 

17from oc_meta.lib.file_manager import get_csv_data, write_csv 

18from oc_meta.run.meta_process import run_meta_process 

19from oc_ocdm.counter_handler.redis_counter_handler import RedisCounterHandler 

20from rdflib import ConjunctiveGraph, Graph, Literal, URIRef 

21from SPARQLWrapper import JSON, POST, XML, SPARQLWrapper 

22 

23BASE_DIR = os.path.join("test", "meta_process") 

24 

25 

26def delete_output_zip(base_dir: str, start_time: datetime) -> None: 

27 for file in os.listdir(base_dir): 

28 if file.startswith("meta_output") and file.endswith(".zip"): 

29 file_creation_time = file.split("meta_output_")[1].replace(".zip", "") 

30 file_creation_time = datetime.strptime( 

31 file_creation_time, "%Y-%m-%dT%H_%M_%S_%f" 

32 ) 

33 was_created_after_time = True if file_creation_time > start_time else False 

34 if was_created_after_time: 

35 os.remove(os.path.join(base_dir, file)) 

36 

37 

38class test_ProcessTest(unittest.TestCase): 

39 @classmethod 

40 def setUpClass(cls): 

41 """Setup iniziale eseguito una volta per tutta la classe di test""" 

42 # Aspetta che Virtuoso sia pronto 

43 max_wait = 30 # secondi 

44 start_time = time.time() 

45 while time.time() - start_time < max_wait: 

46 try: 

47 # Prova una query semplice 

48 sparql = SPARQLWrapper(SERVER) 

49 sparql.setQuery("SELECT * WHERE { ?s ?p ?o } LIMIT 1") 

50 sparql.setReturnFormat(JSON) 

51 sparql.query() 

52 break 

53 except Exception: 

54 time.sleep(2) 

55 else: 

56 raise TimeoutError(f"Virtuoso non pronto dopo {max_wait} secondi") 

57 

58 def setUp(self): 

59 """Setup eseguito prima di ogni test""" 

60 # Create temporary directory for cache files 

61 self.temp_dir = tempfile.mkdtemp() 

62 self.cache_file = os.path.join(self.temp_dir, "ts_upload_cache.json") 

63 self.failed_file = os.path.join(self.temp_dir, "failed_queries.txt") 

64 self.stop_file = os.path.join(self.temp_dir, ".stop_upload") 

65 

66 # Reset del database 

67 reset_server() 

68 reset_redis_counters() 

69 

70 def tearDown(self): 

71 reset_redis_counters() 

72 # Remove temporary directory and its contents 

73 if hasattr(self, "temp_dir") and os.path.exists(self.temp_dir): 

74 shutil.rmtree(self.temp_dir) 

75 

76 def test_run_meta_process(self): 

77 output_folder = os.path.join(BASE_DIR, "output_1") 

78 meta_config_path = os.path.join(BASE_DIR, "meta_config_1.yaml") 

79 with open(meta_config_path, encoding="utf-8") as file: 

80 settings = yaml.full_load(file) 

81 

82 # Update settings with temporary files and Redis cache DB 

83 settings.update( 

84 { 

85 "redis_cache_db": 2, 

86 "ts_upload_cache": self.cache_file, 

87 "ts_failed_queries": self.failed_file, 

88 "ts_stop_file": self.stop_file, 

89 } 

90 ) 

91 

92 now = datetime.now() 

93 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

94 output = list() 

95 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")): 

96 for file in filenames: 

97 output.extend(get_csv_data(os.path.join(dirpath, file))) 

98 expected_output = [ 

99 { 

100 "id": "doi:10.17117/na.2015.08.1067 omid:br/0601", 

101 "title": "", 

102 "author": "", 

103 "pub_date": "", 

104 "venue": "Scientometrics [issn:0138-9130 issn:1588-2861 omid:br/0603]", 

105 "volume": "26", 

106 "issue": "", 

107 "page": "", 

108 "type": "journal article", 

109 "publisher": "Consulting Company Ucom [crossref:6623 omid:ra/0601]", 

110 "editor": "Naimi, Elmehdi [orcid:0000-0002-4126-8519 omid:ra/0602]", 

111 }, 

112 { 

113 "id": "issn:1524-4539 issn:0009-7322 omid:br/0602", 

114 "title": "Circulation", 

115 "author": "", 

116 "pub_date": "", 

117 "venue": "", 

118 "volume": "", 

119 "issue": "", 

120 "page": "", 

121 "type": "journal", 

122 "publisher": "", 

123 "editor": "", 

124 }, 

125 { 

126 "id": "doi:10.9799/ksfan.2012.25.1.069 omid:br/0605", 

127 "title": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment", 

128 "author": "Cheigh, Chan-Ick [orcid:0000-0003-2542-5788 omid:ra/0603]; Mun, Ji-Hye [omid:ra/0604]; Chung, Myong-Soo [omid:ra/0605]", 

129 "pub_date": "2012-03-31", 

130 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/0608]", 

131 "volume": "25", 

132 "issue": "1", 

133 "page": "69-76", 

134 "type": "journal article", 

135 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/0606]", 

136 "editor": "Chung, Myong-Soo [orcid:0000-0002-9666-2513 omid:ra/0607]", 

137 }, 

138 { 

139 "id": "doi:10.9799/ksfan.2012.25.1.077 omid:br/0606", 

140 "title": "Properties Of Immature Green Cherry Tomato Pickles", 

141 "author": "Koh, Jong-Ho [omid:ra/0608]; Shin, Hae-Hun [omid:ra/0609]; Kim, Young-Shik [orcid:0000-0001-5673-6314 omid:ra/06010]; Kook, Moo-Chang [omid:ra/06011]", 

142 "pub_date": "2012-03-31", 

143 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/0608]", 

144 "volume": "", 

145 "issue": "2", 

146 "page": "77-82", 

147 "type": "journal article", 

148 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/0606]", 

149 "editor": "", 

150 }, 

151 { 

152 "id": "doi:10.1097/01.rct.0000185385.35389.cd omid:br/0607", 

153 "title": "Comprehensive Assessment Of Lung CT Attenuation Alteration At Perfusion Defects Of Acute Pulmonary Thromboembolism With Breath-Hold SPECT-CT Fusion Images", 

154 "author": "Suga, Kazuyoshi [omid:ra/06012]; Kawakami, Yasuhiko [omid:ra/06013]; Iwanaga, Hideyuki [omid:ra/06014]; Hayashi, Noriko [omid:ra/06015]; Seto, Aska [omid:ra/06016]; Matsunaga, Naofumi [omid:ra/06017]", 

155 "pub_date": "2006-01", 

156 "venue": "Journal Of Computer Assisted Tomography [issn:0363-8715 omid:br/06012]", 

157 "volume": "30", 

158 "issue": "1", 

159 "page": "83-91", 

160 "type": "journal article", 

161 "publisher": "Ovid Technologies (Wolters Kluwer Health) [crossref:276 omid:ra/06018]", 

162 "editor": "", 

163 }, 

164 ] 

165 output = sorted(sorted(d.items()) for d in output) 

166 expected_output = sorted(sorted(d.items()) for d in expected_output) 

167 self.maxDiff = None 

168 shutil.rmtree(output_folder) 

169 delete_output_zip(".", now) 

170 self.assertEqual(output, expected_output) 

171 

172 def test_run_meta_process_ids_only(self): 

173 output_folder = os.path.join(BASE_DIR, "output_5") 

174 meta_config_path = os.path.join(BASE_DIR, "meta_config_5.yaml") 

175 now = datetime.now() 

176 with open(meta_config_path, encoding="utf-8") as file: 

177 settings = yaml.full_load(file) 

178 

179 # Update settings with temporary files and Redis cache DB 

180 settings.update( 

181 { 

182 "redis_cache_db": 2, 

183 "ts_upload_cache": self.cache_file, 

184 "ts_failed_queries": self.failed_file, 

185 "ts_stop_file": self.stop_file, 

186 } 

187 ) 

188 

189 run_meta_process(settings, meta_config_path=meta_config_path) 

190 output = list() 

191 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")): 

192 for file in filenames: 

193 output.extend(get_csv_data(os.path.join(dirpath, file))) 

194 expected_output = [ 

195 { 

196 "id": "doi:10.17117/na.2015.08.1067 omid:br/0601", 

197 "title": "Some Aspects Of The Evolution Of Chernozems Under The Influence Of Natural And Anthropogenic Factors", 

198 "author": "[orcid:0000-0002-4126-8519 omid:ra/0601]; [orcid:0000-0003-0530-4305 omid:ra/0602]", 

199 "pub_date": "2015-08-22", 

200 "venue": "[issn:1225-4339 omid:br/0602]", 

201 "volume": "26", 

202 "issue": "", 

203 "page": "50", 

204 "type": "journal article", 

205 "publisher": "[crossref:6623 omid:ra/0603]", 

206 "editor": "[orcid:0000-0002-4126-8519 omid:ra/0601]; [orcid:0000-0002-8420-0696 omid:ra/0604]", 

207 } 

208 ] 

209 output = sorted(sorted(d.items()) for d in output) 

210 expected_output = sorted(sorted(d.items()) for d in expected_output) 

211 self.maxDiff = None 

212 shutil.rmtree(output_folder) 

213 delete_output_zip(".", now) 

214 self.assertEqual(output, expected_output) 

215 

216 def test_run_meta_process_two_workers(self): 

217 output_folder = os.path.join(BASE_DIR, "output_2") 

218 meta_config_path = os.path.join(BASE_DIR, "meta_config_2.yaml") 

219 with open(meta_config_path, encoding="utf-8") as file: 

220 settings = yaml.full_load(file) 

221 

222 # Update settings with temporary files and Redis cache DB 

223 settings.update( 

224 { 

225 "redis_cache_db": 2, 

226 "ts_upload_cache": self.cache_file, 

227 "ts_failed_queries": self.failed_file, 

228 "ts_stop_file": self.stop_file, 

229 "workers_number": 2, 

230 } 

231 ) 

232 

233 now = datetime.now() 

234 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

235 output = list() 

236 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")): 

237 for file in filenames: 

238 output.extend(get_csv_data(os.path.join(dirpath, file))) 

239 shutil.rmtree(output_folder) 

240 delete_output_zip(".", now) 

241 expected_output = [ 

242 { 

243 "id": "doi:10.17117/na.2015.08.1067 omid:br/06101", 

244 "title": "", 

245 "author": "", 

246 "pub_date": "", 

247 "venue": "Scientometrics [issn:0138-9130 issn:1588-2861 omid:br/06103]", 

248 "volume": "26", 

249 "issue": "", 

250 "page": "", 

251 "type": "journal article", 

252 "publisher": "Consulting Company Ucom [crossref:6623 omid:ra/06101]", 

253 "editor": "Naimi, Elmehdi [orcid:0000-0002-4126-8519 omid:ra/06102]", 

254 }, 

255 { 

256 "id": "issn:1524-4539 issn:0009-7322 omid:br/06102", 

257 "title": "Circulation", 

258 "author": "", 

259 "pub_date": "", 

260 "venue": "", 

261 "volume": "", 

262 "issue": "", 

263 "page": "", 

264 "type": "journal", 

265 "publisher": "", 

266 "editor": "", 

267 }, 

268 { 

269 "id": "doi:10.9799/ksfan.2012.25.1.069 omid:br/06201", 

270 "title": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment", 

271 "author": "Cheigh, Chan-Ick [orcid:0000-0003-2542-5788 omid:ra/06201]; Mun, Ji-Hye [omid:ra/06202]; Chung, Myong-Soo [omid:ra/06203]", 

272 "pub_date": "2012-03-31", 

273 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/06204]", 

274 "volume": "25", 

275 "issue": "1", 

276 "page": "69-76", 

277 "type": "journal article", 

278 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/06204]", 

279 "editor": "Chung, Myong-Soo [orcid:0000-0002-9666-2513 omid:ra/06205]", 

280 }, 

281 { 

282 "id": "doi:10.9799/ksfan.2012.25.1.077 omid:br/06202", 

283 "title": "Properties Of Immature Green Cherry Tomato Pickles", 

284 "author": "Koh, Jong-Ho [omid:ra/06206]; Shin, Hae-Hun [omid:ra/06207]; Kim, Young-Shik [orcid:0000-0001-5673-6314 omid:ra/06208]; Kook, Moo-Chang [omid:ra/06209]", 

285 "pub_date": "2012-03-31", 

286 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/06204]", 

287 "volume": "", 

288 "issue": "2", 

289 "page": "77-82", 

290 "type": "journal article", 

291 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/06204]", 

292 "editor": "", 

293 }, 

294 { 

295 "id": "doi:10.1097/01.rct.0000185385.35389.cd omid:br/06203", 

296 "title": "Comprehensive Assessment Of Lung CT Attenuation Alteration At Perfusion Defects Of Acute Pulmonary Thromboembolism With Breath-Hold SPECT-CT Fusion Images", 

297 "author": "Suga, Kazuyoshi [omid:ra/062010]; Kawakami, Yasuhiko [omid:ra/062011]; Iwanaga, Hideyuki [omid:ra/062012]; Hayashi, Noriko [omid:ra/062013]; Seto, Aska [omid:ra/062014]; Matsunaga, Naofumi [omid:ra/062015]", 

298 "pub_date": "2006-01", 

299 "venue": "Journal Of Computer Assisted Tomography [issn:0363-8715 omid:br/06208]", 

300 "volume": "30", 

301 "issue": "1", 

302 "page": "83-91", 

303 "type": "journal article", 

304 "publisher": "Ovid Technologies (Wolters Kluwer Health) [crossref:276 omid:ra/062016]", 

305 "editor": "", 

306 }, 

307 ] 

308 output = sorted(sorted(d.items()) for d in output) 

309 expected_output = sorted(sorted(d.items()) for d in expected_output) 

310 self.assertEqual(output, expected_output) 

311 

312 def test_provenance(self): 

313 output_folder = os.path.join(BASE_DIR, "output_3") 

314 now = datetime.now() 

315 if os.path.exists(output_folder): 

316 shutil.rmtree(output_folder) 

317 delete_output_zip(".", now) 

318 meta_config_path = os.path.join(BASE_DIR, "meta_config_3.yaml") 

319 with open(meta_config_path, encoding="utf-8") as file: 

320 settings = yaml.full_load(file) 

321 

322 # Update settings with temporary files and Redis cache DB 

323 settings.update( 

324 { 

325 "redis_cache_db": 2, 

326 "ts_upload_cache": self.cache_file, 

327 "ts_failed_queries": self.failed_file, 

328 "ts_stop_file": self.stop_file, 

329 } 

330 ) 

331 

332 reset_server() 

333 

334 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input") 

335 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

336 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input_2") 

337 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

338 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input") 

339 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

340 

341 output = dict() 

342 

343 entity_types = ['ar', 'br', 'id', 'ra', 're'] 

344 

345 for entity_type in entity_types: 

346 query = f""" 

347 PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 

348 PREFIX prov: <http://www.w3.org/ns/prov#> 

349 PREFIX oco: <https://w3id.org/oc/ontology/> 

350  

351 CONSTRUCT {{ 

352 ?s ?p ?o . 

353 }} 

354 WHERE {{ 

355 ?s ?p ?o . 

356 FILTER(REGEX(STR(?s), "https://w3id.org/oc/meta/{entity_type}/[0-9]+/prov/se/[0-9]+")) 

357 }} 

358 """ 

359 

360 result = execute_sparql_query(PROV_SERVER, query, return_format=XML) 

361 

362 g = Graph() 

363 for s, p, o in result: 

364 g.add((s, p, o)) 

365 

366 entities = {} 

367 for s, p, o in g: 

368 s_str = str(s) 

369 if s_str not in entities: 

370 entities[s_str] = {'@id': s_str, '@type': []} 

371 

372 p_str = str(p) 

373 if p == URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'): 

374 entities[s_str]['@type'].append(str(o)) 

375 else: 

376 if p_str not in entities[s_str]: 

377 entities[s_str][p_str] = [] 

378 

379 if isinstance(o, URIRef): 

380 entities[s_str][p_str].append({'@id': str(o)}) 

381 elif isinstance(o, Literal): 

382 if o.datatype: 

383 entities[s_str][p_str].append({ 

384 '@value': str(o), 

385 '@type': str(o.datatype) 

386 }) 

387 else: 

388 entities[s_str][p_str].append({'@value': str(o)}) 

389 

390 # Group entities by their parent entity (e.g., br/0601/prov/se/1 -> br/0601) 

391 grouped_entities = {} 

392 for entity_id, entity_data in entities.items(): 

393 # Extract the parent entity ID from the provenance entity ID 

394 parent_id = re.match(r'https://w3id.org/oc/meta/([^/]+/[0-9]+)', entity_id).group(0) 

395 

396 if parent_id not in grouped_entities: 

397 grouped_entities[parent_id] = [] 

398 

399 # Filter out properties we don't need for comparison 

400 filtered_entity_data = { 

401 '@id': entity_data['@id'], 

402 } 

403 

404 # Keep the required properties for comparison 

405 properties_to_keep = [ 

406 'http://www.w3.org/ns/prov#specializationOf', 

407 'http://www.w3.org/ns/prov#wasDerivedFrom' 

408 ] 

409 

410 for prop in properties_to_keep: 

411 if prop in entity_data: 

412 filtered_entity_data[prop] = entity_data[prop] 

413 

414 # Handle hasUpdateQuery specially 

415 if 'https://w3id.org/oc/ontology/hasUpdateQuery' in entity_data: 

416 # Extract the value from the hasUpdateQuery property 

417 update_query_value = entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'][0].get('@value', '') 

418 

419 # Split the query into individual statements 

420 if update_query_value: 

421 # Extract the part between the INSERT DATA { GRAPH <...> { and } } 

422 try: 

423 query_content = update_query_value.split( 

424 "INSERT DATA { GRAPH <https://w3id.org/oc/meta/br/> { " 

425 )[1].split(" } }")[0] 

426 

427 # Split by dot and space to get individual statements 

428 statements = set(query_content.split(" .")) 

429 

430 # Add to filtered entity data 

431 filtered_entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'] = statements 

432 except IndexError: 

433 # If the format is different, just use the original value 

434 filtered_entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'] = \ 

435 entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'] 

436 

437 # Add this filtered entity to its parent's group 

438 grouped_entities[parent_id].append(filtered_entity_data) 

439 

440 # Format the output to match the expected structure 

441 entity_list = [] 

442 for parent_id, entities_list in sorted(grouped_entities.items()): 

443 entity_list.append({ 

444 '@graph': sorted(entities_list, key=lambda x: x['@id']) 

445 }) 

446 

447 output[entity_type] = entity_list 

448 expected_output = { 

449 "ar": [ 

450 { 

451 "@graph": [ 

452 { 

453 "@id": "https://w3id.org/oc/meta/ar/0601/prov/se/1", 

454 "http://www.w3.org/ns/prov#specializationOf": [ 

455 {"@id": "https://w3id.org/oc/meta/ar/0601"} 

456 ], 

457 } 

458 ] 

459 }, 

460 { 

461 "@graph": [ 

462 { 

463 "@id": "https://w3id.org/oc/meta/ar/0602/prov/se/1", 

464 "http://www.w3.org/ns/prov#specializationOf": [ 

465 {"@id": "https://w3id.org/oc/meta/ar/0602"} 

466 ], 

467 } 

468 ] 

469 }, 

470 { 

471 "@graph": [ 

472 { 

473 "@id": "https://w3id.org/oc/meta/ar/0603/prov/se/1", 

474 "http://www.w3.org/ns/prov#specializationOf": [ 

475 {"@id": "https://w3id.org/oc/meta/ar/0603"} 

476 ], 

477 } 

478 ] 

479 }, 

480 ], 

481 "br": [ 

482 { 

483 "@graph": [ 

484 { 

485 "@id": "https://w3id.org/oc/meta/br/0601/prov/se/1", 

486 "http://www.w3.org/ns/prov#specializationOf": [ 

487 {"@id": "https://w3id.org/oc/meta/br/0601"} 

488 ], 

489 }, 

490 { 

491 "@id": "https://w3id.org/oc/meta/br/0601/prov/se/2", 

492 "http://www.w3.org/ns/prov#specializationOf": [ 

493 {"@id": "https://w3id.org/oc/meta/br/0601"} 

494 ], 

495 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

496 {"@id": "https://w3id.org/oc/meta/br/0601/prov/se/1"} 

497 ], 

498 "https://w3id.org/oc/ontology/hasUpdateQuery": { 

499 "", 

500 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0601>", 

501 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0603>", 

502 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0602>", 

503 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0603>", 

504 '<https://w3id.org/oc/meta/br/0601> <http://prismstandard.org/namespaces/basic/2.0/publicationDate> "2015-08-22"^^<http://www.w3.org/2001/XMLSchema#date>', 

505 '<https://w3id.org/oc/meta/br/0601> <http://purl.org/dc/terms/title> "Some Aspects Of The Evolution Of Chernozems Under The Influence Of Natural And Anthropogenic Factors"^^<http://www.w3.org/2001/XMLSchema#string>', 

506 }, 

507 }, 

508 ] 

509 }, 

510 { 

511 "@graph": [ 

512 { 

513 "@id": "https://w3id.org/oc/meta/br/0602/prov/se/1", 

514 "http://www.w3.org/ns/prov#specializationOf": [ 

515 {"@id": "https://w3id.org/oc/meta/br/0602"} 

516 ], 

517 } 

518 ] 

519 }, 

520 { 

521 "@graph": [ 

522 { 

523 "@id": "https://w3id.org/oc/meta/br/0603/prov/se/1", 

524 "http://www.w3.org/ns/prov#specializationOf": [ 

525 {"@id": "https://w3id.org/oc/meta/br/0603"} 

526 ], 

527 } 

528 ] 

529 }, 

530 ], 

531 "id": [ 

532 { 

533 "@graph": [ 

534 { 

535 "@id": "https://w3id.org/oc/meta/id/0601/prov/se/1", 

536 "http://www.w3.org/ns/prov#specializationOf": [ 

537 {"@id": "https://w3id.org/oc/meta/id/0601"} 

538 ], 

539 } 

540 ] 

541 }, 

542 { 

543 "@graph": [ 

544 { 

545 "@id": "https://w3id.org/oc/meta/id/0602/prov/se/1", 

546 "http://www.w3.org/ns/prov#specializationOf": [ 

547 {"@id": "https://w3id.org/oc/meta/id/0602"} 

548 ], 

549 } 

550 ] 

551 }, 

552 { 

553 "@graph": [ 

554 { 

555 "@id": "https://w3id.org/oc/meta/id/0603/prov/se/1", 

556 "http://www.w3.org/ns/prov#specializationOf": [ 

557 {"@id": "https://w3id.org/oc/meta/id/0603"} 

558 ], 

559 } 

560 ] 

561 }, 

562 { 

563 "@graph": [ 

564 { 

565 "@id": "https://w3id.org/oc/meta/id/0604/prov/se/1", 

566 "http://www.w3.org/ns/prov#specializationOf": [ 

567 {"@id": "https://w3id.org/oc/meta/id/0604"} 

568 ], 

569 } 

570 ] 

571 }, 

572 ], 

573 "ra": [ 

574 { 

575 "@graph": [ 

576 { 

577 "@id": "https://w3id.org/oc/meta/ra/0601/prov/se/1", 

578 "http://www.w3.org/ns/prov#specializationOf": [ 

579 {"@id": "https://w3id.org/oc/meta/ra/0601"} 

580 ], 

581 } 

582 ] 

583 }, 

584 { 

585 "@graph": [ 

586 { 

587 "@id": "https://w3id.org/oc/meta/ra/0602/prov/se/1", 

588 "http://www.w3.org/ns/prov#specializationOf": [ 

589 {"@id": "https://w3id.org/oc/meta/ra/0602"} 

590 ], 

591 } 

592 ] 

593 }, 

594 ], 

595 "re": [ 

596 { 

597 "@graph": [ 

598 { 

599 "@id": "https://w3id.org/oc/meta/re/0601/prov/se/1", 

600 "http://www.w3.org/ns/prov#specializationOf": [ 

601 {"@id": "https://w3id.org/oc/meta/re/0601"} 

602 ], 

603 } 

604 ] 

605 } 

606 ], 

607 } 

608 shutil.rmtree(output_folder) 

609 self.maxDiff = None 

610 self.assertEqual(output, expected_output) 

611 

612 def test_run_meta_process_thread_safe(self): 

613 output_folder = os.path.join(BASE_DIR, "output_4") 

614 meta_config_path = os.path.join(BASE_DIR, "meta_config_4.yaml") 

615 with open(meta_config_path, encoding="utf-8") as file: 

616 settings = yaml.full_load(file) 

617 original_input_csv_dir = settings["input_csv_dir"] 

618 settings["input_csv_dir"] = os.path.join(original_input_csv_dir, "preprocess") 

619 settings["workers_number"] = 1 

620 

621 reset_server() 

622 

623 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

624 

625 # Run it again to test thread safety 

626 proc = subprocess.run( 

627 [sys.executable, "-m", "oc_meta.run.meta_process", "-c", meta_config_path], 

628 capture_output=True, 

629 text=True, 

630 ) 

631 

632 output = dict() 

633 

634 entity_types = ['ar', 'br', 'id', 'ra', 're'] 

635 

636 for entity_type in entity_types: 

637 query = f""" 

638 PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 

639 PREFIX fabio: <http://purl.org/spar/fabio/> 

640 PREFIX pro: <http://purl.org/spar/pro/> 

641 PREFIX datacite: <http://purl.org/spar/datacite/> 

642 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/> 

643 PREFIX frbr: <http://purl.org/vocab/frbr/core#> 

644 PREFIX foaf: <http://xmlns.com/foaf/0.1/> 

645 PREFIX prism: <http://prismstandard.org/namespaces/basic/2.0/> 

646 PREFIX dcterms: <http://purl.org/dc/terms/> 

647 PREFIX oco: <https://w3id.org/oc/ontology/> 

648  

649 CONSTRUCT {{ 

650 ?s ?p ?o . 

651 }} 

652 WHERE {{ 

653 ?s ?p ?o . 

654 FILTER(STRSTARTS(STR(?s), "https://w3id.org/oc/meta/{entity_type}/")) 

655 }} 

656 """ 

657 

658 result = execute_sparql_query(SERVER, query, return_format=XML) 

659 

660 g = Graph() 

661 for s, p, o in result: 

662 g.add((s, p, o)) 

663 

664 entities = {} 

665 for s, p, o in g: 

666 s_str = str(s) 

667 if s_str not in entities: 

668 entities[s_str] = {'@id': s_str, '@type': []} 

669 

670 p_str = str(p) 

671 if p == URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'): 

672 entities[s_str]['@type'].append(str(o)) 

673 else: 

674 if p_str not in entities[s_str]: 

675 entities[s_str][p_str] = [] 

676 

677 if isinstance(o, URIRef): 

678 entities[s_str][p_str].append({'@id': str(o)}) 

679 elif isinstance(o, Literal): 

680 if o.datatype: 

681 entities[s_str][p_str].append({ 

682 '@value': str(o), 

683 '@type': str(o.datatype) 

684 }) 

685 else: 

686 entities[s_str][p_str].append({'@value': str(o)}) 

687 

688 entity_list = list(entities.values()) 

689 

690 output[entity_type] = [ 

691 { 

692 '@graph': entity_list, 

693 '@id': f"https://w3id.org/oc/meta/{entity_type}/" 

694 } 

695 ] 

696 

697 expected_output = { 

698 "ar": [ 

699 { 

700 "@graph": [ 

701 { 

702 "@id": "https://w3id.org/oc/meta/ar/0604", 

703 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

704 "http://purl.org/spar/pro/isHeldBy": [ 

705 {"@id": "https://w3id.org/oc/meta/ra/0604"} 

706 ], 

707 "http://purl.org/spar/pro/withRole": [ 

708 {"@id": "http://purl.org/spar/pro/publisher"} 

709 ], 

710 }, 

711 { 

712 "@id": "https://w3id.org/oc/meta/ar/0602", 

713 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

714 "http://purl.org/spar/pro/isHeldBy": [ 

715 {"@id": "https://w3id.org/oc/meta/ra/0602"} 

716 ], 

717 "http://purl.org/spar/pro/withRole": [ 

718 {"@id": "http://purl.org/spar/pro/author"} 

719 ], 

720 "https://w3id.org/oc/ontology/hasNext": [ 

721 {"@id": "https://w3id.org/oc/meta/ar/0603"} 

722 ], 

723 }, 

724 { 

725 "@id": "https://w3id.org/oc/meta/ar/0603", 

726 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

727 "http://purl.org/spar/pro/isHeldBy": [ 

728 {"@id": "https://w3id.org/oc/meta/ra/0603"} 

729 ], 

730 "http://purl.org/spar/pro/withRole": [ 

731 {"@id": "http://purl.org/spar/pro/author"} 

732 ], 

733 }, 

734 { 

735 "@id": "https://w3id.org/oc/meta/ar/0605", 

736 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

737 "http://purl.org/spar/pro/isHeldBy": [ 

738 {"@id": "https://w3id.org/oc/meta/ra/0605"} 

739 ], 

740 "http://purl.org/spar/pro/withRole": [ 

741 {"@id": "http://purl.org/spar/pro/editor"} 

742 ], 

743 }, 

744 { 

745 "@id": "https://w3id.org/oc/meta/ar/0601", 

746 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

747 "http://purl.org/spar/pro/isHeldBy": [ 

748 {"@id": "https://w3id.org/oc/meta/ra/0601"} 

749 ], 

750 "http://purl.org/spar/pro/withRole": [ 

751 {"@id": "http://purl.org/spar/pro/author"} 

752 ], 

753 "https://w3id.org/oc/ontology/hasNext": [ 

754 {"@id": "https://w3id.org/oc/meta/ar/0602"} 

755 ], 

756 }, 

757 ], 

758 "@id": "https://w3id.org/oc/meta/ar/", 

759 } 

760 ], 

761 "br": [ 

762 { 

763 "@graph": [ 

764 { 

765 "@id": "https://w3id.org/oc/meta/br/0601", 

766 "@type": [ 

767 "http://purl.org/spar/fabio/Expression", 

768 "http://purl.org/spar/fabio/JournalArticle", 

769 ], 

770 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

771 { 

772 "@type": "http://www.w3.org/2001/XMLSchema#date", 

773 "@value": "2012-03-31", 

774 } 

775 ], 

776 "http://purl.org/dc/terms/title": [ 

777 { 

778 "@value": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment", 

779 "@type": "http://www.w3.org/2001/XMLSchema#string" 

780 } 

781 ], 

782 "http://purl.org/spar/datacite/hasIdentifier": [ 

783 {"@id": "https://w3id.org/oc/meta/id/0601"} 

784 ], 

785 "http://purl.org/spar/pro/isDocumentContextFor": [ 

786 {"@id": "https://w3id.org/oc/meta/ar/0603"}, 

787 {"@id": "https://w3id.org/oc/meta/ar/0601"}, 

788 {"@id": "https://w3id.org/oc/meta/ar/0604"}, 

789 {"@id": "https://w3id.org/oc/meta/ar/0602"}, 

790 {"@id": "https://w3id.org/oc/meta/ar/0605"}, 

791 ], 

792 "http://purl.org/vocab/frbr/core#embodiment": [ 

793 {"@id": "https://w3id.org/oc/meta/re/0601"} 

794 ], 

795 "http://purl.org/vocab/frbr/core#partOf": [ 

796 {"@id": "https://w3id.org/oc/meta/br/0604"} 

797 ], 

798 }, 

799 { 

800 "@id": "https://w3id.org/oc/meta/br/0604", 

801 "@type": [ 

802 "http://purl.org/spar/fabio/JournalIssue", 

803 "http://purl.org/spar/fabio/Expression", 

804 ], 

805 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

806 {"@value": "1", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

807 ], 

808 "http://purl.org/vocab/frbr/core#partOf": [ 

809 {"@id": "https://w3id.org/oc/meta/br/0603"} 

810 ], 

811 }, 

812 { 

813 "@id": "https://w3id.org/oc/meta/br/0602", 

814 "@type": [ 

815 "http://purl.org/spar/fabio/Expression", 

816 "http://purl.org/spar/fabio/Journal", 

817 ], 

818 "http://purl.org/dc/terms/title": [ 

819 {"@value": "The Korean Journal Of Food And Nutrition", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

820 ], 

821 "http://purl.org/spar/datacite/hasIdentifier": [ 

822 {"@id": "https://w3id.org/oc/meta/id/0602"} 

823 ], 

824 }, 

825 { 

826 "@id": "https://w3id.org/oc/meta/br/0603", 

827 "@type": [ 

828 "http://purl.org/spar/fabio/Expression", 

829 "http://purl.org/spar/fabio/JournalVolume", 

830 ], 

831 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

832 {"@value": "25", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

833 ], 

834 "http://purl.org/vocab/frbr/core#partOf": [ 

835 {"@id": "https://w3id.org/oc/meta/br/0602"} 

836 ], 

837 }, 

838 ], 

839 "@id": "https://w3id.org/oc/meta/br/", 

840 } 

841 ], 

842 "id": [ 

843 { 

844 "@graph": [ 

845 { 

846 "@id": "https://w3id.org/oc/meta/id/0605", 

847 "@type": ["http://purl.org/spar/datacite/Identifier"], 

848 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

849 {"@id": "http://purl.org/spar/datacite/orcid"} 

850 ], 

851 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

852 {"@value": "0000-0002-9666-2513", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

853 ], 

854 }, 

855 { 

856 "@id": "https://w3id.org/oc/meta/id/0601", 

857 "@type": ["http://purl.org/spar/datacite/Identifier"], 

858 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

859 {"@id": "http://purl.org/spar/datacite/doi"} 

860 ], 

861 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

862 {"@value": "10.9799/ksfan.2012.25.1.069", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

863 ], 

864 }, 

865 { 

866 "@id": "https://w3id.org/oc/meta/id/0603", 

867 "@type": ["http://purl.org/spar/datacite/Identifier"], 

868 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

869 {"@id": "http://purl.org/spar/datacite/orcid"} 

870 ], 

871 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

872 {"@value": "0000-0003-2542-5788", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

873 ], 

874 }, 

875 { 

876 "@id": "https://w3id.org/oc/meta/id/0604", 

877 "@type": ["http://purl.org/spar/datacite/Identifier"], 

878 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

879 {"@id": "http://purl.org/spar/datacite/crossref"} 

880 ], 

881 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

882 {"@value": "4768", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

883 ], 

884 }, 

885 { 

886 "@id": "https://w3id.org/oc/meta/id/0602", 

887 "@type": ["http://purl.org/spar/datacite/Identifier"], 

888 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

889 {"@id": "http://purl.org/spar/datacite/issn"} 

890 ], 

891 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

892 {"@value": "1225-4339", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

893 ], 

894 }, 

895 ], 

896 "@id": "https://w3id.org/oc/meta/id/", 

897 } 

898 ], 

899 "ra": [ 

900 { 

901 "@graph": [ 

902 { 

903 "@id": "https://w3id.org/oc/meta/ra/0605", 

904 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

905 "http://purl.org/spar/datacite/hasIdentifier": [ 

906 {"@id": "https://w3id.org/oc/meta/id/0605"} 

907 ], 

908 "http://xmlns.com/foaf/0.1/familyName": [ 

909 {"@value": "Chung", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

910 ], 

911 "http://xmlns.com/foaf/0.1/givenName": [ 

912 {"@value": "Myong-Soo", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

913 ], 

914 }, 

915 { 

916 "@id": "https://w3id.org/oc/meta/ra/0602", 

917 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

918 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Mun", "@type": "http://www.w3.org/2001/XMLSchema#string"}], 

919 "http://xmlns.com/foaf/0.1/givenName": [ 

920 {"@value": "Ji-Hye", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

921 ], 

922 }, 

923 { 

924 "@id": "https://w3id.org/oc/meta/ra/0604", 

925 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

926 "http://purl.org/spar/datacite/hasIdentifier": [ 

927 {"@id": "https://w3id.org/oc/meta/id/0604"} 

928 ], 

929 "http://xmlns.com/foaf/0.1/name": [ 

930 {"@value": "The Korean Society Of Food And Nutrition", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

931 ], 

932 }, 

933 { 

934 "@id": "https://w3id.org/oc/meta/ra/0603", 

935 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

936 "http://xmlns.com/foaf/0.1/familyName": [ 

937 {"@value": "Chung", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

938 ], 

939 "http://xmlns.com/foaf/0.1/givenName": [ 

940 {"@value": "Myong-Soo", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

941 ], 

942 }, 

943 { 

944 "@id": "https://w3id.org/oc/meta/ra/0601", 

945 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

946 "http://purl.org/spar/datacite/hasIdentifier": [ 

947 {"@id": "https://w3id.org/oc/meta/id/0603"} 

948 ], 

949 "http://xmlns.com/foaf/0.1/familyName": [ 

950 {"@value": "Cheigh", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

951 ], 

952 "http://xmlns.com/foaf/0.1/givenName": [ 

953 {"@value": "Chan-Ick", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

954 ], 

955 }, 

956 ], 

957 "@id": "https://w3id.org/oc/meta/ra/", 

958 } 

959 ], 

960 "re": [ 

961 { 

962 "@graph": [ 

963 { 

964 "@id": "https://w3id.org/oc/meta/re/0601", 

965 "@type": ["http://purl.org/spar/fabio/Manifestation"], 

966 "http://prismstandard.org/namespaces/basic/2.0/endingPage": [ 

967 {"@value": "76", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

968 ], 

969 "http://prismstandard.org/namespaces/basic/2.0/startingPage": [ 

970 {"@value": "69", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

971 ], 

972 } 

973 ], 

974 "@id": "https://w3id.org/oc/meta/re/", 

975 } 

976 ], 

977 } 

978 

979 processed_output = {} 

980 for entity_type, entity_data in output.items(): 

981 processed_output[entity_type] = [] 

982 for graph_container in entity_data: 

983 filtered_graph = [] 

984 for entity in graph_container['@graph']: 

985 filtered_entity = { 

986 '@id': entity['@id'] 

987 } 

988 for pred, obj in entity.items(): 

989 if pred != '@id': # Only exclude @id since we already added it 

990 filtered_entity[pred] = obj 

991 

992 if len(filtered_entity) > 1: # Only include if it has predicates beyond @id 

993 filtered_graph.append(filtered_entity) 

994 

995 # Sort the graph by @id 

996 filtered_graph = sorted(filtered_graph, key=lambda x: x['@id']) 

997 

998 processed_output[entity_type].append({ 

999 '@graph': filtered_graph, 

1000 '@id': graph_container['@id'] 

1001 }) 

1002 # For each entity type in the expected output, verify that all expected entities exist 

1003 # with their expected properties in the actual output from the triplestore 

1004 for entity_type, expected_graphs in expected_output.items(): 

1005 self.assertIn(entity_type, processed_output, f"Entity type {entity_type} missing from triplestore output") 

1006 

1007 for expected_graph in expected_graphs: 

1008 expected_entities = expected_graph['@graph'] 

1009 

1010 # Find the corresponding graph in the processed output 

1011 actual_graph = None 

1012 for graph in processed_output[entity_type]: 

1013 if graph['@id'] == expected_graph['@id']: 

1014 actual_graph = graph 

1015 break 

1016 

1017 self.assertIsNotNone(actual_graph, f"Graph {expected_graph['@id']} not found in triplestore output") 

1018 

1019 # For each expected entity, verify it exists with all expected properties 

1020 for expected_entity in expected_entities: 

1021 entity_id = expected_entity['@id'] 

1022 

1023 # Find the entity in the actual graph 

1024 actual_entity = None 

1025 for entity in actual_graph['@graph']: 

1026 if entity['@id'] == entity_id: 

1027 actual_entity = entity 

1028 break 

1029 

1030 self.assertIsNotNone(actual_entity, f"Entity {entity_id} not found in triplestore output") 

1031 

1032 # Check that all expected predicates and objects exist 

1033 for pred, expected_objects in expected_entity.items(): 

1034 if pred != '@id': 

1035 self.assertIn(pred, actual_entity, f"Predicate {pred} missing for entity {entity_id}") 

1036 

1037 # For each expected object, verify it exists in the actual objects 

1038 for expected_obj in expected_objects: 

1039 found = False 

1040 for actual_obj in actual_entity[pred]: 

1041 # Require exact matches for all objects 

1042 if expected_obj == actual_obj: 

1043 found = True 

1044 break 

1045 

1046 self.assertTrue(found, f"Object {expected_obj} not found for predicate {pred} of entity {entity_id}\nActual values: {actual_entity[pred]}") 

1047 

1048 

1049 if os.path.exists(output_folder): 

1050 shutil.rmtree(output_folder) 

1051 

1052 self.assertFalse( 

1053 "Reader: ERROR" in proc.stdout or "Storer: ERROR" in proc.stdout 

1054 ) 

1055 self.assertFalse( 

1056 "Reader: ERROR" in proc.stdout or "Storer: ERROR" in proc.stdout 

1057 ) 

1058 

1059 def test_silencer_on(self): 

1060 output_folder = os.path.join(BASE_DIR, "output_6") 

1061 now = datetime.now() 

1062 meta_config_path = os.path.join(BASE_DIR, "meta_config_6.yaml") 

1063 with open(meta_config_path, encoding="utf-8") as file: 

1064 settings = yaml.full_load(file) 

1065 

1066 # Update settings with temporary files and Redis cache DB 

1067 settings.update( 

1068 { 

1069 "redis_cache_db": 2, 

1070 "ts_upload_cache": self.cache_file, 

1071 "ts_failed_queries": self.failed_file, 

1072 "ts_stop_file": self.stop_file, 

1073 } 

1074 ) 

1075 

1076 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1077 settings["input_csv_dir"] = os.path.join( 

1078 BASE_DIR, "same_as_input_2_with_other_authors" 

1079 ) 

1080 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1081 query_agents = """ 

1082 PREFIX pro: <http://purl.org/spar/pro/> 

1083 SELECT (COUNT (?agent) AS ?agent_count) 

1084 WHERE { 

1085 <https://w3id.org/oc/meta/br/0601> pro:isDocumentContextFor ?agent. 

1086 } 

1087 """ 

1088 result = execute_sparql_query(SERVER, query_agents) 

1089 expected_result = { 

1090 "head": {"link": [], "vars": ["agent_count"]}, 

1091 "results": { 

1092 "distinct": False, 

1093 "ordered": True, 

1094 "bindings": [ 

1095 { 

1096 "agent_count": { 

1097 "datatype": "http://www.w3.org/2001/XMLSchema#integer", 

1098 "type": "typed-literal", 

1099 "value": "3", 

1100 } 

1101 } 

1102 ], 

1103 }, 

1104 } 

1105 shutil.rmtree(output_folder) 

1106 delete_output_zip(".", now) 

1107 self.assertEqual(result, expected_result) 

1108 

1109 def test_silencer_off(self): 

1110 output_folder = os.path.join(BASE_DIR, "output_7") 

1111 now = datetime.now() 

1112 meta_config_path = os.path.join(BASE_DIR, "meta_config_7.yaml") 

1113 with open(meta_config_path, encoding="utf-8") as file: 

1114 settings = yaml.full_load(file) 

1115 

1116 # Update settings with temporary files and Redis cache DB 

1117 settings.update( 

1118 { 

1119 "redis_cache_db": 2, 

1120 "ts_upload_cache": self.cache_file, 

1121 "ts_failed_queries": self.failed_file, 

1122 "ts_stop_file": self.stop_file, 

1123 } 

1124 ) 

1125 

1126 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1127 settings["input_csv_dir"] = os.path.join( 

1128 BASE_DIR, "same_as_input_2_with_other_authors" 

1129 ) 

1130 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1131 query_agents = """ 

1132 PREFIX pro: <http://purl.org/spar/pro/> 

1133 SELECT (COUNT (?agent) AS ?agent_count) 

1134 WHERE { 

1135 <https://w3id.org/oc/meta/br/0601> pro:isDocumentContextFor ?agent. 

1136 } 

1137 """ 

1138 result = execute_sparql_query(SERVER, query_agents) 

1139 expected_result = { 

1140 "head": {"link": [], "vars": ["agent_count"]}, 

1141 "results": { 

1142 "distinct": False, 

1143 "ordered": True, 

1144 "bindings": [ 

1145 { 

1146 "agent_count": { 

1147 "datatype": "http://www.w3.org/2001/XMLSchema#integer", 

1148 "type": "typed-literal", 

1149 "value": "6", 

1150 } 

1151 } 

1152 ], 

1153 }, 

1154 } 

1155 shutil.rmtree(output_folder) 

1156 delete_output_zip(".", now) 

1157 self.assertEqual(result, expected_result) 

1158 

1159 def test_omid_in_input_data(self): 

1160 query_all = """ 

1161 PREFIX fabio: <http://purl.org/spar/fabio/> 

1162 PREFIX datacite: <http://purl.org/spar/datacite/> 

1163 CONSTRUCT {?s ?p ?o. ?id ?id_p ?id_o.} 

1164 WHERE { 

1165 ?s a fabio:JournalArticle; 

1166 ?p ?o. 

1167 ?s datacite:hasIdentifier ?id. 

1168 ?id ?id_p ?id_o. 

1169 } 

1170 """ 

1171 result = execute_sparql_query(SERVER, query_all, return_format=XML) 

1172 output_folder = os.path.join(BASE_DIR, "output_8") 

1173 now = datetime.now() 

1174 meta_config_path_without_openalex = os.path.join(BASE_DIR, "meta_config_8.yaml") 

1175 meta_config_path_with_openalex = os.path.join(BASE_DIR, "meta_config_9.yaml") 

1176 with open(meta_config_path_without_openalex, encoding="utf-8") as file: 

1177 settings_without_openalex = yaml.full_load(file) 

1178 with open(meta_config_path_with_openalex, encoding="utf-8") as file: 

1179 settings_with_openalex = yaml.full_load(file) 

1180 

1181 # Update settings with temporary files and Redis cache DB 

1182 cache_settings = { 

1183 "redis_cache_db": 2, 

1184 "ts_upload_cache": self.cache_file, 

1185 "ts_failed_queries": self.failed_file, 

1186 "ts_stop_file": self.stop_file, 

1187 } 

1188 settings_without_openalex.update(cache_settings) 

1189 settings_with_openalex.update(cache_settings) 

1190 

1191 run_meta_process( 

1192 settings=settings_without_openalex, 

1193 meta_config_path=meta_config_path_without_openalex, 

1194 ) 

1195 run_meta_process( 

1196 settings=settings_with_openalex, 

1197 meta_config_path=meta_config_path_with_openalex, 

1198 ) 

1199 query_all = """ 

1200 PREFIX fabio: <http://purl.org/spar/fabio/> 

1201 PREFIX datacite: <http://purl.org/spar/datacite/> 

1202 CONSTRUCT {?s ?p ?o. ?id ?id_p ?id_o.} 

1203 WHERE { 

1204 ?s a fabio:JournalArticle; 

1205 ?p ?o. 

1206 ?s datacite:hasIdentifier ?id. 

1207 ?id ?id_p ?id_o. 

1208 } 

1209 """ 

1210 result = execute_sparql_query(SERVER, query_all, return_format=XML) 

1211 expected_result = Graph() 

1212 expected_result.parse( 

1213 location=os.path.join(BASE_DIR, "test_omid_in_input_data.json"), 

1214 format="json-ld", 

1215 ) 

1216 prov_graph = ConjunctiveGraph() 

1217 for dirpath, dirnames, filenames in os.walk(os.path.join(output_folder, "rdf")): 

1218 if "br" in dirpath and "prov" in dirpath: 

1219 for filename in filenames: 

1220 prov_graph.parse( 

1221 source=os.path.join(dirpath, filename), format="json-ld" 

1222 ) 

1223 

1224 expected_prov_graph = ConjunctiveGraph() 

1225 expected_prov_graph.parse( 

1226 os.path.join(BASE_DIR, "test_omid_in_input_data_prov.json"), 

1227 format="json-ld", 

1228 ) 

1229 prov_graph.remove( 

1230 (None, URIRef("http://www.w3.org/ns/prov#generatedAtTime"), None) 

1231 ) 

1232 expected_prov_graph.remove( 

1233 (None, URIRef("http://www.w3.org/ns/prov#generatedAtTime"), None) 

1234 ) 

1235 prov_graph.remove( 

1236 (None, URIRef("http://www.w3.org/ns/prov#invalidatedAtTime"), None) 

1237 ) 

1238 expected_prov_graph.remove( 

1239 (None, URIRef("http://www.w3.org/ns/prov#invalidatedAtTime"), None) 

1240 ) 

1241 shutil.rmtree(output_folder) 

1242 self.assertTrue( 

1243 normalize_graph(result).isomorphic(normalize_graph(expected_result)) 

1244 ) 

1245 

1246 def test_publishers_sequence(self): 

1247 output_folder = os.path.join(BASE_DIR, "output_9") 

1248 meta_config_path = os.path.join(BASE_DIR, "meta_config_10.yaml") 

1249 now = datetime.now() 

1250 with open(meta_config_path, encoding="utf-8") as file: 

1251 settings = yaml.full_load(file) 

1252 

1253 # Update settings with temporary files and Redis cache DB 

1254 settings.update( 

1255 { 

1256 "redis_cache_db": 2, 

1257 "ts_upload_cache": self.cache_file, 

1258 "ts_failed_queries": self.failed_file, 

1259 "ts_stop_file": self.stop_file, 

1260 } 

1261 ) 

1262 

1263 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1264 query_all = """ 

1265 PREFIX datacite: <http://purl.org/spar/datacite/> 

1266 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/> 

1267 CONSTRUCT {?br ?p ?o. ?o ?op ?oo. ?oo ?oop ?ooo. ?ooo ?ooop ?oooo.} 

1268 WHERE { 

1269 ?id literal:hasLiteralValue "10.17117/na.2015.08.1067"^^<http://www.w3.org/2001/XMLSchema#string>; 

1270 datacite:usesIdentifierScheme datacite:doi; 

1271 ^datacite:hasIdentifier ?br. 

1272 ?br ?p ?o. 

1273 ?o ?op ?oo. 

1274 ?oo ?oop ?ooo. 

1275 ?ooo ?ooop ?oooo. 

1276 } 

1277 """ 

1278 result = execute_sparql_query(SERVER, query_all, return_format=XML) 

1279 expected_result = Graph() 

1280 expected_result.parse( 

1281 os.path.join(BASE_DIR, "test_publishers_sequence.json"), format="json-ld" 

1282 ) 

1283 shutil.rmtree(output_folder) 

1284 self.assertTrue( 

1285 normalize_graph(result).isomorphic(normalize_graph(expected_result)) 

1286 ) 

1287 

1288 def test_duplicate_omids_with_datatype(self): 

1289 output_folder = os.path.join(BASE_DIR, "output_duplicate_test") 

1290 meta_config_path = os.path.join(BASE_DIR, "meta_config_duplicate.yaml") 

1291 

1292 # Create test settings 

1293 settings = { 

1294 "triplestore_url": SERVER, 

1295 "provenance_triplestore_url": PROV_SERVER, 

1296 "input_csv_dir": os.path.join(BASE_DIR, "input_duplicate"), 

1297 "base_output_dir": output_folder, 

1298 "output_rdf_dir": output_folder, 

1299 "resp_agent": "test", 

1300 "base_iri": "https://w3id.org/oc/meta/", 

1301 "context_path": None, 

1302 "dir_split_number": 10000, 

1303 "items_per_file": 1000, 

1304 "default_dir": "_", 

1305 "rdf_output_in_chunks": False, 

1306 "zip_output_rdf": True, 

1307 "source": None, 

1308 "supplier_prefix": "060", 

1309 "workers_number": 1, 

1310 "use_doi_api_service": False, 

1311 "blazegraph_full_text_search": False, 

1312 "virtuoso_full_text_search": True, 

1313 "fuseki_full_text_search": False, 

1314 "graphdb_connector_name": None, 

1315 "cache_endpoint": None, 

1316 "cache_update_endpoint": None, 

1317 "silencer": [], 

1318 "redis_cache_db": 2, 

1319 "ts_upload_cache": self.cache_file, 

1320 "ts_failed_queries": self.failed_file, 

1321 "ts_stop_file": self.stop_file, 

1322 } 

1323 

1324 # Setup: create test data 

1325 os.makedirs(os.path.join(BASE_DIR, "input_duplicate"), exist_ok=True) 

1326 with open( 

1327 os.path.join(BASE_DIR, "input_duplicate", "test.csv"), "w", encoding="utf-8" 

1328 ) as f: 

1329 writer = csv.writer(f) 

1330 writer.writerow( 

1331 [ 

1332 "id", 

1333 "title", 

1334 "author", 

1335 "pub_date", 

1336 "venue", 

1337 "volume", 

1338 "issue", 

1339 "page", 

1340 "type", 

1341 "publisher", 

1342 "editor", 

1343 ] 

1344 ) 

1345 writer.writerow( 

1346 [ 

1347 "issn:2543-3288 issn:2078-7685", # Exact problematic row from production 

1348 "Journal of Diabetology", 

1349 "", 

1350 "", 

1351 "", 

1352 "", 

1353 "", 

1354 "", 

1355 "journal", 

1356 "Medknow [crossref:2581]", 

1357 "", 

1358 ] 

1359 ) 

1360 

1361 # Setup: Insert pre-existing identifiers and BRs in triplestore 

1362 sparql = SPARQLWrapper(SERVER) 

1363 sparql.setMethod(POST) 

1364 sparql.setQuery( 

1365 """ 

1366 INSERT DATA { 

1367 GRAPH <https://w3id.org/oc/meta/br/> { 

1368 <https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601> ; 

1369 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> . 

1370 <https://w3id.org/oc/meta/br/0602> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0602> ; 

1371 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> . 

1372 } 

1373 GRAPH <https://w3id.org/oc/meta/id/> { 

1374 <https://w3id.org/oc/meta/id/0601> <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "2078-7685" ; 

1375 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1376 <https://w3id.org/oc/meta/id/0602> <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "2543-3288" ; 

1377 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1378 } 

1379 } 

1380 """ 

1381 ) 

1382 sparql.query() 

1383 

1384 # Update Redis counters to match the inserted data 

1385 redis_handler = RedisCounterHandler(db=5) # Use test db 

1386 redis_handler.set_counter( 

1387 2, "br", supplier_prefix="060" 

1388 ) # BR counter for two BRs 

1389 redis_handler.set_counter( 

1390 2, "id", supplier_prefix="060" 

1391 ) # ID counter for two IDs 

1392 

1393 # Run the process 

1394 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1395 

1396 # Check for errors 

1397 errors_file = os.path.join(output_folder, "errors.txt") 

1398 if os.path.exists(errors_file): 

1399 with open(errors_file, "r") as f: 

1400 errors = f.read() 

1401 print(f"Errors found:\n{errors}") 

1402 

1403 # Query to check for duplicates 

1404 query = """ 

1405 SELECT DISTINCT ?id ?value 

1406 WHERE { 

1407 GRAPH <https://w3id.org/oc/meta/id/> { 

1408 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?value ; 

1409 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1410 FILTER(?value IN ("2078-7685"^^<http://www.w3.org/2001/XMLSchema#string>, "2078-7685", 

1411 "2543-3288"^^<http://www.w3.org/2001/XMLSchema#string>, "2543-3288")) 

1412 } 

1413 } 

1414 """ 

1415 result = execute_sparql_query(SERVER, query, return_format=JSON) 

1416 # Group IDs by value to check for duplicates 

1417 ids_by_value = {} 

1418 for binding in result["results"]["bindings"]: 

1419 value = binding["value"]["value"] 

1420 id = binding["id"]["value"] 

1421 if value not in ids_by_value: 

1422 ids_by_value[value] = [] 

1423 ids_by_value[value].append(id) 

1424 

1425 # Cleanup 

1426 shutil.rmtree(output_folder, ignore_errors=True) 

1427 shutil.rmtree(os.path.join(BASE_DIR, "input_duplicate"), ignore_errors=True) 

1428 if os.path.exists(meta_config_path): 

1429 os.remove(meta_config_path) 

1430 

1431 # Check that we have both ISSNs and no duplicates 

1432 for issn_value, ids in ids_by_value.items(): 

1433 self.assertEqual( 

1434 len(ids), 1, f"Found multiple IDs for ISSN {issn_value}: {ids}" 

1435 ) 

1436 

1437 self.assertEqual( 

1438 len(ids_by_value), 

1439 2, 

1440 f"Expected 2 ISSNs, found {len(ids_by_value)}: {list(ids_by_value.keys())}", 

1441 ) 

1442 

1443 def test_duplicate_omids_with_venue_datatype(self): 

1444 """Test to verify that identifiers are not duplicated when merging previously unconnected venues""" 

1445 output_folder = os.path.join(BASE_DIR, "output_duplicate_venue_test") 

1446 meta_config_path = os.path.join(BASE_DIR, "meta_config_duplicate_venue.yaml") 

1447 

1448 # Setup: create test data 

1449 os.makedirs(os.path.join(BASE_DIR, "input_duplicate_venue"), exist_ok=True) 

1450 with open( 

1451 os.path.join(BASE_DIR, "input_duplicate_venue", "test.csv"), 

1452 "w", 

1453 encoding="utf-8", 

1454 ) as f: 

1455 writer = csv.writer(f) 

1456 writer.writerow( 

1457 [ 

1458 "id", 

1459 "title", 

1460 "author", 

1461 "pub_date", 

1462 "venue", 

1463 "volume", 

1464 "issue", 

1465 "page", 

1466 "type", 

1467 "publisher", 

1468 "editor", 

1469 ] 

1470 ) 

1471 writer.writerow( 

1472 [ 

1473 "issn:1756-1833", 

1474 "BMJ", 

1475 "", 

1476 "", 

1477 "", 

1478 "", 

1479 "", 

1480 "", 

1481 "journal", 

1482 "BMJ [crossref:239]", 

1483 "", 

1484 ] 

1485 ) 

1486 writer.writerow( 

1487 [ 

1488 "", # id 

1489 "", # title 

1490 "", # author 

1491 "", # pub_date 

1492 "BMJ [issn:0267-0623 issn:0959-8138 issn:1468-5833 issn:0007-1447]", # venue 

1493 "283", # volume 

1494 "", # issue 

1495 "", # page 

1496 "journal volume", # type 

1497 "BMJ [crossref:239]", # publisher 

1498 "", # editor 

1499 ] 

1500 ) 

1501 

1502 # Setup: Insert pre-existing data - aggiungiamo gli identificatori iniziali 

1503 sparql = SPARQLWrapper(SERVER) 

1504 sparql.setMethod(POST) 

1505 sparql.setQuery( 

1506 """ 

1507 INSERT DATA { 

1508 GRAPH <https://w3id.org/oc/meta/br/> { 

1509 # First venue - BMJ with initial ISSNs 

1510 <https://w3id.org/oc/meta/br/0601>  

1511 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601>, <https://w3id.org/oc/meta/id/0602> ; 

1512 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ; 

1513 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1514 <http://purl.org/dc/terms/title> "BMJ" . 

1515 

1516 # Second venue 

1517 <https://w3id.org/oc/meta/br/0602>  

1518 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0603> ; 

1519 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ; 

1520 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1521 <http://purl.org/dc/terms/title> "British Medical Journal" . 

1522 } 

1523 GRAPH <https://w3id.org/oc/meta/id/> { 

1524 # First venue's ISSNs 

1525 <https://w3id.org/oc/meta/id/0601>  

1526 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "1756-1833" ; 

1527 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1528 <https://w3id.org/oc/meta/id/0602> 

1529 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "0959-8138" ; 

1530 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1531 # Second venue's ISSN 

1532 <https://w3id.org/oc/meta/id/0603>  

1533 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "0267-0623" ; 

1534 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1535 } 

1536 } 

1537 """ 

1538 ) 

1539 sparql.query() 

1540 

1541 # Update Redis counters for the pre-existing entities 

1542 redis_handler = RedisCounterHandler(db=5) 

1543 redis_handler.set_counter( 

1544 6, "br", supplier_prefix="060" 

1545 ) # Updated to account for 6 entities (2 venues + 4 volumes) 

1546 redis_handler.set_counter( 

1547 3, "id", supplier_prefix="060" 

1548 ) # Corretto: 3 IDs (1756-1833, 0959-8138, 0267-0623) 

1549 

1550 # Create test settings 

1551 settings = { 

1552 "triplestore_url": SERVER, 

1553 "provenance_triplestore_url": PROV_SERVER, 

1554 "input_csv_dir": os.path.join(BASE_DIR, "input_duplicate_venue"), 

1555 "base_output_dir": output_folder, 

1556 "output_rdf_dir": output_folder, 

1557 "resp_agent": "test", 

1558 "base_iri": "https://w3id.org/oc/meta/", 

1559 "context_path": None, 

1560 "dir_split_number": 10000, 

1561 "items_per_file": 1000, 

1562 "default_dir": "_", 

1563 "rdf_output_in_chunks": False, 

1564 "zip_output_rdf": True, 

1565 "source": None, 

1566 "supplier_prefix": "060", 

1567 "workers_number": 1, 

1568 "use_doi_api_service": False, 

1569 "blazegraph_full_text_search": False, 

1570 "virtuoso_full_text_search": True, 

1571 "fuseki_full_text_search": False, 

1572 "graphdb_connector_name": None, 

1573 "cache_endpoint": None, 

1574 "cache_update_endpoint": None, 

1575 "silencer": [], 

1576 "redis_cache_db": 2, 

1577 "ts_upload_cache": self.cache_file, 

1578 "ts_failed_queries": self.failed_file, 

1579 "ts_stop_file": self.stop_file, 

1580 } 

1581 

1582 with open(meta_config_path, "w") as f: 

1583 yaml.dump(settings, f) 

1584 

1585 # Run the process 

1586 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1587 

1588 # Query to check for duplicates - check all ISSNs 

1589 query = """ 

1590 SELECT DISTINCT ?id ?value 

1591 WHERE { 

1592 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?value ; 

1593 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1594 FILTER(STR(?value) IN ("1756-1833", "0959-8138", "0267-0623")) 

1595 } 

1596 """ 

1597 result = execute_sparql_query(SERVER, query, return_format=JSON) 

1598 # Group IDs by value to check for duplicates 

1599 ids_by_value = {} 

1600 for binding in result["results"]["bindings"]: 

1601 value = binding["value"]["value"] 

1602 id = binding["id"]["value"] 

1603 if value not in ids_by_value: 

1604 ids_by_value[value] = [] 

1605 ids_by_value[value].append(id) 

1606 

1607 # Cleanup 

1608 shutil.rmtree(output_folder, ignore_errors=True) 

1609 shutil.rmtree( 

1610 os.path.join(BASE_DIR, "input_duplicate_venue"), ignore_errors=True 

1611 ) 

1612 if os.path.exists(meta_config_path): 

1613 os.remove(meta_config_path) 

1614 

1615 # Check that we don't have duplicate IDs for any ISSN 

1616 for issn_value, ids in ids_by_value.items(): 

1617 self.assertEqual( 

1618 len(ids), 1, f"Found multiple IDs for ISSN {issn_value} in venue: {ids}" 

1619 ) 

1620 

1621 # Verify that pre-existing IDs were reused 

1622 self.assertTrue( 

1623 any("0601" in id for ids in ids_by_value.values() for id in ids) 

1624 and any("0602" in id for ids in ids_by_value.values() for id in ids), 

1625 "Pre-existing IDs were not reused", 

1626 ) 

1627 

1628 def test_doi_with_multiple_slashes(self): 

1629 """Test handling of DOIs containing multiple forward slashes""" 

1630 output_folder = os.path.join(BASE_DIR, "output_doi_test") 

1631 meta_config_path = os.path.join(BASE_DIR, "meta_config_doi.yaml") 

1632 

1633 # Setup: create test data with problematic DOI 

1634 os.makedirs(os.path.join(BASE_DIR, "input_doi"), exist_ok=True) 

1635 with open( 

1636 os.path.join(BASE_DIR, "input_doi", "test.csv"), "w", encoding="utf-8" 

1637 ) as f: 

1638 writer = csv.writer(f) 

1639 writer.writerow( 

1640 [ 

1641 "id", 

1642 "title", 

1643 "author", 

1644 "pub_date", 

1645 "venue", 

1646 "volume", 

1647 "issue", 

1648 "page", 

1649 "type", 

1650 "publisher", 

1651 "editor", 

1652 ] 

1653 ) 

1654 writer.writerow( 

1655 [ 

1656 "doi:10.1093/acprof:oso/9780199230723.001.0001", # Problematic DOI with multiple slashes 

1657 "Test Book", 

1658 "", 

1659 "", 

1660 "", 

1661 "", 

1662 "", 

1663 "", 

1664 "book", 

1665 "", 

1666 "", 

1667 ] 

1668 ) 

1669 

1670 # Create test settings 

1671 settings = { 

1672 "triplestore_url": SERVER, 

1673 "provenance_triplestore_url": PROV_SERVER, 

1674 "input_csv_dir": os.path.join(BASE_DIR, "input_doi"), 

1675 "base_output_dir": output_folder, 

1676 "output_rdf_dir": output_folder, 

1677 "resp_agent": "test", 

1678 "base_iri": "https://w3id.org/oc/meta/", 

1679 "context_path": None, 

1680 "dir_split_number": 10000, 

1681 "items_per_file": 1000, 

1682 "default_dir": "_", 

1683 "rdf_output_in_chunks": False, 

1684 "zip_output_rdf": True, 

1685 "source": None, 

1686 "supplier_prefix": "060", 

1687 "workers_number": 1, 

1688 "use_doi_api_service": False, 

1689 "blazegraph_full_text_search": False, 

1690 "virtuoso_full_text_search": True, 

1691 "fuseki_full_text_search": False, 

1692 "graphdb_connector_name": None, 

1693 "cache_endpoint": None, 

1694 "cache_update_endpoint": None, 

1695 "silencer": [], 

1696 "redis_cache_db": 2, 

1697 "ts_upload_cache": self.cache_file, 

1698 "ts_failed_queries": self.failed_file, 

1699 "ts_stop_file": self.stop_file, 

1700 } 

1701 

1702 with open(meta_config_path, "w") as f: 

1703 yaml.dump(settings, f) 

1704 

1705 now = datetime.now() 

1706 

1707 # Run the process 

1708 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1709 

1710 # Query to verify DOI was processed correctly 

1711 query = """ 

1712 SELECT ?br ?id ?value 

1713 WHERE { 

1714 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "10.1093/acprof:oso/9780199230723.001.0001"^^<http://www.w3.org/2001/XMLSchema#string> ; 

1715 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/doi> ; 

1716 ^<http://purl.org/spar/datacite/hasIdentifier> ?br . 

1717 } 

1718 """ 

1719 result = execute_sparql_query(SERVER, query, return_format=JSON) 

1720 

1721 # Cleanup 

1722 shutil.rmtree(output_folder, ignore_errors=True) 

1723 shutil.rmtree(os.path.join(BASE_DIR, "input_doi"), ignore_errors=True) 

1724 if os.path.exists(meta_config_path): 

1725 os.remove(meta_config_path) 

1726 delete_output_zip(".", now) 

1727 

1728 # Verify results 

1729 self.assertTrue( 

1730 len(result["results"]["bindings"]) > 0, 

1731 "DOI with multiple slashes was not processed correctly", 

1732 ) 

1733 

1734 # Check that we got exactly one result 

1735 self.assertEqual( 

1736 len(result["results"]["bindings"]), 

1737 1, 

1738 f"Expected 1 result, got {len(result['results']['bindings'])}", 

1739 ) 

1740 

1741 def test_volume_issue_deduplication(self): 

1742 """Test to verify that volumes and issues are properly deduplicated""" 

1743 output_folder = os.path.join(BASE_DIR, "output_vvi_test") 

1744 meta_config_path = os.path.join(BASE_DIR, "meta_config_vvi.yaml") 

1745 

1746 # Setup: create test data 

1747 os.makedirs(os.path.join(BASE_DIR, "input_vvi"), exist_ok=True) 

1748 with open( 

1749 os.path.join(BASE_DIR, "input_vvi", "test.csv"), "w", encoding="utf-8" 

1750 ) as f: 

1751 writer = csv.writer(f) 

1752 writer.writerow( 

1753 [ 

1754 "id", 

1755 "title", 

1756 "author", 

1757 "pub_date", 

1758 "venue", 

1759 "volume", 

1760 "issue", 

1761 "page", 

1762 "type", 

1763 "publisher", 

1764 "editor", 

1765 ] 

1766 ) 

1767 # First article in volume 1, issue 1 

1768 writer.writerow( 

1769 [ 

1770 "doi:10.1234/test.1", 

1771 "First Article", 

1772 "", 

1773 "2023", 

1774 "Test Journal [issn:1756-1833]", 

1775 "1", 

1776 "1", 

1777 "1-10", 

1778 "journal article", 

1779 "", 

1780 "", 

1781 ] 

1782 ) 

1783 # Second article in same volume and issue 

1784 writer.writerow( 

1785 [ 

1786 "doi:10.1234/test.2", 

1787 "Second Article", 

1788 "", 

1789 "2023", 

1790 "Test Journal [issn:1756-1833]", 

1791 "1", 

1792 "1", 

1793 "11-20", 

1794 "journal article", 

1795 "", 

1796 "", 

1797 ] 

1798 ) 

1799 

1800 # Create test settings 

1801 settings = { 

1802 "triplestore_url": SERVER, 

1803 "provenance_triplestore_url": PROV_SERVER, 

1804 "input_csv_dir": os.path.join(BASE_DIR, "input_vvi"), 

1805 "base_output_dir": output_folder, 

1806 "output_rdf_dir": output_folder, 

1807 "resp_agent": "test", 

1808 "base_iri": "https://w3id.org/oc/meta/", 

1809 "context_path": None, 

1810 "dir_split_number": 10000, 

1811 "items_per_file": 1000, 

1812 "default_dir": "_", 

1813 "rdf_output_in_chunks": False, 

1814 "zip_output_rdf": True, 

1815 "source": None, 

1816 "supplier_prefix": "060", 

1817 "workers_number": 1, 

1818 "use_doi_api_service": False, 

1819 "blazegraph_full_text_search": False, 

1820 "virtuoso_full_text_search": True, 

1821 "fuseki_full_text_search": False, 

1822 "graphdb_connector_name": None, 

1823 "cache_endpoint": None, 

1824 "cache_update_endpoint": None, 

1825 "silencer": [], 

1826 "redis_cache_db": 2, 

1827 "ts_upload_cache": self.cache_file, 

1828 "ts_failed_queries": self.failed_file, 

1829 "ts_stop_file": self.stop_file, 

1830 } 

1831 

1832 with open(meta_config_path, "w") as f: 

1833 yaml.dump(settings, f) 

1834 

1835 # Run the process 

1836 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1837 

1838 # Query to check volume and issue structure 

1839 query = """ 

1840 PREFIX fabio: <http://purl.org/spar/fabio/> 

1841 PREFIX frbr: <http://purl.org/vocab/frbr/core#> 

1842 PREFIX prism: <http://prismstandard.org/namespaces/basic/2.0/> 

1843  

1844 SELECT ?article ?volume ?issue ?seq_id 

1845 WHERE { 

1846 ?article a fabio:JournalArticle ; 

1847 frbr:partOf ?issue . 

1848 ?issue a fabio:JournalIssue ; 

1849 fabio:hasSequenceIdentifier ?seq_id ; 

1850 frbr:partOf ?volume . 

1851 ?volume a fabio:JournalVolume . 

1852 } 

1853 ORDER BY ?article 

1854 """ 

1855 

1856 result = execute_sparql_query(SERVER, query) 

1857 

1858 # Cleanup 

1859 shutil.rmtree(output_folder, ignore_errors=True) 

1860 shutil.rmtree(os.path.join(BASE_DIR, "input_vvi"), ignore_errors=True) 

1861 if os.path.exists(meta_config_path): 

1862 os.remove(meta_config_path) 

1863 

1864 # Verify results 

1865 bindings = result["results"]["bindings"] 

1866 

1867 # Should have 2 articles 

1868 self.assertEqual(len(bindings), 2, "Expected 2 articles") 

1869 

1870 # Both articles should reference the same volume and issue 

1871 first_volume = bindings[0]["volume"]["value"] 

1872 first_issue = bindings[0]["issue"]["value"] 

1873 

1874 for binding in bindings[1:]: 

1875 self.assertEqual( 

1876 binding["volume"]["value"], 

1877 first_volume, 

1878 "Articles reference different volumes", 

1879 ) 

1880 self.assertEqual( 

1881 binding["issue"]["value"], 

1882 first_issue, 

1883 "Articles reference different issues", 

1884 ) 

1885 

1886 def test_volume_issue_deduplication_with_triplestore(self): 

1887 """Test that volumes and issues are properly deduplicated when they already exist in the triplestore""" 

1888 output_folder = os.path.join(BASE_DIR, "output_vvi_triplestore_test") 

1889 meta_config_path = os.path.join(BASE_DIR, "meta_config_vvi_triplestore.yaml") 

1890 

1891 # Setup: Insert pre-existing venue with duplicate volumes and issues (with/without datatype) 

1892 sparql = SPARQLWrapper(SERVER) 

1893 sparql.setMethod(POST) 

1894 sparql.setQuery( 

1895 """ 

1896 INSERT DATA { 

1897 GRAPH <https://w3id.org/oc/meta/br/> { 

1898 # Venue 

1899 <https://w3id.org/oc/meta/br/0601>  

1900 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601> ; 

1901 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ; 

1902 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1903 <http://purl.org/dc/terms/title> "Test Journal" . 

1904  

1905 # Volume 1 (without datatype) 

1906 <https://w3id.org/oc/meta/br/0602> 

1907 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalVolume> ; 

1908 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1909 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0601> ; 

1910 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1" . 

1911 

1912 # Volume 1 (with datatype) 

1913 <https://w3id.org/oc/meta/br/0604> 

1914 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalVolume> ; 

1915 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1916 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0601> ; 

1917 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1"^^<http://www.w3.org/2001/XMLSchema#string> . 

1918  

1919 # Issue 1 (without datatype) 

1920 <https://w3id.org/oc/meta/br/0603> 

1921 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalIssue> ; 

1922 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1923 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0602> ; 

1924 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1" . 

1925 

1926 # Issue 1 (with datatype) 

1927 <https://w3id.org/oc/meta/br/0605> 

1928 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalIssue> ; 

1929 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1930 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0604> ; 

1931 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1"^^<http://www.w3.org/2001/XMLSchema#string> . 

1932 } 

1933 GRAPH <https://w3id.org/oc/meta/id/> { 

1934 <https://w3id.org/oc/meta/id/0601> 

1935 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "1756-1833" ; 

1936 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1937 } 

1938 } 

1939 """ 

1940 ) 

1941 sparql.query() 

1942 

1943 # Update Redis counters for pre-existing entities 

1944 redis_handler = RedisCounterHandler(db=5) 

1945 redis_handler.set_counter( 

1946 5, "br", supplier_prefix="060" 

1947 ) # 5 entities: venue, 2 volumes, 2 issues 

1948 redis_handler.set_counter( 

1949 1, "id", supplier_prefix="060" 

1950 ) # 1 identifier for venue 

1951 

1952 # Create test data - article that should use existing volume and issue 

1953 os.makedirs(os.path.join(BASE_DIR, "input_vvi_triplestore"), exist_ok=True) 

1954 with open( 

1955 os.path.join(BASE_DIR, "input_vvi_triplestore", "test.csv"), 

1956 "w", 

1957 encoding="utf-8", 

1958 ) as f: 

1959 writer = csv.writer(f) 

1960 writer.writerow( 

1961 [ 

1962 "id", 

1963 "title", 

1964 "author", 

1965 "pub_date", 

1966 "venue", 

1967 "volume", 

1968 "issue", 

1969 "page", 

1970 "type", 

1971 "publisher", 

1972 "editor", 

1973 ] 

1974 ) 

1975 writer.writerow( 

1976 [ 

1977 "doi:10.1234/test.1", 

1978 "Test Article", 

1979 "", 

1980 "2023", 

1981 "Test Journal [issn:1756-1833]", 

1982 "1", # Should match existing volume 

1983 "1", # Should match existing issue 

1984 "1-10", 

1985 "journal article", 

1986 "", 

1987 "", 

1988 ] 

1989 ) 

1990 

1991 # Create test settings 

1992 settings = { 

1993 "triplestore_url": SERVER, 

1994 "provenance_triplestore_url": PROV_SERVER, 

1995 "input_csv_dir": os.path.join(BASE_DIR, "input_vvi_triplestore"), 

1996 "base_output_dir": output_folder, 

1997 "output_rdf_dir": output_folder, 

1998 "resp_agent": "test", 

1999 "base_iri": "https://w3id.org/oc/meta/", 

2000 "context_path": None, 

2001 "dir_split_number": 10000, 

2002 "items_per_file": 1000, 

2003 "default_dir": "_", 

2004 "rdf_output_in_chunks": False, 

2005 "zip_output_rdf": True, 

2006 "source": None, 

2007 "supplier_prefix": "060", 

2008 "workers_number": 1, 

2009 "use_doi_api_service": False, 

2010 "blazegraph_full_text_search": False, 

2011 "virtuoso_full_text_search": True, 

2012 "fuseki_full_text_search": False, 

2013 "graphdb_connector_name": None, 

2014 "cache_endpoint": None, 

2015 "cache_update_endpoint": None, 

2016 "silencer": [], 

2017 "redis_cache_db": 2, 

2018 "ts_upload_cache": self.cache_file, 

2019 "ts_failed_queries": self.failed_file, 

2020 "ts_stop_file": self.stop_file, 

2021 } 

2022 

2023 with open(meta_config_path, "w") as f: 

2024 yaml.dump(settings, f) 

2025 

2026 # Run the process 

2027 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

2028 

2029 # Check if new volumes/issues were created 

2030 to_be_uploaded_dir = os.path.join(output_folder, "rdf", "to_be_uploaded") 

2031 new_entities_created = False 

2032 if os.path.exists(to_be_uploaded_dir): 

2033 for dirpath, _, filenames in os.walk(to_be_uploaded_dir): 

2034 for f in filenames: 

2035 if f.endswith(".sparql"): 

2036 with open(os.path.join(dirpath, f)) as file: 

2037 content = file.read() 

2038 if any( 

2039 "JournalVolume" in line or "JournalIssue" in line 

2040 for line in content.splitlines() 

2041 ): 

2042 print(f"\nFound new volume/issue creation in {f}:") 

2043 new_entities_created = True 

2044 

2045 # Query to get all entities and their relationships 

2046 query = """ 

2047 PREFIX fabio: <http://purl.org/spar/fabio/> 

2048 PREFIX frbr: <http://purl.org/vocab/frbr/core#> 

2049 PREFIX datacite: <http://purl.org/spar/datacite/> 

2050  

2051 SELECT DISTINCT ?article ?venue ?volume ?issue ?issn 

2052 WHERE { 

2053 ?article a fabio:JournalArticle ; 

2054 frbr:partOf ?issue . 

2055 ?issue a fabio:JournalIssue ; 

2056 frbr:partOf ?volume . 

2057 ?volume a fabio:JournalVolume ; 

2058 frbr:partOf ?venue . 

2059 ?venue datacite:hasIdentifier ?id . 

2060 ?id datacite:usesIdentifierScheme datacite:issn ; 

2061 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?issn . 

2062 } 

2063 """ 

2064 

2065 result = execute_sparql_query(SERVER, query) 

2066 

2067 # Cleanup 

2068 shutil.rmtree(output_folder, ignore_errors=True) 

2069 shutil.rmtree( 

2070 os.path.join(BASE_DIR, "input_vvi_triplestore"), ignore_errors=True 

2071 ) 

2072 if os.path.exists(meta_config_path): 

2073 os.remove(meta_config_path) 

2074 

2075 # Verify results 

2076 bindings = result["results"]["bindings"] 

2077 self.assertEqual(len(bindings), 1, "Expected exactly one article") 

2078 

2079 # Get the URIs from the result 

2080 venue_uri = bindings[0]["venue"]["value"] 

2081 volume_uri = bindings[0]["volume"]["value"] 

2082 issue_uri = bindings[0]["issue"]["value"] 

2083 issn = bindings[0]["issn"]["value"] 

2084 

2085 # Check if venue was deduplicated (should use existing venue) 

2086 self.assertEqual( 

2087 venue_uri, 

2088 "https://w3id.org/oc/meta/br/0601", 

2089 "Venue was not deduplicated correctly", 

2090 ) 

2091 

2092 # Check if volume was deduplicated - either version is valid 

2093 self.assertIn( 

2094 volume_uri, 

2095 ["https://w3id.org/oc/meta/br/0602", "https://w3id.org/oc/meta/br/0604"], 

2096 "Volume was not deduplicated correctly - should use one of the existing volumes", 

2097 ) 

2098 

2099 # Check if issue was deduplicated - either version is valid 

2100 self.assertIn( 

2101 issue_uri, 

2102 ["https://w3id.org/oc/meta/br/0603", "https://w3id.org/oc/meta/br/0605"], 

2103 "Issue was not deduplicated correctly - should use one of the existing issues", 

2104 ) 

2105 

2106 # Check ISSN 

2107 self.assertEqual(issn, "1756-1833", "ISSN does not match") 

2108 

2109 # Verify no new volumes/issues were created 

2110 self.assertFalse( 

2111 new_entities_created, 

2112 "New volumes/issues were created when they should have been deduplicated", 

2113 ) 

2114 

2115 def test_temporary_identifiers(self): 

2116 """Test that temporary identifiers are used for deduplication but not saved, and an OMID is generated""" 

2117 output_folder = os.path.join(BASE_DIR, "output_temp_id_test") 

2118 meta_config_path = os.path.join(BASE_DIR, "meta_config_temp.yaml") 

2119 

2120 # Setup: create test data with only temporary identifier 

2121 os.makedirs(os.path.join(BASE_DIR, "input_temp"), exist_ok=True) 

2122 with open( 

2123 os.path.join(BASE_DIR, "input_temp", "test.csv"), "w", encoding="utf-8" 

2124 ) as f: 

2125 writer = csv.writer(f) 

2126 writer.writerow( 

2127 [ 

2128 "id", 

2129 "title", 

2130 "author", 

2131 "pub_date", 

2132 "venue", 

2133 "volume", 

2134 "issue", 

2135 "page", 

2136 "type", 

2137 "publisher", 

2138 "editor", 

2139 ] 

2140 ) 

2141 writer.writerow( 

2142 [ 

2143 "temp:567", # Only temporary identifier 

2144 "Test Article", 

2145 "", 

2146 "2023", 

2147 "", 

2148 "", 

2149 "", 

2150 "", 

2151 "journal article", 

2152 "", 

2153 "", 

2154 ] 

2155 ) 

2156 

2157 # Create test settings 

2158 settings = { 

2159 "triplestore_url": SERVER, 

2160 "provenance_triplestore_url": PROV_SERVER, 

2161 "input_csv_dir": os.path.join(BASE_DIR, "input_temp"), 

2162 "base_output_dir": output_folder, 

2163 "output_rdf_dir": output_folder, 

2164 "resp_agent": "test", 

2165 "base_iri": "https://w3id.org/oc/meta/", 

2166 "context_path": None, 

2167 "dir_split_number": 10000, 

2168 "items_per_file": 1000, 

2169 "default_dir": "_", 

2170 "rdf_output_in_chunks": False, 

2171 "zip_output_rdf": True, 

2172 "source": None, 

2173 "supplier_prefix": "060", 

2174 "workers_number": 1, 

2175 "use_doi_api_service": False, 

2176 "blazegraph_full_text_search": False, 

2177 "virtuoso_full_text_search": True, 

2178 "fuseki_full_text_search": False, 

2179 "graphdb_connector_name": None, 

2180 "cache_endpoint": None, 

2181 "cache_update_endpoint": None, 

2182 "silencer": [], 

2183 "redis_cache_db": 2, 

2184 "ts_upload_cache": self.cache_file, 

2185 "ts_failed_queries": self.failed_file, 

2186 "ts_stop_file": self.stop_file, 

2187 } 

2188 

2189 with open(meta_config_path, "w") as f: 

2190 yaml.dump(settings, f) 

2191 

2192 now = datetime.now() 

2193 

2194 # Run the process 

2195 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

2196 

2197 # Query to verify an OMID was generated and no temporary identifier was saved 

2198 query = """ 

2199 PREFIX fabio: <http://purl.org/spar/fabio/> 

2200 PREFIX datacite: <http://purl.org/spar/datacite/> 

2201 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/> 

2202  

2203 SELECT ?br ?id ?value ?scheme 

2204 WHERE { 

2205 ?br a fabio:JournalArticle . 

2206 OPTIONAL { 

2207 ?br datacite:hasIdentifier ?id . 

2208 ?id datacite:usesIdentifierScheme ?scheme ; 

2209 literal:hasLiteralValue ?value . 

2210 } 

2211 } 

2212 """ 

2213 result = execute_sparql_query(SERVER, query, return_format=JSON) 

2214 

2215 # Cleanup 

2216 shutil.rmtree(output_folder, ignore_errors=True) 

2217 shutil.rmtree(os.path.join(BASE_DIR, "input_temp"), ignore_errors=True) 

2218 if os.path.exists(meta_config_path): 

2219 os.remove(meta_config_path) 

2220 delete_output_zip(".", now) 

2221 

2222 # Verify results 

2223 bindings = result["results"]["bindings"] 

2224 

2225 # Should find exactly one article 

2226 self.assertEqual(len(bindings), 1, "Expected exactly one article") 

2227 

2228 # The article should have a br/ URI (OMID) 

2229 br_uri = bindings[0]["br"]["value"] 

2230 self.assertTrue( 

2231 "br/" in br_uri, 

2232 f"Article URI {br_uri} does not contain expected OMID pattern 'br/'", 

2233 ) 

2234 

2235 # Should not have any saved identifiers 

2236 self.assertNotIn( 

2237 "id", 

2238 bindings[0], 

2239 "Found unexpected identifier when only temporary ID was provided", 

2240 ) 

2241 

2242 def test_temporary_identifiers_deduplication(self): 

2243 """Test that multiple rows with the same temporary identifier are correctly deduplicated""" 

2244 # Create test data with two rows using the same temporary identifier 

2245 test_data = [ 

2246 { 

2247 "id": "temp:789", 

2248 "title": "Test Article 1", 

2249 "author": "Smith, John [orcid:0000-0002-1234-5678]", 

2250 "pub_date": "2020", 

2251 "venue": "", 

2252 "volume": "", 

2253 "issue": "", 

2254 "page": "", 

2255 "type": "journal article", 

2256 "publisher": "", 

2257 "editor": "", 

2258 }, 

2259 { 

2260 "id": "temp:789", # Same temporary ID 

2261 "title": "Test Article 1", # Same title 

2262 "author": "Smith, John [orcid:0000-0002-1234-5678]", 

2263 "pub_date": "2020", 

2264 "venue": "", 

2265 "volume": "", 

2266 "issue": "", 

2267 "page": "", 

2268 "type": "journal article", 

2269 "publisher": "", 

2270 "editor": "", 

2271 }, 

2272 ] 

2273 

2274 # Write test data to CSV 

2275 input_dir = os.path.join(BASE_DIR, "input_temp_dedup") 

2276 os.makedirs(input_dir, exist_ok=True) 

2277 csv_path = os.path.join(input_dir, "test.csv") 

2278 write_csv(csv_path, test_data) 

2279 

2280 # Run meta process 

2281 output_dir = os.path.join(BASE_DIR, "output_temp_dedup") 

2282 os.makedirs(output_dir, exist_ok=True) 

2283 config = { 

2284 "input_csv_dir": input_dir, 

2285 "base_output_dir": output_dir, 

2286 "output_rdf_dir": output_dir, 

2287 "triplestore_url": SERVER, 

2288 "provenance_triplestore_url": PROV_SERVER, 

2289 "resp_agent": "https://w3id.org/oc/meta/prov/pa/1", 

2290 "base_iri": "https://w3id.org/oc/meta/", 

2291 "context_path": "https://w3id.org/oc/meta/context.json", 

2292 "supplier_prefix": "060", 

2293 "dir_split_number": 10000, 

2294 "items_per_file": 1000, 

2295 "default_dir": "_", 

2296 "rdf_output_in_chunks": True, 

2297 "zip_output_rdf": False, 

2298 "source": None, 

2299 "use_doi_api_service": False, 

2300 "workers_number": 1, 

2301 "silencer": [], 

2302 "redis_host": "localhost", 

2303 "redis_port": 6379, 

2304 "redis_db": 5, 

2305 "redis_cache_db": 2, 

2306 "ts_upload_cache": self.cache_file, 

2307 "ts_failed_queries": self.failed_file, 

2308 "ts_stop_file": self.stop_file, 

2309 "graphdb_connector_name": None, 

2310 "blazegraph_full_text_search": False, 

2311 "fuseki_full_text_search": False, 

2312 "virtuoso_full_text_search": False, 

2313 "provenance_endpoints": [], 

2314 "cache_endpoint": None, 

2315 "cache_update_endpoint": None, 

2316 "normalize_titles": True, 

2317 } 

2318 config_path = os.path.join(output_dir, "config.yaml") 

2319 with open(config_path, "w") as f: 

2320 yaml.dump(config, f) 

2321 

2322 # Run the process 

2323 run_meta_process(settings=config, meta_config_path=config_path) 

2324 

2325 # Query the triplestore to verify: 

2326 # 1. Only one OMID was generated for both rows 

2327 # 2. The temporary identifier was not saved 

2328 query = """ 

2329 SELECT DISTINCT ?br 

2330 WHERE { 

2331 ?br a <http://purl.org/spar/fabio/JournalArticle> . 

2332 } 

2333 """ 

2334 results = execute_sparql_query(SERVER, query) 

2335 

2336 # Clean up 

2337 shutil.rmtree(input_dir) 

2338 shutil.rmtree(output_dir) 

2339 

2340 # Should only be one article 

2341 articles = [ 

2342 str(result["br"]["value"]) for result in results["results"]["bindings"] 

2343 ] 

2344 self.assertEqual( 

2345 len(articles), 1, "Should only be one article after deduplication" 

2346 ) 

2347 

2348 

2349def normalize_graph(graph): 

2350 """ 

2351 Normalizza i letterali nel grafo rimuovendo i tipi di dato espliciti. 

2352 """ 

2353 normalized_graph = Graph() 

2354 for subject, predicate, obj in graph: 

2355 if isinstance(obj, Literal) and obj.datatype is not None: 

2356 normalized_obj = Literal(obj.toPython()) 

2357 normalized_graph.add((subject, predicate, normalized_obj)) 

2358 else: 

2359 normalized_graph.add((subject, predicate, obj)) 

2360 return normalized_graph 

2361 

2362 

2363if __name__ == "__main__": # pragma: no cover 

2364 unittest.main()