Coverage for test/meta_process_test.py: 95%

537 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-12-20 08:55 +0000

1import csv 

2import glob 

3import os 

4import re 

5import shutil 

6import subprocess 

7import sys 

8import tempfile 

9import time 

10import unittest 

11from datetime import datetime 

12from test.test_utils import (PROV_SERVER, SERVER, execute_sparql_construct, 

13 execute_sparql_query, reset_redis_counters, 

14 reset_server, wait_for_virtuoso) 

15 

16import yaml 

17from oc_meta.lib.file_manager import get_csv_data, write_csv 

18from oc_meta.run.meta_process import run_meta_process 

19from oc_ocdm.counter_handler.redis_counter_handler import RedisCounterHandler 

20from rdflib import Dataset, Graph, Literal, URIRef 

21from sparqlite import SPARQLClient 

22 

23BASE_DIR = os.path.join("test", "meta_process") 

24 

25 

26def delete_output_zip(base_dir: str, start_time: datetime) -> None: 

27 for file in os.listdir(base_dir): 

28 if file.startswith("meta_output") and file.endswith(".zip"): 

29 file_creation_time = file.split("meta_output_")[1].replace(".zip", "") 

30 file_creation_time = datetime.strptime( 

31 file_creation_time, "%Y-%m-%dT%H_%M_%S_%f" 

32 ) 

33 was_created_after_time = True if file_creation_time > start_time else False 

34 if was_created_after_time: 

35 os.remove(os.path.join(base_dir, file)) 

36 

37 

38class test_ProcessTest(unittest.TestCase): 

39 @classmethod 

40 def setUpClass(cls): 

41 """Setup eseguito una volta per tutta la classe di test""" 

42 if not wait_for_virtuoso(SERVER, max_wait=30): 

43 raise TimeoutError("Virtuoso not ready after 30 seconds") 

44 

45 def setUp(self): 

46 """Setup eseguito prima di ogni test""" 

47 # Create temporary directory for cache files 

48 self.temp_dir = tempfile.mkdtemp() 

49 self.cache_file = os.path.join(self.temp_dir, "ts_upload_cache.json") 

50 self.failed_file = os.path.join(self.temp_dir, "failed_queries.txt") 

51 self.stop_file = os.path.join(self.temp_dir, ".stop_upload") 

52 

53 # Reset del database 

54 reset_server() 

55 reset_redis_counters() 

56 

57 def tearDown(self): 

58 reset_redis_counters() 

59 # Remove temporary directory and its contents 

60 if hasattr(self, "temp_dir") and os.path.exists(self.temp_dir): 

61 shutil.rmtree(self.temp_dir) 

62 

63 # Clean up bulk load files 

64 bulk_load_dirs = [ 

65 "test/test_virtuoso_db/bulk_load", 

66 "test/test_virtuoso_db_prov/bulk_load" 

67 ] 

68 for bulk_dir in bulk_load_dirs: 

69 if os.path.exists(bulk_dir): 

70 for file in glob.glob(os.path.join(bulk_dir, "*.nq.gz")): 

71 os.remove(file) 

72 for file in glob.glob(os.path.join(bulk_dir, "*.backup")): 

73 os.remove(file) 

74 

75 for i in range(1, 11): 

76 output_dir = os.path.join(BASE_DIR, f"output_{i}") 

77 if os.path.exists(output_dir): 

78 shutil.rmtree(output_dir) 

79 

80 def test_run_meta_process(self): 

81 output_folder = os.path.join(BASE_DIR, "output_1") 

82 meta_config_path = os.path.join(BASE_DIR, "meta_config_1.yaml") 

83 with open(meta_config_path, encoding="utf-8") as file: 

84 settings = yaml.full_load(file) 

85 

86 # Update settings with temporary files and Redis cache DB 

87 settings.update( 

88 { 

89 "redis_cache_db": 2, 

90 "ts_upload_cache": self.cache_file, 

91 "ts_failed_queries": self.failed_file, 

92 "ts_stop_file": self.stop_file, 

93 } 

94 ) 

95 

96 now = datetime.now() 

97 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

98 output = list() 

99 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")): 

100 for file in filenames: 

101 output.extend(get_csv_data(os.path.join(dirpath, file))) 

102 expected_output = [ 

103 { 

104 "id": "doi:10.17117/na.2015.08.1067 omid:br/0601", 

105 "title": "", 

106 "author": "", 

107 "pub_date": "", 

108 "venue": "Scientometrics [issn:0138-9130 issn:1588-2861 omid:br/0603]", 

109 "volume": "26", 

110 "issue": "", 

111 "page": "", 

112 "type": "journal article", 

113 "publisher": "Consulting Company Ucom [crossref:6623 omid:ra/0601]", 

114 "editor": "Naimi, Elmehdi [orcid:0000-0002-4126-8519 omid:ra/0602]", 

115 }, 

116 { 

117 "id": "issn:1524-4539 issn:0009-7322 omid:br/0602", 

118 "title": "Circulation", 

119 "author": "", 

120 "pub_date": "", 

121 "venue": "", 

122 "volume": "", 

123 "issue": "", 

124 "page": "", 

125 "type": "journal", 

126 "publisher": "", 

127 "editor": "", 

128 }, 

129 { 

130 "id": "doi:10.9799/ksfan.2012.25.1.069 omid:br/0605", 

131 "title": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment", 

132 "author": "Cheigh, Chan-Ick [orcid:0000-0003-2542-5788 omid:ra/0603]; Mun, Ji-Hye [omid:ra/0604]; Chung, Myong-Soo [omid:ra/0605]", 

133 "pub_date": "2012-03-31", 

134 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/0608]", 

135 "volume": "25", 

136 "issue": "1", 

137 "page": "69-76", 

138 "type": "journal article", 

139 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/0606]", 

140 "editor": "Chung, Myong-Soo [orcid:0000-0002-9666-2513 omid:ra/0607]", 

141 }, 

142 { 

143 "id": "doi:10.9799/ksfan.2012.25.1.077 omid:br/0606", 

144 "title": "Properties Of Immature Green Cherry Tomato Pickles", 

145 "author": "Koh, Jong-Ho [omid:ra/0608]; Shin, Hae-Hun [omid:ra/0609]; Kim, Young-Shik [orcid:0000-0001-5673-6314 omid:ra/06010]; Kook, Moo-Chang [omid:ra/06011]", 

146 "pub_date": "2012-03-31", 

147 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/0608]", 

148 "volume": "", 

149 "issue": "2", 

150 "page": "77-82", 

151 "type": "journal article", 

152 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/0606]", 

153 "editor": "", 

154 }, 

155 { 

156 "id": "doi:10.1097/01.rct.0000185385.35389.cd omid:br/0607", 

157 "title": "Comprehensive Assessment Of Lung CT Attenuation Alteration At Perfusion Defects Of Acute Pulmonary Thromboembolism With Breath-Hold SPECT-CT Fusion Images", 

158 "author": "Suga, Kazuyoshi [omid:ra/06012]; Kawakami, Yasuhiko [omid:ra/06013]; Iwanaga, Hideyuki [omid:ra/06014]; Hayashi, Noriko [omid:ra/06015]; Seto, Aska [omid:ra/06016]; Matsunaga, Naofumi [omid:ra/06017]", 

159 "pub_date": "2006-01", 

160 "venue": "Journal Of Computer Assisted Tomography [issn:0363-8715 omid:br/06012]", 

161 "volume": "30", 

162 "issue": "1", 

163 "page": "83-91", 

164 "type": "journal article", 

165 "publisher": "Ovid Technologies (Wolters Kluwer Health) [crossref:276 omid:ra/06018]", 

166 "editor": "", 

167 }, 

168 ] 

169 output = sorted(sorted(d.items()) for d in output) 

170 expected_output = sorted(sorted(d.items()) for d in expected_output) 

171 self.maxDiff = None 

172 shutil.rmtree(output_folder) 

173 delete_output_zip(".", now) 

174 self.assertEqual(output, expected_output) 

175 

176 def test_run_meta_process_ids_only(self): 

177 output_folder = os.path.join(BASE_DIR, "output_5") 

178 meta_config_path = os.path.join(BASE_DIR, "meta_config_5.yaml") 

179 now = datetime.now() 

180 with open(meta_config_path, encoding="utf-8") as file: 

181 settings = yaml.full_load(file) 

182 

183 # Update settings with temporary files and Redis cache DB 

184 settings.update( 

185 { 

186 "redis_cache_db": 2, 

187 "ts_upload_cache": self.cache_file, 

188 "ts_failed_queries": self.failed_file, 

189 "ts_stop_file": self.stop_file, 

190 } 

191 ) 

192 

193 run_meta_process(settings, meta_config_path=meta_config_path) 

194 output = list() 

195 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")): 

196 for file in filenames: 

197 output.extend(get_csv_data(os.path.join(dirpath, file))) 

198 expected_output = [ 

199 { 

200 "id": "doi:10.17117/na.2015.08.1067 omid:br/0601", 

201 "title": "Some Aspects Of The Evolution Of Chernozems Under The Influence Of Natural And Anthropogenic Factors", 

202 "author": "[orcid:0000-0002-4126-8519 omid:ra/0601]; [orcid:0000-0003-0530-4305 omid:ra/0602]", 

203 "pub_date": "2015-08-22", 

204 "venue": "[issn:1225-4339 omid:br/0602]", 

205 "volume": "26", 

206 "issue": "", 

207 "page": "50", 

208 "type": "journal article", 

209 "publisher": "[crossref:6623 omid:ra/0603]", 

210 "editor": "[orcid:0000-0002-4126-8519 omid:ra/0601]; [orcid:0000-0002-8420-0696 omid:ra/0604]", 

211 } 

212 ] 

213 output = sorted(sorted(d.items()) for d in output) 

214 expected_output = sorted(sorted(d.items()) for d in expected_output) 

215 self.maxDiff = None 

216 shutil.rmtree(output_folder) 

217 delete_output_zip(".", now) 

218 self.assertEqual(output, expected_output) 

219 

220 def test_provenance(self): 

221 # Bulk load disabled in meta_config_3.yaml 

222 output_folder = os.path.join(BASE_DIR, "output_3") 

223 now = datetime.now() 

224 if os.path.exists(output_folder): 

225 shutil.rmtree(output_folder) 

226 delete_output_zip(".", now) 

227 meta_config_path = os.path.join(BASE_DIR, "meta_config_3.yaml") 

228 with open(meta_config_path, encoding="utf-8") as file: 

229 settings = yaml.full_load(file) 

230 

231 # Update settings with temporary files and Redis cache DB 

232 settings.update( 

233 { 

234 "redis_cache_db": 2, 

235 "ts_upload_cache": self.cache_file, 

236 "ts_failed_queries": self.failed_file, 

237 "ts_stop_file": self.stop_file, 

238 } 

239 ) 

240 

241 reset_server() 

242 

243 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input") 

244 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

245 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input_2") 

246 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

247 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input") 

248 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

249 

250 output = dict() 

251 

252 entity_types = ['ar', 'br', 'id', 'ra', 're'] 

253 

254 for entity_type in entity_types: 

255 query = f""" 

256 SELECT ?s ?p ?o 

257 WHERE {{ 

258 ?s ?p ?o . 

259 FILTER(REGEX(STR(?s), "https://w3id.org/oc/meta/{entity_type}/[0-9]+/prov/se/[0-9]+")) 

260 }} 

261 """ 

262 

263 result = execute_sparql_query(PROV_SERVER, query) 

264 

265 entities = {} 

266 for binding in result['results']['bindings']: 

267 s_str = binding['s']['value'] 

268 p_str = binding['p']['value'] 

269 o_data = binding['o'] 

270 

271 if s_str not in entities: 

272 entities[s_str] = {'@id': s_str, '@type': []} 

273 

274 if p_str == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type': 

275 entities[s_str]['@type'].append(o_data['value']) 

276 else: 

277 if p_str not in entities[s_str]: 

278 entities[s_str][p_str] = [] 

279 

280 if o_data['type'] == 'uri': 

281 entities[s_str][p_str].append({'@id': o_data['value']}) 

282 elif o_data.get('datatype'): 

283 entities[s_str][p_str].append({ 

284 '@value': o_data['value'], 

285 '@type': o_data['datatype'] 

286 }) 

287 else: 

288 entities[s_str][p_str].append({'@value': o_data['value']}) 

289 

290 # Group entities by their parent entity (e.g., br/0601/prov/se/1 -> br/0601) 

291 grouped_entities = {} 

292 for entity_id, entity_data in entities.items(): 

293 # Extract the parent entity ID from the provenance entity ID 

294 parent_id = re.match(r'https://w3id.org/oc/meta/([^/]+/[0-9]+)', entity_id).group(0) 

295 

296 if parent_id not in grouped_entities: 

297 grouped_entities[parent_id] = [] 

298 

299 # Filter out properties we don't need for comparison 

300 filtered_entity_data = { 

301 '@id': entity_data['@id'], 

302 } 

303 

304 # Keep the required properties for comparison 

305 properties_to_keep = [ 

306 'http://www.w3.org/ns/prov#specializationOf', 

307 'http://www.w3.org/ns/prov#wasDerivedFrom' 

308 ] 

309 

310 for prop in properties_to_keep: 

311 if prop in entity_data: 

312 filtered_entity_data[prop] = entity_data[prop] 

313 

314 # Handle hasUpdateQuery specially 

315 if 'https://w3id.org/oc/ontology/hasUpdateQuery' in entity_data: 

316 # Extract the value from the hasUpdateQuery property 

317 update_query_value = entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'][0].get('@value', '') 

318 

319 # Split the query into individual statements 

320 if update_query_value: 

321 # Extract the part between the INSERT DATA { GRAPH <...> { and } } 

322 try: 

323 query_content = update_query_value.split( 

324 "INSERT DATA { GRAPH <https://w3id.org/oc/meta/br/> { " 

325 )[1].split(" } }")[0] 

326 

327 # Split by dot and space to get individual statements 

328 statements = set(query_content.split(" .")) 

329 

330 # Add to filtered entity data 

331 filtered_entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'] = statements 

332 except IndexError: 

333 # If the format is different, just use the original value 

334 filtered_entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'] = \ 

335 entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'] 

336 

337 # Add this filtered entity to its parent's group 

338 grouped_entities[parent_id].append(filtered_entity_data) 

339 

340 # Format the output to match the expected structure 

341 entity_list = [] 

342 for parent_id, entities_list in sorted(grouped_entities.items()): 

343 entity_list.append({ 

344 '@graph': sorted(entities_list, key=lambda x: x['@id']) 

345 }) 

346 

347 output[entity_type] = entity_list 

348 expected_output = { 

349 "ar": [ 

350 { 

351 "@graph": [ 

352 { 

353 "@id": "https://w3id.org/oc/meta/ar/0601/prov/se/1", 

354 "http://www.w3.org/ns/prov#specializationOf": [ 

355 {"@id": "https://w3id.org/oc/meta/ar/0601"} 

356 ], 

357 } 

358 ] 

359 }, 

360 { 

361 "@graph": [ 

362 { 

363 "@id": "https://w3id.org/oc/meta/ar/0602/prov/se/1", 

364 "http://www.w3.org/ns/prov#specializationOf": [ 

365 {"@id": "https://w3id.org/oc/meta/ar/0602"} 

366 ], 

367 } 

368 ] 

369 }, 

370 { 

371 "@graph": [ 

372 { 

373 "@id": "https://w3id.org/oc/meta/ar/0603/prov/se/1", 

374 "http://www.w3.org/ns/prov#specializationOf": [ 

375 {"@id": "https://w3id.org/oc/meta/ar/0603"} 

376 ], 

377 } 

378 ] 

379 }, 

380 ], 

381 "br": [ 

382 { 

383 "@graph": [ 

384 { 

385 "@id": "https://w3id.org/oc/meta/br/0601/prov/se/1", 

386 "http://www.w3.org/ns/prov#specializationOf": [ 

387 {"@id": "https://w3id.org/oc/meta/br/0601"} 

388 ], 

389 }, 

390 { 

391 "@id": "https://w3id.org/oc/meta/br/0601/prov/se/2", 

392 "http://www.w3.org/ns/prov#specializationOf": [ 

393 {"@id": "https://w3id.org/oc/meta/br/0601"} 

394 ], 

395 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

396 {"@id": "https://w3id.org/oc/meta/br/0601/prov/se/1"} 

397 ], 

398 "https://w3id.org/oc/ontology/hasUpdateQuery": { 

399 "", 

400 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0601>", 

401 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0603>", 

402 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0602>", 

403 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0603>", 

404 '<https://w3id.org/oc/meta/br/0601> <http://prismstandard.org/namespaces/basic/2.0/publicationDate> "2015-08-22"^^<http://www.w3.org/2001/XMLSchema#date>', 

405 '<https://w3id.org/oc/meta/br/0601> <http://purl.org/dc/terms/title> "Some Aspects Of The Evolution Of Chernozems Under The Influence Of Natural And Anthropogenic Factors"^^<http://www.w3.org/2001/XMLSchema#string>', 

406 }, 

407 }, 

408 ] 

409 }, 

410 { 

411 "@graph": [ 

412 { 

413 "@id": "https://w3id.org/oc/meta/br/0602/prov/se/1", 

414 "http://www.w3.org/ns/prov#specializationOf": [ 

415 {"@id": "https://w3id.org/oc/meta/br/0602"} 

416 ], 

417 } 

418 ] 

419 }, 

420 { 

421 "@graph": [ 

422 { 

423 "@id": "https://w3id.org/oc/meta/br/0603/prov/se/1", 

424 "http://www.w3.org/ns/prov#specializationOf": [ 

425 {"@id": "https://w3id.org/oc/meta/br/0603"} 

426 ], 

427 } 

428 ] 

429 }, 

430 ], 

431 "id": [ 

432 { 

433 "@graph": [ 

434 { 

435 "@id": "https://w3id.org/oc/meta/id/0601/prov/se/1", 

436 "http://www.w3.org/ns/prov#specializationOf": [ 

437 {"@id": "https://w3id.org/oc/meta/id/0601"} 

438 ], 

439 } 

440 ] 

441 }, 

442 { 

443 "@graph": [ 

444 { 

445 "@id": "https://w3id.org/oc/meta/id/0602/prov/se/1", 

446 "http://www.w3.org/ns/prov#specializationOf": [ 

447 {"@id": "https://w3id.org/oc/meta/id/0602"} 

448 ], 

449 } 

450 ] 

451 }, 

452 { 

453 "@graph": [ 

454 { 

455 "@id": "https://w3id.org/oc/meta/id/0603/prov/se/1", 

456 "http://www.w3.org/ns/prov#specializationOf": [ 

457 {"@id": "https://w3id.org/oc/meta/id/0603"} 

458 ], 

459 } 

460 ] 

461 }, 

462 { 

463 "@graph": [ 

464 { 

465 "@id": "https://w3id.org/oc/meta/id/0604/prov/se/1", 

466 "http://www.w3.org/ns/prov#specializationOf": [ 

467 {"@id": "https://w3id.org/oc/meta/id/0604"} 

468 ], 

469 } 

470 ] 

471 }, 

472 ], 

473 "ra": [ 

474 { 

475 "@graph": [ 

476 { 

477 "@id": "https://w3id.org/oc/meta/ra/0601/prov/se/1", 

478 "http://www.w3.org/ns/prov#specializationOf": [ 

479 {"@id": "https://w3id.org/oc/meta/ra/0601"} 

480 ], 

481 } 

482 ] 

483 }, 

484 { 

485 "@graph": [ 

486 { 

487 "@id": "https://w3id.org/oc/meta/ra/0602/prov/se/1", 

488 "http://www.w3.org/ns/prov#specializationOf": [ 

489 {"@id": "https://w3id.org/oc/meta/ra/0602"} 

490 ], 

491 } 

492 ] 

493 }, 

494 ], 

495 "re": [ 

496 { 

497 "@graph": [ 

498 { 

499 "@id": "https://w3id.org/oc/meta/re/0601/prov/se/1", 

500 "http://www.w3.org/ns/prov#specializationOf": [ 

501 {"@id": "https://w3id.org/oc/meta/re/0601"} 

502 ], 

503 } 

504 ] 

505 } 

506 ], 

507 } 

508 shutil.rmtree(output_folder) 

509 self.maxDiff = None 

510 self.assertEqual(output, expected_output) 

511 

512 def test_run_meta_process_thread_safe(self): 

513 output_folder = os.path.join(BASE_DIR, "output_4") 

514 meta_config_path = os.path.join(BASE_DIR, "meta_config_4.yaml") 

515 with open(meta_config_path, encoding="utf-8") as file: 

516 settings = yaml.full_load(file) 

517 original_input_csv_dir = settings["input_csv_dir"] 

518 settings["input_csv_dir"] = os.path.join(original_input_csv_dir, "preprocess") 

519 

520 # Use temporary cache files to avoid corruption 

521 settings["ts_upload_cache"] = self.cache_file 

522 settings["ts_failed_queries"] = self.failed_file 

523 settings["ts_stop_file"] = self.stop_file 

524 

525 reset_server() 

526 

527 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

528 

529 # Create a temporary config file with updated settings for subprocess 

530 temp_config_path = os.path.join(self.temp_dir, "temp_meta_config.yaml") 

531 with open(temp_config_path, "w") as f: 

532 yaml.dump(settings, f) 

533 

534 # Run it again to test thread safety 

535 proc = subprocess.run( 

536 [sys.executable, "-m", "oc_meta.run.meta_process", "-c", temp_config_path], 

537 capture_output=True, 

538 text=True, 

539 ) 

540 

541 output = dict() 

542 

543 entity_types = ['ar', 'br', 'id', 'ra', 're'] 

544 

545 for entity_type in entity_types: 

546 query = f""" 

547 SELECT ?s ?p ?o 

548 WHERE {{ 

549 ?s ?p ?o . 

550 FILTER(STRSTARTS(STR(?s), "https://w3id.org/oc/meta/{entity_type}/")) 

551 }} 

552 """ 

553 

554 result = execute_sparql_query(SERVER, query) 

555 

556 entities = {} 

557 for binding in result['results']['bindings']: 

558 s_str = binding['s']['value'] 

559 p_str = binding['p']['value'] 

560 o_data = binding['o'] 

561 

562 if s_str not in entities: 

563 entities[s_str] = {'@id': s_str, '@type': []} 

564 

565 if p_str == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type': 

566 entities[s_str]['@type'].append(o_data['value']) 

567 else: 

568 if p_str not in entities[s_str]: 

569 entities[s_str][p_str] = [] 

570 

571 if o_data['type'] == 'uri': 

572 entities[s_str][p_str].append({'@id': o_data['value']}) 

573 elif o_data.get('datatype'): 

574 entities[s_str][p_str].append({ 

575 '@value': o_data['value'], 

576 '@type': o_data['datatype'] 

577 }) 

578 else: 

579 entities[s_str][p_str].append({'@value': o_data['value']}) 

580 

581 entity_list = list(entities.values()) 

582 

583 output[entity_type] = [ 

584 { 

585 '@graph': entity_list, 

586 '@id': f"https://w3id.org/oc/meta/{entity_type}/" 

587 } 

588 ] 

589 

590 expected_output = { 

591 "ar": [ 

592 { 

593 "@graph": [ 

594 { 

595 "@id": "https://w3id.org/oc/meta/ar/0604", 

596 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

597 "http://purl.org/spar/pro/isHeldBy": [ 

598 {"@id": "https://w3id.org/oc/meta/ra/0604"} 

599 ], 

600 "http://purl.org/spar/pro/withRole": [ 

601 {"@id": "http://purl.org/spar/pro/publisher"} 

602 ], 

603 }, 

604 { 

605 "@id": "https://w3id.org/oc/meta/ar/0602", 

606 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

607 "http://purl.org/spar/pro/isHeldBy": [ 

608 {"@id": "https://w3id.org/oc/meta/ra/0602"} 

609 ], 

610 "http://purl.org/spar/pro/withRole": [ 

611 {"@id": "http://purl.org/spar/pro/author"} 

612 ], 

613 "https://w3id.org/oc/ontology/hasNext": [ 

614 {"@id": "https://w3id.org/oc/meta/ar/0603"} 

615 ], 

616 }, 

617 { 

618 "@id": "https://w3id.org/oc/meta/ar/0603", 

619 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

620 "http://purl.org/spar/pro/isHeldBy": [ 

621 {"@id": "https://w3id.org/oc/meta/ra/0603"} 

622 ], 

623 "http://purl.org/spar/pro/withRole": [ 

624 {"@id": "http://purl.org/spar/pro/author"} 

625 ], 

626 }, 

627 { 

628 "@id": "https://w3id.org/oc/meta/ar/0605", 

629 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

630 "http://purl.org/spar/pro/isHeldBy": [ 

631 {"@id": "https://w3id.org/oc/meta/ra/0605"} 

632 ], 

633 "http://purl.org/spar/pro/withRole": [ 

634 {"@id": "http://purl.org/spar/pro/editor"} 

635 ], 

636 }, 

637 { 

638 "@id": "https://w3id.org/oc/meta/ar/0601", 

639 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

640 "http://purl.org/spar/pro/isHeldBy": [ 

641 {"@id": "https://w3id.org/oc/meta/ra/0601"} 

642 ], 

643 "http://purl.org/spar/pro/withRole": [ 

644 {"@id": "http://purl.org/spar/pro/author"} 

645 ], 

646 "https://w3id.org/oc/ontology/hasNext": [ 

647 {"@id": "https://w3id.org/oc/meta/ar/0602"} 

648 ], 

649 }, 

650 ], 

651 "@id": "https://w3id.org/oc/meta/ar/", 

652 } 

653 ], 

654 "br": [ 

655 { 

656 "@graph": [ 

657 { 

658 "@id": "https://w3id.org/oc/meta/br/0601", 

659 "@type": [ 

660 "http://purl.org/spar/fabio/Expression", 

661 "http://purl.org/spar/fabio/JournalArticle", 

662 ], 

663 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

664 { 

665 "@type": "http://www.w3.org/2001/XMLSchema#date", 

666 "@value": "2012-03-31", 

667 } 

668 ], 

669 "http://purl.org/dc/terms/title": [ 

670 { 

671 "@value": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment", 

672 "@type": "http://www.w3.org/2001/XMLSchema#string" 

673 } 

674 ], 

675 "http://purl.org/spar/datacite/hasIdentifier": [ 

676 {"@id": "https://w3id.org/oc/meta/id/0601"} 

677 ], 

678 "http://purl.org/spar/pro/isDocumentContextFor": [ 

679 {"@id": "https://w3id.org/oc/meta/ar/0603"}, 

680 {"@id": "https://w3id.org/oc/meta/ar/0601"}, 

681 {"@id": "https://w3id.org/oc/meta/ar/0604"}, 

682 {"@id": "https://w3id.org/oc/meta/ar/0602"}, 

683 {"@id": "https://w3id.org/oc/meta/ar/0605"}, 

684 ], 

685 "http://purl.org/vocab/frbr/core#embodiment": [ 

686 {"@id": "https://w3id.org/oc/meta/re/0601"} 

687 ], 

688 "http://purl.org/vocab/frbr/core#partOf": [ 

689 {"@id": "https://w3id.org/oc/meta/br/0604"} 

690 ], 

691 }, 

692 { 

693 "@id": "https://w3id.org/oc/meta/br/0604", 

694 "@type": [ 

695 "http://purl.org/spar/fabio/JournalIssue", 

696 "http://purl.org/spar/fabio/Expression", 

697 ], 

698 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

699 {"@value": "1", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

700 ], 

701 "http://purl.org/vocab/frbr/core#partOf": [ 

702 {"@id": "https://w3id.org/oc/meta/br/0603"} 

703 ], 

704 }, 

705 { 

706 "@id": "https://w3id.org/oc/meta/br/0602", 

707 "@type": [ 

708 "http://purl.org/spar/fabio/Expression", 

709 "http://purl.org/spar/fabio/Journal", 

710 ], 

711 "http://purl.org/dc/terms/title": [ 

712 {"@value": "The Korean Journal Of Food And Nutrition", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

713 ], 

714 "http://purl.org/spar/datacite/hasIdentifier": [ 

715 {"@id": "https://w3id.org/oc/meta/id/0602"} 

716 ], 

717 }, 

718 { 

719 "@id": "https://w3id.org/oc/meta/br/0603", 

720 "@type": [ 

721 "http://purl.org/spar/fabio/Expression", 

722 "http://purl.org/spar/fabio/JournalVolume", 

723 ], 

724 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

725 {"@value": "25", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

726 ], 

727 "http://purl.org/vocab/frbr/core#partOf": [ 

728 {"@id": "https://w3id.org/oc/meta/br/0602"} 

729 ], 

730 }, 

731 ], 

732 "@id": "https://w3id.org/oc/meta/br/", 

733 } 

734 ], 

735 "id": [ 

736 { 

737 "@graph": [ 

738 { 

739 "@id": "https://w3id.org/oc/meta/id/0605", 

740 "@type": ["http://purl.org/spar/datacite/Identifier"], 

741 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

742 {"@id": "http://purl.org/spar/datacite/orcid"} 

743 ], 

744 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

745 {"@value": "0000-0002-9666-2513", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

746 ], 

747 }, 

748 { 

749 "@id": "https://w3id.org/oc/meta/id/0601", 

750 "@type": ["http://purl.org/spar/datacite/Identifier"], 

751 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

752 {"@id": "http://purl.org/spar/datacite/doi"} 

753 ], 

754 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

755 {"@value": "10.9799/ksfan.2012.25.1.069", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

756 ], 

757 }, 

758 { 

759 "@id": "https://w3id.org/oc/meta/id/0603", 

760 "@type": ["http://purl.org/spar/datacite/Identifier"], 

761 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

762 {"@id": "http://purl.org/spar/datacite/orcid"} 

763 ], 

764 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

765 {"@value": "0000-0003-2542-5788", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

766 ], 

767 }, 

768 { 

769 "@id": "https://w3id.org/oc/meta/id/0604", 

770 "@type": ["http://purl.org/spar/datacite/Identifier"], 

771 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

772 {"@id": "http://purl.org/spar/datacite/crossref"} 

773 ], 

774 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

775 {"@value": "4768", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

776 ], 

777 }, 

778 { 

779 "@id": "https://w3id.org/oc/meta/id/0602", 

780 "@type": ["http://purl.org/spar/datacite/Identifier"], 

781 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

782 {"@id": "http://purl.org/spar/datacite/issn"} 

783 ], 

784 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

785 {"@value": "1225-4339", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

786 ], 

787 }, 

788 ], 

789 "@id": "https://w3id.org/oc/meta/id/", 

790 } 

791 ], 

792 "ra": [ 

793 { 

794 "@graph": [ 

795 { 

796 "@id": "https://w3id.org/oc/meta/ra/0605", 

797 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

798 "http://purl.org/spar/datacite/hasIdentifier": [ 

799 {"@id": "https://w3id.org/oc/meta/id/0605"} 

800 ], 

801 "http://xmlns.com/foaf/0.1/familyName": [ 

802 {"@value": "Chung", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

803 ], 

804 "http://xmlns.com/foaf/0.1/givenName": [ 

805 {"@value": "Myong-Soo", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

806 ], 

807 }, 

808 { 

809 "@id": "https://w3id.org/oc/meta/ra/0602", 

810 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

811 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Mun", "@type": "http://www.w3.org/2001/XMLSchema#string"}], 

812 "http://xmlns.com/foaf/0.1/givenName": [ 

813 {"@value": "Ji-Hye", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

814 ], 

815 }, 

816 { 

817 "@id": "https://w3id.org/oc/meta/ra/0604", 

818 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

819 "http://purl.org/spar/datacite/hasIdentifier": [ 

820 {"@id": "https://w3id.org/oc/meta/id/0604"} 

821 ], 

822 "http://xmlns.com/foaf/0.1/name": [ 

823 {"@value": "The Korean Society Of Food And Nutrition", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

824 ], 

825 }, 

826 { 

827 "@id": "https://w3id.org/oc/meta/ra/0603", 

828 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

829 "http://xmlns.com/foaf/0.1/familyName": [ 

830 {"@value": "Chung", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

831 ], 

832 "http://xmlns.com/foaf/0.1/givenName": [ 

833 {"@value": "Myong-Soo", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

834 ], 

835 }, 

836 { 

837 "@id": "https://w3id.org/oc/meta/ra/0601", 

838 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

839 "http://purl.org/spar/datacite/hasIdentifier": [ 

840 {"@id": "https://w3id.org/oc/meta/id/0603"} 

841 ], 

842 "http://xmlns.com/foaf/0.1/familyName": [ 

843 {"@value": "Cheigh", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

844 ], 

845 "http://xmlns.com/foaf/0.1/givenName": [ 

846 {"@value": "Chan-Ick", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

847 ], 

848 }, 

849 ], 

850 "@id": "https://w3id.org/oc/meta/ra/", 

851 } 

852 ], 

853 "re": [ 

854 { 

855 "@graph": [ 

856 { 

857 "@id": "https://w3id.org/oc/meta/re/0601", 

858 "@type": ["http://purl.org/spar/fabio/Manifestation"], 

859 "http://prismstandard.org/namespaces/basic/2.0/endingPage": [ 

860 {"@value": "76", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

861 ], 

862 "http://prismstandard.org/namespaces/basic/2.0/startingPage": [ 

863 {"@value": "69", "@type": "http://www.w3.org/2001/XMLSchema#string"} 

864 ], 

865 } 

866 ], 

867 "@id": "https://w3id.org/oc/meta/re/", 

868 } 

869 ], 

870 } 

871 

872 processed_output = {} 

873 for entity_type, entity_data in output.items(): 

874 processed_output[entity_type] = [] 

875 for graph_container in entity_data: 

876 filtered_graph = [] 

877 for entity in graph_container['@graph']: 

878 filtered_entity = { 

879 '@id': entity['@id'] 

880 } 

881 for pred, obj in entity.items(): 

882 if pred != '@id': # Only exclude @id since we already added it 

883 filtered_entity[pred] = obj 

884 

885 if len(filtered_entity) > 1: # Only include if it has predicates beyond @id 

886 filtered_graph.append(filtered_entity) 

887 

888 # Sort the graph by @id 

889 filtered_graph = sorted(filtered_graph, key=lambda x: x['@id']) 

890 

891 processed_output[entity_type].append({ 

892 '@graph': filtered_graph, 

893 '@id': graph_container['@id'] 

894 }) 

895 # For each entity type in the expected output, verify that all expected entities exist 

896 # with their expected properties in the actual output from the triplestore 

897 for entity_type, expected_graphs in expected_output.items(): 

898 self.assertIn(entity_type, processed_output, f"Entity type {entity_type} missing from triplestore output") 

899 

900 for expected_graph in expected_graphs: 

901 expected_entities = expected_graph['@graph'] 

902 

903 # Find the corresponding graph in the processed output 

904 actual_graph = None 

905 for graph in processed_output[entity_type]: 

906 if graph['@id'] == expected_graph['@id']: 

907 actual_graph = graph 

908 break 

909 

910 self.assertIsNotNone(actual_graph, f"Graph {expected_graph['@id']} not found in triplestore output") 

911 

912 # For each expected entity, verify it exists with all expected properties 

913 for expected_entity in expected_entities: 

914 entity_id = expected_entity['@id'] 

915 

916 # Find the entity in the actual graph 

917 actual_entity = None 

918 for entity in actual_graph['@graph']: 

919 if entity['@id'] == entity_id: 

920 actual_entity = entity 

921 break 

922 

923 self.assertIsNotNone(actual_entity, f"Entity {entity_id} not found in triplestore output") 

924 

925 # Check that all expected predicates and objects exist 

926 for pred, expected_objects in expected_entity.items(): 

927 if pred != '@id': 

928 self.assertIn(pred, actual_entity, f"Predicate {pred} missing for entity {entity_id}") 

929 

930 # For each expected object, verify it exists in the actual objects 

931 for expected_obj in expected_objects: 

932 found = False 

933 for actual_obj in actual_entity[pred]: 

934 # Require exact matches for all objects 

935 if expected_obj == actual_obj: 

936 found = True 

937 break 

938 

939 self.assertTrue(found, f"Object {expected_obj} not found for predicate {pred} of entity {entity_id}\nActual values: {actual_entity[pred]}") 

940 

941 

942 if os.path.exists(output_folder): 

943 shutil.rmtree(output_folder) 

944 

945 self.assertFalse( 

946 "Reader: ERROR" in proc.stdout or "Storer: ERROR" in proc.stdout 

947 ) 

948 self.assertFalse( 

949 "Reader: ERROR" in proc.stdout or "Storer: ERROR" in proc.stdout 

950 ) 

951 

952 def test_silencer_on(self): 

953 output_folder = os.path.join(BASE_DIR, "output_6") 

954 now = datetime.now() 

955 meta_config_path = os.path.join(BASE_DIR, "meta_config_6.yaml") 

956 with open(meta_config_path, encoding="utf-8") as file: 

957 settings = yaml.full_load(file) 

958 

959 # Update settings with temporary files and Redis cache DB 

960 settings.update( 

961 { 

962 "redis_cache_db": 2, 

963 "ts_upload_cache": self.cache_file, 

964 "ts_failed_queries": self.failed_file, 

965 "ts_stop_file": self.stop_file, 

966 } 

967 ) 

968 

969 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

970 settings["input_csv_dir"] = os.path.join( 

971 BASE_DIR, "same_as_input_2_with_other_authors" 

972 ) 

973 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

974 query_agents = """ 

975 PREFIX pro: <http://purl.org/spar/pro/> 

976 SELECT (COUNT (?agent) AS ?agent_count) 

977 WHERE { 

978 <https://w3id.org/oc/meta/br/0601> pro:isDocumentContextFor ?agent. 

979 } 

980 """ 

981 result = execute_sparql_query(SERVER, query_agents) 

982 expected_result = { 

983 "head": {"link": [], "vars": ["agent_count"]}, 

984 "results": { 

985 "distinct": False, 

986 "ordered": True, 

987 "bindings": [ 

988 { 

989 "agent_count": { 

990 "datatype": "http://www.w3.org/2001/XMLSchema#integer", 

991 "type": "literal", 

992 "value": "3", 

993 } 

994 } 

995 ], 

996 }, 

997 } 

998 shutil.rmtree(output_folder) 

999 delete_output_zip(".", now) 

1000 self.assertEqual(result, expected_result) 

1001 

1002 def test_silencer_off(self): 

1003 output_folder = os.path.join(BASE_DIR, "output_7") 

1004 now = datetime.now() 

1005 meta_config_path = os.path.join(BASE_DIR, "meta_config_7.yaml") 

1006 with open(meta_config_path, encoding="utf-8") as file: 

1007 settings = yaml.full_load(file) 

1008 

1009 # Update settings with temporary files and Redis cache DB 

1010 settings.update( 

1011 { 

1012 "redis_cache_db": 2, 

1013 "ts_upload_cache": self.cache_file, 

1014 "ts_failed_queries": self.failed_file, 

1015 "ts_stop_file": self.stop_file, 

1016 } 

1017 ) 

1018 

1019 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1020 settings["input_csv_dir"] = os.path.join( 

1021 BASE_DIR, "same_as_input_2_with_other_authors" 

1022 ) 

1023 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1024 query_agents = """ 

1025 PREFIX pro: <http://purl.org/spar/pro/> 

1026 SELECT (COUNT (?agent) AS ?agent_count) 

1027 WHERE { 

1028 <https://w3id.org/oc/meta/br/0601> pro:isDocumentContextFor ?agent. 

1029 } 

1030 """ 

1031 result = execute_sparql_query(SERVER, query_agents) 

1032 expected_result = { 

1033 "head": {"link": [], "vars": ["agent_count"]}, 

1034 "results": { 

1035 "distinct": False, 

1036 "ordered": True, 

1037 "bindings": [ 

1038 { 

1039 "agent_count": { 

1040 "datatype": "http://www.w3.org/2001/XMLSchema#integer", 

1041 "type": "literal", 

1042 "value": "6", 

1043 } 

1044 } 

1045 ], 

1046 }, 

1047 } 

1048 shutil.rmtree(output_folder) 

1049 delete_output_zip(".", now) 

1050 self.assertEqual(result, expected_result) 

1051 

1052 def test_omid_in_input_data(self): 

1053 query_all = """ 

1054 PREFIX fabio: <http://purl.org/spar/fabio/> 

1055 PREFIX datacite: <http://purl.org/spar/datacite/> 

1056 CONSTRUCT {?s ?p ?o. ?id ?id_p ?id_o.} 

1057 WHERE { 

1058 ?s a fabio:JournalArticle; 

1059 ?p ?o. 

1060 ?s datacite:hasIdentifier ?id. 

1061 ?id ?id_p ?id_o. 

1062 } 

1063 """ 

1064 result = execute_sparql_construct(SERVER, query_all) 

1065 output_folder = os.path.join(BASE_DIR, "output_8") 

1066 meta_config_path_without_openalex = os.path.join(BASE_DIR, "meta_config_8.yaml") 

1067 meta_config_path_with_openalex = os.path.join(BASE_DIR, "meta_config_9.yaml") 

1068 with open(meta_config_path_without_openalex, encoding="utf-8") as file: 

1069 settings_without_openalex = yaml.full_load(file) 

1070 with open(meta_config_path_with_openalex, encoding="utf-8") as file: 

1071 settings_with_openalex = yaml.full_load(file) 

1072 

1073 # Update settings with temporary files and Redis cache DB 

1074 cache_settings = { 

1075 "redis_cache_db": 2, 

1076 "ts_upload_cache": self.cache_file, 

1077 "ts_failed_queries": self.failed_file, 

1078 "ts_stop_file": self.stop_file, 

1079 } 

1080 settings_without_openalex.update(cache_settings) 

1081 settings_with_openalex.update(cache_settings) 

1082 

1083 run_meta_process( 

1084 settings=settings_without_openalex, 

1085 meta_config_path=meta_config_path_without_openalex, 

1086 ) 

1087 run_meta_process( 

1088 settings=settings_with_openalex, 

1089 meta_config_path=meta_config_path_with_openalex, 

1090 ) 

1091 query_all = """ 

1092 PREFIX fabio: <http://purl.org/spar/fabio/> 

1093 PREFIX datacite: <http://purl.org/spar/datacite/> 

1094 CONSTRUCT {?s ?p ?o. ?id ?id_p ?id_o.} 

1095 WHERE { 

1096 ?s a fabio:JournalArticle; 

1097 ?p ?o. 

1098 ?s datacite:hasIdentifier ?id. 

1099 ?id ?id_p ?id_o. 

1100 } 

1101 """ 

1102 result = execute_sparql_construct(SERVER, query_all) 

1103 expected_result = Graph() 

1104 expected_result.parse( 

1105 location=os.path.join(BASE_DIR, "test_omid_in_input_data.json"), 

1106 format="json-ld", 

1107 ) 

1108 prov_graph = Dataset() 

1109 for dirpath, dirnames, filenames in os.walk(os.path.join(output_folder, "rdf")): 

1110 if "br" in dirpath and "prov" in dirpath: 

1111 for filename in filenames: 

1112 prov_graph.parse( 

1113 source=os.path.join(dirpath, filename), format="json-ld" 

1114 ) 

1115 

1116 expected_prov_graph = Dataset() 

1117 expected_prov_graph.parse( 

1118 os.path.join(BASE_DIR, "test_omid_in_input_data_prov.json"), 

1119 format="json-ld", 

1120 ) 

1121 prov_graph.remove( 

1122 (None, URIRef("http://www.w3.org/ns/prov#generatedAtTime"), None) 

1123 ) 

1124 expected_prov_graph.remove( 

1125 (None, URIRef("http://www.w3.org/ns/prov#generatedAtTime"), None) 

1126 ) 

1127 prov_graph.remove( 

1128 (None, URIRef("http://www.w3.org/ns/prov#invalidatedAtTime"), None) 

1129 ) 

1130 expected_prov_graph.remove( 

1131 (None, URIRef("http://www.w3.org/ns/prov#invalidatedAtTime"), None) 

1132 ) 

1133 shutil.rmtree(output_folder) 

1134 self.assertTrue( 

1135 normalize_graph(result).isomorphic(normalize_graph(expected_result)) 

1136 ) 

1137 

1138 def test_publishers_sequence(self): 

1139 output_folder = os.path.join(BASE_DIR, "output_9") 

1140 meta_config_path = os.path.join(BASE_DIR, "meta_config_10.yaml") 

1141 now = datetime.now() 

1142 with open(meta_config_path, encoding="utf-8") as file: 

1143 settings = yaml.full_load(file) 

1144 

1145 # Update settings with temporary files and Redis cache DB 

1146 settings.update( 

1147 { 

1148 "redis_cache_db": 2, 

1149 "ts_upload_cache": self.cache_file, 

1150 "ts_failed_queries": self.failed_file, 

1151 "ts_stop_file": self.stop_file, 

1152 } 

1153 ) 

1154 

1155 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1156 query_all = """ 

1157 PREFIX datacite: <http://purl.org/spar/datacite/> 

1158 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/> 

1159 CONSTRUCT {?br ?p ?o. ?o ?op ?oo. ?oo ?oop ?ooo. ?ooo ?ooop ?oooo.} 

1160 WHERE { 

1161 ?id literal:hasLiteralValue "10.17117/na.2015.08.1067"^^<http://www.w3.org/2001/XMLSchema#string>; 

1162 datacite:usesIdentifierScheme datacite:doi; 

1163 ^datacite:hasIdentifier ?br. 

1164 ?br ?p ?o. 

1165 ?o ?op ?oo. 

1166 ?oo ?oop ?ooo. 

1167 ?ooo ?ooop ?oooo. 

1168 } 

1169 """ 

1170 result = execute_sparql_construct(SERVER, query_all) 

1171 expected_result = Graph() 

1172 expected_result.parse( 

1173 os.path.join(BASE_DIR, "test_publishers_sequence.json"), format="json-ld" 

1174 ) 

1175 shutil.rmtree(output_folder) 

1176 self.assertTrue( 

1177 normalize_graph(result).isomorphic(normalize_graph(expected_result)) 

1178 ) 

1179 

1180 def test_duplicate_omids_with_datatype(self): 

1181 output_folder = os.path.join(BASE_DIR, "output_duplicate_test") 

1182 meta_config_path = os.path.join(BASE_DIR, "meta_config_duplicate.yaml") 

1183 

1184 # Create test settings 

1185 settings = { 

1186 "triplestore_url": SERVER, 

1187 "provenance_triplestore_url": PROV_SERVER, 

1188 "input_csv_dir": os.path.join(BASE_DIR, "input_duplicate"), 

1189 "base_output_dir": output_folder, 

1190 "output_rdf_dir": output_folder, 

1191 "resp_agent": "test", 

1192 "base_iri": "https://w3id.org/oc/meta/", 

1193 "context_path": None, 

1194 "dir_split_number": 10000, 

1195 "items_per_file": 1000, 

1196 "default_dir": "_", 

1197 "rdf_output_in_chunks": False, 

1198 "zip_output_rdf": True, 

1199 "source": None, 

1200 "supplier_prefix": "060", 

1201 "use_doi_api_service": False, 

1202 "blazegraph_full_text_search": False, 

1203 "virtuoso_full_text_search": True, 

1204 "fuseki_full_text_search": False, 

1205 "graphdb_connector_name": None, 

1206 "cache_endpoint": None, 

1207 "cache_update_endpoint": None, 

1208 "silencer": [], 

1209 "redis_host": "localhost", 

1210 "redis_port": 6381, 

1211 "redis_db": 5, 

1212 "redis_cache_db": 2, 

1213 "ts_upload_cache": self.cache_file, 

1214 "ts_failed_queries": self.failed_file, 

1215 "ts_stop_file": self.stop_file, 

1216 } 

1217 

1218 # Setup: create test data 

1219 os.makedirs(os.path.join(BASE_DIR, "input_duplicate"), exist_ok=True) 

1220 with open( 

1221 os.path.join(BASE_DIR, "input_duplicate", "test.csv"), "w", encoding="utf-8" 

1222 ) as f: 

1223 writer = csv.writer(f) 

1224 writer.writerow( 

1225 [ 

1226 "id", 

1227 "title", 

1228 "author", 

1229 "pub_date", 

1230 "venue", 

1231 "volume", 

1232 "issue", 

1233 "page", 

1234 "type", 

1235 "publisher", 

1236 "editor", 

1237 ] 

1238 ) 

1239 writer.writerow( 

1240 [ 

1241 "issn:2543-3288 issn:2078-7685", # Exact problematic row from production 

1242 "Journal of Diabetology", 

1243 "", 

1244 "", 

1245 "", 

1246 "", 

1247 "", 

1248 "", 

1249 "journal", 

1250 "Medknow [crossref:2581]", 

1251 "", 

1252 ] 

1253 ) 

1254 

1255 # Setup: Insert pre-existing identifiers and BRs in triplestore 

1256 with SPARQLClient(SERVER, timeout=60) as client: 

1257 client.update( 

1258 """ 

1259 INSERT DATA { 

1260 GRAPH <https://w3id.org/oc/meta/br/> { 

1261 <https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601> ; 

1262 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> . 

1263 <https://w3id.org/oc/meta/br/0602> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0602> ; 

1264 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> . 

1265 } 

1266 GRAPH <https://w3id.org/oc/meta/id/> { 

1267 <https://w3id.org/oc/meta/id/0601> <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "2078-7685" ; 

1268 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1269 <https://w3id.org/oc/meta/id/0602> <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "2543-3288" ; 

1270 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1271 } 

1272 } 

1273 """ 

1274 ) 

1275 

1276 # Update Redis counters to match the inserted data 

1277 redis_handler = RedisCounterHandler(port=6381, db=5) # Use test db 

1278 redis_handler.set_counter( 

1279 2, "br", supplier_prefix="060" 

1280 ) # BR counter for two BRs 

1281 redis_handler.set_counter( 

1282 2, "id", supplier_prefix="060" 

1283 ) # ID counter for two IDs 

1284 

1285 # Run the process 

1286 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1287 

1288 # Check for errors 

1289 errors_file = os.path.join(output_folder, "errors.txt") 

1290 if os.path.exists(errors_file): 

1291 with open(errors_file, "r") as f: 

1292 errors = f.read() 

1293 print(f"Errors found:\n{errors}") 

1294 

1295 # Query to check for duplicates 

1296 query = """ 

1297 SELECT DISTINCT ?id ?value 

1298 WHERE { 

1299 GRAPH <https://w3id.org/oc/meta/id/> { 

1300 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?value ; 

1301 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1302 FILTER(?value IN ("2078-7685"^^<http://www.w3.org/2001/XMLSchema#string>, "2078-7685", 

1303 "2543-3288"^^<http://www.w3.org/2001/XMLSchema#string>, "2543-3288")) 

1304 } 

1305 } 

1306 """ 

1307 result = execute_sparql_query(SERVER, query) 

1308 # Group IDs by value to check for duplicates 

1309 ids_by_value = {} 

1310 for binding in result["results"]["bindings"]: 

1311 value = binding["value"]["value"] 

1312 id = binding["id"]["value"] 

1313 if value not in ids_by_value: 

1314 ids_by_value[value] = [] 

1315 ids_by_value[value].append(id) 

1316 

1317 # Cleanup 

1318 shutil.rmtree(output_folder, ignore_errors=True) 

1319 shutil.rmtree(os.path.join(BASE_DIR, "input_duplicate"), ignore_errors=True) 

1320 if os.path.exists(meta_config_path): 

1321 os.remove(meta_config_path) 

1322 

1323 # Check that we have both ISSNs and no duplicates 

1324 for issn_value, ids in ids_by_value.items(): 

1325 self.assertEqual( 

1326 len(ids), 1, f"Found multiple IDs for ISSN {issn_value}: {ids}" 

1327 ) 

1328 

1329 self.assertEqual( 

1330 len(ids_by_value), 

1331 2, 

1332 f"Expected 2 ISSNs, found {len(ids_by_value)}: {list(ids_by_value.keys())}", 

1333 ) 

1334 

1335 def test_duplicate_omids_with_venue_datatype(self): 

1336 """Test to verify that identifiers are not duplicated when merging previously unconnected venues""" 

1337 output_folder = os.path.join(BASE_DIR, "output_duplicate_venue_test") 

1338 meta_config_path = os.path.join(BASE_DIR, "meta_config_duplicate_venue.yaml") 

1339 

1340 # Setup: create test data 

1341 os.makedirs(os.path.join(BASE_DIR, "input_duplicate_venue"), exist_ok=True) 

1342 with open( 

1343 os.path.join(BASE_DIR, "input_duplicate_venue", "test.csv"), 

1344 "w", 

1345 encoding="utf-8", 

1346 ) as f: 

1347 writer = csv.writer(f) 

1348 writer.writerow( 

1349 [ 

1350 "id", 

1351 "title", 

1352 "author", 

1353 "pub_date", 

1354 "venue", 

1355 "volume", 

1356 "issue", 

1357 "page", 

1358 "type", 

1359 "publisher", 

1360 "editor", 

1361 ] 

1362 ) 

1363 writer.writerow( 

1364 [ 

1365 "issn:1756-1833", 

1366 "BMJ", 

1367 "", 

1368 "", 

1369 "", 

1370 "", 

1371 "", 

1372 "", 

1373 "journal", 

1374 "BMJ [crossref:239]", 

1375 "", 

1376 ] 

1377 ) 

1378 writer.writerow( 

1379 [ 

1380 "", # id 

1381 "", # title 

1382 "", # author 

1383 "", # pub_date 

1384 "BMJ [issn:0267-0623 issn:0959-8138 issn:1468-5833 issn:0007-1447]", # venue 

1385 "283", # volume 

1386 "", # issue 

1387 "", # page 

1388 "journal volume", # type 

1389 "BMJ [crossref:239]", # publisher 

1390 "", # editor 

1391 ] 

1392 ) 

1393 

1394 # Setup: Insert pre-existing data - aggiungiamo gli identificatori iniziali 

1395 with SPARQLClient(SERVER, timeout=60) as client: 

1396 client.update( 

1397 """ 

1398 INSERT DATA { 

1399 GRAPH <https://w3id.org/oc/meta/br/> { 

1400 # First venue - BMJ with initial ISSNs 

1401 <https://w3id.org/oc/meta/br/0601> 

1402 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601>, <https://w3id.org/oc/meta/id/0602> ; 

1403 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ; 

1404 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1405 <http://purl.org/dc/terms/title> "BMJ" . 

1406 

1407 # Second venue 

1408 <https://w3id.org/oc/meta/br/0602> 

1409 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0603> ; 

1410 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ; 

1411 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1412 <http://purl.org/dc/terms/title> "British Medical Journal" . 

1413 } 

1414 GRAPH <https://w3id.org/oc/meta/id/> { 

1415 # First venue's ISSNs 

1416 <https://w3id.org/oc/meta/id/0601> 

1417 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "1756-1833" ; 

1418 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1419 <https://w3id.org/oc/meta/id/0602> 

1420 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "0959-8138" ; 

1421 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1422 # Second venue's ISSN 

1423 <https://w3id.org/oc/meta/id/0603> 

1424 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "0267-0623" ; 

1425 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1426 } 

1427 } 

1428 """ 

1429 ) 

1430 

1431 # Update Redis counters for the pre-existing entities 

1432 redis_handler = RedisCounterHandler(port=6381, db=5) 

1433 redis_handler.set_counter( 

1434 6, "br", supplier_prefix="060" 

1435 ) # Updated to account for 6 entities (2 venues + 4 volumes) 

1436 redis_handler.set_counter( 

1437 3, "id", supplier_prefix="060" 

1438 ) # Corretto: 3 IDs (1756-1833, 0959-8138, 0267-0623) 

1439 

1440 # Create test settings 

1441 settings = { 

1442 "triplestore_url": SERVER, 

1443 "provenance_triplestore_url": PROV_SERVER, 

1444 "input_csv_dir": os.path.join(BASE_DIR, "input_duplicate_venue"), 

1445 "base_output_dir": output_folder, 

1446 "output_rdf_dir": output_folder, 

1447 "resp_agent": "test", 

1448 "base_iri": "https://w3id.org/oc/meta/", 

1449 "context_path": None, 

1450 "dir_split_number": 10000, 

1451 "items_per_file": 1000, 

1452 "default_dir": "_", 

1453 "rdf_output_in_chunks": False, 

1454 "zip_output_rdf": True, 

1455 "source": None, 

1456 "supplier_prefix": "060", 

1457 "use_doi_api_service": False, 

1458 "blazegraph_full_text_search": False, 

1459 "virtuoso_full_text_search": True, 

1460 "fuseki_full_text_search": False, 

1461 "graphdb_connector_name": None, 

1462 "cache_endpoint": None, 

1463 "cache_update_endpoint": None, 

1464 "silencer": [], 

1465 "redis_host": "localhost", 

1466 "redis_port": 6381, 

1467 "redis_db": 5, 

1468 "redis_cache_db": 2, 

1469 "ts_upload_cache": self.cache_file, 

1470 "ts_failed_queries": self.failed_file, 

1471 "ts_stop_file": self.stop_file, 

1472 } 

1473 

1474 with open(meta_config_path, "w") as f: 

1475 yaml.dump(settings, f) 

1476 

1477 # Run the process 

1478 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1479 

1480 # Query to check for duplicates - check all ISSNs 

1481 query = """ 

1482 SELECT DISTINCT ?id ?value 

1483 WHERE { 

1484 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?value ; 

1485 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1486 FILTER(STR(?value) IN ("1756-1833", "0959-8138", "0267-0623")) 

1487 } 

1488 """ 

1489 result = execute_sparql_query(SERVER, query) 

1490 # Group IDs by value to check for duplicates 

1491 ids_by_value = {} 

1492 for binding in result["results"]["bindings"]: 

1493 value = binding["value"]["value"] 

1494 id = binding["id"]["value"] 

1495 if value not in ids_by_value: 

1496 ids_by_value[value] = [] 

1497 ids_by_value[value].append(id) 

1498 

1499 # Cleanup 

1500 shutil.rmtree(output_folder, ignore_errors=True) 

1501 shutil.rmtree( 

1502 os.path.join(BASE_DIR, "input_duplicate_venue"), ignore_errors=True 

1503 ) 

1504 if os.path.exists(meta_config_path): 

1505 os.remove(meta_config_path) 

1506 

1507 # Check that we don't have duplicate IDs for any ISSN 

1508 for issn_value, ids in ids_by_value.items(): 

1509 self.assertEqual( 

1510 len(ids), 1, f"Found multiple IDs for ISSN {issn_value} in venue: {ids}" 

1511 ) 

1512 

1513 # Verify that pre-existing IDs were reused 

1514 self.assertTrue( 

1515 any("0601" in id for ids in ids_by_value.values() for id in ids) 

1516 and any("0602" in id for ids in ids_by_value.values() for id in ids), 

1517 "Pre-existing IDs were not reused", 

1518 ) 

1519 

1520 def test_doi_with_multiple_slashes(self): 

1521 """Test handling of DOIs containing multiple forward slashes""" 

1522 output_folder = os.path.join(BASE_DIR, "output_doi_test") 

1523 meta_config_path = os.path.join(BASE_DIR, "meta_config_doi.yaml") 

1524 

1525 # Setup: create test data with problematic DOI 

1526 os.makedirs(os.path.join(BASE_DIR, "input_doi"), exist_ok=True) 

1527 with open( 

1528 os.path.join(BASE_DIR, "input_doi", "test.csv"), "w", encoding="utf-8" 

1529 ) as f: 

1530 writer = csv.writer(f) 

1531 writer.writerow( 

1532 [ 

1533 "id", 

1534 "title", 

1535 "author", 

1536 "pub_date", 

1537 "venue", 

1538 "volume", 

1539 "issue", 

1540 "page", 

1541 "type", 

1542 "publisher", 

1543 "editor", 

1544 ] 

1545 ) 

1546 writer.writerow( 

1547 [ 

1548 "doi:10.1093/acprof:oso/9780199230723.001.0001", # Problematic DOI with multiple slashes 

1549 "Test Book", 

1550 "", 

1551 "", 

1552 "", 

1553 "", 

1554 "", 

1555 "", 

1556 "book", 

1557 "", 

1558 "", 

1559 ] 

1560 ) 

1561 

1562 # Create test settings 

1563 settings = { 

1564 "triplestore_url": SERVER, 

1565 "provenance_triplestore_url": PROV_SERVER, 

1566 "input_csv_dir": os.path.join(BASE_DIR, "input_doi"), 

1567 "base_output_dir": output_folder, 

1568 "output_rdf_dir": output_folder, 

1569 "resp_agent": "test", 

1570 "base_iri": "https://w3id.org/oc/meta/", 

1571 "context_path": None, 

1572 "dir_split_number": 10000, 

1573 "items_per_file": 1000, 

1574 "default_dir": "_", 

1575 "rdf_output_in_chunks": False, 

1576 "zip_output_rdf": True, 

1577 "source": None, 

1578 "supplier_prefix": "060", 

1579 "use_doi_api_service": False, 

1580 "blazegraph_full_text_search": False, 

1581 "virtuoso_full_text_search": True, 

1582 "fuseki_full_text_search": False, 

1583 "graphdb_connector_name": None, 

1584 "cache_endpoint": None, 

1585 "cache_update_endpoint": None, 

1586 "silencer": [], 

1587 "redis_host": "localhost", 

1588 "redis_port": 6381, 

1589 "redis_db": 5, 

1590 "redis_cache_db": 2, 

1591 "ts_upload_cache": self.cache_file, 

1592 "ts_failed_queries": self.failed_file, 

1593 "ts_stop_file": self.stop_file, 

1594 } 

1595 

1596 with open(meta_config_path, "w") as f: 

1597 yaml.dump(settings, f) 

1598 

1599 now = datetime.now() 

1600 

1601 # Run the process 

1602 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1603 

1604 # Query to verify DOI was processed correctly 

1605 query = """ 

1606 SELECT ?br ?id ?value 

1607 WHERE { 

1608 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "10.1093/acprof:oso/9780199230723.001.0001"^^<http://www.w3.org/2001/XMLSchema#string> ; 

1609 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/doi> ; 

1610 ^<http://purl.org/spar/datacite/hasIdentifier> ?br . 

1611 } 

1612 """ 

1613 result = execute_sparql_query(SERVER, query) 

1614 

1615 # Cleanup 

1616 shutil.rmtree(output_folder, ignore_errors=True) 

1617 shutil.rmtree(os.path.join(BASE_DIR, "input_doi"), ignore_errors=True) 

1618 if os.path.exists(meta_config_path): 

1619 os.remove(meta_config_path) 

1620 delete_output_zip(".", now) 

1621 

1622 # Verify results 

1623 self.assertTrue( 

1624 len(result["results"]["bindings"]) > 0, 

1625 "DOI with multiple slashes was not processed correctly", 

1626 ) 

1627 

1628 # Check that we got exactly one result 

1629 self.assertEqual( 

1630 len(result["results"]["bindings"]), 

1631 1, 

1632 f"Expected 1 result, got {len(result['results']['bindings'])}", 

1633 ) 

1634 

1635 def test_volume_issue_deduplication(self): 

1636 """Test to verify that volumes and issues are properly deduplicated""" 

1637 output_folder = os.path.join(BASE_DIR, "output_vvi_test") 

1638 meta_config_path = os.path.join(BASE_DIR, "meta_config_vvi.yaml") 

1639 

1640 # Setup: create test data 

1641 os.makedirs(os.path.join(BASE_DIR, "input_vvi"), exist_ok=True) 

1642 with open( 

1643 os.path.join(BASE_DIR, "input_vvi", "test.csv"), "w", encoding="utf-8" 

1644 ) as f: 

1645 writer = csv.writer(f) 

1646 writer.writerow( 

1647 [ 

1648 "id", 

1649 "title", 

1650 "author", 

1651 "pub_date", 

1652 "venue", 

1653 "volume", 

1654 "issue", 

1655 "page", 

1656 "type", 

1657 "publisher", 

1658 "editor", 

1659 ] 

1660 ) 

1661 # First article in volume 1, issue 1 

1662 writer.writerow( 

1663 [ 

1664 "doi:10.1234/test.1", 

1665 "First Article", 

1666 "", 

1667 "2023", 

1668 "Test Journal [issn:1756-1833]", 

1669 "1", 

1670 "1", 

1671 "1-10", 

1672 "journal article", 

1673 "", 

1674 "", 

1675 ] 

1676 ) 

1677 # Second article in same volume and issue 

1678 writer.writerow( 

1679 [ 

1680 "doi:10.1234/test.2", 

1681 "Second Article", 

1682 "", 

1683 "2023", 

1684 "Test Journal [issn:1756-1833]", 

1685 "1", 

1686 "1", 

1687 "11-20", 

1688 "journal article", 

1689 "", 

1690 "", 

1691 ] 

1692 ) 

1693 

1694 # Create test settings 

1695 settings = { 

1696 "triplestore_url": SERVER, 

1697 "provenance_triplestore_url": PROV_SERVER, 

1698 "input_csv_dir": os.path.join(BASE_DIR, "input_vvi"), 

1699 "base_output_dir": output_folder, 

1700 "output_rdf_dir": output_folder, 

1701 "resp_agent": "test", 

1702 "base_iri": "https://w3id.org/oc/meta/", 

1703 "context_path": None, 

1704 "dir_split_number": 10000, 

1705 "items_per_file": 1000, 

1706 "default_dir": "_", 

1707 "rdf_output_in_chunks": False, 

1708 "zip_output_rdf": True, 

1709 "source": None, 

1710 "supplier_prefix": "060", 

1711 "use_doi_api_service": False, 

1712 "blazegraph_full_text_search": False, 

1713 "virtuoso_full_text_search": True, 

1714 "fuseki_full_text_search": False, 

1715 "graphdb_connector_name": None, 

1716 "cache_endpoint": None, 

1717 "cache_update_endpoint": None, 

1718 "silencer": [], 

1719 "redis_host": "localhost", 

1720 "redis_port": 6381, 

1721 "redis_db": 5, 

1722 "redis_cache_db": 2, 

1723 "ts_upload_cache": self.cache_file, 

1724 "ts_failed_queries": self.failed_file, 

1725 "ts_stop_file": self.stop_file, 

1726 } 

1727 

1728 with open(meta_config_path, "w") as f: 

1729 yaml.dump(settings, f) 

1730 

1731 # Run the process 

1732 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1733 

1734 # Query to check volume and issue structure 

1735 query = """ 

1736 PREFIX fabio: <http://purl.org/spar/fabio/> 

1737 PREFIX frbr: <http://purl.org/vocab/frbr/core#> 

1738 PREFIX prism: <http://prismstandard.org/namespaces/basic/2.0/> 

1739  

1740 SELECT ?article ?volume ?issue ?seq_id 

1741 WHERE { 

1742 ?article a fabio:JournalArticle ; 

1743 frbr:partOf ?issue . 

1744 ?issue a fabio:JournalIssue ; 

1745 fabio:hasSequenceIdentifier ?seq_id ; 

1746 frbr:partOf ?volume . 

1747 ?volume a fabio:JournalVolume . 

1748 } 

1749 ORDER BY ?article 

1750 """ 

1751 

1752 result = execute_sparql_query(SERVER, query) 

1753 

1754 # Cleanup 

1755 shutil.rmtree(output_folder, ignore_errors=True) 

1756 shutil.rmtree(os.path.join(BASE_DIR, "input_vvi"), ignore_errors=True) 

1757 if os.path.exists(meta_config_path): 

1758 os.remove(meta_config_path) 

1759 

1760 # Verify results 

1761 bindings = result["results"]["bindings"] 

1762 

1763 # Should have 2 articles 

1764 self.assertEqual(len(bindings), 2, "Expected 2 articles") 

1765 

1766 # Both articles should reference the same volume and issue 

1767 first_volume = bindings[0]["volume"]["value"] 

1768 first_issue = bindings[0]["issue"]["value"] 

1769 

1770 for binding in bindings[1:]: 

1771 self.assertEqual( 

1772 binding["volume"]["value"], 

1773 first_volume, 

1774 "Articles reference different volumes", 

1775 ) 

1776 self.assertEqual( 

1777 binding["issue"]["value"], 

1778 first_issue, 

1779 "Articles reference different issues", 

1780 ) 

1781 

1782 def test_volume_issue_deduplication_with_triplestore(self): 

1783 """Test that volumes and issues are properly deduplicated when they already exist in the triplestore""" 

1784 output_folder = os.path.join(BASE_DIR, "output_vvi_triplestore_test") 

1785 meta_config_path = os.path.join(BASE_DIR, "meta_config_vvi_triplestore.yaml") 

1786 

1787 # Setup: Insert pre-existing venue with duplicate volumes and issues (with/without datatype) 

1788 with SPARQLClient(SERVER, timeout=60) as client: 

1789 client.update( 

1790 """ 

1791 INSERT DATA { 

1792 GRAPH <https://w3id.org/oc/meta/br/> { 

1793 # Venue 

1794 <https://w3id.org/oc/meta/br/0601> 

1795 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601> ; 

1796 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ; 

1797 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1798 <http://purl.org/dc/terms/title> "Test Journal" . 

1799 

1800 # Volume 1 (without datatype) 

1801 <https://w3id.org/oc/meta/br/0602> 

1802 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalVolume> ; 

1803 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1804 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0601> ; 

1805 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1" . 

1806 

1807 # Volume 1 (with datatype) 

1808 <https://w3id.org/oc/meta/br/0604> 

1809 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalVolume> ; 

1810 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1811 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0601> ; 

1812 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1"^^<http://www.w3.org/2001/XMLSchema#string> . 

1813 

1814 # Issue 1 (without datatype) 

1815 <https://w3id.org/oc/meta/br/0603> 

1816 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalIssue> ; 

1817 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1818 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0602> ; 

1819 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1" . 

1820 

1821 # Issue 1 (with datatype) 

1822 <https://w3id.org/oc/meta/br/0605> 

1823 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalIssue> ; 

1824 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ; 

1825 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0604> ; 

1826 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1"^^<http://www.w3.org/2001/XMLSchema#string> . 

1827 } 

1828 GRAPH <https://w3id.org/oc/meta/id/> { 

1829 <https://w3id.org/oc/meta/id/0601> 

1830 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "1756-1833" ; 

1831 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> . 

1832 } 

1833 } 

1834 """ 

1835 ) 

1836 

1837 # Update Redis counters for pre-existing entities 

1838 redis_handler = RedisCounterHandler(port=6381, db=5) 

1839 redis_handler.set_counter( 

1840 5, "br", supplier_prefix="060" 

1841 ) # 5 entities: venue, 2 volumes, 2 issues 

1842 redis_handler.set_counter( 

1843 1, "id", supplier_prefix="060" 

1844 ) # 1 identifier for venue 

1845 

1846 # Create test data - article that should use existing volume and issue 

1847 os.makedirs(os.path.join(BASE_DIR, "input_vvi_triplestore"), exist_ok=True) 

1848 with open( 

1849 os.path.join(BASE_DIR, "input_vvi_triplestore", "test.csv"), 

1850 "w", 

1851 encoding="utf-8", 

1852 ) as f: 

1853 writer = csv.writer(f) 

1854 writer.writerow( 

1855 [ 

1856 "id", 

1857 "title", 

1858 "author", 

1859 "pub_date", 

1860 "venue", 

1861 "volume", 

1862 "issue", 

1863 "page", 

1864 "type", 

1865 "publisher", 

1866 "editor", 

1867 ] 

1868 ) 

1869 writer.writerow( 

1870 [ 

1871 "doi:10.1234/test.1", 

1872 "Test Article", 

1873 "", 

1874 "2023", 

1875 "Test Journal [issn:1756-1833]", 

1876 "1", # Should match existing volume 

1877 "1", # Should match existing issue 

1878 "1-10", 

1879 "journal article", 

1880 "", 

1881 "", 

1882 ] 

1883 ) 

1884 

1885 # Create test settings 

1886 settings = { 

1887 "triplestore_url": SERVER, 

1888 "provenance_triplestore_url": PROV_SERVER, 

1889 "input_csv_dir": os.path.join(BASE_DIR, "input_vvi_triplestore"), 

1890 "base_output_dir": output_folder, 

1891 "output_rdf_dir": output_folder, 

1892 "resp_agent": "test", 

1893 "base_iri": "https://w3id.org/oc/meta/", 

1894 "context_path": None, 

1895 "dir_split_number": 10000, 

1896 "items_per_file": 1000, 

1897 "default_dir": "_", 

1898 "rdf_output_in_chunks": False, 

1899 "zip_output_rdf": True, 

1900 "source": None, 

1901 "supplier_prefix": "060", 

1902 "use_doi_api_service": False, 

1903 "blazegraph_full_text_search": False, 

1904 "virtuoso_full_text_search": True, 

1905 "fuseki_full_text_search": False, 

1906 "graphdb_connector_name": None, 

1907 "cache_endpoint": None, 

1908 "cache_update_endpoint": None, 

1909 "silencer": [], 

1910 "redis_host": "localhost", 

1911 "redis_port": 6381, 

1912 "redis_db": 5, 

1913 "redis_cache_db": 2, 

1914 "ts_upload_cache": self.cache_file, 

1915 "ts_failed_queries": self.failed_file, 

1916 "ts_stop_file": self.stop_file, 

1917 } 

1918 

1919 with open(meta_config_path, "w") as f: 

1920 yaml.dump(settings, f) 

1921 

1922 # Run the process 

1923 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

1924 

1925 # Check if new volumes/issues were created 

1926 to_be_uploaded_dir = os.path.join(output_folder, "rdf", "to_be_uploaded") 

1927 new_entities_created = False 

1928 if os.path.exists(to_be_uploaded_dir): 

1929 for dirpath, _, filenames in os.walk(to_be_uploaded_dir): 

1930 for f in filenames: 

1931 if f.endswith(".sparql"): 

1932 with open(os.path.join(dirpath, f)) as file: 

1933 content = file.read() 

1934 if any( 

1935 "JournalVolume" in line or "JournalIssue" in line 

1936 for line in content.splitlines() 

1937 ): 

1938 print(f"\nFound new volume/issue creation in {f}:") 

1939 new_entities_created = True 

1940 

1941 # Query to get all entities and their relationships 

1942 query = """ 

1943 PREFIX fabio: <http://purl.org/spar/fabio/> 

1944 PREFIX frbr: <http://purl.org/vocab/frbr/core#> 

1945 PREFIX datacite: <http://purl.org/spar/datacite/> 

1946  

1947 SELECT DISTINCT ?article ?venue ?volume ?issue ?issn 

1948 WHERE { 

1949 ?article a fabio:JournalArticle ; 

1950 frbr:partOf ?issue . 

1951 ?issue a fabio:JournalIssue ; 

1952 frbr:partOf ?volume . 

1953 ?volume a fabio:JournalVolume ; 

1954 frbr:partOf ?venue . 

1955 ?venue datacite:hasIdentifier ?id . 

1956 ?id datacite:usesIdentifierScheme datacite:issn ; 

1957 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?issn . 

1958 } 

1959 """ 

1960 

1961 result = execute_sparql_query(SERVER, query) 

1962 

1963 # Cleanup 

1964 shutil.rmtree(output_folder, ignore_errors=True) 

1965 shutil.rmtree( 

1966 os.path.join(BASE_DIR, "input_vvi_triplestore"), ignore_errors=True 

1967 ) 

1968 if os.path.exists(meta_config_path): 

1969 os.remove(meta_config_path) 

1970 

1971 # Verify results 

1972 bindings = result["results"]["bindings"] 

1973 self.assertEqual(len(bindings), 1, "Expected exactly one article") 

1974 

1975 # Get the URIs from the result 

1976 venue_uri = bindings[0]["venue"]["value"] 

1977 volume_uri = bindings[0]["volume"]["value"] 

1978 issue_uri = bindings[0]["issue"]["value"] 

1979 issn = bindings[0]["issn"]["value"] 

1980 

1981 # Check if venue was deduplicated (should use existing venue) 

1982 self.assertEqual( 

1983 venue_uri, 

1984 "https://w3id.org/oc/meta/br/0601", 

1985 "Venue was not deduplicated correctly", 

1986 ) 

1987 

1988 # Check if volume was deduplicated - either version is valid 

1989 self.assertIn( 

1990 volume_uri, 

1991 ["https://w3id.org/oc/meta/br/0602", "https://w3id.org/oc/meta/br/0604"], 

1992 "Volume was not deduplicated correctly - should use one of the existing volumes", 

1993 ) 

1994 

1995 # Check if issue was deduplicated - either version is valid 

1996 self.assertIn( 

1997 issue_uri, 

1998 ["https://w3id.org/oc/meta/br/0603", "https://w3id.org/oc/meta/br/0605"], 

1999 "Issue was not deduplicated correctly - should use one of the existing issues", 

2000 ) 

2001 

2002 # Check ISSN 

2003 self.assertEqual(issn, "1756-1833", "ISSN does not match") 

2004 

2005 # Verify no new volumes/issues were created 

2006 self.assertFalse( 

2007 new_entities_created, 

2008 "New volumes/issues were created when they should have been deduplicated", 

2009 ) 

2010 

2011 def test_temporary_identifiers(self): 

2012 """Test that temporary identifiers are used for deduplication but not saved, and an OMID is generated""" 

2013 output_folder = os.path.join(BASE_DIR, "output_temp_id_test") 

2014 meta_config_path = os.path.join(BASE_DIR, "meta_config_temp.yaml") 

2015 

2016 # Setup: create test data with only temporary identifier 

2017 os.makedirs(os.path.join(BASE_DIR, "input_temp"), exist_ok=True) 

2018 with open( 

2019 os.path.join(BASE_DIR, "input_temp", "test.csv"), "w", encoding="utf-8" 

2020 ) as f: 

2021 writer = csv.writer(f) 

2022 writer.writerow( 

2023 [ 

2024 "id", 

2025 "title", 

2026 "author", 

2027 "pub_date", 

2028 "venue", 

2029 "volume", 

2030 "issue", 

2031 "page", 

2032 "type", 

2033 "publisher", 

2034 "editor", 

2035 ] 

2036 ) 

2037 writer.writerow( 

2038 [ 

2039 "temp:567", # Only temporary identifier 

2040 "Test Article", 

2041 "", 

2042 "2023", 

2043 "", 

2044 "", 

2045 "", 

2046 "", 

2047 "journal article", 

2048 "", 

2049 "", 

2050 ] 

2051 ) 

2052 

2053 # Create test settings 

2054 settings = { 

2055 "triplestore_url": SERVER, 

2056 "provenance_triplestore_url": PROV_SERVER, 

2057 "input_csv_dir": os.path.join(BASE_DIR, "input_temp"), 

2058 "base_output_dir": output_folder, 

2059 "output_rdf_dir": output_folder, 

2060 "resp_agent": "test", 

2061 "base_iri": "https://w3id.org/oc/meta/", 

2062 "context_path": None, 

2063 "dir_split_number": 10000, 

2064 "items_per_file": 1000, 

2065 "default_dir": "_", 

2066 "rdf_output_in_chunks": False, 

2067 "zip_output_rdf": True, 

2068 "source": None, 

2069 "supplier_prefix": "060", 

2070 "use_doi_api_service": False, 

2071 "blazegraph_full_text_search": False, 

2072 "virtuoso_full_text_search": True, 

2073 "fuseki_full_text_search": False, 

2074 "graphdb_connector_name": None, 

2075 "cache_endpoint": None, 

2076 "cache_update_endpoint": None, 

2077 "silencer": [], 

2078 "redis_host": "localhost", 

2079 "redis_port": 6381, 

2080 "redis_db": 5, 

2081 "redis_cache_db": 2, 

2082 "ts_upload_cache": self.cache_file, 

2083 "ts_failed_queries": self.failed_file, 

2084 "ts_stop_file": self.stop_file, 

2085 } 

2086 

2087 with open(meta_config_path, "w") as f: 

2088 yaml.dump(settings, f) 

2089 

2090 now = datetime.now() 

2091 

2092 # Run the process 

2093 run_meta_process(settings=settings, meta_config_path=meta_config_path) 

2094 

2095 # Query to verify an OMID was generated and no temporary identifier was saved 

2096 query = """ 

2097 PREFIX fabio: <http://purl.org/spar/fabio/> 

2098 PREFIX datacite: <http://purl.org/spar/datacite/> 

2099 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/> 

2100  

2101 SELECT ?br ?id ?value ?scheme 

2102 WHERE { 

2103 ?br a fabio:JournalArticle . 

2104 OPTIONAL { 

2105 ?br datacite:hasIdentifier ?id . 

2106 ?id datacite:usesIdentifierScheme ?scheme ; 

2107 literal:hasLiteralValue ?value . 

2108 } 

2109 } 

2110 """ 

2111 result = execute_sparql_query(SERVER, query) 

2112 

2113 # Cleanup 

2114 shutil.rmtree(output_folder, ignore_errors=True) 

2115 shutil.rmtree(os.path.join(BASE_DIR, "input_temp"), ignore_errors=True) 

2116 if os.path.exists(meta_config_path): 

2117 os.remove(meta_config_path) 

2118 delete_output_zip(".", now) 

2119 

2120 # Verify results 

2121 bindings = result["results"]["bindings"] 

2122 

2123 # Should find exactly one article 

2124 self.assertEqual(len(bindings), 1, "Expected exactly one article") 

2125 

2126 # The article should have a br/ URI (OMID) 

2127 br_uri = bindings[0]["br"]["value"] 

2128 self.assertTrue( 

2129 "br/" in br_uri, 

2130 f"Article URI {br_uri} does not contain expected OMID pattern 'br/'", 

2131 ) 

2132 

2133 # Should not have any saved identifiers 

2134 self.assertNotIn( 

2135 "id", 

2136 bindings[0], 

2137 "Found unexpected identifier when only temporary ID was provided", 

2138 ) 

2139 

2140 def test_temporary_identifiers_deduplication(self): 

2141 """Test that multiple rows with the same temporary identifier are correctly deduplicated""" 

2142 # Create test data with two rows using the same temporary identifier 

2143 test_data = [ 

2144 { 

2145 "id": "temp:789", 

2146 "title": "Test Article 1", 

2147 "author": "Smith, John [orcid:0000-0002-1234-5678]", 

2148 "pub_date": "2020", 

2149 "venue": "", 

2150 "volume": "", 

2151 "issue": "", 

2152 "page": "", 

2153 "type": "journal article", 

2154 "publisher": "", 

2155 "editor": "", 

2156 }, 

2157 { 

2158 "id": "temp:789", # Same temporary ID 

2159 "title": "Test Article 1", # Same title 

2160 "author": "Smith, John [orcid:0000-0002-1234-5678]", 

2161 "pub_date": "2020", 

2162 "venue": "", 

2163 "volume": "", 

2164 "issue": "", 

2165 "page": "", 

2166 "type": "journal article", 

2167 "publisher": "", 

2168 "editor": "", 

2169 }, 

2170 ] 

2171 

2172 # Write test data to CSV 

2173 input_dir = os.path.join(BASE_DIR, "input_temp_dedup") 

2174 os.makedirs(input_dir, exist_ok=True) 

2175 csv_path = os.path.join(input_dir, "test.csv") 

2176 write_csv(csv_path, test_data) 

2177 

2178 # Run meta process 

2179 output_dir = os.path.join(BASE_DIR, "output_temp_dedup") 

2180 os.makedirs(output_dir, exist_ok=True) 

2181 config = { 

2182 "input_csv_dir": input_dir, 

2183 "base_output_dir": output_dir, 

2184 "output_rdf_dir": output_dir, 

2185 "triplestore_url": SERVER, 

2186 "provenance_triplestore_url": PROV_SERVER, 

2187 "resp_agent": "https://w3id.org/oc/meta/prov/pa/1", 

2188 "base_iri": "https://w3id.org/oc/meta/", 

2189 "context_path": "https://w3id.org/oc/meta/context.json", 

2190 "supplier_prefix": "060", 

2191 "dir_split_number": 10000, 

2192 "items_per_file": 1000, 

2193 "default_dir": "_", 

2194 "rdf_output_in_chunks": True, 

2195 "zip_output_rdf": False, 

2196 "source": None, 

2197 "use_doi_api_service": False, 

2198 "silencer": [], 

2199 "redis_host": "localhost", 

2200 "redis_port": 6381, 

2201 "redis_db": 5, 

2202 "redis_cache_db": 2, 

2203 "ts_upload_cache": self.cache_file, 

2204 "ts_failed_queries": self.failed_file, 

2205 "ts_stop_file": self.stop_file, 

2206 "graphdb_connector_name": None, 

2207 "blazegraph_full_text_search": False, 

2208 "fuseki_full_text_search": False, 

2209 "virtuoso_full_text_search": False, 

2210 "provenance_endpoints": [], 

2211 "cache_endpoint": None, 

2212 "cache_update_endpoint": None, 

2213 "normalize_titles": True, 

2214 } 

2215 config_path = os.path.join(output_dir, "config.yaml") 

2216 with open(config_path, "w") as f: 

2217 yaml.dump(config, f) 

2218 

2219 # Run the process 

2220 run_meta_process(settings=config, meta_config_path=config_path) 

2221 

2222 # Query the triplestore to verify: 

2223 # 1. Only one OMID was generated for both rows 

2224 # 2. The temporary identifier was not saved 

2225 query = """ 

2226 SELECT DISTINCT ?br 

2227 WHERE { 

2228 ?br a <http://purl.org/spar/fabio/JournalArticle> . 

2229 } 

2230 """ 

2231 results = execute_sparql_query(SERVER, query) 

2232 

2233 # Clean up 

2234 shutil.rmtree(input_dir) 

2235 shutil.rmtree(output_dir) 

2236 

2237 # Should only be one article 

2238 articles = [ 

2239 str(result["br"]["value"]) for result in results["results"]["bindings"] 

2240 ] 

2241 self.assertEqual( 

2242 len(articles), 1, "Should only be one article after deduplication" 

2243 ) 

2244 

2245 

2246def normalize_graph(graph): 

2247 """ 

2248 Normalizza i letterali nel grafo rimuovendo i tipi di dato espliciti. 

2249 """ 

2250 normalized_graph = Graph() 

2251 for subject, predicate, obj in graph: 

2252 if isinstance(obj, Literal) and obj.datatype is not None: 

2253 normalized_obj = Literal(obj.toPython()) 

2254 normalized_graph.add((subject, predicate, normalized_obj)) 

2255 else: 

2256 normalized_graph.add((subject, predicate, obj)) 

2257 return normalized_graph 

2258 

2259 

2260if __name__ == "__main__": # pragma: no cover 

2261 unittest.main()