Coverage for test/meta_process

1import csv

2import json

3import os

4import re

5import shutil

6import subprocess

7import sys

8import tempfile

9import time

10import unittest

11from datetime import datetime

12from test.test_utils import (PROV_SERVER, SERVER, VIRTUOSO_CONTAINER,

13 VIRTUOSO_PROV_CONTAINER, execute_sparql_query,

14 reset_redis_counters, reset_server)

16import yaml

17from oc_meta.lib.file_manager import get_csv_data, write_csv

18from oc_meta.run.meta_process import run_meta_process

19from oc_ocdm.counter_handler.redis_counter_handler import RedisCounterHandler

20from rdflib import ConjunctiveGraph, Graph, Literal, URIRef

21from SPARQLWrapper import JSON, POST, XML, SPARQLWrapper

23BASE_DIR = os.path.join("test", "meta_process")

26def delete_output_zip(base_dir: str, start_time: datetime) -> None:

27 for file in os.listdir(base_dir):

28 if file.startswith("meta_output") and file.endswith(".zip"):

29 file_creation_time = file.split("meta_output_")[1].replace(".zip", "")

30 file_creation_time = datetime.strptime(

31 file_creation_time, "%Y-%m-%dT%H_%M_%S_%f"

32 )

33 was_created_after_time = True if file_creation_time > start_time else False

34 if was_created_after_time:

35 os.remove(os.path.join(base_dir, file))

38class test_ProcessTest(unittest.TestCase):

39 @classmethod

40 def setUpClass(cls):

41 """Setup iniziale eseguito una volta per tutta la classe di test"""

42 # Aspetta che Virtuoso sia pronto

43 max_wait = 30 # secondi

44 start_time = time.time()

45 while time.time() - start_time < max_wait:

46 try:

47 # Prova una query semplice

48 sparql = SPARQLWrapper(SERVER)

49 sparql.setQuery("SELECT * WHERE { ?s ?p ?o } LIMIT 1")

50 sparql.setReturnFormat(JSON)

51 sparql.query()

52 break

53 except Exception:

54 time.sleep(2)

55 else:

56 raise TimeoutError(f"Virtuoso non pronto dopo {max_wait} secondi")

58 def setUp(self):

59 """Setup eseguito prima di ogni test"""

60 # Create temporary directory for cache files

61 self.temp_dir = tempfile.mkdtemp()

62 self.cache_file = os.path.join(self.temp_dir, "ts_upload_cache.json")

63 self.failed_file = os.path.join(self.temp_dir, "failed_queries.txt")

64 self.stop_file = os.path.join(self.temp_dir, ".stop_upload")

66 # Reset del database

67 reset_server()

68 reset_redis_counters()

70 def tearDown(self):

71 reset_redis_counters()

72 # Remove temporary directory and its contents

73 if hasattr(self, "temp_dir") and os.path.exists(self.temp_dir):

74 shutil.rmtree(self.temp_dir)

76 def test_run_meta_process(self):

77 output_folder = os.path.join(BASE_DIR, "output_1")

78 meta_config_path = os.path.join(BASE_DIR, "meta_config_1.yaml")

79 with open(meta_config_path, encoding="utf-8") as file:

80 settings = yaml.full_load(file)

82 # Update settings with temporary files and Redis cache DB

83 settings.update(

84 {

85 "redis_cache_db": 2,

86 "ts_upload_cache": self.cache_file,

87 "ts_failed_queries": self.failed_file,

88 "ts_stop_file": self.stop_file,

89 }

90 )

92 now = datetime.now()

93 run_meta_process(settings=settings, meta_config_path=meta_config_path)

94 output = list()

95 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")):

96 for file in filenames:

97 output.extend(get_csv_data(os.path.join(dirpath, file)))

98 expected_output = [

99 {

100 "id": "doi:10.17117/na.2015.08.1067 omid:br/0601",

101 "title": "",

102 "author": "",

103 "pub_date": "",

104 "venue": "Scientometrics [issn:0138-9130 issn:1588-2861 omid:br/0603]",

105 "volume": "26",

106 "issue": "",

107 "page": "",

108 "type": "journal article",

109 "publisher": "Consulting Company Ucom [crossref:6623 omid:ra/0601]",

110 "editor": "Naimi, Elmehdi [orcid:0000-0002-4126-8519 omid:ra/0602]",

111 },

112 {

113 "id": "issn:1524-4539 issn:0009-7322 omid:br/0602",

114 "title": "Circulation",

115 "author": "",

116 "pub_date": "",

117 "venue": "",

118 "volume": "",

119 "issue": "",

120 "page": "",

121 "type": "journal",

122 "publisher": "",

123 "editor": "",

124 },

125 {

126 "id": "doi:10.9799/ksfan.2012.25.1.069 omid:br/0605",

127 "title": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment",

128 "author": "Cheigh, Chan-Ick [orcid:0000-0003-2542-5788 omid:ra/0603]; Mun, Ji-Hye [omid:ra/0604]; Chung, Myong-Soo [omid:ra/0605]",

129 "pub_date": "2012-03-31",

130 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/0608]",

131 "volume": "25",

132 "issue": "1",

133 "page": "69-76",

134 "type": "journal article",

135 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/0606]",

136 "editor": "Chung, Myong-Soo [orcid:0000-0002-9666-2513 omid:ra/0607]",

137 },

138 {

139 "id": "doi:10.9799/ksfan.2012.25.1.077 omid:br/0606",

140 "title": "Properties Of Immature Green Cherry Tomato Pickles",

141 "author": "Koh, Jong-Ho [omid:ra/0608]; Shin, Hae-Hun [omid:ra/0609]; Kim, Young-Shik [orcid:0000-0001-5673-6314 omid:ra/06010]; Kook, Moo-Chang [omid:ra/06011]",

142 "pub_date": "2012-03-31",

143 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/0608]",

144 "volume": "",

145 "issue": "2",

146 "page": "77-82",

147 "type": "journal article",

148 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/0606]",

149 "editor": "",

150 },

151 {

152 "id": "doi:10.1097/01.rct.0000185385.35389.cd omid:br/0607",

153 "title": "Comprehensive Assessment Of Lung CT Attenuation Alteration At Perfusion Defects Of Acute Pulmonary Thromboembolism With Breath-Hold SPECT-CT Fusion Images",

154 "author": "Suga, Kazuyoshi [omid:ra/06012]; Kawakami, Yasuhiko [omid:ra/06013]; Iwanaga, Hideyuki [omid:ra/06014]; Hayashi, Noriko [omid:ra/06015]; Seto, Aska [omid:ra/06016]; Matsunaga, Naofumi [omid:ra/06017]",

155 "pub_date": "2006-01",

156 "venue": "Journal Of Computer Assisted Tomography [issn:0363-8715 omid:br/06012]",

157 "volume": "30",

158 "issue": "1",

159 "page": "83-91",

160 "type": "journal article",

161 "publisher": "Ovid Technologies (Wolters Kluwer Health) [crossref:276 omid:ra/06018]",

162 "editor": "",

163 },

164 ]

165 output = sorted(sorted(d.items()) for d in output)

166 expected_output = sorted(sorted(d.items()) for d in expected_output)

167 self.maxDiff = None

168 shutil.rmtree(output_folder)

169 delete_output_zip(".", now)

170 self.assertEqual(output, expected_output)

171

172 def test_run_meta_process_ids_only(self):

173 output_folder = os.path.join(BASE_DIR, "output_5")

174 meta_config_path = os.path.join(BASE_DIR, "meta_config_5.yaml")

175 now = datetime.now()

176 with open(meta_config_path, encoding="utf-8") as file:

177 settings = yaml.full_load(file)

178

179 # Update settings with temporary files and Redis cache DB

180 settings.update(

181 {

182 "redis_cache_db": 2,

183 "ts_upload_cache": self.cache_file,

184 "ts_failed_queries": self.failed_file,

185 "ts_stop_file": self.stop_file,

186 }

187 )

188

189 run_meta_process(settings, meta_config_path=meta_config_path)

190 output = list()

191 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")):

192 for file in filenames:

193 output.extend(get_csv_data(os.path.join(dirpath, file)))

194 expected_output = [

195 {

196 "id": "doi:10.17117/na.2015.08.1067 omid:br/0601",

197 "title": "Some Aspects Of The Evolution Of Chernozems Under The Influence Of Natural And Anthropogenic Factors",

198 "author": "[orcid:0000-0002-4126-8519 omid:ra/0601]; [orcid:0000-0003-0530-4305 omid:ra/0602]",

199 "pub_date": "2015-08-22",

200 "venue": "[issn:1225-4339 omid:br/0602]",

201 "volume": "26",

202 "issue": "",

203 "page": "50",

204 "type": "journal article",

205 "publisher": "[crossref:6623 omid:ra/0603]",

206 "editor": "[orcid:0000-0002-4126-8519 omid:ra/0601]; [orcid:0000-0002-8420-0696 omid:ra/0604]",

207 }

208 ]

209 output = sorted(sorted(d.items()) for d in output)

210 expected_output = sorted(sorted(d.items()) for d in expected_output)

211 self.maxDiff = None

212 shutil.rmtree(output_folder)

213 delete_output_zip(".", now)

214 self.assertEqual(output, expected_output)

215

216 def test_run_meta_process_two_workers(self):

217 output_folder = os.path.join(BASE_DIR, "output_2")

218 meta_config_path = os.path.join(BASE_DIR, "meta_config_2.yaml")

219 with open(meta_config_path, encoding="utf-8") as file:

220 settings = yaml.full_load(file)

221

222 # Update settings with temporary files and Redis cache DB

223 settings.update(

224 {

225 "redis_cache_db": 2,

226 "ts_upload_cache": self.cache_file,

227 "ts_failed_queries": self.failed_file,

228 "ts_stop_file": self.stop_file,

229 "workers_number": 2,

230 }

231 )

232

233 now = datetime.now()

234 run_meta_process(settings=settings, meta_config_path=meta_config_path)

235 output = list()

236 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")):

237 for file in filenames:

238 output.extend(get_csv_data(os.path.join(dirpath, file)))

239 shutil.rmtree(output_folder)

240 delete_output_zip(".", now)

241 expected_output = [

242 {

243 "id": "doi:10.17117/na.2015.08.1067 omid:br/06101",

244 "title": "",

245 "author": "",

246 "pub_date": "",

247 "venue": "Scientometrics [issn:0138-9130 issn:1588-2861 omid:br/06103]",

248 "volume": "26",

249 "issue": "",

250 "page": "",

251 "type": "journal article",

252 "publisher": "Consulting Company Ucom [crossref:6623 omid:ra/06101]",

253 "editor": "Naimi, Elmehdi [orcid:0000-0002-4126-8519 omid:ra/06102]",

254 },

255 {

256 "id": "issn:1524-4539 issn:0009-7322 omid:br/06102",

257 "title": "Circulation",

258 "author": "",

259 "pub_date": "",

260 "venue": "",

261 "volume": "",

262 "issue": "",

263 "page": "",

264 "type": "journal",

265 "publisher": "",

266 "editor": "",

267 },

268 {

269 "id": "doi:10.9799/ksfan.2012.25.1.069 omid:br/06201",

270 "title": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment",

271 "author": "Cheigh, Chan-Ick [orcid:0000-0003-2542-5788 omid:ra/06201]; Mun, Ji-Hye [omid:ra/06202]; Chung, Myong-Soo [omid:ra/06203]",

272 "pub_date": "2012-03-31",

273 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/06204]",

274 "volume": "25",

275 "issue": "1",

276 "page": "69-76",

277 "type": "journal article",

278 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/06204]",

279 "editor": "Chung, Myong-Soo [orcid:0000-0002-9666-2513 omid:ra/06205]",

280 },

281 {

282 "id": "doi:10.9799/ksfan.2012.25.1.077 omid:br/06202",

283 "title": "Properties Of Immature Green Cherry Tomato Pickles",

284 "author": "Koh, Jong-Ho [omid:ra/06206]; Shin, Hae-Hun [omid:ra/06207]; Kim, Young-Shik [orcid:0000-0001-5673-6314 omid:ra/06208]; Kook, Moo-Chang [omid:ra/06209]",

285 "pub_date": "2012-03-31",

286 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/06204]",

287 "volume": "",

288 "issue": "2",

289 "page": "77-82",

290 "type": "journal article",

291 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/06204]",

292 "editor": "",

293 },

294 {

295 "id": "doi:10.1097/01.rct.0000185385.35389.cd omid:br/06203",

296 "title": "Comprehensive Assessment Of Lung CT Attenuation Alteration At Perfusion Defects Of Acute Pulmonary Thromboembolism With Breath-Hold SPECT-CT Fusion Images",

297 "author": "Suga, Kazuyoshi [omid:ra/062010]; Kawakami, Yasuhiko [omid:ra/062011]; Iwanaga, Hideyuki [omid:ra/062012]; Hayashi, Noriko [omid:ra/062013]; Seto, Aska [omid:ra/062014]; Matsunaga, Naofumi [omid:ra/062015]",

298 "pub_date": "2006-01",

299 "venue": "Journal Of Computer Assisted Tomography [issn:0363-8715 omid:br/06208]",

300 "volume": "30",

301 "issue": "1",

302 "page": "83-91",

303 "type": "journal article",

304 "publisher": "Ovid Technologies (Wolters Kluwer Health) [crossref:276 omid:ra/062016]",

305 "editor": "",

306 },

307 ]

308 output = sorted(sorted(d.items()) for d in output)

309 expected_output = sorted(sorted(d.items()) for d in expected_output)

310 self.assertEqual(output, expected_output)

311

312 def test_provenance(self):

313 output_folder = os.path.join(BASE_DIR, "output_3")

314 now = datetime.now()

315 if os.path.exists(output_folder):

316 shutil.rmtree(output_folder)

317 delete_output_zip(".", now)

318 meta_config_path = os.path.join(BASE_DIR, "meta_config_3.yaml")

319 with open(meta_config_path, encoding="utf-8") as file:

320 settings = yaml.full_load(file)

321

322 # Update settings with temporary files and Redis cache DB

323 settings.update(

324 {

325 "redis_cache_db": 2,

326 "ts_upload_cache": self.cache_file,

327 "ts_failed_queries": self.failed_file,

328 "ts_stop_file": self.stop_file,

329 }

330 )

331

332 reset_server()

333

334 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input")

335 run_meta_process(settings=settings, meta_config_path=meta_config_path)

336 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input_2")

337 run_meta_process(settings=settings, meta_config_path=meta_config_path)

338 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input")

339 run_meta_process(settings=settings, meta_config_path=meta_config_path)

340

341 output = dict()

342

343 entity_types = ['ar', 'br', 'id', 'ra', 're']

344

345 for entity_type in entity_types:

346 query = f"""

347 PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

348 PREFIX prov: <http://www.w3.org/ns/prov#>

349 PREFIX oco: <https://w3id.org/oc/ontology/>

350

351 CONSTRUCT {{

352 ?s ?p ?o .

353 }}

354 WHERE {{

355 ?s ?p ?o .

356 FILTER(REGEX(STR(?s), "https://w3id.org/oc/meta/{entity_type}/[0-9]+/prov/se/[0-9]+"))

357 }}

358 """

359

360 result = execute_sparql_query(PROV_SERVER, query, return_format=XML)

361

362 g = Graph()

363 for s, p, o in result:

364 g.add((s, p, o))

365

366 entities = {}

367 for s, p, o in g:

368 s_str = str(s)

369 if s_str not in entities:

370 entities[s_str] = {'@id': s_str, '@type': []}

371

372 p_str = str(p)

373 if p == URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'):

374 entities[s_str]['@type'].append(str(o))

375 else:

376 if p_str not in entities[s_str]:

377 entities[s_str][p_str] = []

378

379 if isinstance(o, URIRef):

380 entities[s_str][p_str].append({'@id': str(o)})

381 elif isinstance(o, Literal):

382 if o.datatype:

383 entities[s_str][p_str].append({

384 '@value': str(o),

385 '@type': str(o.datatype)

386 })

387 else:

388 entities[s_str][p_str].append({'@value': str(o)})

389

390 # Group entities by their parent entity (e.g., br/0601/prov/se/1 -> br/0601)

391 grouped_entities = {}

392 for entity_id, entity_data in entities.items():

393 # Extract the parent entity ID from the provenance entity ID

394 parent_id = re.match(r'https://w3id.org/oc/meta/([^/]+/[0-9]+)', entity_id).group(0)

395

396 if parent_id not in grouped_entities:

397 grouped_entities[parent_id] = []

398

399 # Filter out properties we don't need for comparison

400 filtered_entity_data = {

401 '@id': entity_data['@id'],

402 }

403

404 # Keep the required properties for comparison

405 properties_to_keep = [

406 'http://www.w3.org/ns/prov#specializationOf',

407 'http://www.w3.org/ns/prov#wasDerivedFrom'

408 ]

409

410 for prop in properties_to_keep:

411 if prop in entity_data:

412 filtered_entity_data[prop] = entity_data[prop]

413

414 # Handle hasUpdateQuery specially

415 if 'https://w3id.org/oc/ontology/hasUpdateQuery' in entity_data:

416 # Extract the value from the hasUpdateQuery property

417 update_query_value = entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'][0].get('@value', '')

418

419 # Split the query into individual statements

420 if update_query_value:

421 # Extract the part between the INSERT DATA { GRAPH <...> { and } }

422 try:

423 query_content = update_query_value.split(

424 "INSERT DATA { GRAPH <https://w3id.org/oc/meta/br/> { "

425 )[1].split(" } }")[0]

426

427 # Split by dot and space to get individual statements

428 statements = set(query_content.split(" ."))

429

430 # Add to filtered entity data

431 filtered_entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'] = statements

432 except IndexError:

433 # If the format is different, just use the original value

434 filtered_entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'] = \

435 entity_data['https://w3id.org/oc/ontology/hasUpdateQuery']

436

437 # Add this filtered entity to its parent's group

438 grouped_entities[parent_id].append(filtered_entity_data)

439

440 # Format the output to match the expected structure

441 entity_list = []

442 for parent_id, entities_list in sorted(grouped_entities.items()):

443 entity_list.append({

444 '@graph': sorted(entities_list, key=lambda x: x['@id'])

445 })

446

447 output[entity_type] = entity_list

448 expected_output = {

449 "ar": [

450 {

451 "@graph": [

452 {

453 "@id": "https://w3id.org/oc/meta/ar/0601/prov/se/1",

454 "http://www.w3.org/ns/prov#specializationOf": [

455 {"@id": "https://w3id.org/oc/meta/ar/0601"}

456 ],

457 }

458 ]

459 },

460 {

461 "@graph": [

462 {

463 "@id": "https://w3id.org/oc/meta/ar/0602/prov/se/1",

464 "http://www.w3.org/ns/prov#specializationOf": [

465 {"@id": "https://w3id.org/oc/meta/ar/0602"}

466 ],

467 }

468 ]

469 },

470 {

471 "@graph": [

472 {

473 "@id": "https://w3id.org/oc/meta/ar/0603/prov/se/1",

474 "http://www.w3.org/ns/prov#specializationOf": [

475 {"@id": "https://w3id.org/oc/meta/ar/0603"}

476 ],

477 }

478 ]

479 },

480 ],

481 "br": [

482 {

483 "@graph": [

484 {

485 "@id": "https://w3id.org/oc/meta/br/0601/prov/se/1",

486 "http://www.w3.org/ns/prov#specializationOf": [

487 {"@id": "https://w3id.org/oc/meta/br/0601"}

488 ],

489 },

490 {

491 "@id": "https://w3id.org/oc/meta/br/0601/prov/se/2",

492 "http://www.w3.org/ns/prov#specializationOf": [

493 {"@id": "https://w3id.org/oc/meta/br/0601"}

494 ],

495 "http://www.w3.org/ns/prov#wasDerivedFrom": [

496 {"@id": "https://w3id.org/oc/meta/br/0601/prov/se/1"}

497 ],

498 "https://w3id.org/oc/ontology/hasUpdateQuery": {

499 "",

500 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0601>",

501 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0603>",

502 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0602>",

503 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0603>",

504 '<https://w3id.org/oc/meta/br/0601> <http://prismstandard.org/namespaces/basic/2.0/publicationDate> "2015-08-22"^^<http://www.w3.org/2001/XMLSchema#date>',

505 '<https://w3id.org/oc/meta/br/0601> <http://purl.org/dc/terms/title> "Some Aspects Of The Evolution Of Chernozems Under The Influence Of Natural And Anthropogenic Factors"^^<http://www.w3.org/2001/XMLSchema#string>',

506 },

507 },

508 ]

509 },

510 {

511 "@graph": [

512 {

513 "@id": "https://w3id.org/oc/meta/br/0602/prov/se/1",

514 "http://www.w3.org/ns/prov#specializationOf": [

515 {"@id": "https://w3id.org/oc/meta/br/0602"}

516 ],

517 }

518 ]

519 },

520 {

521 "@graph": [

522 {

523 "@id": "https://w3id.org/oc/meta/br/0603/prov/se/1",

524 "http://www.w3.org/ns/prov#specializationOf": [

525 {"@id": "https://w3id.org/oc/meta/br/0603"}

526 ],

527 }

528 ]

529 },

530 ],

531 "id": [

532 {

533 "@graph": [

534 {

535 "@id": "https://w3id.org/oc/meta/id/0601/prov/se/1",

536 "http://www.w3.org/ns/prov#specializationOf": [

537 {"@id": "https://w3id.org/oc/meta/id/0601"}

538 ],

539 }

540 ]

541 },

542 {

543 "@graph": [

544 {

545 "@id": "https://w3id.org/oc/meta/id/0602/prov/se/1",

546 "http://www.w3.org/ns/prov#specializationOf": [

547 {"@id": "https://w3id.org/oc/meta/id/0602"}

548 ],

549 }

550 ]

551 },

552 {

553 "@graph": [

554 {

555 "@id": "https://w3id.org/oc/meta/id/0603/prov/se/1",

556 "http://www.w3.org/ns/prov#specializationOf": [

557 {"@id": "https://w3id.org/oc/meta/id/0603"}

558 ],

559 }

560 ]

561 },

562 {

563 "@graph": [

564 {

565 "@id": "https://w3id.org/oc/meta/id/0604/prov/se/1",

566 "http://www.w3.org/ns/prov#specializationOf": [

567 {"@id": "https://w3id.org/oc/meta/id/0604"}

568 ],

569 }

570 ]

571 },

572 ],

573 "ra": [

574 {

575 "@graph": [

576 {

577 "@id": "https://w3id.org/oc/meta/ra/0601/prov/se/1",

578 "http://www.w3.org/ns/prov#specializationOf": [

579 {"@id": "https://w3id.org/oc/meta/ra/0601"}

580 ],

581 }

582 ]

583 },

584 {

585 "@graph": [

586 {

587 "@id": "https://w3id.org/oc/meta/ra/0602/prov/se/1",

588 "http://www.w3.org/ns/prov#specializationOf": [

589 {"@id": "https://w3id.org/oc/meta/ra/0602"}

590 ],

591 }

592 ]

593 },

594 ],

595 "re": [

596 {

597 "@graph": [

598 {

599 "@id": "https://w3id.org/oc/meta/re/0601/prov/se/1",

600 "http://www.w3.org/ns/prov#specializationOf": [

601 {"@id": "https://w3id.org/oc/meta/re/0601"}

602 ],

603 }

604 ]

605 }

606 ],

607 }

608 shutil.rmtree(output_folder)

609 self.maxDiff = None

610 self.assertEqual(output, expected_output)

611

612 def test_run_meta_process_thread_safe(self):

613 output_folder = os.path.join(BASE_DIR, "output_4")

614 meta_config_path = os.path.join(BASE_DIR, "meta_config_4.yaml")

615 with open(meta_config_path, encoding="utf-8") as file:

616 settings = yaml.full_load(file)

617 original_input_csv_dir = settings["input_csv_dir"]

618 settings["input_csv_dir"] = os.path.join(original_input_csv_dir, "preprocess")

619 settings["workers_number"] = 1

620

621 reset_server()

622

623 run_meta_process(settings=settings, meta_config_path=meta_config_path)

624

625 # Run it again to test thread safety

626 proc = subprocess.run(

627 [sys.executable, "-m", "oc_meta.run.meta_process", "-c", meta_config_path],

628 capture_output=True,

629 text=True,

630 )

631

632 output = dict()

633

634 entity_types = ['ar', 'br', 'id', 'ra', 're']

635

636 for entity_type in entity_types:

637 query = f"""

638 PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

639 PREFIX fabio: <http://purl.org/spar/fabio/>

640 PREFIX pro: <http://purl.org/spar/pro/>

641 PREFIX datacite: <http://purl.org/spar/datacite/>

642 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>

643 PREFIX frbr: <http://purl.org/vocab/frbr/core#>

644 PREFIX foaf: <http://xmlns.com/foaf/0.1/>

645 PREFIX prism: <http://prismstandard.org/namespaces/basic/2.0/>

646 PREFIX dcterms: <http://purl.org/dc/terms/>

647 PREFIX oco: <https://w3id.org/oc/ontology/>

648

649 CONSTRUCT {{

650 ?s ?p ?o .

651 }}

652 WHERE {{

653 ?s ?p ?o .

654 FILTER(STRSTARTS(STR(?s), "https://w3id.org/oc/meta/{entity_type}/"))

655 }}

656 """

657

658 result = execute_sparql_query(SERVER, query, return_format=XML)

659

660 g = Graph()

661 for s, p, o in result:

662 g.add((s, p, o))

663

664 entities = {}

665 for s, p, o in g:

666 s_str = str(s)

667 if s_str not in entities:

668 entities[s_str] = {'@id': s_str, '@type': []}

669

670 p_str = str(p)

671 if p == URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'):

672 entities[s_str]['@type'].append(str(o))

673 else:

674 if p_str not in entities[s_str]:

675 entities[s_str][p_str] = []

676

677 if isinstance(o, URIRef):

678 entities[s_str][p_str].append({'@id': str(o)})

679 elif isinstance(o, Literal):

680 if o.datatype:

681 entities[s_str][p_str].append({

682 '@value': str(o),

683 '@type': str(o.datatype)

684 })

685 else:

686 entities[s_str][p_str].append({'@value': str(o)})

687

688 entity_list = list(entities.values())

689

690 output[entity_type] = [

691 {

692 '@graph': entity_list,

693 '@id': f"https://w3id.org/oc/meta/{entity_type}/"

694 }

695 ]

696

697 expected_output = {

698 "ar": [

699 {

700 "@graph": [

701 {

702 "@id": "https://w3id.org/oc/meta/ar/0604",

703 "@type": ["http://purl.org/spar/pro/RoleInTime"],

704 "http://purl.org/spar/pro/isHeldBy": [

705 {"@id": "https://w3id.org/oc/meta/ra/0604"}

706 ],

707 "http://purl.org/spar/pro/withRole": [

708 {"@id": "http://purl.org/spar/pro/publisher"}

709 ],

710 },

711 {

712 "@id": "https://w3id.org/oc/meta/ar/0602",

713 "@type": ["http://purl.org/spar/pro/RoleInTime"],

714 "http://purl.org/spar/pro/isHeldBy": [

715 {"@id": "https://w3id.org/oc/meta/ra/0602"}

716 ],

717 "http://purl.org/spar/pro/withRole": [

718 {"@id": "http://purl.org/spar/pro/author"}

719 ],

720 "https://w3id.org/oc/ontology/hasNext": [

721 {"@id": "https://w3id.org/oc/meta/ar/0603"}

722 ],

723 },

724 {

725 "@id": "https://w3id.org/oc/meta/ar/0603",

726 "@type": ["http://purl.org/spar/pro/RoleInTime"],

727 "http://purl.org/spar/pro/isHeldBy": [

728 {"@id": "https://w3id.org/oc/meta/ra/0603"}

729 ],

730 "http://purl.org/spar/pro/withRole": [

731 {"@id": "http://purl.org/spar/pro/author"}

732 ],

733 },

734 {

735 "@id": "https://w3id.org/oc/meta/ar/0605",

736 "@type": ["http://purl.org/spar/pro/RoleInTime"],

737 "http://purl.org/spar/pro/isHeldBy": [

738 {"@id": "https://w3id.org/oc/meta/ra/0605"}

739 ],

740 "http://purl.org/spar/pro/withRole": [

741 {"@id": "http://purl.org/spar/pro/editor"}

742 ],

743 },

744 {

745 "@id": "https://w3id.org/oc/meta/ar/0601",

746 "@type": ["http://purl.org/spar/pro/RoleInTime"],

747 "http://purl.org/spar/pro/isHeldBy": [

748 {"@id": "https://w3id.org/oc/meta/ra/0601"}

749 ],

750 "http://purl.org/spar/pro/withRole": [

751 {"@id": "http://purl.org/spar/pro/author"}

752 ],

753 "https://w3id.org/oc/ontology/hasNext": [

754 {"@id": "https://w3id.org/oc/meta/ar/0602"}

755 ],

756 },

757 ],

758 "@id": "https://w3id.org/oc/meta/ar/",

759 }

760 ],

761 "br": [

762 {

763 "@graph": [

764 {

765 "@id": "https://w3id.org/oc/meta/br/0601",

766 "@type": [

767 "http://purl.org/spar/fabio/Expression",

768 "http://purl.org/spar/fabio/JournalArticle",

769 ],

770 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [

771 {

772 "@type": "http://www.w3.org/2001/XMLSchema#date",

773 "@value": "2012-03-31",

774 }

775 ],

776 "http://purl.org/dc/terms/title": [

777 {

778 "@value": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment",

779 "@type": "http://www.w3.org/2001/XMLSchema#string"

780 }

781 ],

782 "http://purl.org/spar/datacite/hasIdentifier": [

783 {"@id": "https://w3id.org/oc/meta/id/0601"}

784 ],

785 "http://purl.org/spar/pro/isDocumentContextFor": [

786 {"@id": "https://w3id.org/oc/meta/ar/0603"},

787 {"@id": "https://w3id.org/oc/meta/ar/0601"},

788 {"@id": "https://w3id.org/oc/meta/ar/0604"},

789 {"@id": "https://w3id.org/oc/meta/ar/0602"},

790 {"@id": "https://w3id.org/oc/meta/ar/0605"},

791 ],

792 "http://purl.org/vocab/frbr/core#embodiment": [

793 {"@id": "https://w3id.org/oc/meta/re/0601"}

794 ],

795 "http://purl.org/vocab/frbr/core#partOf": [

796 {"@id": "https://w3id.org/oc/meta/br/0604"}

797 ],

798 },

799 {

800 "@id": "https://w3id.org/oc/meta/br/0604",

801 "@type": [

802 "http://purl.org/spar/fabio/JournalIssue",

803 "http://purl.org/spar/fabio/Expression",

804 ],

805 "http://purl.org/spar/fabio/hasSequenceIdentifier": [

806 {"@value": "1", "@type": "http://www.w3.org/2001/XMLSchema#string"}

807 ],

808 "http://purl.org/vocab/frbr/core#partOf": [

809 {"@id": "https://w3id.org/oc/meta/br/0603"}

810 ],

811 },

812 {

813 "@id": "https://w3id.org/oc/meta/br/0602",

814 "@type": [

815 "http://purl.org/spar/fabio/Expression",

816 "http://purl.org/spar/fabio/Journal",

817 ],

818 "http://purl.org/dc/terms/title": [

819 {"@value": "The Korean Journal Of Food And Nutrition", "@type": "http://www.w3.org/2001/XMLSchema#string"}

820 ],

821 "http://purl.org/spar/datacite/hasIdentifier": [

822 {"@id": "https://w3id.org/oc/meta/id/0602"}

823 ],

824 },

825 {

826 "@id": "https://w3id.org/oc/meta/br/0603",

827 "@type": [

828 "http://purl.org/spar/fabio/Expression",

829 "http://purl.org/spar/fabio/JournalVolume",

830 ],

831 "http://purl.org/spar/fabio/hasSequenceIdentifier": [

832 {"@value": "25", "@type": "http://www.w3.org/2001/XMLSchema#string"}

833 ],

834 "http://purl.org/vocab/frbr/core#partOf": [

835 {"@id": "https://w3id.org/oc/meta/br/0602"}

836 ],

837 },

838 ],

839 "@id": "https://w3id.org/oc/meta/br/",

840 }

841 ],

842 "id": [

843 {

844 "@graph": [

845 {

846 "@id": "https://w3id.org/oc/meta/id/0605",

847 "@type": ["http://purl.org/spar/datacite/Identifier"],

848 "http://purl.org/spar/datacite/usesIdentifierScheme": [

849 {"@id": "http://purl.org/spar/datacite/orcid"}

850 ],

851 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [

852 {"@value": "0000-0002-9666-2513", "@type": "http://www.w3.org/2001/XMLSchema#string"}

853 ],

854 },

855 {

856 "@id": "https://w3id.org/oc/meta/id/0601",

857 "@type": ["http://purl.org/spar/datacite/Identifier"],

858 "http://purl.org/spar/datacite/usesIdentifierScheme": [

859 {"@id": "http://purl.org/spar/datacite/doi"}

860 ],

861 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [

862 {"@value": "10.9799/ksfan.2012.25.1.069", "@type": "http://www.w3.org/2001/XMLSchema#string"}

863 ],

864 },

865 {

866 "@id": "https://w3id.org/oc/meta/id/0603",

867 "@type": ["http://purl.org/spar/datacite/Identifier"],

868 "http://purl.org/spar/datacite/usesIdentifierScheme": [

869 {"@id": "http://purl.org/spar/datacite/orcid"}

870 ],

871 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [

872 {"@value": "0000-0003-2542-5788", "@type": "http://www.w3.org/2001/XMLSchema#string"}

873 ],

874 },

875 {

876 "@id": "https://w3id.org/oc/meta/id/0604",

877 "@type": ["http://purl.org/spar/datacite/Identifier"],

878 "http://purl.org/spar/datacite/usesIdentifierScheme": [

879 {"@id": "http://purl.org/spar/datacite/crossref"}

880 ],

881 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [

882 {"@value": "4768", "@type": "http://www.w3.org/2001/XMLSchema#string"}

883 ],

884 },

885 {

886 "@id": "https://w3id.org/oc/meta/id/0602",

887 "@type": ["http://purl.org/spar/datacite/Identifier"],

888 "http://purl.org/spar/datacite/usesIdentifierScheme": [

889 {"@id": "http://purl.org/spar/datacite/issn"}

890 ],

891 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [

892 {"@value": "1225-4339", "@type": "http://www.w3.org/2001/XMLSchema#string"}

893 ],

894 },

895 ],

896 "@id": "https://w3id.org/oc/meta/id/",

897 }

898 ],

899 "ra": [

900 {

901 "@graph": [

902 {

903 "@id": "https://w3id.org/oc/meta/ra/0605",

904 "@type": ["http://xmlns.com/foaf/0.1/Agent"],

905 "http://purl.org/spar/datacite/hasIdentifier": [

906 {"@id": "https://w3id.org/oc/meta/id/0605"}

907 ],

908 "http://xmlns.com/foaf/0.1/familyName": [

909 {"@value": "Chung", "@type": "http://www.w3.org/2001/XMLSchema#string"}

910 ],

911 "http://xmlns.com/foaf/0.1/givenName": [

912 {"@value": "Myong-Soo", "@type": "http://www.w3.org/2001/XMLSchema#string"}

913 ],

914 },

915 {

916 "@id": "https://w3id.org/oc/meta/ra/0602",

917 "@type": ["http://xmlns.com/foaf/0.1/Agent"],

918 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Mun", "@type": "http://www.w3.org/2001/XMLSchema#string"}],

919 "http://xmlns.com/foaf/0.1/givenName": [

920 {"@value": "Ji-Hye", "@type": "http://www.w3.org/2001/XMLSchema#string"}

921 ],

922 },

923 {

924 "@id": "https://w3id.org/oc/meta/ra/0604",

925 "@type": ["http://xmlns.com/foaf/0.1/Agent"],

926 "http://purl.org/spar/datacite/hasIdentifier": [

927 {"@id": "https://w3id.org/oc/meta/id/0604"}

928 ],

929 "http://xmlns.com/foaf/0.1/name": [

930 {"@value": "The Korean Society Of Food And Nutrition", "@type": "http://www.w3.org/2001/XMLSchema#string"}

931 ],

932 },

933 {

934 "@id": "https://w3id.org/oc/meta/ra/0603",

935 "@type": ["http://xmlns.com/foaf/0.1/Agent"],

936 "http://xmlns.com/foaf/0.1/familyName": [

937 {"@value": "Chung", "@type": "http://www.w3.org/2001/XMLSchema#string"}

938 ],

939 "http://xmlns.com/foaf/0.1/givenName": [

940 {"@value": "Myong-Soo", "@type": "http://www.w3.org/2001/XMLSchema#string"}

941 ],

942 },

943 {

944 "@id": "https://w3id.org/oc/meta/ra/0601",

945 "@type": ["http://xmlns.com/foaf/0.1/Agent"],

946 "http://purl.org/spar/datacite/hasIdentifier": [

947 {"@id": "https://w3id.org/oc/meta/id/0603"}

948 ],

949 "http://xmlns.com/foaf/0.1/familyName": [

950 {"@value": "Cheigh", "@type": "http://www.w3.org/2001/XMLSchema#string"}

951 ],

952 "http://xmlns.com/foaf/0.1/givenName": [

953 {"@value": "Chan-Ick", "@type": "http://www.w3.org/2001/XMLSchema#string"}

954 ],

955 },

956 ],

957 "@id": "https://w3id.org/oc/meta/ra/",

958 }

959 ],

960 "re": [

961 {

962 "@graph": [

963 {

964 "@id": "https://w3id.org/oc/meta/re/0601",

965 "@type": ["http://purl.org/spar/fabio/Manifestation"],

966 "http://prismstandard.org/namespaces/basic/2.0/endingPage": [

967 {"@value": "76", "@type": "http://www.w3.org/2001/XMLSchema#string"}

968 ],

969 "http://prismstandard.org/namespaces/basic/2.0/startingPage": [

970 {"@value": "69", "@type": "http://www.w3.org/2001/XMLSchema#string"}

971 ],

972 }

973 ],

974 "@id": "https://w3id.org/oc/meta/re/",

975 }

976 ],

977 }

978

979 processed_output = {}

980 for entity_type, entity_data in output.items():

981 processed_output[entity_type] = []

982 for graph_container in entity_data:

983 filtered_graph = []

984 for entity in graph_container['@graph']:

985 filtered_entity = {

986 '@id': entity['@id']

987 }

988 for pred, obj in entity.items():

989 if pred != '@id': # Only exclude @id since we already added it

990 filtered_entity[pred] = obj

991

992 if len(filtered_entity) > 1: # Only include if it has predicates beyond @id

993 filtered_graph.append(filtered_entity)

994

995 # Sort the graph by @id

996 filtered_graph = sorted(filtered_graph, key=lambda x: x['@id'])

997

998 processed_output[entity_type].append({

999 '@graph': filtered_graph,

1000 '@id': graph_container['@id']

1001 })

1002 # For each entity type in the expected output, verify that all expected entities exist

1003 # with their expected properties in the actual output from the triplestore

1004 for entity_type, expected_graphs in expected_output.items():

1005 self.assertIn(entity_type, processed_output, f"Entity type {entity_type} missing from triplestore output")

1006

1007 for expected_graph in expected_graphs:

1008 expected_entities = expected_graph['@graph']

1009

1010 # Find the corresponding graph in the processed output

1011 actual_graph = None

1012 for graph in processed_output[entity_type]:

1013 if graph['@id'] == expected_graph['@id']:

1014 actual_graph = graph

1015 break

1016

1017 self.assertIsNotNone(actual_graph, f"Graph {expected_graph['@id']} not found in triplestore output")

1018

1019 # For each expected entity, verify it exists with all expected properties

1020 for expected_entity in expected_entities:

1021 entity_id = expected_entity['@id']

1022

1023 # Find the entity in the actual graph

1024 actual_entity = None

1025 for entity in actual_graph['@graph']:

1026 if entity['@id'] == entity_id:

1027 actual_entity = entity

1028 break

1029

1030 self.assertIsNotNone(actual_entity, f"Entity {entity_id} not found in triplestore output")

1031

1032 # Check that all expected predicates and objects exist

1033 for pred, expected_objects in expected_entity.items():

1034 if pred != '@id':

1035 self.assertIn(pred, actual_entity, f"Predicate {pred} missing for entity {entity_id}")

1036

1037 # For each expected object, verify it exists in the actual objects

1038 for expected_obj in expected_objects:

1039 found = False

1040 for actual_obj in actual_entity[pred]:

1041 # Require exact matches for all objects

1042 if expected_obj == actual_obj:

1043 found = True

1044 break

1045

1046 self.assertTrue(found, f"Object {expected_obj} not found for predicate {pred} of entity {entity_id}\nActual values: {actual_entity[pred]}")

1047

1048

1049 if os.path.exists(output_folder):

1050 shutil.rmtree(output_folder)

1051

1052 self.assertFalse(

1053 "Reader: ERROR" in proc.stdout or "Storer: ERROR" in proc.stdout

1054 )

1055 self.assertFalse(

1056 "Reader: ERROR" in proc.stdout or "Storer: ERROR" in proc.stdout

1057 )

1058

1059 def test_silencer_on(self):

1060 output_folder = os.path.join(BASE_DIR, "output_6")

1061 now = datetime.now()

1062 meta_config_path = os.path.join(BASE_DIR, "meta_config_6.yaml")

1063 with open(meta_config_path, encoding="utf-8") as file:

1064 settings = yaml.full_load(file)

1065

1066 # Update settings with temporary files and Redis cache DB

1067 settings.update(

1068 {

1069 "redis_cache_db": 2,

1070 "ts_upload_cache": self.cache_file,

1071 "ts_failed_queries": self.failed_file,

1072 "ts_stop_file": self.stop_file,

1073 }

1074 )

1075

1076 run_meta_process(settings=settings, meta_config_path=meta_config_path)

1077 settings["input_csv_dir"] = os.path.join(

1078 BASE_DIR, "same_as_input_2_with_other_authors"

1079 )

1080 run_meta_process(settings=settings, meta_config_path=meta_config_path)

1081 query_agents = """

1082 PREFIX pro: <http://purl.org/spar/pro/>

1083 SELECT (COUNT (?agent) AS ?agent_count)

1084 WHERE {

1085 <https://w3id.org/oc/meta/br/0601> pro:isDocumentContextFor ?agent.

1086 }

1087 """

1088 result = execute_sparql_query(SERVER, query_agents)

1089 expected_result = {

1090 "head": {"link": [], "vars": ["agent_count"]},

1091 "results": {

1092 "distinct": False,

1093 "ordered": True,

1094 "bindings": [

1095 {

1096 "agent_count": {

1097 "datatype": "http://www.w3.org/2001/XMLSchema#integer",

1098 "type": "typed-literal",

1099 "value": "3",

1100 }

1101 }

1102 ],

1103 },

1104 }

1105 shutil.rmtree(output_folder)

1106 delete_output_zip(".", now)

1107 self.assertEqual(result, expected_result)

1108

1109 def test_silencer_off(self):

1110 output_folder = os.path.join(BASE_DIR, "output_7")

1111 now = datetime.now()

1112 meta_config_path = os.path.join(BASE_DIR, "meta_config_7.yaml")

1113 with open(meta_config_path, encoding="utf-8") as file:

1114 settings = yaml.full_load(file)

1115

1116 # Update settings with temporary files and Redis cache DB

1117 settings.update(

1118 {

1119 "redis_cache_db": 2,

1120 "ts_upload_cache": self.cache_file,

1121 "ts_failed_queries": self.failed_file,

1122 "ts_stop_file": self.stop_file,

1123 }

1124 )

1125

1126 run_meta_process(settings=settings, meta_config_path=meta_config_path)

1127 settings["input_csv_dir"] = os.path.join(

1128 BASE_DIR, "same_as_input_2_with_other_authors"

1129 )

1130 run_meta_process(settings=settings, meta_config_path=meta_config_path)

1131 query_agents = """

1132 PREFIX pro: <http://purl.org/spar/pro/>

1133 SELECT (COUNT (?agent) AS ?agent_count)

1134 WHERE {

1135 <https://w3id.org/oc/meta/br/0601> pro:isDocumentContextFor ?agent.

1136 }

1137 """

1138 result = execute_sparql_query(SERVER, query_agents)

1139 expected_result = {

1140 "head": {"link": [], "vars": ["agent_count"]},

1141 "results": {

1142 "distinct": False,

1143 "ordered": True,

1144 "bindings": [

1145 {

1146 "agent_count": {

1147 "datatype": "http://www.w3.org/2001/XMLSchema#integer",

1148 "type": "typed-literal",

1149 "value": "6",

1150 }

1151 }

1152 ],

1153 },

1154 }

1155 shutil.rmtree(output_folder)

1156 delete_output_zip(".", now)

1157 self.assertEqual(result, expected_result)

1158

1159 def test_omid_in_input_data(self):

1160 query_all = """

1161 PREFIX fabio: <http://purl.org/spar/fabio/>

1162 PREFIX datacite: <http://purl.org/spar/datacite/>

1163 CONSTRUCT {?s ?p ?o. ?id ?id_p ?id_o.}

1164 WHERE {

1165 ?s a fabio:JournalArticle;

1166 ?p ?o.

1167 ?s datacite:hasIdentifier ?id.

1168 ?id ?id_p ?id_o.

1169 }

1170 """

1171 result = execute_sparql_query(SERVER, query_all, return_format=XML)

1172 output_folder = os.path.join(BASE_DIR, "output_8")

1173 now = datetime.now()

1174 meta_config_path_without_openalex = os.path.join(BASE_DIR, "meta_config_8.yaml")

1175 meta_config_path_with_openalex = os.path.join(BASE_DIR, "meta_config_9.yaml")

1176 with open(meta_config_path_without_openalex, encoding="utf-8") as file:

1177 settings_without_openalex = yaml.full_load(file)

1178 with open(meta_config_path_with_openalex, encoding="utf-8") as file:

1179 settings_with_openalex = yaml.full_load(file)

1180

1181 # Update settings with temporary files and Redis cache DB

1182 cache_settings = {

1183 "redis_cache_db": 2,

1184 "ts_upload_cache": self.cache_file,

1185 "ts_failed_queries": self.failed_file,

1186 "ts_stop_file": self.stop_file,

1187 }

1188 settings_without_openalex.update(cache_settings)

1189 settings_with_openalex.update(cache_settings)

1190

1191 run_meta_process(

1192 settings=settings_without_openalex,

1193 meta_config_path=meta_config_path_without_openalex,

1194 )

1195 run_meta_process(

1196 settings=settings_with_openalex,

1197 meta_config_path=meta_config_path_with_openalex,

1198 )

1199 query_all = """

1200 PREFIX fabio: <http://purl.org/spar/fabio/>

1201 PREFIX datacite: <http://purl.org/spar/datacite/>

1202 CONSTRUCT {?s ?p ?o. ?id ?id_p ?id_o.}

1203 WHERE {

1204 ?s a fabio:JournalArticle;

1205 ?p ?o.

1206 ?s datacite:hasIdentifier ?id.

1207 ?id ?id_p ?id_o.

1208 }

1209 """

1210 result = execute_sparql_query(SERVER, query_all, return_format=XML)

1211 expected_result = Graph()

1212 expected_result.parse(

1213 location=os.path.join(BASE_DIR, "test_omid_in_input_data.json"),

1214 format="json-ld",

1215 )

1216 prov_graph = ConjunctiveGraph()

1217 for dirpath, dirnames, filenames in os.walk(os.path.join(output_folder, "rdf")):

1218 if "br" in dirpath and "prov" in dirpath:

1219 for filename in filenames:

1220 prov_graph.parse(

1221 source=os.path.join(dirpath, filename), format="json-ld"

1222 )

1223

1224 expected_prov_graph = ConjunctiveGraph()

1225 expected_prov_graph.parse(

1226 os.path.join(BASE_DIR, "test_omid_in_input_data_prov.json"),

1227 format="json-ld",

1228 )

1229 prov_graph.remove(

1230 (None, URIRef("http://www.w3.org/ns/prov#generatedAtTime"), None)

1231 )

1232 expected_prov_graph.remove(

1233 (None, URIRef("http://www.w3.org/ns/prov#generatedAtTime"), None)

1234 )

1235 prov_graph.remove(

1236 (None, URIRef("http://www.w3.org/ns/prov#invalidatedAtTime"), None)

1237 )

1238 expected_prov_graph.remove(

1239 (None, URIRef("http://www.w3.org/ns/prov#invalidatedAtTime"), None)

1240 )

1241 shutil.rmtree(output_folder)

1242 self.assertTrue(

1243 normalize_graph(result).isomorphic(normalize_graph(expected_result))

1244 )

1245

1246 def test_publishers_sequence(self):

1247 output_folder = os.path.join(BASE_DIR, "output_9")

1248 meta_config_path = os.path.join(BASE_DIR, "meta_config_10.yaml")

1249 now = datetime.now()

1250 with open(meta_config_path, encoding="utf-8") as file:

1251 settings = yaml.full_load(file)

1252

1253 # Update settings with temporary files and Redis cache DB

1254 settings.update(

1255 {

1256 "redis_cache_db": 2,

1257 "ts_upload_cache": self.cache_file,

1258 "ts_failed_queries": self.failed_file,

1259 "ts_stop_file": self.stop_file,

1260 }

1261 )

1262

1263 run_meta_process(settings=settings, meta_config_path=meta_config_path)

1264 query_all = """

1265 PREFIX datacite: <http://purl.org/spar/datacite/>

1266 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>

1267 CONSTRUCT {?br ?p ?o. ?o ?op ?oo. ?oo ?oop ?ooo. ?ooo ?ooop ?oooo.}

1268 WHERE {

1269 ?id literal:hasLiteralValue "10.17117/na.2015.08.1067"^^<http://www.w3.org/2001/XMLSchema#string>;

1270 datacite:usesIdentifierScheme datacite:doi;

1271 ^datacite:hasIdentifier ?br.

1272 ?br ?p ?o.

1273 ?o ?op ?oo.

1274 ?oo ?oop ?ooo.

1275 ?ooo ?ooop ?oooo.

1276 }

1277 """

1278 result = execute_sparql_query(SERVER, query_all, return_format=XML)

1279 expected_result = Graph()

1280 expected_result.parse(

1281 os.path.join(BASE_DIR, "test_publishers_sequence.json"), format="json-ld"

1282 )

1283 shutil.rmtree(output_folder)

1284 self.assertTrue(

1285 normalize_graph(result).isomorphic(normalize_graph(expected_result))

1286 )

1287

1288 def test_duplicate_omids_with_datatype(self):

1289 output_folder = os.path.join(BASE_DIR, "output_duplicate_test")

1290 meta_config_path = os.path.join(BASE_DIR, "meta_config_duplicate.yaml")

1291

1292 # Create test settings

1293 settings = {

1294 "triplestore_url": SERVER,

1295 "provenance_triplestore_url": PROV_SERVER,

1296 "input_csv_dir": os.path.join(BASE_DIR, "input_duplicate"),

1297 "base_output_dir": output_folder,

1298 "output_rdf_dir": output_folder,

1299 "resp_agent": "test",

1300 "base_iri": "https://w3id.org/oc/meta/",

1301 "context_path": None,

1302 "dir_split_number": 10000,

1303 "items_per_file": 1000,

1304 "default_dir": "_",

1305 "rdf_output_in_chunks": False,

1306 "zip_output_rdf": True,

1307 "source": None,

1308 "supplier_prefix": "060",

1309 "workers_number": 1,

1310 "use_doi_api_service": False,

1311 "blazegraph_full_text_search": False,

1312 "virtuoso_full_text_search": True,

1313 "fuseki_full_text_search": False,

1314 "graphdb_connector_name": None,

1315 "cache_endpoint": None,

1316 "cache_update_endpoint": None,

1317 "silencer": [],

1318 "redis_cache_db": 2,

1319 "ts_upload_cache": self.cache_file,

1320 "ts_failed_queries": self.failed_file,

1321 "ts_stop_file": self.stop_file,

1322 }

1323

1324 # Setup: create test data

1325 os.makedirs(os.path.join(BASE_DIR, "input_duplicate"), exist_ok=True)

1326 with open(

1327 os.path.join(BASE_DIR, "input_duplicate", "test.csv"), "w", encoding="utf-8"

1328 ) as f:

1329 writer = csv.writer(f)

1330 writer.writerow(

1331 [

1332 "id",

1333 "title",

1334 "author",

1335 "pub_date",

1336 "venue",

1337 "volume",

1338 "issue",

1339 "page",

1340 "type",

1341 "publisher",

1342 "editor",

1343 ]

1344 )

1345 writer.writerow(

1346 [

1347 "issn:2543-3288 issn:2078-7685", # Exact problematic row from production

1348 "Journal of Diabetology",

1349 "",

1350 "",

1351 "",

1352 "",

1353 "",

1354 "",

1355 "journal",

1356 "Medknow [crossref:2581]",

1357 "",

1358 ]

1359 )

1360

1361 # Setup: Insert pre-existing identifiers and BRs in triplestore

1362 sparql = SPARQLWrapper(SERVER)

1363 sparql.setMethod(POST)

1364 sparql.setQuery(

1365 """

1366 INSERT DATA {

1367 GRAPH <https://w3id.org/oc/meta/br/> {

1368 <https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601> ;

1369 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> .

1370 <https://w3id.org/oc/meta/br/0602> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0602> ;

1371 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> .

1372 }

1373 GRAPH <https://w3id.org/oc/meta/id/> {

1374 <https://w3id.org/oc/meta/id/0601> <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "2078-7685" ;

1375 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .

1376 <https://w3id.org/oc/meta/id/0602> <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "2543-3288" ;

1377 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .

1378 }

1379 }

1380 """

1381 )

1382 sparql.query()

1383

1384 # Update Redis counters to match the inserted data

1385 redis_handler = RedisCounterHandler(db=5) # Use test db

1386 redis_handler.set_counter(

1387 2, "br", supplier_prefix="060"

1388 ) # BR counter for two BRs

1389 redis_handler.set_counter(

1390 2, "id", supplier_prefix="060"

1391 ) # ID counter for two IDs

1392

1393 # Run the process

1394 run_meta_process(settings=settings, meta_config_path=meta_config_path)

1395

1396 # Check for errors

1397 errors_file = os.path.join(output_folder, "errors.txt")

1398 if os.path.exists(errors_file):

1399 with open(errors_file, "r") as f:

1400 errors = f.read()

1401 print(f"Errors found:\n{errors}")

1402

1403 # Query to check for duplicates

1404 query = """

1405 SELECT DISTINCT ?id ?value

1406 WHERE {

1407 GRAPH <https://w3id.org/oc/meta/id/> {

1408 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?value ;

1409 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .

1410 FILTER(?value IN ("2078-7685"^^<http://www.w3.org/2001/XMLSchema#string>, "2078-7685",

1411 "2543-3288"^^<http://www.w3.org/2001/XMLSchema#string>, "2543-3288"))

1412 }

1413 }

1414 """

1415 result = execute_sparql_query(SERVER, query, return_format=JSON)

1416 # Group IDs by value to check for duplicates

1417 ids_by_value = {}

1418 for binding in result["results"]["bindings"]:

1419 value = binding["value"]["value"]

1420 id = binding["id"]["value"]

1421 if value not in ids_by_value:

1422 ids_by_value[value] = []

1423 ids_by_value[value].append(id)

1424

1425 # Cleanup

1426 shutil.rmtree(output_folder, ignore_errors=True)

1427 shutil.rmtree(os.path.join(BASE_DIR, "input_duplicate"), ignore_errors=True)

1428 if os.path.exists(meta_config_path):

1429 os.remove(meta_config_path)

1430

1431 # Check that we have both ISSNs and no duplicates

1432 for issn_value, ids in ids_by_value.items():

1433 self.assertEqual(

1434 len(ids), 1, f"Found multiple IDs for ISSN {issn_value}: {ids}"

1435 )

1436

1437 self.assertEqual(

1438 len(ids_by_value),

1439 2,

1440 f"Expected 2 ISSNs, found {len(ids_by_value)}: {list(ids_by_value.keys())}",

1441 )

1442

1443 def test_duplicate_omids_with_venue_datatype(self):

1444 """Test to verify that identifiers are not duplicated when merging previously unconnected venues"""

1445 output_folder = os.path.join(BASE_DIR, "output_duplicate_venue_test")

1446 meta_config_path = os.path.join(BASE_DIR, "meta_config_duplicate_venue.yaml")

1447

1448 # Setup: create test data

1449 os.makedirs(os.path.join(BASE_DIR, "input_duplicate_venue"), exist_ok=True)

1450 with open(

1451 os.path.join(BASE_DIR, "input_duplicate_venue", "test.csv"),

1452 "w",

1453 encoding="utf-8",

1454 ) as f:

1455 writer = csv.writer(f)

1456 writer.writerow(

1457 [

1458 "id",

1459 "title",

1460 "author",

1461 "pub_date",

1462 "venue",

1463 "volume",

1464 "issue",

1465 "page",

1466 "type",

1467 "publisher",

1468 "editor",

1469 ]

1470 )

1471 writer.writerow(

1472 [

1473 "issn:1756-1833",

1474 "BMJ",

1475 "",

1476 "",

1477 "",

1478 "",

1479 "",

1480 "",

1481 "journal",

1482 "BMJ [crossref:239]",

1483 "",

1484 ]

1485 )

1486 writer.writerow(

1487 [

1488 "", # id

1489 "", # title

1490 "", # author

1491 "", # pub_date

1492 "BMJ [issn:0267-0623 issn:0959-8138 issn:1468-5833 issn:0007-1447]", # venue

1493 "283", # volume

1494 "", # issue

1495 "", # page

1496 "journal volume", # type

1497 "BMJ [crossref:239]", # publisher

1498 "", # editor

1499 ]

1500 )

1501

1502 # Setup: Insert pre-existing data - aggiungiamo gli identificatori iniziali

1503 sparql = SPARQLWrapper(SERVER)

1504 sparql.setMethod(POST)

1505 sparql.setQuery(

1506 """

1507 INSERT DATA {

1508 GRAPH <https://w3id.org/oc/meta/br/> {

1509 # First venue - BMJ with initial ISSNs

1510 <https://w3id.org/oc/meta/br/0601>

1511 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601>, <https://w3id.org/oc/meta/id/0602> ;

1512 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ;

1513 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;

1514 <http://purl.org/dc/terms/title> "BMJ" .

1515

1516 # Second venue

1517 <https://w3id.org/oc/meta/br/0602>

1518 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0603> ;

1519 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ;

1520 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;

1521 <http://purl.org/dc/terms/title> "British Medical Journal" .

1522 }

1523 GRAPH <https://w3id.org/oc/meta/id/> {

1524 # First venue's ISSNs

1525 <https://w3id.org/oc/meta/id/0601>

1526 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "1756-1833" ;

1527 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .

1528 <https://w3id.org/oc/meta/id/0602>

1529 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "0959-8138" ;

1530 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .

1531 # Second venue's ISSN

1532 <https://w3id.org/oc/meta/id/0603>

1533 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "0267-0623" ;

1534 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .

1535 }

1536 }

1537 """

1538 )

1539 sparql.query()

1540

1541 # Update Redis counters for the pre-existing entities

1542 redis_handler = RedisCounterHandler(db=5)

1543 redis_handler.set_counter(

1544 6, "br", supplier_prefix="060"

1545 ) # Updated to account for 6 entities (2 venues + 4 volumes)

1546 redis_handler.set_counter(

1547 3, "id", supplier_prefix="060"

1548 ) # Corretto: 3 IDs (1756-1833, 0959-8138, 0267-0623)

1549

1550 # Create test settings

1551 settings = {

1552 "triplestore_url": SERVER,

1553 "provenance_triplestore_url": PROV_SERVER,

1554 "input_csv_dir": os.path.join(BASE_DIR, "input_duplicate_venue"),

1555 "base_output_dir": output_folder,

1556 "output_rdf_dir": output_folder,

1557 "resp_agent": "test",

1558 "base_iri": "https://w3id.org/oc/meta/",

1559 "context_path": None,

1560 "dir_split_number": 10000,

1561 "items_per_file": 1000,

1562 "default_dir": "_",

1563 "rdf_output_in_chunks": False,

1564 "zip_output_rdf": True,

1565 "source": None,

1566 "supplier_prefix": "060",

1567 "workers_number": 1,

1568 "use_doi_api_service": False,

1569 "blazegraph_full_text_search": False,

1570 "virtuoso_full_text_search": True,

1571 "fuseki_full_text_search": False,

1572 "graphdb_connector_name": None,

1573 "cache_endpoint": None,

1574 "cache_update_endpoint": None,

1575 "silencer": [],

1576 "redis_cache_db": 2,

1577 "ts_upload_cache": self.cache_file,

1578 "ts_failed_queries": self.failed_file,

1579 "ts_stop_file": self.stop_file,

1580 }

1581

1582 with open(meta_config_path, "w") as f:

1583 yaml.dump(settings, f)

1584

1585 # Run the process

1586 run_meta_process(settings=settings, meta_config_path=meta_config_path)

1587

1588 # Query to check for duplicates - check all ISSNs

1589 query = """

1590 SELECT DISTINCT ?id ?value

1591 WHERE {

1592 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?value ;

1593 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .

1594 FILTER(STR(?value) IN ("1756-1833", "0959-8138", "0267-0623"))

1595 }

1596 """

1597 result = execute_sparql_query(SERVER, query, return_format=JSON)

1598 # Group IDs by value to check for duplicates

1599 ids_by_value = {}

1600 for binding in result["results"]["bindings"]:

1601 value = binding["value"]["value"]

1602 id = binding["id"]["value"]

1603 if value not in ids_by_value:

1604 ids_by_value[value] = []

1605 ids_by_value[value].append(id)

1606

1607 # Cleanup

1608 shutil.rmtree(output_folder, ignore_errors=True)

1609 shutil.rmtree(

1610 os.path.join(BASE_DIR, "input_duplicate_venue"), ignore_errors=True

1611 )

1612 if os.path.exists(meta_config_path):

1613 os.remove(meta_config_path)

1614

1615 # Check that we don't have duplicate IDs for any ISSN

1616 for issn_value, ids in ids_by_value.items():

1617 self.assertEqual(

1618 len(ids), 1, f"Found multiple IDs for ISSN {issn_value} in venue: {ids}"

1619 )

1620

1621 # Verify that pre-existing IDs were reused

1622 self.assertTrue(

1623 any("0601" in id for ids in ids_by_value.values() for id in ids)

1624 and any("0602" in id for ids in ids_by_value.values() for id in ids),

1625 "Pre-existing IDs were not reused",

1626 )

1627

1628 def test_doi_with_multiple_slashes(self):

1629 """Test handling of DOIs containing multiple forward slashes"""

1630 output_folder = os.path.join(BASE_DIR, "output_doi_test")

1631 meta_config_path = os.path.join(BASE_DIR, "meta_config_doi.yaml")

1632

1633 # Setup: create test data with problematic DOI

1634 os.makedirs(os.path.join(BASE_DIR, "input_doi"), exist_ok=True)

1635 with open(

1636 os.path.join(BASE_DIR, "input_doi", "test.csv"), "w", encoding="utf-8"

1637 ) as f:

1638 writer = csv.writer(f)

1639 writer.writerow(

1640 [

1641 "id",

1642 "title",

1643 "author",

1644 "pub_date",

1645 "venue",

1646 "volume",

1647 "issue",

1648 "page",

1649 "type",

1650 "publisher",

1651 "editor",

1652 ]

1653 )

1654 writer.writerow(

1655 [

1656 "doi:10.1093/acprof:oso/9780199230723.001.0001", # Problematic DOI with multiple slashes

1657 "Test Book",

1658 "",

1659 "",

1660 "",

1661 "",

1662 "",

1663 "",

1664 "book",

1665 "",

1666 "",

1667 ]

1668 )

1669

1670 # Create test settings

1671 settings = {

1672 "triplestore_url": SERVER,

1673 "provenance_triplestore_url": PROV_SERVER,

1674 "input_csv_dir": os.path.join(BASE_DIR, "input_doi"),

1675 "base_output_dir": output_folder,

1676 "output_rdf_dir": output_folder,

1677 "resp_agent": "test",

1678 "base_iri": "https://w3id.org/oc/meta/",

1679 "context_path": None,

1680 "dir_split_number": 10000,

1681 "items_per_file": 1000,

1682 "default_dir": "_",

1683 "rdf_output_in_chunks": False,

1684 "zip_output_rdf": True,

1685 "source": None,

1686 "supplier_prefix": "060",

1687 "workers_number": 1,

1688 "use_doi_api_service": False,

1689 "blazegraph_full_text_search": False,

1690 "virtuoso_full_text_search": True,

1691 "fuseki_full_text_search": False,

1692 "graphdb_connector_name": None,

1693 "cache_endpoint": None,

1694 "cache_update_endpoint": None,

1695 "silencer": [],

1696 "redis_cache_db": 2,

1697 "ts_upload_cache": self.cache_file,

1698 "ts_failed_queries": self.failed_file,

1699 "ts_stop_file": self.stop_file,

1700 }

1701

1702 with open(meta_config_path, "w") as f:

1703 yaml.dump(settings, f)

1704

1705 now = datetime.now()

1706

1707 # Run the process

1708 run_meta_process(settings=settings, meta_config_path=meta_config_path)

1709

1710 # Query to verify DOI was processed correctly

1711 query = """

1712 SELECT ?br ?id ?value

1713 WHERE {

1714 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "10.1093/acprof:oso/9780199230723.001.0001"^^<http://www.w3.org/2001/XMLSchema#string> ;

1715 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/doi> ;

1716 ^<http://purl.org/spar/datacite/hasIdentifier> ?br .

1717 }

1718 """

1719 result = execute_sparql_query(SERVER, query, return_format=JSON)

1720

1721 # Cleanup

1722 shutil.rmtree(output_folder, ignore_errors=True)

1723 shutil.rmtree(os.path.join(BASE_DIR, "input_doi"), ignore_errors=True)

1724 if os.path.exists(meta_config_path):

1725 os.remove(meta_config_path)

1726 delete_output_zip(".", now)

1727

1728 # Verify results

1729 self.assertTrue(

1730 len(result["results"]["bindings"]) > 0,

1731 "DOI with multiple slashes was not processed correctly",

1732 )

1733

1734 # Check that we got exactly one result

1735 self.assertEqual(

1736 len(result["results"]["bindings"]),

1737 1,

1738 f"Expected 1 result, got {len(result['results']['bindings'])}",

1739 )

1740

1741 def test_volume_issue_deduplication(self):

1742 """Test to verify that volumes and issues are properly deduplicated"""

1743 output_folder = os.path.join(BASE_DIR, "output_vvi_test")

1744 meta_config_path = os.path.join(BASE_DIR, "meta_config_vvi.yaml")

1745

1746 # Setup: create test data

1747 os.makedirs(os.path.join(BASE_DIR, "input_vvi"), exist_ok=True)

1748 with open(

1749 os.path.join(BASE_DIR, "input_vvi", "test.csv"), "w", encoding="utf-8"

1750 ) as f:

1751 writer = csv.writer(f)

1752 writer.writerow(

1753 [

1754 "id",

1755 "title",

1756 "author",

1757 "pub_date",

1758 "venue",

1759 "volume",

1760 "issue",

1761 "page",

1762 "type",

1763 "publisher",

1764 "editor",

1765 ]

1766 )

1767 # First article in volume 1, issue 1

1768 writer.writerow(

1769 [

1770 "doi:10.1234/test.1",

1771 "First Article",

1772 "",

1773 "2023",

1774 "Test Journal [issn:1756-1833]",

1775 "1",

1776 "1",

1777 "1-10",

1778 "journal article",

1779 "",

1780 "",

1781 ]

1782 )

1783 # Second article in same volume and issue

1784 writer.writerow(

1785 [

1786 "doi:10.1234/test.2",

1787 "Second Article",

1788 "",

1789 "2023",

1790 "Test Journal [issn:1756-1833]",

1791 "1",

1792 "1",

1793 "11-20",

1794 "journal article",

1795 "",

1796 "",

1797 ]

1798 )

1799

1800 # Create test settings

1801 settings = {

1802 "triplestore_url": SERVER,

1803 "provenance_triplestore_url": PROV_SERVER,

1804 "input_csv_dir": os.path.join(BASE_DIR, "input_vvi"),

1805 "base_output_dir": output_folder,

1806 "output_rdf_dir": output_folder,

1807 "resp_agent": "test",

1808 "base_iri": "https://w3id.org/oc/meta/",

1809 "context_path": None,

1810 "dir_split_number": 10000,

1811 "items_per_file": 1000,

1812 "default_dir": "_",

1813 "rdf_output_in_chunks": False,

1814 "zip_output_rdf": True,

1815 "source": None,

1816 "supplier_prefix": "060",

1817 "workers_number": 1,

1818 "use_doi_api_service": False,

1819 "blazegraph_full_text_search": False,

1820 "virtuoso_full_text_search": True,

1821 "fuseki_full_text_search": False,

1822 "graphdb_connector_name": None,

1823 "cache_endpoint": None,

1824 "cache_update_endpoint": None,

1825 "silencer": [],

1826 "redis_cache_db": 2,

1827 "ts_upload_cache": self.cache_file,

1828 "ts_failed_queries": self.failed_file,

1829 "ts_stop_file": self.stop_file,

1830 }

1831

1832 with open(meta_config_path, "w") as f:

1833 yaml.dump(settings, f)

1834

1835 # Run the process

1836 run_meta_process(settings=settings, meta_config_path=meta_config_path)

1837

1838 # Query to check volume and issue structure

1839 query = """

1840 PREFIX fabio: <http://purl.org/spar/fabio/>

1841 PREFIX frbr: <http://purl.org/vocab/frbr/core#>

1842 PREFIX prism: <http://prismstandard.org/namespaces/basic/2.0/>

1843

1844 SELECT ?article ?volume ?issue ?seq_id

1845 WHERE {

1846 ?article a fabio:JournalArticle ;

1847 frbr:partOf ?issue .

1848 ?issue a fabio:JournalIssue ;

1849 fabio:hasSequenceIdentifier ?seq_id ;

1850 frbr:partOf ?volume .

1851 ?volume a fabio:JournalVolume .

1852 }

1853 ORDER BY ?article

1854 """

1855

1856 result = execute_sparql_query(SERVER, query)

1857

1858 # Cleanup

1859 shutil.rmtree(output_folder, ignore_errors=True)

1860 shutil.rmtree(os.path.join(BASE_DIR, "input_vvi"), ignore_errors=True)

1861 if os.path.exists(meta_config_path):

1862 os.remove(meta_config_path)

1863

1864 # Verify results

1865 bindings = result["results"]["bindings"]

1866

1867 # Should have 2 articles

1868 self.assertEqual(len(bindings), 2, "Expected 2 articles")

1869

1870 # Both articles should reference the same volume and issue

1871 first_volume = bindings[0]["volume"]["value"]

1872 first_issue = bindings[0]["issue"]["value"]

1873

1874 for binding in bindings[1:]:

1875 self.assertEqual(

1876 binding["volume"]["value"],

1877 first_volume,

1878 "Articles reference different volumes",

1879 )

1880 self.assertEqual(

1881 binding["issue"]["value"],

1882 first_issue,

1883 "Articles reference different issues",

1884 )

1885

1886 def test_volume_issue_deduplication_with_triplestore(self):

1887 """Test that volumes and issues are properly deduplicated when they already exist in the triplestore"""

1888 output_folder = os.path.join(BASE_DIR, "output_vvi_triplestore_test")

1889 meta_config_path = os.path.join(BASE_DIR, "meta_config_vvi_triplestore.yaml")

1890

1891 # Setup: Insert pre-existing venue with duplicate volumes and issues (with/without datatype)

1892 sparql = SPARQLWrapper(SERVER)

1893 sparql.setMethod(POST)

1894 sparql.setQuery(

1895 """

1896 INSERT DATA {

1897 GRAPH <https://w3id.org/oc/meta/br/> {

1898 # Venue

1899 <https://w3id.org/oc/meta/br/0601>

1900 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601> ;

1901 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ;

1902 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;

1903 <http://purl.org/dc/terms/title> "Test Journal" .

1904

1905 # Volume 1 (without datatype)

1906 <https://w3id.org/oc/meta/br/0602>

1907 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalVolume> ;

1908 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;

1909 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0601> ;

1910 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1" .

1911

1912 # Volume 1 (with datatype)

1913 <https://w3id.org/oc/meta/br/0604>

1914 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalVolume> ;

1915 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;

1916 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0601> ;

1917 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1"^^<http://www.w3.org/2001/XMLSchema#string> .

1918

1919 # Issue 1 (without datatype)

1920 <https://w3id.org/oc/meta/br/0603>

1921 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalIssue> ;

1922 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;

1923 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0602> ;

1924 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1" .

1925

1926 # Issue 1 (with datatype)

1927 <https://w3id.org/oc/meta/br/0605>

1928 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalIssue> ;

1929 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;

1930 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0604> ;

1931 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1"^^<http://www.w3.org/2001/XMLSchema#string> .

1932 }

1933 GRAPH <https://w3id.org/oc/meta/id/> {

1934 <https://w3id.org/oc/meta/id/0601>

1935 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "1756-1833" ;

1936 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .

1937 }

1938 }

1939 """

1940 )

1941 sparql.query()

1942

1943 # Update Redis counters for pre-existing entities

1944 redis_handler = RedisCounterHandler(db=5)

1945 redis_handler.set_counter(

1946 5, "br", supplier_prefix="060"

1947 ) # 5 entities: venue, 2 volumes, 2 issues

1948 redis_handler.set_counter(

1949 1, "id", supplier_prefix="060"

1950 ) # 1 identifier for venue

1951

1952 # Create test data - article that should use existing volume and issue

1953 os.makedirs(os.path.join(BASE_DIR, "input_vvi_triplestore"), exist_ok=True)

1954 with open(

1955 os.path.join(BASE_DIR, "input_vvi_triplestore", "test.csv"),

1956 "w",

1957 encoding="utf-8",

1958 ) as f:

1959 writer = csv.writer(f)

1960 writer.writerow(

1961 [

1962 "id",

1963 "title",

1964 "author",

1965 "pub_date",

1966 "venue",

1967 "volume",

1968 "issue",

1969 "page",

1970 "type",

1971 "publisher",

1972 "editor",

1973 ]

1974 )

1975 writer.writerow(

1976 [

1977 "doi:10.1234/test.1",

1978 "Test Article",

1979 "",

1980 "2023",

1981 "Test Journal [issn:1756-1833]",

1982 "1", # Should match existing volume

1983 "1", # Should match existing issue

1984 "1-10",

1985 "journal article",

1986 "",

1987 "",

1988 ]

1989 )

1990

1991 # Create test settings

1992 settings = {

1993 "triplestore_url": SERVER,

1994 "provenance_triplestore_url": PROV_SERVER,

1995 "input_csv_dir": os.path.join(BASE_DIR, "input_vvi_triplestore"),

1996 "base_output_dir": output_folder,

1997 "output_rdf_dir": output_folder,

1998 "resp_agent": "test",

1999 "base_iri": "https://w3id.org/oc/meta/",

2000 "context_path": None,

2001 "dir_split_number": 10000,

2002 "items_per_file": 1000,

2003 "default_dir": "_",

2004 "rdf_output_in_chunks": False,

2005 "zip_output_rdf": True,

2006 "source": None,

2007 "supplier_prefix": "060",

2008 "workers_number": 1,

2009 "use_doi_api_service": False,

2010 "blazegraph_full_text_search": False,

2011 "virtuoso_full_text_search": True,

2012 "fuseki_full_text_search": False,

2013 "graphdb_connector_name": None,

2014 "cache_endpoint": None,

2015 "cache_update_endpoint": None,

2016 "silencer": [],

2017 "redis_cache_db": 2,

2018 "ts_upload_cache": self.cache_file,

2019 "ts_failed_queries": self.failed_file,

2020 "ts_stop_file": self.stop_file,

2021 }

2022

2023 with open(meta_config_path, "w") as f:

2024 yaml.dump(settings, f)

2025

2026 # Run the process

2027 run_meta_process(settings=settings, meta_config_path=meta_config_path)

2028

2029 # Check if new volumes/issues were created

2030 to_be_uploaded_dir = os.path.join(output_folder, "rdf", "to_be_uploaded")

2031 new_entities_created = False

2032 if os.path.exists(to_be_uploaded_dir):

2033 for dirpath, _, filenames in os.walk(to_be_uploaded_dir):

2034 for f in filenames:

2035 if f.endswith(".sparql"):

2036 with open(os.path.join(dirpath, f)) as file:

2037 content = file.read()

2038 if any(

2039 "JournalVolume" in line or "JournalIssue" in line

2040 for line in content.splitlines()

2041 ):

2042 print(f"\nFound new volume/issue creation in {f}:")

2043 new_entities_created = True

2044

2045 # Query to get all entities and their relationships

2046 query = """

2047 PREFIX fabio: <http://purl.org/spar/fabio/>

2048 PREFIX frbr: <http://purl.org/vocab/frbr/core#>

2049 PREFIX datacite: <http://purl.org/spar/datacite/>

2050

2051 SELECT DISTINCT ?article ?venue ?volume ?issue ?issn

2052 WHERE {

2053 ?article a fabio:JournalArticle ;

2054 frbr:partOf ?issue .

2055 ?issue a fabio:JournalIssue ;

2056 frbr:partOf ?volume .

2057 ?volume a fabio:JournalVolume ;

2058 frbr:partOf ?venue .

2059 ?venue datacite:hasIdentifier ?id .

2060 ?id datacite:usesIdentifierScheme datacite:issn ;

2061 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?issn .

2062 }

2063 """

2064

2065 result = execute_sparql_query(SERVER, query)

2066

2067 # Cleanup

2068 shutil.rmtree(output_folder, ignore_errors=True)

2069 shutil.rmtree(

2070 os.path.join(BASE_DIR, "input_vvi_triplestore"), ignore_errors=True

2071 )

2072 if os.path.exists(meta_config_path):

2073 os.remove(meta_config_path)

2074

2075 # Verify results

2076 bindings = result["results"]["bindings"]

2077 self.assertEqual(len(bindings), 1, "Expected exactly one article")

2078

2079 # Get the URIs from the result

2080 venue_uri = bindings[0]["venue"]["value"]

2081 volume_uri = bindings[0]["volume"]["value"]

2082 issue_uri = bindings[0]["issue"]["value"]

2083 issn = bindings[0]["issn"]["value"]

2084

2085 # Check if venue was deduplicated (should use existing venue)

2086 self.assertEqual(

2087 venue_uri,

2088 "https://w3id.org/oc/meta/br/0601",

2089 "Venue was not deduplicated correctly",

2090 )

2091

2092 # Check if volume was deduplicated - either version is valid

2093 self.assertIn(

2094 volume_uri,

2095 ["https://w3id.org/oc/meta/br/0602", "https://w3id.org/oc/meta/br/0604"],

2096 "Volume was not deduplicated correctly - should use one of the existing volumes",

2097 )

2098

2099 # Check if issue was deduplicated - either version is valid

2100 self.assertIn(

2101 issue_uri,

2102 ["https://w3id.org/oc/meta/br/0603", "https://w3id.org/oc/meta/br/0605"],

2103 "Issue was not deduplicated correctly - should use one of the existing issues",

2104 )

2105

2106 # Check ISSN

2107 self.assertEqual(issn, "1756-1833", "ISSN does not match")

2108

2109 # Verify no new volumes/issues were created

2110 self.assertFalse(

2111 new_entities_created,

2112 "New volumes/issues were created when they should have been deduplicated",

2113 )

2114

2115 def test_temporary_identifiers(self):

2116 """Test that temporary identifiers are used for deduplication but not saved, and an OMID is generated"""

2117 output_folder = os.path.join(BASE_DIR, "output_temp_id_test")

2118 meta_config_path = os.path.join(BASE_DIR, "meta_config_temp.yaml")

2119

2120 # Setup: create test data with only temporary identifier

2121 os.makedirs(os.path.join(BASE_DIR, "input_temp"), exist_ok=True)

2122 with open(

2123 os.path.join(BASE_DIR, "input_temp", "test.csv"), "w", encoding="utf-8"

2124 ) as f:

2125 writer = csv.writer(f)

2126 writer.writerow(

2127 [

2128 "id",

2129 "title",

2130 "author",

2131 "pub_date",

2132 "venue",

2133 "volume",

2134 "issue",

2135 "page",

2136 "type",

2137 "publisher",

2138 "editor",

2139 ]

2140 )

2141 writer.writerow(

2142 [

2143 "temp:567", # Only temporary identifier

2144 "Test Article",

2145 "",

2146 "2023",

2147 "",

2148 "",

2149 "",

2150 "",

2151 "journal article",

2152 "",

2153 "",

2154 ]

2155 )

2156

2157 # Create test settings

2158 settings = {

2159 "triplestore_url": SERVER,

2160 "provenance_triplestore_url": PROV_SERVER,

2161 "input_csv_dir": os.path.join(BASE_DIR, "input_temp"),

2162 "base_output_dir": output_folder,

2163 "output_rdf_dir": output_folder,

2164 "resp_agent": "test",

2165 "base_iri": "https://w3id.org/oc/meta/",

2166 "context_path": None,

2167 "dir_split_number": 10000,

2168 "items_per_file": 1000,

2169 "default_dir": "_",

2170 "rdf_output_in_chunks": False,

2171 "zip_output_rdf": True,

2172 "source": None,

2173 "supplier_prefix": "060",

2174 "workers_number": 1,

2175 "use_doi_api_service": False,

2176 "blazegraph_full_text_search": False,

2177 "virtuoso_full_text_search": True,

2178 "fuseki_full_text_search": False,

2179 "graphdb_connector_name": None,

2180 "cache_endpoint": None,

2181 "cache_update_endpoint": None,

2182 "silencer": [],

2183 "redis_cache_db": 2,

2184 "ts_upload_cache": self.cache_file,

2185 "ts_failed_queries": self.failed_file,

2186 "ts_stop_file": self.stop_file,

2187 }

2188

2189 with open(meta_config_path, "w") as f:

2190 yaml.dump(settings, f)

2191

2192 now = datetime.now()

2193

2194 # Run the process

2195 run_meta_process(settings=settings, meta_config_path=meta_config_path)

2196

2197 # Query to verify an OMID was generated and no temporary identifier was saved

2198 query = """

2199 PREFIX fabio: <http://purl.org/spar/fabio/>

2200 PREFIX datacite: <http://purl.org/spar/datacite/>

2201 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>

2202

2203 SELECT ?br ?id ?value ?scheme

2204 WHERE {

2205 ?br a fabio:JournalArticle .

2206 OPTIONAL {

2207 ?br datacite:hasIdentifier ?id .

2208 ?id datacite:usesIdentifierScheme ?scheme ;

2209 literal:hasLiteralValue ?value .

2210 }

2211 }

2212 """

2213 result = execute_sparql_query(SERVER, query, return_format=JSON)

2214

2215 # Cleanup

2216 shutil.rmtree(output_folder, ignore_errors=True)

2217 shutil.rmtree(os.path.join(BASE_DIR, "input_temp"), ignore_errors=True)

2218 if os.path.exists(meta_config_path):

2219 os.remove(meta_config_path)

2220 delete_output_zip(".", now)

2221

2222 # Verify results

2223 bindings = result["results"]["bindings"]

2224

2225 # Should find exactly one article

2226 self.assertEqual(len(bindings), 1, "Expected exactly one article")

2227

2228 # The article should have a br/ URI (OMID)

2229 br_uri = bindings[0]["br"]["value"]

2230 self.assertTrue(

2231 "br/" in br_uri,

2232 f"Article URI {br_uri} does not contain expected OMID pattern 'br/'",

2233 )

2234

2235 # Should not have any saved identifiers

2236 self.assertNotIn(

2237 "id",

2238 bindings[0],

2239 "Found unexpected identifier when only temporary ID was provided",

2240 )

2241

2242 def test_temporary_identifiers_deduplication(self):

2243 """Test that multiple rows with the same temporary identifier are correctly deduplicated"""

2244 # Create test data with two rows using the same temporary identifier

2245 test_data = [

2246 {

2247 "id": "temp:789",

2248 "title": "Test Article 1",

2249 "author": "Smith, John [orcid:0000-0002-1234-5678]",

2250 "pub_date": "2020",

2251 "venue": "",

2252 "volume": "",

2253 "issue": "",

2254 "page": "",

2255 "type": "journal article",

2256 "publisher": "",

2257 "editor": "",

2258 },

2259 {

2260 "id": "temp:789", # Same temporary ID

2261 "title": "Test Article 1", # Same title

2262 "author": "Smith, John [orcid:0000-0002-1234-5678]",

2263 "pub_date": "2020",

2264 "venue": "",

2265 "volume": "",

2266 "issue": "",

2267 "page": "",

2268 "type": "journal article",

2269 "publisher": "",

2270 "editor": "",

2271 },

2272 ]

2273

2274 # Write test data to CSV

2275 input_dir = os.path.join(BASE_DIR, "input_temp_dedup")

2276 os.makedirs(input_dir, exist_ok=True)

2277 csv_path = os.path.join(input_dir, "test.csv")

2278 write_csv(csv_path, test_data)

2279

2280 # Run meta process

2281 output_dir = os.path.join(BASE_DIR, "output_temp_dedup")

2282 os.makedirs(output_dir, exist_ok=True)

2283 config = {

2284 "input_csv_dir": input_dir,

2285 "base_output_dir": output_dir,

2286 "output_rdf_dir": output_dir,

2287 "triplestore_url": SERVER,

2288 "provenance_triplestore_url": PROV_SERVER,

2289 "resp_agent": "https://w3id.org/oc/meta/prov/pa/1",

2290 "base_iri": "https://w3id.org/oc/meta/",

2291 "context_path": "https://w3id.org/oc/meta/context.json",

2292 "supplier_prefix": "060",

2293 "dir_split_number": 10000,

2294 "items_per_file": 1000,

2295 "default_dir": "_",

2296 "rdf_output_in_chunks": True,

2297 "zip_output_rdf": False,

2298 "source": None,

2299 "use_doi_api_service": False,

2300 "workers_number": 1,

2301 "silencer": [],

2302 "redis_host": "localhost",

2303 "redis_port": 6379,

2304 "redis_db": 5,

2305 "redis_cache_db": 2,

2306 "ts_upload_cache": self.cache_file,

2307 "ts_failed_queries": self.failed_file,

2308 "ts_stop_file": self.stop_file,

2309 "graphdb_connector_name": None,

2310 "blazegraph_full_text_search": False,

2311 "fuseki_full_text_search": False,

2312 "virtuoso_full_text_search": False,

2313 "provenance_endpoints": [],

2314 "cache_endpoint": None,

2315 "cache_update_endpoint": None,

2316 "normalize_titles": True,

2317 }

2318 config_path = os.path.join(output_dir, "config.yaml")

2319 with open(config_path, "w") as f:

2320 yaml.dump(config, f)

2321

2322 # Run the process

2323 run_meta_process(settings=config, meta_config_path=config_path)

2324

2325 # Query the triplestore to verify:

2326 # 1. Only one OMID was generated for both rows

2327 # 2. The temporary identifier was not saved

2328 query = """

2329 SELECT DISTINCT ?br

2330 WHERE {

2331 ?br a <http://purl.org/spar/fabio/JournalArticle> .

2332 }

2333 """

2334 results = execute_sparql_query(SERVER, query)

2335

2336 # Clean up

2337 shutil.rmtree(input_dir)

2338 shutil.rmtree(output_dir)

2339

2340 # Should only be one article

2341 articles = [

2342 str(result["br"]["value"]) for result in results["results"]["bindings"]

2343 ]

2344 self.assertEqual(

2345 len(articles), 1, "Should only be one article after deduplication"

2346 )

2347

2348

2349def normalize_graph(graph):

2350 """

2351 Normalizza i letterali nel grafo rimuovendo i tipi di dato espliciti.

2352 """

2353 normalized_graph = Graph()

2354 for subject, predicate, obj in graph:

2355 if isinstance(obj, Literal) and obj.datatype is not None:

2356 normalized_obj = Literal(obj.toPython())

2357 normalized_graph.add((subject, predicate, normalized_obj))

2358 else:

2359 normalized_graph.add((subject, predicate, obj))

2360 return normalized_graph

2361

2362

2363if __name__ == "__main__": # pragma: no cover

2364 unittest.main()

Coverage for test/meta_process_test.py: 96%

562 statements