Coverage for test/meta_process_test.py: 95%
537 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-12-20 08:55 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-12-20 08:55 +0000
1import csv
2import glob
3import os
4import re
5import shutil
6import subprocess
7import sys
8import tempfile
9import time
10import unittest
11from datetime import datetime
12from test.test_utils import (PROV_SERVER, SERVER, execute_sparql_construct,
13 execute_sparql_query, reset_redis_counters,
14 reset_server, wait_for_virtuoso)
16import yaml
17from oc_meta.lib.file_manager import get_csv_data, write_csv
18from oc_meta.run.meta_process import run_meta_process
19from oc_ocdm.counter_handler.redis_counter_handler import RedisCounterHandler
20from rdflib import Dataset, Graph, Literal, URIRef
21from sparqlite import SPARQLClient
23BASE_DIR = os.path.join("test", "meta_process")
26def delete_output_zip(base_dir: str, start_time: datetime) -> None:
27 for file in os.listdir(base_dir):
28 if file.startswith("meta_output") and file.endswith(".zip"):
29 file_creation_time = file.split("meta_output_")[1].replace(".zip", "")
30 file_creation_time = datetime.strptime(
31 file_creation_time, "%Y-%m-%dT%H_%M_%S_%f"
32 )
33 was_created_after_time = True if file_creation_time > start_time else False
34 if was_created_after_time:
35 os.remove(os.path.join(base_dir, file))
38class test_ProcessTest(unittest.TestCase):
39 @classmethod
40 def setUpClass(cls):
41 """Setup eseguito una volta per tutta la classe di test"""
42 if not wait_for_virtuoso(SERVER, max_wait=30):
43 raise TimeoutError("Virtuoso not ready after 30 seconds")
45 def setUp(self):
46 """Setup eseguito prima di ogni test"""
47 # Create temporary directory for cache files
48 self.temp_dir = tempfile.mkdtemp()
49 self.cache_file = os.path.join(self.temp_dir, "ts_upload_cache.json")
50 self.failed_file = os.path.join(self.temp_dir, "failed_queries.txt")
51 self.stop_file = os.path.join(self.temp_dir, ".stop_upload")
53 # Reset del database
54 reset_server()
55 reset_redis_counters()
57 def tearDown(self):
58 reset_redis_counters()
59 # Remove temporary directory and its contents
60 if hasattr(self, "temp_dir") and os.path.exists(self.temp_dir):
61 shutil.rmtree(self.temp_dir)
63 # Clean up bulk load files
64 bulk_load_dirs = [
65 "test/test_virtuoso_db/bulk_load",
66 "test/test_virtuoso_db_prov/bulk_load"
67 ]
68 for bulk_dir in bulk_load_dirs:
69 if os.path.exists(bulk_dir):
70 for file in glob.glob(os.path.join(bulk_dir, "*.nq.gz")):
71 os.remove(file)
72 for file in glob.glob(os.path.join(bulk_dir, "*.backup")):
73 os.remove(file)
75 for i in range(1, 11):
76 output_dir = os.path.join(BASE_DIR, f"output_{i}")
77 if os.path.exists(output_dir):
78 shutil.rmtree(output_dir)
80 def test_run_meta_process(self):
81 output_folder = os.path.join(BASE_DIR, "output_1")
82 meta_config_path = os.path.join(BASE_DIR, "meta_config_1.yaml")
83 with open(meta_config_path, encoding="utf-8") as file:
84 settings = yaml.full_load(file)
86 # Update settings with temporary files and Redis cache DB
87 settings.update(
88 {
89 "redis_cache_db": 2,
90 "ts_upload_cache": self.cache_file,
91 "ts_failed_queries": self.failed_file,
92 "ts_stop_file": self.stop_file,
93 }
94 )
96 now = datetime.now()
97 run_meta_process(settings=settings, meta_config_path=meta_config_path)
98 output = list()
99 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")):
100 for file in filenames:
101 output.extend(get_csv_data(os.path.join(dirpath, file)))
102 expected_output = [
103 {
104 "id": "doi:10.17117/na.2015.08.1067 omid:br/0601",
105 "title": "",
106 "author": "",
107 "pub_date": "",
108 "venue": "Scientometrics [issn:0138-9130 issn:1588-2861 omid:br/0603]",
109 "volume": "26",
110 "issue": "",
111 "page": "",
112 "type": "journal article",
113 "publisher": "Consulting Company Ucom [crossref:6623 omid:ra/0601]",
114 "editor": "Naimi, Elmehdi [orcid:0000-0002-4126-8519 omid:ra/0602]",
115 },
116 {
117 "id": "issn:1524-4539 issn:0009-7322 omid:br/0602",
118 "title": "Circulation",
119 "author": "",
120 "pub_date": "",
121 "venue": "",
122 "volume": "",
123 "issue": "",
124 "page": "",
125 "type": "journal",
126 "publisher": "",
127 "editor": "",
128 },
129 {
130 "id": "doi:10.9799/ksfan.2012.25.1.069 omid:br/0605",
131 "title": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment",
132 "author": "Cheigh, Chan-Ick [orcid:0000-0003-2542-5788 omid:ra/0603]; Mun, Ji-Hye [omid:ra/0604]; Chung, Myong-Soo [omid:ra/0605]",
133 "pub_date": "2012-03-31",
134 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/0608]",
135 "volume": "25",
136 "issue": "1",
137 "page": "69-76",
138 "type": "journal article",
139 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/0606]",
140 "editor": "Chung, Myong-Soo [orcid:0000-0002-9666-2513 omid:ra/0607]",
141 },
142 {
143 "id": "doi:10.9799/ksfan.2012.25.1.077 omid:br/0606",
144 "title": "Properties Of Immature Green Cherry Tomato Pickles",
145 "author": "Koh, Jong-Ho [omid:ra/0608]; Shin, Hae-Hun [omid:ra/0609]; Kim, Young-Shik [orcid:0000-0001-5673-6314 omid:ra/06010]; Kook, Moo-Chang [omid:ra/06011]",
146 "pub_date": "2012-03-31",
147 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/0608]",
148 "volume": "",
149 "issue": "2",
150 "page": "77-82",
151 "type": "journal article",
152 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/0606]",
153 "editor": "",
154 },
155 {
156 "id": "doi:10.1097/01.rct.0000185385.35389.cd omid:br/0607",
157 "title": "Comprehensive Assessment Of Lung CT Attenuation Alteration At Perfusion Defects Of Acute Pulmonary Thromboembolism With Breath-Hold SPECT-CT Fusion Images",
158 "author": "Suga, Kazuyoshi [omid:ra/06012]; Kawakami, Yasuhiko [omid:ra/06013]; Iwanaga, Hideyuki [omid:ra/06014]; Hayashi, Noriko [omid:ra/06015]; Seto, Aska [omid:ra/06016]; Matsunaga, Naofumi [omid:ra/06017]",
159 "pub_date": "2006-01",
160 "venue": "Journal Of Computer Assisted Tomography [issn:0363-8715 omid:br/06012]",
161 "volume": "30",
162 "issue": "1",
163 "page": "83-91",
164 "type": "journal article",
165 "publisher": "Ovid Technologies (Wolters Kluwer Health) [crossref:276 omid:ra/06018]",
166 "editor": "",
167 },
168 ]
169 output = sorted(sorted(d.items()) for d in output)
170 expected_output = sorted(sorted(d.items()) for d in expected_output)
171 self.maxDiff = None
172 shutil.rmtree(output_folder)
173 delete_output_zip(".", now)
174 self.assertEqual(output, expected_output)
176 def test_run_meta_process_ids_only(self):
177 output_folder = os.path.join(BASE_DIR, "output_5")
178 meta_config_path = os.path.join(BASE_DIR, "meta_config_5.yaml")
179 now = datetime.now()
180 with open(meta_config_path, encoding="utf-8") as file:
181 settings = yaml.full_load(file)
183 # Update settings with temporary files and Redis cache DB
184 settings.update(
185 {
186 "redis_cache_db": 2,
187 "ts_upload_cache": self.cache_file,
188 "ts_failed_queries": self.failed_file,
189 "ts_stop_file": self.stop_file,
190 }
191 )
193 run_meta_process(settings, meta_config_path=meta_config_path)
194 output = list()
195 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")):
196 for file in filenames:
197 output.extend(get_csv_data(os.path.join(dirpath, file)))
198 expected_output = [
199 {
200 "id": "doi:10.17117/na.2015.08.1067 omid:br/0601",
201 "title": "Some Aspects Of The Evolution Of Chernozems Under The Influence Of Natural And Anthropogenic Factors",
202 "author": "[orcid:0000-0002-4126-8519 omid:ra/0601]; [orcid:0000-0003-0530-4305 omid:ra/0602]",
203 "pub_date": "2015-08-22",
204 "venue": "[issn:1225-4339 omid:br/0602]",
205 "volume": "26",
206 "issue": "",
207 "page": "50",
208 "type": "journal article",
209 "publisher": "[crossref:6623 omid:ra/0603]",
210 "editor": "[orcid:0000-0002-4126-8519 omid:ra/0601]; [orcid:0000-0002-8420-0696 omid:ra/0604]",
211 }
212 ]
213 output = sorted(sorted(d.items()) for d in output)
214 expected_output = sorted(sorted(d.items()) for d in expected_output)
215 self.maxDiff = None
216 shutil.rmtree(output_folder)
217 delete_output_zip(".", now)
218 self.assertEqual(output, expected_output)
220 def test_provenance(self):
221 # Bulk load disabled in meta_config_3.yaml
222 output_folder = os.path.join(BASE_DIR, "output_3")
223 now = datetime.now()
224 if os.path.exists(output_folder):
225 shutil.rmtree(output_folder)
226 delete_output_zip(".", now)
227 meta_config_path = os.path.join(BASE_DIR, "meta_config_3.yaml")
228 with open(meta_config_path, encoding="utf-8") as file:
229 settings = yaml.full_load(file)
231 # Update settings with temporary files and Redis cache DB
232 settings.update(
233 {
234 "redis_cache_db": 2,
235 "ts_upload_cache": self.cache_file,
236 "ts_failed_queries": self.failed_file,
237 "ts_stop_file": self.stop_file,
238 }
239 )
241 reset_server()
243 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input")
244 run_meta_process(settings=settings, meta_config_path=meta_config_path)
245 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input_2")
246 run_meta_process(settings=settings, meta_config_path=meta_config_path)
247 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input")
248 run_meta_process(settings=settings, meta_config_path=meta_config_path)
250 output = dict()
252 entity_types = ['ar', 'br', 'id', 'ra', 're']
254 for entity_type in entity_types:
255 query = f"""
256 SELECT ?s ?p ?o
257 WHERE {{
258 ?s ?p ?o .
259 FILTER(REGEX(STR(?s), "https://w3id.org/oc/meta/{entity_type}/[0-9]+/prov/se/[0-9]+"))
260 }}
261 """
263 result = execute_sparql_query(PROV_SERVER, query)
265 entities = {}
266 for binding in result['results']['bindings']:
267 s_str = binding['s']['value']
268 p_str = binding['p']['value']
269 o_data = binding['o']
271 if s_str not in entities:
272 entities[s_str] = {'@id': s_str, '@type': []}
274 if p_str == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type':
275 entities[s_str]['@type'].append(o_data['value'])
276 else:
277 if p_str not in entities[s_str]:
278 entities[s_str][p_str] = []
280 if o_data['type'] == 'uri':
281 entities[s_str][p_str].append({'@id': o_data['value']})
282 elif o_data.get('datatype'):
283 entities[s_str][p_str].append({
284 '@value': o_data['value'],
285 '@type': o_data['datatype']
286 })
287 else:
288 entities[s_str][p_str].append({'@value': o_data['value']})
290 # Group entities by their parent entity (e.g., br/0601/prov/se/1 -> br/0601)
291 grouped_entities = {}
292 for entity_id, entity_data in entities.items():
293 # Extract the parent entity ID from the provenance entity ID
294 parent_id = re.match(r'https://w3id.org/oc/meta/([^/]+/[0-9]+)', entity_id).group(0)
296 if parent_id not in grouped_entities:
297 grouped_entities[parent_id] = []
299 # Filter out properties we don't need for comparison
300 filtered_entity_data = {
301 '@id': entity_data['@id'],
302 }
304 # Keep the required properties for comparison
305 properties_to_keep = [
306 'http://www.w3.org/ns/prov#specializationOf',
307 'http://www.w3.org/ns/prov#wasDerivedFrom'
308 ]
310 for prop in properties_to_keep:
311 if prop in entity_data:
312 filtered_entity_data[prop] = entity_data[prop]
314 # Handle hasUpdateQuery specially
315 if 'https://w3id.org/oc/ontology/hasUpdateQuery' in entity_data:
316 # Extract the value from the hasUpdateQuery property
317 update_query_value = entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'][0].get('@value', '')
319 # Split the query into individual statements
320 if update_query_value:
321 # Extract the part between the INSERT DATA { GRAPH <...> { and } }
322 try:
323 query_content = update_query_value.split(
324 "INSERT DATA { GRAPH <https://w3id.org/oc/meta/br/> { "
325 )[1].split(" } }")[0]
327 # Split by dot and space to get individual statements
328 statements = set(query_content.split(" ."))
330 # Add to filtered entity data
331 filtered_entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'] = statements
332 except IndexError:
333 # If the format is different, just use the original value
334 filtered_entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'] = \
335 entity_data['https://w3id.org/oc/ontology/hasUpdateQuery']
337 # Add this filtered entity to its parent's group
338 grouped_entities[parent_id].append(filtered_entity_data)
340 # Format the output to match the expected structure
341 entity_list = []
342 for parent_id, entities_list in sorted(grouped_entities.items()):
343 entity_list.append({
344 '@graph': sorted(entities_list, key=lambda x: x['@id'])
345 })
347 output[entity_type] = entity_list
348 expected_output = {
349 "ar": [
350 {
351 "@graph": [
352 {
353 "@id": "https://w3id.org/oc/meta/ar/0601/prov/se/1",
354 "http://www.w3.org/ns/prov#specializationOf": [
355 {"@id": "https://w3id.org/oc/meta/ar/0601"}
356 ],
357 }
358 ]
359 },
360 {
361 "@graph": [
362 {
363 "@id": "https://w3id.org/oc/meta/ar/0602/prov/se/1",
364 "http://www.w3.org/ns/prov#specializationOf": [
365 {"@id": "https://w3id.org/oc/meta/ar/0602"}
366 ],
367 }
368 ]
369 },
370 {
371 "@graph": [
372 {
373 "@id": "https://w3id.org/oc/meta/ar/0603/prov/se/1",
374 "http://www.w3.org/ns/prov#specializationOf": [
375 {"@id": "https://w3id.org/oc/meta/ar/0603"}
376 ],
377 }
378 ]
379 },
380 ],
381 "br": [
382 {
383 "@graph": [
384 {
385 "@id": "https://w3id.org/oc/meta/br/0601/prov/se/1",
386 "http://www.w3.org/ns/prov#specializationOf": [
387 {"@id": "https://w3id.org/oc/meta/br/0601"}
388 ],
389 },
390 {
391 "@id": "https://w3id.org/oc/meta/br/0601/prov/se/2",
392 "http://www.w3.org/ns/prov#specializationOf": [
393 {"@id": "https://w3id.org/oc/meta/br/0601"}
394 ],
395 "http://www.w3.org/ns/prov#wasDerivedFrom": [
396 {"@id": "https://w3id.org/oc/meta/br/0601/prov/se/1"}
397 ],
398 "https://w3id.org/oc/ontology/hasUpdateQuery": {
399 "",
400 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0601>",
401 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0603>",
402 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0602>",
403 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0603>",
404 '<https://w3id.org/oc/meta/br/0601> <http://prismstandard.org/namespaces/basic/2.0/publicationDate> "2015-08-22"^^<http://www.w3.org/2001/XMLSchema#date>',
405 '<https://w3id.org/oc/meta/br/0601> <http://purl.org/dc/terms/title> "Some Aspects Of The Evolution Of Chernozems Under The Influence Of Natural And Anthropogenic Factors"^^<http://www.w3.org/2001/XMLSchema#string>',
406 },
407 },
408 ]
409 },
410 {
411 "@graph": [
412 {
413 "@id": "https://w3id.org/oc/meta/br/0602/prov/se/1",
414 "http://www.w3.org/ns/prov#specializationOf": [
415 {"@id": "https://w3id.org/oc/meta/br/0602"}
416 ],
417 }
418 ]
419 },
420 {
421 "@graph": [
422 {
423 "@id": "https://w3id.org/oc/meta/br/0603/prov/se/1",
424 "http://www.w3.org/ns/prov#specializationOf": [
425 {"@id": "https://w3id.org/oc/meta/br/0603"}
426 ],
427 }
428 ]
429 },
430 ],
431 "id": [
432 {
433 "@graph": [
434 {
435 "@id": "https://w3id.org/oc/meta/id/0601/prov/se/1",
436 "http://www.w3.org/ns/prov#specializationOf": [
437 {"@id": "https://w3id.org/oc/meta/id/0601"}
438 ],
439 }
440 ]
441 },
442 {
443 "@graph": [
444 {
445 "@id": "https://w3id.org/oc/meta/id/0602/prov/se/1",
446 "http://www.w3.org/ns/prov#specializationOf": [
447 {"@id": "https://w3id.org/oc/meta/id/0602"}
448 ],
449 }
450 ]
451 },
452 {
453 "@graph": [
454 {
455 "@id": "https://w3id.org/oc/meta/id/0603/prov/se/1",
456 "http://www.w3.org/ns/prov#specializationOf": [
457 {"@id": "https://w3id.org/oc/meta/id/0603"}
458 ],
459 }
460 ]
461 },
462 {
463 "@graph": [
464 {
465 "@id": "https://w3id.org/oc/meta/id/0604/prov/se/1",
466 "http://www.w3.org/ns/prov#specializationOf": [
467 {"@id": "https://w3id.org/oc/meta/id/0604"}
468 ],
469 }
470 ]
471 },
472 ],
473 "ra": [
474 {
475 "@graph": [
476 {
477 "@id": "https://w3id.org/oc/meta/ra/0601/prov/se/1",
478 "http://www.w3.org/ns/prov#specializationOf": [
479 {"@id": "https://w3id.org/oc/meta/ra/0601"}
480 ],
481 }
482 ]
483 },
484 {
485 "@graph": [
486 {
487 "@id": "https://w3id.org/oc/meta/ra/0602/prov/se/1",
488 "http://www.w3.org/ns/prov#specializationOf": [
489 {"@id": "https://w3id.org/oc/meta/ra/0602"}
490 ],
491 }
492 ]
493 },
494 ],
495 "re": [
496 {
497 "@graph": [
498 {
499 "@id": "https://w3id.org/oc/meta/re/0601/prov/se/1",
500 "http://www.w3.org/ns/prov#specializationOf": [
501 {"@id": "https://w3id.org/oc/meta/re/0601"}
502 ],
503 }
504 ]
505 }
506 ],
507 }
508 shutil.rmtree(output_folder)
509 self.maxDiff = None
510 self.assertEqual(output, expected_output)
512 def test_run_meta_process_thread_safe(self):
513 output_folder = os.path.join(BASE_DIR, "output_4")
514 meta_config_path = os.path.join(BASE_DIR, "meta_config_4.yaml")
515 with open(meta_config_path, encoding="utf-8") as file:
516 settings = yaml.full_load(file)
517 original_input_csv_dir = settings["input_csv_dir"]
518 settings["input_csv_dir"] = os.path.join(original_input_csv_dir, "preprocess")
520 # Use temporary cache files to avoid corruption
521 settings["ts_upload_cache"] = self.cache_file
522 settings["ts_failed_queries"] = self.failed_file
523 settings["ts_stop_file"] = self.stop_file
525 reset_server()
527 run_meta_process(settings=settings, meta_config_path=meta_config_path)
529 # Create a temporary config file with updated settings for subprocess
530 temp_config_path = os.path.join(self.temp_dir, "temp_meta_config.yaml")
531 with open(temp_config_path, "w") as f:
532 yaml.dump(settings, f)
534 # Run it again to test thread safety
535 proc = subprocess.run(
536 [sys.executable, "-m", "oc_meta.run.meta_process", "-c", temp_config_path],
537 capture_output=True,
538 text=True,
539 )
541 output = dict()
543 entity_types = ['ar', 'br', 'id', 'ra', 're']
545 for entity_type in entity_types:
546 query = f"""
547 SELECT ?s ?p ?o
548 WHERE {{
549 ?s ?p ?o .
550 FILTER(STRSTARTS(STR(?s), "https://w3id.org/oc/meta/{entity_type}/"))
551 }}
552 """
554 result = execute_sparql_query(SERVER, query)
556 entities = {}
557 for binding in result['results']['bindings']:
558 s_str = binding['s']['value']
559 p_str = binding['p']['value']
560 o_data = binding['o']
562 if s_str not in entities:
563 entities[s_str] = {'@id': s_str, '@type': []}
565 if p_str == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type':
566 entities[s_str]['@type'].append(o_data['value'])
567 else:
568 if p_str not in entities[s_str]:
569 entities[s_str][p_str] = []
571 if o_data['type'] == 'uri':
572 entities[s_str][p_str].append({'@id': o_data['value']})
573 elif o_data.get('datatype'):
574 entities[s_str][p_str].append({
575 '@value': o_data['value'],
576 '@type': o_data['datatype']
577 })
578 else:
579 entities[s_str][p_str].append({'@value': o_data['value']})
581 entity_list = list(entities.values())
583 output[entity_type] = [
584 {
585 '@graph': entity_list,
586 '@id': f"https://w3id.org/oc/meta/{entity_type}/"
587 }
588 ]
590 expected_output = {
591 "ar": [
592 {
593 "@graph": [
594 {
595 "@id": "https://w3id.org/oc/meta/ar/0604",
596 "@type": ["http://purl.org/spar/pro/RoleInTime"],
597 "http://purl.org/spar/pro/isHeldBy": [
598 {"@id": "https://w3id.org/oc/meta/ra/0604"}
599 ],
600 "http://purl.org/spar/pro/withRole": [
601 {"@id": "http://purl.org/spar/pro/publisher"}
602 ],
603 },
604 {
605 "@id": "https://w3id.org/oc/meta/ar/0602",
606 "@type": ["http://purl.org/spar/pro/RoleInTime"],
607 "http://purl.org/spar/pro/isHeldBy": [
608 {"@id": "https://w3id.org/oc/meta/ra/0602"}
609 ],
610 "http://purl.org/spar/pro/withRole": [
611 {"@id": "http://purl.org/spar/pro/author"}
612 ],
613 "https://w3id.org/oc/ontology/hasNext": [
614 {"@id": "https://w3id.org/oc/meta/ar/0603"}
615 ],
616 },
617 {
618 "@id": "https://w3id.org/oc/meta/ar/0603",
619 "@type": ["http://purl.org/spar/pro/RoleInTime"],
620 "http://purl.org/spar/pro/isHeldBy": [
621 {"@id": "https://w3id.org/oc/meta/ra/0603"}
622 ],
623 "http://purl.org/spar/pro/withRole": [
624 {"@id": "http://purl.org/spar/pro/author"}
625 ],
626 },
627 {
628 "@id": "https://w3id.org/oc/meta/ar/0605",
629 "@type": ["http://purl.org/spar/pro/RoleInTime"],
630 "http://purl.org/spar/pro/isHeldBy": [
631 {"@id": "https://w3id.org/oc/meta/ra/0605"}
632 ],
633 "http://purl.org/spar/pro/withRole": [
634 {"@id": "http://purl.org/spar/pro/editor"}
635 ],
636 },
637 {
638 "@id": "https://w3id.org/oc/meta/ar/0601",
639 "@type": ["http://purl.org/spar/pro/RoleInTime"],
640 "http://purl.org/spar/pro/isHeldBy": [
641 {"@id": "https://w3id.org/oc/meta/ra/0601"}
642 ],
643 "http://purl.org/spar/pro/withRole": [
644 {"@id": "http://purl.org/spar/pro/author"}
645 ],
646 "https://w3id.org/oc/ontology/hasNext": [
647 {"@id": "https://w3id.org/oc/meta/ar/0602"}
648 ],
649 },
650 ],
651 "@id": "https://w3id.org/oc/meta/ar/",
652 }
653 ],
654 "br": [
655 {
656 "@graph": [
657 {
658 "@id": "https://w3id.org/oc/meta/br/0601",
659 "@type": [
660 "http://purl.org/spar/fabio/Expression",
661 "http://purl.org/spar/fabio/JournalArticle",
662 ],
663 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
664 {
665 "@type": "http://www.w3.org/2001/XMLSchema#date",
666 "@value": "2012-03-31",
667 }
668 ],
669 "http://purl.org/dc/terms/title": [
670 {
671 "@value": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment",
672 "@type": "http://www.w3.org/2001/XMLSchema#string"
673 }
674 ],
675 "http://purl.org/spar/datacite/hasIdentifier": [
676 {"@id": "https://w3id.org/oc/meta/id/0601"}
677 ],
678 "http://purl.org/spar/pro/isDocumentContextFor": [
679 {"@id": "https://w3id.org/oc/meta/ar/0603"},
680 {"@id": "https://w3id.org/oc/meta/ar/0601"},
681 {"@id": "https://w3id.org/oc/meta/ar/0604"},
682 {"@id": "https://w3id.org/oc/meta/ar/0602"},
683 {"@id": "https://w3id.org/oc/meta/ar/0605"},
684 ],
685 "http://purl.org/vocab/frbr/core#embodiment": [
686 {"@id": "https://w3id.org/oc/meta/re/0601"}
687 ],
688 "http://purl.org/vocab/frbr/core#partOf": [
689 {"@id": "https://w3id.org/oc/meta/br/0604"}
690 ],
691 },
692 {
693 "@id": "https://w3id.org/oc/meta/br/0604",
694 "@type": [
695 "http://purl.org/spar/fabio/JournalIssue",
696 "http://purl.org/spar/fabio/Expression",
697 ],
698 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
699 {"@value": "1", "@type": "http://www.w3.org/2001/XMLSchema#string"}
700 ],
701 "http://purl.org/vocab/frbr/core#partOf": [
702 {"@id": "https://w3id.org/oc/meta/br/0603"}
703 ],
704 },
705 {
706 "@id": "https://w3id.org/oc/meta/br/0602",
707 "@type": [
708 "http://purl.org/spar/fabio/Expression",
709 "http://purl.org/spar/fabio/Journal",
710 ],
711 "http://purl.org/dc/terms/title": [
712 {"@value": "The Korean Journal Of Food And Nutrition", "@type": "http://www.w3.org/2001/XMLSchema#string"}
713 ],
714 "http://purl.org/spar/datacite/hasIdentifier": [
715 {"@id": "https://w3id.org/oc/meta/id/0602"}
716 ],
717 },
718 {
719 "@id": "https://w3id.org/oc/meta/br/0603",
720 "@type": [
721 "http://purl.org/spar/fabio/Expression",
722 "http://purl.org/spar/fabio/JournalVolume",
723 ],
724 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
725 {"@value": "25", "@type": "http://www.w3.org/2001/XMLSchema#string"}
726 ],
727 "http://purl.org/vocab/frbr/core#partOf": [
728 {"@id": "https://w3id.org/oc/meta/br/0602"}
729 ],
730 },
731 ],
732 "@id": "https://w3id.org/oc/meta/br/",
733 }
734 ],
735 "id": [
736 {
737 "@graph": [
738 {
739 "@id": "https://w3id.org/oc/meta/id/0605",
740 "@type": ["http://purl.org/spar/datacite/Identifier"],
741 "http://purl.org/spar/datacite/usesIdentifierScheme": [
742 {"@id": "http://purl.org/spar/datacite/orcid"}
743 ],
744 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
745 {"@value": "0000-0002-9666-2513", "@type": "http://www.w3.org/2001/XMLSchema#string"}
746 ],
747 },
748 {
749 "@id": "https://w3id.org/oc/meta/id/0601",
750 "@type": ["http://purl.org/spar/datacite/Identifier"],
751 "http://purl.org/spar/datacite/usesIdentifierScheme": [
752 {"@id": "http://purl.org/spar/datacite/doi"}
753 ],
754 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
755 {"@value": "10.9799/ksfan.2012.25.1.069", "@type": "http://www.w3.org/2001/XMLSchema#string"}
756 ],
757 },
758 {
759 "@id": "https://w3id.org/oc/meta/id/0603",
760 "@type": ["http://purl.org/spar/datacite/Identifier"],
761 "http://purl.org/spar/datacite/usesIdentifierScheme": [
762 {"@id": "http://purl.org/spar/datacite/orcid"}
763 ],
764 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
765 {"@value": "0000-0003-2542-5788", "@type": "http://www.w3.org/2001/XMLSchema#string"}
766 ],
767 },
768 {
769 "@id": "https://w3id.org/oc/meta/id/0604",
770 "@type": ["http://purl.org/spar/datacite/Identifier"],
771 "http://purl.org/spar/datacite/usesIdentifierScheme": [
772 {"@id": "http://purl.org/spar/datacite/crossref"}
773 ],
774 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
775 {"@value": "4768", "@type": "http://www.w3.org/2001/XMLSchema#string"}
776 ],
777 },
778 {
779 "@id": "https://w3id.org/oc/meta/id/0602",
780 "@type": ["http://purl.org/spar/datacite/Identifier"],
781 "http://purl.org/spar/datacite/usesIdentifierScheme": [
782 {"@id": "http://purl.org/spar/datacite/issn"}
783 ],
784 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
785 {"@value": "1225-4339", "@type": "http://www.w3.org/2001/XMLSchema#string"}
786 ],
787 },
788 ],
789 "@id": "https://w3id.org/oc/meta/id/",
790 }
791 ],
792 "ra": [
793 {
794 "@graph": [
795 {
796 "@id": "https://w3id.org/oc/meta/ra/0605",
797 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
798 "http://purl.org/spar/datacite/hasIdentifier": [
799 {"@id": "https://w3id.org/oc/meta/id/0605"}
800 ],
801 "http://xmlns.com/foaf/0.1/familyName": [
802 {"@value": "Chung", "@type": "http://www.w3.org/2001/XMLSchema#string"}
803 ],
804 "http://xmlns.com/foaf/0.1/givenName": [
805 {"@value": "Myong-Soo", "@type": "http://www.w3.org/2001/XMLSchema#string"}
806 ],
807 },
808 {
809 "@id": "https://w3id.org/oc/meta/ra/0602",
810 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
811 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Mun", "@type": "http://www.w3.org/2001/XMLSchema#string"}],
812 "http://xmlns.com/foaf/0.1/givenName": [
813 {"@value": "Ji-Hye", "@type": "http://www.w3.org/2001/XMLSchema#string"}
814 ],
815 },
816 {
817 "@id": "https://w3id.org/oc/meta/ra/0604",
818 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
819 "http://purl.org/spar/datacite/hasIdentifier": [
820 {"@id": "https://w3id.org/oc/meta/id/0604"}
821 ],
822 "http://xmlns.com/foaf/0.1/name": [
823 {"@value": "The Korean Society Of Food And Nutrition", "@type": "http://www.w3.org/2001/XMLSchema#string"}
824 ],
825 },
826 {
827 "@id": "https://w3id.org/oc/meta/ra/0603",
828 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
829 "http://xmlns.com/foaf/0.1/familyName": [
830 {"@value": "Chung", "@type": "http://www.w3.org/2001/XMLSchema#string"}
831 ],
832 "http://xmlns.com/foaf/0.1/givenName": [
833 {"@value": "Myong-Soo", "@type": "http://www.w3.org/2001/XMLSchema#string"}
834 ],
835 },
836 {
837 "@id": "https://w3id.org/oc/meta/ra/0601",
838 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
839 "http://purl.org/spar/datacite/hasIdentifier": [
840 {"@id": "https://w3id.org/oc/meta/id/0603"}
841 ],
842 "http://xmlns.com/foaf/0.1/familyName": [
843 {"@value": "Cheigh", "@type": "http://www.w3.org/2001/XMLSchema#string"}
844 ],
845 "http://xmlns.com/foaf/0.1/givenName": [
846 {"@value": "Chan-Ick", "@type": "http://www.w3.org/2001/XMLSchema#string"}
847 ],
848 },
849 ],
850 "@id": "https://w3id.org/oc/meta/ra/",
851 }
852 ],
853 "re": [
854 {
855 "@graph": [
856 {
857 "@id": "https://w3id.org/oc/meta/re/0601",
858 "@type": ["http://purl.org/spar/fabio/Manifestation"],
859 "http://prismstandard.org/namespaces/basic/2.0/endingPage": [
860 {"@value": "76", "@type": "http://www.w3.org/2001/XMLSchema#string"}
861 ],
862 "http://prismstandard.org/namespaces/basic/2.0/startingPage": [
863 {"@value": "69", "@type": "http://www.w3.org/2001/XMLSchema#string"}
864 ],
865 }
866 ],
867 "@id": "https://w3id.org/oc/meta/re/",
868 }
869 ],
870 }
872 processed_output = {}
873 for entity_type, entity_data in output.items():
874 processed_output[entity_type] = []
875 for graph_container in entity_data:
876 filtered_graph = []
877 for entity in graph_container['@graph']:
878 filtered_entity = {
879 '@id': entity['@id']
880 }
881 for pred, obj in entity.items():
882 if pred != '@id': # Only exclude @id since we already added it
883 filtered_entity[pred] = obj
885 if len(filtered_entity) > 1: # Only include if it has predicates beyond @id
886 filtered_graph.append(filtered_entity)
888 # Sort the graph by @id
889 filtered_graph = sorted(filtered_graph, key=lambda x: x['@id'])
891 processed_output[entity_type].append({
892 '@graph': filtered_graph,
893 '@id': graph_container['@id']
894 })
895 # For each entity type in the expected output, verify that all expected entities exist
896 # with their expected properties in the actual output from the triplestore
897 for entity_type, expected_graphs in expected_output.items():
898 self.assertIn(entity_type, processed_output, f"Entity type {entity_type} missing from triplestore output")
900 for expected_graph in expected_graphs:
901 expected_entities = expected_graph['@graph']
903 # Find the corresponding graph in the processed output
904 actual_graph = None
905 for graph in processed_output[entity_type]:
906 if graph['@id'] == expected_graph['@id']:
907 actual_graph = graph
908 break
910 self.assertIsNotNone(actual_graph, f"Graph {expected_graph['@id']} not found in triplestore output")
912 # For each expected entity, verify it exists with all expected properties
913 for expected_entity in expected_entities:
914 entity_id = expected_entity['@id']
916 # Find the entity in the actual graph
917 actual_entity = None
918 for entity in actual_graph['@graph']:
919 if entity['@id'] == entity_id:
920 actual_entity = entity
921 break
923 self.assertIsNotNone(actual_entity, f"Entity {entity_id} not found in triplestore output")
925 # Check that all expected predicates and objects exist
926 for pred, expected_objects in expected_entity.items():
927 if pred != '@id':
928 self.assertIn(pred, actual_entity, f"Predicate {pred} missing for entity {entity_id}")
930 # For each expected object, verify it exists in the actual objects
931 for expected_obj in expected_objects:
932 found = False
933 for actual_obj in actual_entity[pred]:
934 # Require exact matches for all objects
935 if expected_obj == actual_obj:
936 found = True
937 break
939 self.assertTrue(found, f"Object {expected_obj} not found for predicate {pred} of entity {entity_id}\nActual values: {actual_entity[pred]}")
942 if os.path.exists(output_folder):
943 shutil.rmtree(output_folder)
945 self.assertFalse(
946 "Reader: ERROR" in proc.stdout or "Storer: ERROR" in proc.stdout
947 )
948 self.assertFalse(
949 "Reader: ERROR" in proc.stdout or "Storer: ERROR" in proc.stdout
950 )
952 def test_silencer_on(self):
953 output_folder = os.path.join(BASE_DIR, "output_6")
954 now = datetime.now()
955 meta_config_path = os.path.join(BASE_DIR, "meta_config_6.yaml")
956 with open(meta_config_path, encoding="utf-8") as file:
957 settings = yaml.full_load(file)
959 # Update settings with temporary files and Redis cache DB
960 settings.update(
961 {
962 "redis_cache_db": 2,
963 "ts_upload_cache": self.cache_file,
964 "ts_failed_queries": self.failed_file,
965 "ts_stop_file": self.stop_file,
966 }
967 )
969 run_meta_process(settings=settings, meta_config_path=meta_config_path)
970 settings["input_csv_dir"] = os.path.join(
971 BASE_DIR, "same_as_input_2_with_other_authors"
972 )
973 run_meta_process(settings=settings, meta_config_path=meta_config_path)
974 query_agents = """
975 PREFIX pro: <http://purl.org/spar/pro/>
976 SELECT (COUNT (?agent) AS ?agent_count)
977 WHERE {
978 <https://w3id.org/oc/meta/br/0601> pro:isDocumentContextFor ?agent.
979 }
980 """
981 result = execute_sparql_query(SERVER, query_agents)
982 expected_result = {
983 "head": {"link": [], "vars": ["agent_count"]},
984 "results": {
985 "distinct": False,
986 "ordered": True,
987 "bindings": [
988 {
989 "agent_count": {
990 "datatype": "http://www.w3.org/2001/XMLSchema#integer",
991 "type": "literal",
992 "value": "3",
993 }
994 }
995 ],
996 },
997 }
998 shutil.rmtree(output_folder)
999 delete_output_zip(".", now)
1000 self.assertEqual(result, expected_result)
1002 def test_silencer_off(self):
1003 output_folder = os.path.join(BASE_DIR, "output_7")
1004 now = datetime.now()
1005 meta_config_path = os.path.join(BASE_DIR, "meta_config_7.yaml")
1006 with open(meta_config_path, encoding="utf-8") as file:
1007 settings = yaml.full_load(file)
1009 # Update settings with temporary files and Redis cache DB
1010 settings.update(
1011 {
1012 "redis_cache_db": 2,
1013 "ts_upload_cache": self.cache_file,
1014 "ts_failed_queries": self.failed_file,
1015 "ts_stop_file": self.stop_file,
1016 }
1017 )
1019 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1020 settings["input_csv_dir"] = os.path.join(
1021 BASE_DIR, "same_as_input_2_with_other_authors"
1022 )
1023 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1024 query_agents = """
1025 PREFIX pro: <http://purl.org/spar/pro/>
1026 SELECT (COUNT (?agent) AS ?agent_count)
1027 WHERE {
1028 <https://w3id.org/oc/meta/br/0601> pro:isDocumentContextFor ?agent.
1029 }
1030 """
1031 result = execute_sparql_query(SERVER, query_agents)
1032 expected_result = {
1033 "head": {"link": [], "vars": ["agent_count"]},
1034 "results": {
1035 "distinct": False,
1036 "ordered": True,
1037 "bindings": [
1038 {
1039 "agent_count": {
1040 "datatype": "http://www.w3.org/2001/XMLSchema#integer",
1041 "type": "literal",
1042 "value": "6",
1043 }
1044 }
1045 ],
1046 },
1047 }
1048 shutil.rmtree(output_folder)
1049 delete_output_zip(".", now)
1050 self.assertEqual(result, expected_result)
1052 def test_omid_in_input_data(self):
1053 query_all = """
1054 PREFIX fabio: <http://purl.org/spar/fabio/>
1055 PREFIX datacite: <http://purl.org/spar/datacite/>
1056 CONSTRUCT {?s ?p ?o. ?id ?id_p ?id_o.}
1057 WHERE {
1058 ?s a fabio:JournalArticle;
1059 ?p ?o.
1060 ?s datacite:hasIdentifier ?id.
1061 ?id ?id_p ?id_o.
1062 }
1063 """
1064 result = execute_sparql_construct(SERVER, query_all)
1065 output_folder = os.path.join(BASE_DIR, "output_8")
1066 meta_config_path_without_openalex = os.path.join(BASE_DIR, "meta_config_8.yaml")
1067 meta_config_path_with_openalex = os.path.join(BASE_DIR, "meta_config_9.yaml")
1068 with open(meta_config_path_without_openalex, encoding="utf-8") as file:
1069 settings_without_openalex = yaml.full_load(file)
1070 with open(meta_config_path_with_openalex, encoding="utf-8") as file:
1071 settings_with_openalex = yaml.full_load(file)
1073 # Update settings with temporary files and Redis cache DB
1074 cache_settings = {
1075 "redis_cache_db": 2,
1076 "ts_upload_cache": self.cache_file,
1077 "ts_failed_queries": self.failed_file,
1078 "ts_stop_file": self.stop_file,
1079 }
1080 settings_without_openalex.update(cache_settings)
1081 settings_with_openalex.update(cache_settings)
1083 run_meta_process(
1084 settings=settings_without_openalex,
1085 meta_config_path=meta_config_path_without_openalex,
1086 )
1087 run_meta_process(
1088 settings=settings_with_openalex,
1089 meta_config_path=meta_config_path_with_openalex,
1090 )
1091 query_all = """
1092 PREFIX fabio: <http://purl.org/spar/fabio/>
1093 PREFIX datacite: <http://purl.org/spar/datacite/>
1094 CONSTRUCT {?s ?p ?o. ?id ?id_p ?id_o.}
1095 WHERE {
1096 ?s a fabio:JournalArticle;
1097 ?p ?o.
1098 ?s datacite:hasIdentifier ?id.
1099 ?id ?id_p ?id_o.
1100 }
1101 """
1102 result = execute_sparql_construct(SERVER, query_all)
1103 expected_result = Graph()
1104 expected_result.parse(
1105 location=os.path.join(BASE_DIR, "test_omid_in_input_data.json"),
1106 format="json-ld",
1107 )
1108 prov_graph = Dataset()
1109 for dirpath, dirnames, filenames in os.walk(os.path.join(output_folder, "rdf")):
1110 if "br" in dirpath and "prov" in dirpath:
1111 for filename in filenames:
1112 prov_graph.parse(
1113 source=os.path.join(dirpath, filename), format="json-ld"
1114 )
1116 expected_prov_graph = Dataset()
1117 expected_prov_graph.parse(
1118 os.path.join(BASE_DIR, "test_omid_in_input_data_prov.json"),
1119 format="json-ld",
1120 )
1121 prov_graph.remove(
1122 (None, URIRef("http://www.w3.org/ns/prov#generatedAtTime"), None)
1123 )
1124 expected_prov_graph.remove(
1125 (None, URIRef("http://www.w3.org/ns/prov#generatedAtTime"), None)
1126 )
1127 prov_graph.remove(
1128 (None, URIRef("http://www.w3.org/ns/prov#invalidatedAtTime"), None)
1129 )
1130 expected_prov_graph.remove(
1131 (None, URIRef("http://www.w3.org/ns/prov#invalidatedAtTime"), None)
1132 )
1133 shutil.rmtree(output_folder)
1134 self.assertTrue(
1135 normalize_graph(result).isomorphic(normalize_graph(expected_result))
1136 )
1138 def test_publishers_sequence(self):
1139 output_folder = os.path.join(BASE_DIR, "output_9")
1140 meta_config_path = os.path.join(BASE_DIR, "meta_config_10.yaml")
1141 now = datetime.now()
1142 with open(meta_config_path, encoding="utf-8") as file:
1143 settings = yaml.full_load(file)
1145 # Update settings with temporary files and Redis cache DB
1146 settings.update(
1147 {
1148 "redis_cache_db": 2,
1149 "ts_upload_cache": self.cache_file,
1150 "ts_failed_queries": self.failed_file,
1151 "ts_stop_file": self.stop_file,
1152 }
1153 )
1155 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1156 query_all = """
1157 PREFIX datacite: <http://purl.org/spar/datacite/>
1158 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>
1159 CONSTRUCT {?br ?p ?o. ?o ?op ?oo. ?oo ?oop ?ooo. ?ooo ?ooop ?oooo.}
1160 WHERE {
1161 ?id literal:hasLiteralValue "10.17117/na.2015.08.1067"^^<http://www.w3.org/2001/XMLSchema#string>;
1162 datacite:usesIdentifierScheme datacite:doi;
1163 ^datacite:hasIdentifier ?br.
1164 ?br ?p ?o.
1165 ?o ?op ?oo.
1166 ?oo ?oop ?ooo.
1167 ?ooo ?ooop ?oooo.
1168 }
1169 """
1170 result = execute_sparql_construct(SERVER, query_all)
1171 expected_result = Graph()
1172 expected_result.parse(
1173 os.path.join(BASE_DIR, "test_publishers_sequence.json"), format="json-ld"
1174 )
1175 shutil.rmtree(output_folder)
1176 self.assertTrue(
1177 normalize_graph(result).isomorphic(normalize_graph(expected_result))
1178 )
1180 def test_duplicate_omids_with_datatype(self):
1181 output_folder = os.path.join(BASE_DIR, "output_duplicate_test")
1182 meta_config_path = os.path.join(BASE_DIR, "meta_config_duplicate.yaml")
1184 # Create test settings
1185 settings = {
1186 "triplestore_url": SERVER,
1187 "provenance_triplestore_url": PROV_SERVER,
1188 "input_csv_dir": os.path.join(BASE_DIR, "input_duplicate"),
1189 "base_output_dir": output_folder,
1190 "output_rdf_dir": output_folder,
1191 "resp_agent": "test",
1192 "base_iri": "https://w3id.org/oc/meta/",
1193 "context_path": None,
1194 "dir_split_number": 10000,
1195 "items_per_file": 1000,
1196 "default_dir": "_",
1197 "rdf_output_in_chunks": False,
1198 "zip_output_rdf": True,
1199 "source": None,
1200 "supplier_prefix": "060",
1201 "use_doi_api_service": False,
1202 "blazegraph_full_text_search": False,
1203 "virtuoso_full_text_search": True,
1204 "fuseki_full_text_search": False,
1205 "graphdb_connector_name": None,
1206 "cache_endpoint": None,
1207 "cache_update_endpoint": None,
1208 "silencer": [],
1209 "redis_host": "localhost",
1210 "redis_port": 6381,
1211 "redis_db": 5,
1212 "redis_cache_db": 2,
1213 "ts_upload_cache": self.cache_file,
1214 "ts_failed_queries": self.failed_file,
1215 "ts_stop_file": self.stop_file,
1216 }
1218 # Setup: create test data
1219 os.makedirs(os.path.join(BASE_DIR, "input_duplicate"), exist_ok=True)
1220 with open(
1221 os.path.join(BASE_DIR, "input_duplicate", "test.csv"), "w", encoding="utf-8"
1222 ) as f:
1223 writer = csv.writer(f)
1224 writer.writerow(
1225 [
1226 "id",
1227 "title",
1228 "author",
1229 "pub_date",
1230 "venue",
1231 "volume",
1232 "issue",
1233 "page",
1234 "type",
1235 "publisher",
1236 "editor",
1237 ]
1238 )
1239 writer.writerow(
1240 [
1241 "issn:2543-3288 issn:2078-7685", # Exact problematic row from production
1242 "Journal of Diabetology",
1243 "",
1244 "",
1245 "",
1246 "",
1247 "",
1248 "",
1249 "journal",
1250 "Medknow [crossref:2581]",
1251 "",
1252 ]
1253 )
1255 # Setup: Insert pre-existing identifiers and BRs in triplestore
1256 with SPARQLClient(SERVER, timeout=60) as client:
1257 client.update(
1258 """
1259 INSERT DATA {
1260 GRAPH <https://w3id.org/oc/meta/br/> {
1261 <https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601> ;
1262 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> .
1263 <https://w3id.org/oc/meta/br/0602> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0602> ;
1264 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> .
1265 }
1266 GRAPH <https://w3id.org/oc/meta/id/> {
1267 <https://w3id.org/oc/meta/id/0601> <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "2078-7685" ;
1268 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1269 <https://w3id.org/oc/meta/id/0602> <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "2543-3288" ;
1270 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1271 }
1272 }
1273 """
1274 )
1276 # Update Redis counters to match the inserted data
1277 redis_handler = RedisCounterHandler(port=6381, db=5) # Use test db
1278 redis_handler.set_counter(
1279 2, "br", supplier_prefix="060"
1280 ) # BR counter for two BRs
1281 redis_handler.set_counter(
1282 2, "id", supplier_prefix="060"
1283 ) # ID counter for two IDs
1285 # Run the process
1286 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1288 # Check for errors
1289 errors_file = os.path.join(output_folder, "errors.txt")
1290 if os.path.exists(errors_file):
1291 with open(errors_file, "r") as f:
1292 errors = f.read()
1293 print(f"Errors found:\n{errors}")
1295 # Query to check for duplicates
1296 query = """
1297 SELECT DISTINCT ?id ?value
1298 WHERE {
1299 GRAPH <https://w3id.org/oc/meta/id/> {
1300 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?value ;
1301 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1302 FILTER(?value IN ("2078-7685"^^<http://www.w3.org/2001/XMLSchema#string>, "2078-7685",
1303 "2543-3288"^^<http://www.w3.org/2001/XMLSchema#string>, "2543-3288"))
1304 }
1305 }
1306 """
1307 result = execute_sparql_query(SERVER, query)
1308 # Group IDs by value to check for duplicates
1309 ids_by_value = {}
1310 for binding in result["results"]["bindings"]:
1311 value = binding["value"]["value"]
1312 id = binding["id"]["value"]
1313 if value not in ids_by_value:
1314 ids_by_value[value] = []
1315 ids_by_value[value].append(id)
1317 # Cleanup
1318 shutil.rmtree(output_folder, ignore_errors=True)
1319 shutil.rmtree(os.path.join(BASE_DIR, "input_duplicate"), ignore_errors=True)
1320 if os.path.exists(meta_config_path):
1321 os.remove(meta_config_path)
1323 # Check that we have both ISSNs and no duplicates
1324 for issn_value, ids in ids_by_value.items():
1325 self.assertEqual(
1326 len(ids), 1, f"Found multiple IDs for ISSN {issn_value}: {ids}"
1327 )
1329 self.assertEqual(
1330 len(ids_by_value),
1331 2,
1332 f"Expected 2 ISSNs, found {len(ids_by_value)}: {list(ids_by_value.keys())}",
1333 )
1335 def test_duplicate_omids_with_venue_datatype(self):
1336 """Test to verify that identifiers are not duplicated when merging previously unconnected venues"""
1337 output_folder = os.path.join(BASE_DIR, "output_duplicate_venue_test")
1338 meta_config_path = os.path.join(BASE_DIR, "meta_config_duplicate_venue.yaml")
1340 # Setup: create test data
1341 os.makedirs(os.path.join(BASE_DIR, "input_duplicate_venue"), exist_ok=True)
1342 with open(
1343 os.path.join(BASE_DIR, "input_duplicate_venue", "test.csv"),
1344 "w",
1345 encoding="utf-8",
1346 ) as f:
1347 writer = csv.writer(f)
1348 writer.writerow(
1349 [
1350 "id",
1351 "title",
1352 "author",
1353 "pub_date",
1354 "venue",
1355 "volume",
1356 "issue",
1357 "page",
1358 "type",
1359 "publisher",
1360 "editor",
1361 ]
1362 )
1363 writer.writerow(
1364 [
1365 "issn:1756-1833",
1366 "BMJ",
1367 "",
1368 "",
1369 "",
1370 "",
1371 "",
1372 "",
1373 "journal",
1374 "BMJ [crossref:239]",
1375 "",
1376 ]
1377 )
1378 writer.writerow(
1379 [
1380 "", # id
1381 "", # title
1382 "", # author
1383 "", # pub_date
1384 "BMJ [issn:0267-0623 issn:0959-8138 issn:1468-5833 issn:0007-1447]", # venue
1385 "283", # volume
1386 "", # issue
1387 "", # page
1388 "journal volume", # type
1389 "BMJ [crossref:239]", # publisher
1390 "", # editor
1391 ]
1392 )
1394 # Setup: Insert pre-existing data - aggiungiamo gli identificatori iniziali
1395 with SPARQLClient(SERVER, timeout=60) as client:
1396 client.update(
1397 """
1398 INSERT DATA {
1399 GRAPH <https://w3id.org/oc/meta/br/> {
1400 # First venue - BMJ with initial ISSNs
1401 <https://w3id.org/oc/meta/br/0601>
1402 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601>, <https://w3id.org/oc/meta/id/0602> ;
1403 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ;
1404 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1405 <http://purl.org/dc/terms/title> "BMJ" .
1407 # Second venue
1408 <https://w3id.org/oc/meta/br/0602>
1409 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0603> ;
1410 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ;
1411 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1412 <http://purl.org/dc/terms/title> "British Medical Journal" .
1413 }
1414 GRAPH <https://w3id.org/oc/meta/id/> {
1415 # First venue's ISSNs
1416 <https://w3id.org/oc/meta/id/0601>
1417 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "1756-1833" ;
1418 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1419 <https://w3id.org/oc/meta/id/0602>
1420 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "0959-8138" ;
1421 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1422 # Second venue's ISSN
1423 <https://w3id.org/oc/meta/id/0603>
1424 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "0267-0623" ;
1425 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1426 }
1427 }
1428 """
1429 )
1431 # Update Redis counters for the pre-existing entities
1432 redis_handler = RedisCounterHandler(port=6381, db=5)
1433 redis_handler.set_counter(
1434 6, "br", supplier_prefix="060"
1435 ) # Updated to account for 6 entities (2 venues + 4 volumes)
1436 redis_handler.set_counter(
1437 3, "id", supplier_prefix="060"
1438 ) # Corretto: 3 IDs (1756-1833, 0959-8138, 0267-0623)
1440 # Create test settings
1441 settings = {
1442 "triplestore_url": SERVER,
1443 "provenance_triplestore_url": PROV_SERVER,
1444 "input_csv_dir": os.path.join(BASE_DIR, "input_duplicate_venue"),
1445 "base_output_dir": output_folder,
1446 "output_rdf_dir": output_folder,
1447 "resp_agent": "test",
1448 "base_iri": "https://w3id.org/oc/meta/",
1449 "context_path": None,
1450 "dir_split_number": 10000,
1451 "items_per_file": 1000,
1452 "default_dir": "_",
1453 "rdf_output_in_chunks": False,
1454 "zip_output_rdf": True,
1455 "source": None,
1456 "supplier_prefix": "060",
1457 "use_doi_api_service": False,
1458 "blazegraph_full_text_search": False,
1459 "virtuoso_full_text_search": True,
1460 "fuseki_full_text_search": False,
1461 "graphdb_connector_name": None,
1462 "cache_endpoint": None,
1463 "cache_update_endpoint": None,
1464 "silencer": [],
1465 "redis_host": "localhost",
1466 "redis_port": 6381,
1467 "redis_db": 5,
1468 "redis_cache_db": 2,
1469 "ts_upload_cache": self.cache_file,
1470 "ts_failed_queries": self.failed_file,
1471 "ts_stop_file": self.stop_file,
1472 }
1474 with open(meta_config_path, "w") as f:
1475 yaml.dump(settings, f)
1477 # Run the process
1478 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1480 # Query to check for duplicates - check all ISSNs
1481 query = """
1482 SELECT DISTINCT ?id ?value
1483 WHERE {
1484 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?value ;
1485 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1486 FILTER(STR(?value) IN ("1756-1833", "0959-8138", "0267-0623"))
1487 }
1488 """
1489 result = execute_sparql_query(SERVER, query)
1490 # Group IDs by value to check for duplicates
1491 ids_by_value = {}
1492 for binding in result["results"]["bindings"]:
1493 value = binding["value"]["value"]
1494 id = binding["id"]["value"]
1495 if value not in ids_by_value:
1496 ids_by_value[value] = []
1497 ids_by_value[value].append(id)
1499 # Cleanup
1500 shutil.rmtree(output_folder, ignore_errors=True)
1501 shutil.rmtree(
1502 os.path.join(BASE_DIR, "input_duplicate_venue"), ignore_errors=True
1503 )
1504 if os.path.exists(meta_config_path):
1505 os.remove(meta_config_path)
1507 # Check that we don't have duplicate IDs for any ISSN
1508 for issn_value, ids in ids_by_value.items():
1509 self.assertEqual(
1510 len(ids), 1, f"Found multiple IDs for ISSN {issn_value} in venue: {ids}"
1511 )
1513 # Verify that pre-existing IDs were reused
1514 self.assertTrue(
1515 any("0601" in id for ids in ids_by_value.values() for id in ids)
1516 and any("0602" in id for ids in ids_by_value.values() for id in ids),
1517 "Pre-existing IDs were not reused",
1518 )
1520 def test_doi_with_multiple_slashes(self):
1521 """Test handling of DOIs containing multiple forward slashes"""
1522 output_folder = os.path.join(BASE_DIR, "output_doi_test")
1523 meta_config_path = os.path.join(BASE_DIR, "meta_config_doi.yaml")
1525 # Setup: create test data with problematic DOI
1526 os.makedirs(os.path.join(BASE_DIR, "input_doi"), exist_ok=True)
1527 with open(
1528 os.path.join(BASE_DIR, "input_doi", "test.csv"), "w", encoding="utf-8"
1529 ) as f:
1530 writer = csv.writer(f)
1531 writer.writerow(
1532 [
1533 "id",
1534 "title",
1535 "author",
1536 "pub_date",
1537 "venue",
1538 "volume",
1539 "issue",
1540 "page",
1541 "type",
1542 "publisher",
1543 "editor",
1544 ]
1545 )
1546 writer.writerow(
1547 [
1548 "doi:10.1093/acprof:oso/9780199230723.001.0001", # Problematic DOI with multiple slashes
1549 "Test Book",
1550 "",
1551 "",
1552 "",
1553 "",
1554 "",
1555 "",
1556 "book",
1557 "",
1558 "",
1559 ]
1560 )
1562 # Create test settings
1563 settings = {
1564 "triplestore_url": SERVER,
1565 "provenance_triplestore_url": PROV_SERVER,
1566 "input_csv_dir": os.path.join(BASE_DIR, "input_doi"),
1567 "base_output_dir": output_folder,
1568 "output_rdf_dir": output_folder,
1569 "resp_agent": "test",
1570 "base_iri": "https://w3id.org/oc/meta/",
1571 "context_path": None,
1572 "dir_split_number": 10000,
1573 "items_per_file": 1000,
1574 "default_dir": "_",
1575 "rdf_output_in_chunks": False,
1576 "zip_output_rdf": True,
1577 "source": None,
1578 "supplier_prefix": "060",
1579 "use_doi_api_service": False,
1580 "blazegraph_full_text_search": False,
1581 "virtuoso_full_text_search": True,
1582 "fuseki_full_text_search": False,
1583 "graphdb_connector_name": None,
1584 "cache_endpoint": None,
1585 "cache_update_endpoint": None,
1586 "silencer": [],
1587 "redis_host": "localhost",
1588 "redis_port": 6381,
1589 "redis_db": 5,
1590 "redis_cache_db": 2,
1591 "ts_upload_cache": self.cache_file,
1592 "ts_failed_queries": self.failed_file,
1593 "ts_stop_file": self.stop_file,
1594 }
1596 with open(meta_config_path, "w") as f:
1597 yaml.dump(settings, f)
1599 now = datetime.now()
1601 # Run the process
1602 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1604 # Query to verify DOI was processed correctly
1605 query = """
1606 SELECT ?br ?id ?value
1607 WHERE {
1608 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "10.1093/acprof:oso/9780199230723.001.0001"^^<http://www.w3.org/2001/XMLSchema#string> ;
1609 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/doi> ;
1610 ^<http://purl.org/spar/datacite/hasIdentifier> ?br .
1611 }
1612 """
1613 result = execute_sparql_query(SERVER, query)
1615 # Cleanup
1616 shutil.rmtree(output_folder, ignore_errors=True)
1617 shutil.rmtree(os.path.join(BASE_DIR, "input_doi"), ignore_errors=True)
1618 if os.path.exists(meta_config_path):
1619 os.remove(meta_config_path)
1620 delete_output_zip(".", now)
1622 # Verify results
1623 self.assertTrue(
1624 len(result["results"]["bindings"]) > 0,
1625 "DOI with multiple slashes was not processed correctly",
1626 )
1628 # Check that we got exactly one result
1629 self.assertEqual(
1630 len(result["results"]["bindings"]),
1631 1,
1632 f"Expected 1 result, got {len(result['results']['bindings'])}",
1633 )
1635 def test_volume_issue_deduplication(self):
1636 """Test to verify that volumes and issues are properly deduplicated"""
1637 output_folder = os.path.join(BASE_DIR, "output_vvi_test")
1638 meta_config_path = os.path.join(BASE_DIR, "meta_config_vvi.yaml")
1640 # Setup: create test data
1641 os.makedirs(os.path.join(BASE_DIR, "input_vvi"), exist_ok=True)
1642 with open(
1643 os.path.join(BASE_DIR, "input_vvi", "test.csv"), "w", encoding="utf-8"
1644 ) as f:
1645 writer = csv.writer(f)
1646 writer.writerow(
1647 [
1648 "id",
1649 "title",
1650 "author",
1651 "pub_date",
1652 "venue",
1653 "volume",
1654 "issue",
1655 "page",
1656 "type",
1657 "publisher",
1658 "editor",
1659 ]
1660 )
1661 # First article in volume 1, issue 1
1662 writer.writerow(
1663 [
1664 "doi:10.1234/test.1",
1665 "First Article",
1666 "",
1667 "2023",
1668 "Test Journal [issn:1756-1833]",
1669 "1",
1670 "1",
1671 "1-10",
1672 "journal article",
1673 "",
1674 "",
1675 ]
1676 )
1677 # Second article in same volume and issue
1678 writer.writerow(
1679 [
1680 "doi:10.1234/test.2",
1681 "Second Article",
1682 "",
1683 "2023",
1684 "Test Journal [issn:1756-1833]",
1685 "1",
1686 "1",
1687 "11-20",
1688 "journal article",
1689 "",
1690 "",
1691 ]
1692 )
1694 # Create test settings
1695 settings = {
1696 "triplestore_url": SERVER,
1697 "provenance_triplestore_url": PROV_SERVER,
1698 "input_csv_dir": os.path.join(BASE_DIR, "input_vvi"),
1699 "base_output_dir": output_folder,
1700 "output_rdf_dir": output_folder,
1701 "resp_agent": "test",
1702 "base_iri": "https://w3id.org/oc/meta/",
1703 "context_path": None,
1704 "dir_split_number": 10000,
1705 "items_per_file": 1000,
1706 "default_dir": "_",
1707 "rdf_output_in_chunks": False,
1708 "zip_output_rdf": True,
1709 "source": None,
1710 "supplier_prefix": "060",
1711 "use_doi_api_service": False,
1712 "blazegraph_full_text_search": False,
1713 "virtuoso_full_text_search": True,
1714 "fuseki_full_text_search": False,
1715 "graphdb_connector_name": None,
1716 "cache_endpoint": None,
1717 "cache_update_endpoint": None,
1718 "silencer": [],
1719 "redis_host": "localhost",
1720 "redis_port": 6381,
1721 "redis_db": 5,
1722 "redis_cache_db": 2,
1723 "ts_upload_cache": self.cache_file,
1724 "ts_failed_queries": self.failed_file,
1725 "ts_stop_file": self.stop_file,
1726 }
1728 with open(meta_config_path, "w") as f:
1729 yaml.dump(settings, f)
1731 # Run the process
1732 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1734 # Query to check volume and issue structure
1735 query = """
1736 PREFIX fabio: <http://purl.org/spar/fabio/>
1737 PREFIX frbr: <http://purl.org/vocab/frbr/core#>
1738 PREFIX prism: <http://prismstandard.org/namespaces/basic/2.0/>
1740 SELECT ?article ?volume ?issue ?seq_id
1741 WHERE {
1742 ?article a fabio:JournalArticle ;
1743 frbr:partOf ?issue .
1744 ?issue a fabio:JournalIssue ;
1745 fabio:hasSequenceIdentifier ?seq_id ;
1746 frbr:partOf ?volume .
1747 ?volume a fabio:JournalVolume .
1748 }
1749 ORDER BY ?article
1750 """
1752 result = execute_sparql_query(SERVER, query)
1754 # Cleanup
1755 shutil.rmtree(output_folder, ignore_errors=True)
1756 shutil.rmtree(os.path.join(BASE_DIR, "input_vvi"), ignore_errors=True)
1757 if os.path.exists(meta_config_path):
1758 os.remove(meta_config_path)
1760 # Verify results
1761 bindings = result["results"]["bindings"]
1763 # Should have 2 articles
1764 self.assertEqual(len(bindings), 2, "Expected 2 articles")
1766 # Both articles should reference the same volume and issue
1767 first_volume = bindings[0]["volume"]["value"]
1768 first_issue = bindings[0]["issue"]["value"]
1770 for binding in bindings[1:]:
1771 self.assertEqual(
1772 binding["volume"]["value"],
1773 first_volume,
1774 "Articles reference different volumes",
1775 )
1776 self.assertEqual(
1777 binding["issue"]["value"],
1778 first_issue,
1779 "Articles reference different issues",
1780 )
1782 def test_volume_issue_deduplication_with_triplestore(self):
1783 """Test that volumes and issues are properly deduplicated when they already exist in the triplestore"""
1784 output_folder = os.path.join(BASE_DIR, "output_vvi_triplestore_test")
1785 meta_config_path = os.path.join(BASE_DIR, "meta_config_vvi_triplestore.yaml")
1787 # Setup: Insert pre-existing venue with duplicate volumes and issues (with/without datatype)
1788 with SPARQLClient(SERVER, timeout=60) as client:
1789 client.update(
1790 """
1791 INSERT DATA {
1792 GRAPH <https://w3id.org/oc/meta/br/> {
1793 # Venue
1794 <https://w3id.org/oc/meta/br/0601>
1795 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601> ;
1796 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ;
1797 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1798 <http://purl.org/dc/terms/title> "Test Journal" .
1800 # Volume 1 (without datatype)
1801 <https://w3id.org/oc/meta/br/0602>
1802 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalVolume> ;
1803 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1804 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0601> ;
1805 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1" .
1807 # Volume 1 (with datatype)
1808 <https://w3id.org/oc/meta/br/0604>
1809 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalVolume> ;
1810 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1811 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0601> ;
1812 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1"^^<http://www.w3.org/2001/XMLSchema#string> .
1814 # Issue 1 (without datatype)
1815 <https://w3id.org/oc/meta/br/0603>
1816 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalIssue> ;
1817 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1818 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0602> ;
1819 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1" .
1821 # Issue 1 (with datatype)
1822 <https://w3id.org/oc/meta/br/0605>
1823 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalIssue> ;
1824 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1825 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0604> ;
1826 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1"^^<http://www.w3.org/2001/XMLSchema#string> .
1827 }
1828 GRAPH <https://w3id.org/oc/meta/id/> {
1829 <https://w3id.org/oc/meta/id/0601>
1830 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "1756-1833" ;
1831 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1832 }
1833 }
1834 """
1835 )
1837 # Update Redis counters for pre-existing entities
1838 redis_handler = RedisCounterHandler(port=6381, db=5)
1839 redis_handler.set_counter(
1840 5, "br", supplier_prefix="060"
1841 ) # 5 entities: venue, 2 volumes, 2 issues
1842 redis_handler.set_counter(
1843 1, "id", supplier_prefix="060"
1844 ) # 1 identifier for venue
1846 # Create test data - article that should use existing volume and issue
1847 os.makedirs(os.path.join(BASE_DIR, "input_vvi_triplestore"), exist_ok=True)
1848 with open(
1849 os.path.join(BASE_DIR, "input_vvi_triplestore", "test.csv"),
1850 "w",
1851 encoding="utf-8",
1852 ) as f:
1853 writer = csv.writer(f)
1854 writer.writerow(
1855 [
1856 "id",
1857 "title",
1858 "author",
1859 "pub_date",
1860 "venue",
1861 "volume",
1862 "issue",
1863 "page",
1864 "type",
1865 "publisher",
1866 "editor",
1867 ]
1868 )
1869 writer.writerow(
1870 [
1871 "doi:10.1234/test.1",
1872 "Test Article",
1873 "",
1874 "2023",
1875 "Test Journal [issn:1756-1833]",
1876 "1", # Should match existing volume
1877 "1", # Should match existing issue
1878 "1-10",
1879 "journal article",
1880 "",
1881 "",
1882 ]
1883 )
1885 # Create test settings
1886 settings = {
1887 "triplestore_url": SERVER,
1888 "provenance_triplestore_url": PROV_SERVER,
1889 "input_csv_dir": os.path.join(BASE_DIR, "input_vvi_triplestore"),
1890 "base_output_dir": output_folder,
1891 "output_rdf_dir": output_folder,
1892 "resp_agent": "test",
1893 "base_iri": "https://w3id.org/oc/meta/",
1894 "context_path": None,
1895 "dir_split_number": 10000,
1896 "items_per_file": 1000,
1897 "default_dir": "_",
1898 "rdf_output_in_chunks": False,
1899 "zip_output_rdf": True,
1900 "source": None,
1901 "supplier_prefix": "060",
1902 "use_doi_api_service": False,
1903 "blazegraph_full_text_search": False,
1904 "virtuoso_full_text_search": True,
1905 "fuseki_full_text_search": False,
1906 "graphdb_connector_name": None,
1907 "cache_endpoint": None,
1908 "cache_update_endpoint": None,
1909 "silencer": [],
1910 "redis_host": "localhost",
1911 "redis_port": 6381,
1912 "redis_db": 5,
1913 "redis_cache_db": 2,
1914 "ts_upload_cache": self.cache_file,
1915 "ts_failed_queries": self.failed_file,
1916 "ts_stop_file": self.stop_file,
1917 }
1919 with open(meta_config_path, "w") as f:
1920 yaml.dump(settings, f)
1922 # Run the process
1923 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1925 # Check if new volumes/issues were created
1926 to_be_uploaded_dir = os.path.join(output_folder, "rdf", "to_be_uploaded")
1927 new_entities_created = False
1928 if os.path.exists(to_be_uploaded_dir):
1929 for dirpath, _, filenames in os.walk(to_be_uploaded_dir):
1930 for f in filenames:
1931 if f.endswith(".sparql"):
1932 with open(os.path.join(dirpath, f)) as file:
1933 content = file.read()
1934 if any(
1935 "JournalVolume" in line or "JournalIssue" in line
1936 for line in content.splitlines()
1937 ):
1938 print(f"\nFound new volume/issue creation in {f}:")
1939 new_entities_created = True
1941 # Query to get all entities and their relationships
1942 query = """
1943 PREFIX fabio: <http://purl.org/spar/fabio/>
1944 PREFIX frbr: <http://purl.org/vocab/frbr/core#>
1945 PREFIX datacite: <http://purl.org/spar/datacite/>
1947 SELECT DISTINCT ?article ?venue ?volume ?issue ?issn
1948 WHERE {
1949 ?article a fabio:JournalArticle ;
1950 frbr:partOf ?issue .
1951 ?issue a fabio:JournalIssue ;
1952 frbr:partOf ?volume .
1953 ?volume a fabio:JournalVolume ;
1954 frbr:partOf ?venue .
1955 ?venue datacite:hasIdentifier ?id .
1956 ?id datacite:usesIdentifierScheme datacite:issn ;
1957 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?issn .
1958 }
1959 """
1961 result = execute_sparql_query(SERVER, query)
1963 # Cleanup
1964 shutil.rmtree(output_folder, ignore_errors=True)
1965 shutil.rmtree(
1966 os.path.join(BASE_DIR, "input_vvi_triplestore"), ignore_errors=True
1967 )
1968 if os.path.exists(meta_config_path):
1969 os.remove(meta_config_path)
1971 # Verify results
1972 bindings = result["results"]["bindings"]
1973 self.assertEqual(len(bindings), 1, "Expected exactly one article")
1975 # Get the URIs from the result
1976 venue_uri = bindings[0]["venue"]["value"]
1977 volume_uri = bindings[0]["volume"]["value"]
1978 issue_uri = bindings[0]["issue"]["value"]
1979 issn = bindings[0]["issn"]["value"]
1981 # Check if venue was deduplicated (should use existing venue)
1982 self.assertEqual(
1983 venue_uri,
1984 "https://w3id.org/oc/meta/br/0601",
1985 "Venue was not deduplicated correctly",
1986 )
1988 # Check if volume was deduplicated - either version is valid
1989 self.assertIn(
1990 volume_uri,
1991 ["https://w3id.org/oc/meta/br/0602", "https://w3id.org/oc/meta/br/0604"],
1992 "Volume was not deduplicated correctly - should use one of the existing volumes",
1993 )
1995 # Check if issue was deduplicated - either version is valid
1996 self.assertIn(
1997 issue_uri,
1998 ["https://w3id.org/oc/meta/br/0603", "https://w3id.org/oc/meta/br/0605"],
1999 "Issue was not deduplicated correctly - should use one of the existing issues",
2000 )
2002 # Check ISSN
2003 self.assertEqual(issn, "1756-1833", "ISSN does not match")
2005 # Verify no new volumes/issues were created
2006 self.assertFalse(
2007 new_entities_created,
2008 "New volumes/issues were created when they should have been deduplicated",
2009 )
2011 def test_temporary_identifiers(self):
2012 """Test that temporary identifiers are used for deduplication but not saved, and an OMID is generated"""
2013 output_folder = os.path.join(BASE_DIR, "output_temp_id_test")
2014 meta_config_path = os.path.join(BASE_DIR, "meta_config_temp.yaml")
2016 # Setup: create test data with only temporary identifier
2017 os.makedirs(os.path.join(BASE_DIR, "input_temp"), exist_ok=True)
2018 with open(
2019 os.path.join(BASE_DIR, "input_temp", "test.csv"), "w", encoding="utf-8"
2020 ) as f:
2021 writer = csv.writer(f)
2022 writer.writerow(
2023 [
2024 "id",
2025 "title",
2026 "author",
2027 "pub_date",
2028 "venue",
2029 "volume",
2030 "issue",
2031 "page",
2032 "type",
2033 "publisher",
2034 "editor",
2035 ]
2036 )
2037 writer.writerow(
2038 [
2039 "temp:567", # Only temporary identifier
2040 "Test Article",
2041 "",
2042 "2023",
2043 "",
2044 "",
2045 "",
2046 "",
2047 "journal article",
2048 "",
2049 "",
2050 ]
2051 )
2053 # Create test settings
2054 settings = {
2055 "triplestore_url": SERVER,
2056 "provenance_triplestore_url": PROV_SERVER,
2057 "input_csv_dir": os.path.join(BASE_DIR, "input_temp"),
2058 "base_output_dir": output_folder,
2059 "output_rdf_dir": output_folder,
2060 "resp_agent": "test",
2061 "base_iri": "https://w3id.org/oc/meta/",
2062 "context_path": None,
2063 "dir_split_number": 10000,
2064 "items_per_file": 1000,
2065 "default_dir": "_",
2066 "rdf_output_in_chunks": False,
2067 "zip_output_rdf": True,
2068 "source": None,
2069 "supplier_prefix": "060",
2070 "use_doi_api_service": False,
2071 "blazegraph_full_text_search": False,
2072 "virtuoso_full_text_search": True,
2073 "fuseki_full_text_search": False,
2074 "graphdb_connector_name": None,
2075 "cache_endpoint": None,
2076 "cache_update_endpoint": None,
2077 "silencer": [],
2078 "redis_host": "localhost",
2079 "redis_port": 6381,
2080 "redis_db": 5,
2081 "redis_cache_db": 2,
2082 "ts_upload_cache": self.cache_file,
2083 "ts_failed_queries": self.failed_file,
2084 "ts_stop_file": self.stop_file,
2085 }
2087 with open(meta_config_path, "w") as f:
2088 yaml.dump(settings, f)
2090 now = datetime.now()
2092 # Run the process
2093 run_meta_process(settings=settings, meta_config_path=meta_config_path)
2095 # Query to verify an OMID was generated and no temporary identifier was saved
2096 query = """
2097 PREFIX fabio: <http://purl.org/spar/fabio/>
2098 PREFIX datacite: <http://purl.org/spar/datacite/>
2099 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>
2101 SELECT ?br ?id ?value ?scheme
2102 WHERE {
2103 ?br a fabio:JournalArticle .
2104 OPTIONAL {
2105 ?br datacite:hasIdentifier ?id .
2106 ?id datacite:usesIdentifierScheme ?scheme ;
2107 literal:hasLiteralValue ?value .
2108 }
2109 }
2110 """
2111 result = execute_sparql_query(SERVER, query)
2113 # Cleanup
2114 shutil.rmtree(output_folder, ignore_errors=True)
2115 shutil.rmtree(os.path.join(BASE_DIR, "input_temp"), ignore_errors=True)
2116 if os.path.exists(meta_config_path):
2117 os.remove(meta_config_path)
2118 delete_output_zip(".", now)
2120 # Verify results
2121 bindings = result["results"]["bindings"]
2123 # Should find exactly one article
2124 self.assertEqual(len(bindings), 1, "Expected exactly one article")
2126 # The article should have a br/ URI (OMID)
2127 br_uri = bindings[0]["br"]["value"]
2128 self.assertTrue(
2129 "br/" in br_uri,
2130 f"Article URI {br_uri} does not contain expected OMID pattern 'br/'",
2131 )
2133 # Should not have any saved identifiers
2134 self.assertNotIn(
2135 "id",
2136 bindings[0],
2137 "Found unexpected identifier when only temporary ID was provided",
2138 )
2140 def test_temporary_identifiers_deduplication(self):
2141 """Test that multiple rows with the same temporary identifier are correctly deduplicated"""
2142 # Create test data with two rows using the same temporary identifier
2143 test_data = [
2144 {
2145 "id": "temp:789",
2146 "title": "Test Article 1",
2147 "author": "Smith, John [orcid:0000-0002-1234-5678]",
2148 "pub_date": "2020",
2149 "venue": "",
2150 "volume": "",
2151 "issue": "",
2152 "page": "",
2153 "type": "journal article",
2154 "publisher": "",
2155 "editor": "",
2156 },
2157 {
2158 "id": "temp:789", # Same temporary ID
2159 "title": "Test Article 1", # Same title
2160 "author": "Smith, John [orcid:0000-0002-1234-5678]",
2161 "pub_date": "2020",
2162 "venue": "",
2163 "volume": "",
2164 "issue": "",
2165 "page": "",
2166 "type": "journal article",
2167 "publisher": "",
2168 "editor": "",
2169 },
2170 ]
2172 # Write test data to CSV
2173 input_dir = os.path.join(BASE_DIR, "input_temp_dedup")
2174 os.makedirs(input_dir, exist_ok=True)
2175 csv_path = os.path.join(input_dir, "test.csv")
2176 write_csv(csv_path, test_data)
2178 # Run meta process
2179 output_dir = os.path.join(BASE_DIR, "output_temp_dedup")
2180 os.makedirs(output_dir, exist_ok=True)
2181 config = {
2182 "input_csv_dir": input_dir,
2183 "base_output_dir": output_dir,
2184 "output_rdf_dir": output_dir,
2185 "triplestore_url": SERVER,
2186 "provenance_triplestore_url": PROV_SERVER,
2187 "resp_agent": "https://w3id.org/oc/meta/prov/pa/1",
2188 "base_iri": "https://w3id.org/oc/meta/",
2189 "context_path": "https://w3id.org/oc/meta/context.json",
2190 "supplier_prefix": "060",
2191 "dir_split_number": 10000,
2192 "items_per_file": 1000,
2193 "default_dir": "_",
2194 "rdf_output_in_chunks": True,
2195 "zip_output_rdf": False,
2196 "source": None,
2197 "use_doi_api_service": False,
2198 "silencer": [],
2199 "redis_host": "localhost",
2200 "redis_port": 6381,
2201 "redis_db": 5,
2202 "redis_cache_db": 2,
2203 "ts_upload_cache": self.cache_file,
2204 "ts_failed_queries": self.failed_file,
2205 "ts_stop_file": self.stop_file,
2206 "graphdb_connector_name": None,
2207 "blazegraph_full_text_search": False,
2208 "fuseki_full_text_search": False,
2209 "virtuoso_full_text_search": False,
2210 "provenance_endpoints": [],
2211 "cache_endpoint": None,
2212 "cache_update_endpoint": None,
2213 "normalize_titles": True,
2214 }
2215 config_path = os.path.join(output_dir, "config.yaml")
2216 with open(config_path, "w") as f:
2217 yaml.dump(config, f)
2219 # Run the process
2220 run_meta_process(settings=config, meta_config_path=config_path)
2222 # Query the triplestore to verify:
2223 # 1. Only one OMID was generated for both rows
2224 # 2. The temporary identifier was not saved
2225 query = """
2226 SELECT DISTINCT ?br
2227 WHERE {
2228 ?br a <http://purl.org/spar/fabio/JournalArticle> .
2229 }
2230 """
2231 results = execute_sparql_query(SERVER, query)
2233 # Clean up
2234 shutil.rmtree(input_dir)
2235 shutil.rmtree(output_dir)
2237 # Should only be one article
2238 articles = [
2239 str(result["br"]["value"]) for result in results["results"]["bindings"]
2240 ]
2241 self.assertEqual(
2242 len(articles), 1, "Should only be one article after deduplication"
2243 )
2246def normalize_graph(graph):
2247 """
2248 Normalizza i letterali nel grafo rimuovendo i tipi di dato espliciti.
2249 """
2250 normalized_graph = Graph()
2251 for subject, predicate, obj in graph:
2252 if isinstance(obj, Literal) and obj.datatype is not None:
2253 normalized_obj = Literal(obj.toPython())
2254 normalized_graph.add((subject, predicate, normalized_obj))
2255 else:
2256 normalized_graph.add((subject, predicate, obj))
2257 return normalized_graph
2260if __name__ == "__main__": # pragma: no cover
2261 unittest.main()