Coverage for test/meta_process_test.py: 96%
562 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-07-14 14:06 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-07-14 14:06 +0000
1import csv
2import json
3import os
4import re
5import shutil
6import subprocess
7import sys
8import tempfile
9import time
10import unittest
11from datetime import datetime
12from test.test_utils import (PROV_SERVER, SERVER, VIRTUOSO_CONTAINER,
13 VIRTUOSO_PROV_CONTAINER, execute_sparql_query,
14 reset_redis_counters, reset_server)
16import yaml
17from oc_meta.lib.file_manager import get_csv_data, write_csv
18from oc_meta.run.meta_process import run_meta_process
19from oc_ocdm.counter_handler.redis_counter_handler import RedisCounterHandler
20from rdflib import ConjunctiveGraph, Graph, Literal, URIRef
21from SPARQLWrapper import JSON, POST, XML, SPARQLWrapper
23BASE_DIR = os.path.join("test", "meta_process")
26def delete_output_zip(base_dir: str, start_time: datetime) -> None:
27 for file in os.listdir(base_dir):
28 if file.startswith("meta_output") and file.endswith(".zip"):
29 file_creation_time = file.split("meta_output_")[1].replace(".zip", "")
30 file_creation_time = datetime.strptime(
31 file_creation_time, "%Y-%m-%dT%H_%M_%S_%f"
32 )
33 was_created_after_time = True if file_creation_time > start_time else False
34 if was_created_after_time:
35 os.remove(os.path.join(base_dir, file))
38class test_ProcessTest(unittest.TestCase):
39 @classmethod
40 def setUpClass(cls):
41 """Setup iniziale eseguito una volta per tutta la classe di test"""
42 # Aspetta che Virtuoso sia pronto
43 max_wait = 30 # secondi
44 start_time = time.time()
45 while time.time() - start_time < max_wait:
46 try:
47 # Prova una query semplice
48 sparql = SPARQLWrapper(SERVER)
49 sparql.setQuery("SELECT * WHERE { ?s ?p ?o } LIMIT 1")
50 sparql.setReturnFormat(JSON)
51 sparql.query()
52 break
53 except Exception:
54 time.sleep(2)
55 else:
56 raise TimeoutError(f"Virtuoso non pronto dopo {max_wait} secondi")
58 def setUp(self):
59 """Setup eseguito prima di ogni test"""
60 # Create temporary directory for cache files
61 self.temp_dir = tempfile.mkdtemp()
62 self.cache_file = os.path.join(self.temp_dir, "ts_upload_cache.json")
63 self.failed_file = os.path.join(self.temp_dir, "failed_queries.txt")
64 self.stop_file = os.path.join(self.temp_dir, ".stop_upload")
66 # Reset del database
67 reset_server()
68 reset_redis_counters()
70 def tearDown(self):
71 reset_redis_counters()
72 # Remove temporary directory and its contents
73 if hasattr(self, "temp_dir") and os.path.exists(self.temp_dir):
74 shutil.rmtree(self.temp_dir)
76 def test_run_meta_process(self):
77 output_folder = os.path.join(BASE_DIR, "output_1")
78 meta_config_path = os.path.join(BASE_DIR, "meta_config_1.yaml")
79 with open(meta_config_path, encoding="utf-8") as file:
80 settings = yaml.full_load(file)
82 # Update settings with temporary files and Redis cache DB
83 settings.update(
84 {
85 "redis_cache_db": 2,
86 "ts_upload_cache": self.cache_file,
87 "ts_failed_queries": self.failed_file,
88 "ts_stop_file": self.stop_file,
89 }
90 )
92 now = datetime.now()
93 run_meta_process(settings=settings, meta_config_path=meta_config_path)
94 output = list()
95 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")):
96 for file in filenames:
97 output.extend(get_csv_data(os.path.join(dirpath, file)))
98 expected_output = [
99 {
100 "id": "doi:10.17117/na.2015.08.1067 omid:br/0601",
101 "title": "",
102 "author": "",
103 "pub_date": "",
104 "venue": "Scientometrics [issn:0138-9130 issn:1588-2861 omid:br/0603]",
105 "volume": "26",
106 "issue": "",
107 "page": "",
108 "type": "journal article",
109 "publisher": "Consulting Company Ucom [crossref:6623 omid:ra/0601]",
110 "editor": "Naimi, Elmehdi [orcid:0000-0002-4126-8519 omid:ra/0602]",
111 },
112 {
113 "id": "issn:1524-4539 issn:0009-7322 omid:br/0602",
114 "title": "Circulation",
115 "author": "",
116 "pub_date": "",
117 "venue": "",
118 "volume": "",
119 "issue": "",
120 "page": "",
121 "type": "journal",
122 "publisher": "",
123 "editor": "",
124 },
125 {
126 "id": "doi:10.9799/ksfan.2012.25.1.069 omid:br/0605",
127 "title": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment",
128 "author": "Cheigh, Chan-Ick [orcid:0000-0003-2542-5788 omid:ra/0603]; Mun, Ji-Hye [omid:ra/0604]; Chung, Myong-Soo [omid:ra/0605]",
129 "pub_date": "2012-03-31",
130 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/0608]",
131 "volume": "25",
132 "issue": "1",
133 "page": "69-76",
134 "type": "journal article",
135 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/0606]",
136 "editor": "Chung, Myong-Soo [orcid:0000-0002-9666-2513 omid:ra/0607]",
137 },
138 {
139 "id": "doi:10.9799/ksfan.2012.25.1.077 omid:br/0606",
140 "title": "Properties Of Immature Green Cherry Tomato Pickles",
141 "author": "Koh, Jong-Ho [omid:ra/0608]; Shin, Hae-Hun [omid:ra/0609]; Kim, Young-Shik [orcid:0000-0001-5673-6314 omid:ra/06010]; Kook, Moo-Chang [omid:ra/06011]",
142 "pub_date": "2012-03-31",
143 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/0608]",
144 "volume": "",
145 "issue": "2",
146 "page": "77-82",
147 "type": "journal article",
148 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/0606]",
149 "editor": "",
150 },
151 {
152 "id": "doi:10.1097/01.rct.0000185385.35389.cd omid:br/0607",
153 "title": "Comprehensive Assessment Of Lung CT Attenuation Alteration At Perfusion Defects Of Acute Pulmonary Thromboembolism With Breath-Hold SPECT-CT Fusion Images",
154 "author": "Suga, Kazuyoshi [omid:ra/06012]; Kawakami, Yasuhiko [omid:ra/06013]; Iwanaga, Hideyuki [omid:ra/06014]; Hayashi, Noriko [omid:ra/06015]; Seto, Aska [omid:ra/06016]; Matsunaga, Naofumi [omid:ra/06017]",
155 "pub_date": "2006-01",
156 "venue": "Journal Of Computer Assisted Tomography [issn:0363-8715 omid:br/06012]",
157 "volume": "30",
158 "issue": "1",
159 "page": "83-91",
160 "type": "journal article",
161 "publisher": "Ovid Technologies (Wolters Kluwer Health) [crossref:276 omid:ra/06018]",
162 "editor": "",
163 },
164 ]
165 output = sorted(sorted(d.items()) for d in output)
166 expected_output = sorted(sorted(d.items()) for d in expected_output)
167 self.maxDiff = None
168 shutil.rmtree(output_folder)
169 delete_output_zip(".", now)
170 self.assertEqual(output, expected_output)
172 def test_run_meta_process_ids_only(self):
173 output_folder = os.path.join(BASE_DIR, "output_5")
174 meta_config_path = os.path.join(BASE_DIR, "meta_config_5.yaml")
175 now = datetime.now()
176 with open(meta_config_path, encoding="utf-8") as file:
177 settings = yaml.full_load(file)
179 # Update settings with temporary files and Redis cache DB
180 settings.update(
181 {
182 "redis_cache_db": 2,
183 "ts_upload_cache": self.cache_file,
184 "ts_failed_queries": self.failed_file,
185 "ts_stop_file": self.stop_file,
186 }
187 )
189 run_meta_process(settings, meta_config_path=meta_config_path)
190 output = list()
191 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")):
192 for file in filenames:
193 output.extend(get_csv_data(os.path.join(dirpath, file)))
194 expected_output = [
195 {
196 "id": "doi:10.17117/na.2015.08.1067 omid:br/0601",
197 "title": "Some Aspects Of The Evolution Of Chernozems Under The Influence Of Natural And Anthropogenic Factors",
198 "author": "[orcid:0000-0002-4126-8519 omid:ra/0601]; [orcid:0000-0003-0530-4305 omid:ra/0602]",
199 "pub_date": "2015-08-22",
200 "venue": "[issn:1225-4339 omid:br/0602]",
201 "volume": "26",
202 "issue": "",
203 "page": "50",
204 "type": "journal article",
205 "publisher": "[crossref:6623 omid:ra/0603]",
206 "editor": "[orcid:0000-0002-4126-8519 omid:ra/0601]; [orcid:0000-0002-8420-0696 omid:ra/0604]",
207 }
208 ]
209 output = sorted(sorted(d.items()) for d in output)
210 expected_output = sorted(sorted(d.items()) for d in expected_output)
211 self.maxDiff = None
212 shutil.rmtree(output_folder)
213 delete_output_zip(".", now)
214 self.assertEqual(output, expected_output)
216 def test_run_meta_process_two_workers(self):
217 output_folder = os.path.join(BASE_DIR, "output_2")
218 meta_config_path = os.path.join(BASE_DIR, "meta_config_2.yaml")
219 with open(meta_config_path, encoding="utf-8") as file:
220 settings = yaml.full_load(file)
222 # Update settings with temporary files and Redis cache DB
223 settings.update(
224 {
225 "redis_cache_db": 2,
226 "ts_upload_cache": self.cache_file,
227 "ts_failed_queries": self.failed_file,
228 "ts_stop_file": self.stop_file,
229 "workers_number": 2,
230 }
231 )
233 now = datetime.now()
234 run_meta_process(settings=settings, meta_config_path=meta_config_path)
235 output = list()
236 for dirpath, _, filenames in os.walk(os.path.join(output_folder, "csv")):
237 for file in filenames:
238 output.extend(get_csv_data(os.path.join(dirpath, file)))
239 shutil.rmtree(output_folder)
240 delete_output_zip(".", now)
241 expected_output = [
242 {
243 "id": "doi:10.17117/na.2015.08.1067 omid:br/06101",
244 "title": "",
245 "author": "",
246 "pub_date": "",
247 "venue": "Scientometrics [issn:0138-9130 issn:1588-2861 omid:br/06103]",
248 "volume": "26",
249 "issue": "",
250 "page": "",
251 "type": "journal article",
252 "publisher": "Consulting Company Ucom [crossref:6623 omid:ra/06101]",
253 "editor": "Naimi, Elmehdi [orcid:0000-0002-4126-8519 omid:ra/06102]",
254 },
255 {
256 "id": "issn:1524-4539 issn:0009-7322 omid:br/06102",
257 "title": "Circulation",
258 "author": "",
259 "pub_date": "",
260 "venue": "",
261 "volume": "",
262 "issue": "",
263 "page": "",
264 "type": "journal",
265 "publisher": "",
266 "editor": "",
267 },
268 {
269 "id": "doi:10.9799/ksfan.2012.25.1.069 omid:br/06201",
270 "title": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment",
271 "author": "Cheigh, Chan-Ick [orcid:0000-0003-2542-5788 omid:ra/06201]; Mun, Ji-Hye [omid:ra/06202]; Chung, Myong-Soo [omid:ra/06203]",
272 "pub_date": "2012-03-31",
273 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/06204]",
274 "volume": "25",
275 "issue": "1",
276 "page": "69-76",
277 "type": "journal article",
278 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/06204]",
279 "editor": "Chung, Myong-Soo [orcid:0000-0002-9666-2513 omid:ra/06205]",
280 },
281 {
282 "id": "doi:10.9799/ksfan.2012.25.1.077 omid:br/06202",
283 "title": "Properties Of Immature Green Cherry Tomato Pickles",
284 "author": "Koh, Jong-Ho [omid:ra/06206]; Shin, Hae-Hun [omid:ra/06207]; Kim, Young-Shik [orcid:0000-0001-5673-6314 omid:ra/06208]; Kook, Moo-Chang [omid:ra/06209]",
285 "pub_date": "2012-03-31",
286 "venue": "The Korean Journal Of Food And Nutrition [issn:1225-4339 omid:br/06204]",
287 "volume": "",
288 "issue": "2",
289 "page": "77-82",
290 "type": "journal article",
291 "publisher": "The Korean Society Of Food And Nutrition [crossref:4768 omid:ra/06204]",
292 "editor": "",
293 },
294 {
295 "id": "doi:10.1097/01.rct.0000185385.35389.cd omid:br/06203",
296 "title": "Comprehensive Assessment Of Lung CT Attenuation Alteration At Perfusion Defects Of Acute Pulmonary Thromboembolism With Breath-Hold SPECT-CT Fusion Images",
297 "author": "Suga, Kazuyoshi [omid:ra/062010]; Kawakami, Yasuhiko [omid:ra/062011]; Iwanaga, Hideyuki [omid:ra/062012]; Hayashi, Noriko [omid:ra/062013]; Seto, Aska [omid:ra/062014]; Matsunaga, Naofumi [omid:ra/062015]",
298 "pub_date": "2006-01",
299 "venue": "Journal Of Computer Assisted Tomography [issn:0363-8715 omid:br/06208]",
300 "volume": "30",
301 "issue": "1",
302 "page": "83-91",
303 "type": "journal article",
304 "publisher": "Ovid Technologies (Wolters Kluwer Health) [crossref:276 omid:ra/062016]",
305 "editor": "",
306 },
307 ]
308 output = sorted(sorted(d.items()) for d in output)
309 expected_output = sorted(sorted(d.items()) for d in expected_output)
310 self.assertEqual(output, expected_output)
312 def test_provenance(self):
313 output_folder = os.path.join(BASE_DIR, "output_3")
314 now = datetime.now()
315 if os.path.exists(output_folder):
316 shutil.rmtree(output_folder)
317 delete_output_zip(".", now)
318 meta_config_path = os.path.join(BASE_DIR, "meta_config_3.yaml")
319 with open(meta_config_path, encoding="utf-8") as file:
320 settings = yaml.full_load(file)
322 # Update settings with temporary files and Redis cache DB
323 settings.update(
324 {
325 "redis_cache_db": 2,
326 "ts_upload_cache": self.cache_file,
327 "ts_failed_queries": self.failed_file,
328 "ts_stop_file": self.stop_file,
329 }
330 )
332 reset_server()
334 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input")
335 run_meta_process(settings=settings, meta_config_path=meta_config_path)
336 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input_2")
337 run_meta_process(settings=settings, meta_config_path=meta_config_path)
338 settings["input_csv_dir"] = os.path.join(BASE_DIR, "input")
339 run_meta_process(settings=settings, meta_config_path=meta_config_path)
341 output = dict()
343 entity_types = ['ar', 'br', 'id', 'ra', 're']
345 for entity_type in entity_types:
346 query = f"""
347 PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
348 PREFIX prov: <http://www.w3.org/ns/prov#>
349 PREFIX oco: <https://w3id.org/oc/ontology/>
351 CONSTRUCT {{
352 ?s ?p ?o .
353 }}
354 WHERE {{
355 ?s ?p ?o .
356 FILTER(REGEX(STR(?s), "https://w3id.org/oc/meta/{entity_type}/[0-9]+/prov/se/[0-9]+"))
357 }}
358 """
360 result = execute_sparql_query(PROV_SERVER, query, return_format=XML)
362 g = Graph()
363 for s, p, o in result:
364 g.add((s, p, o))
366 entities = {}
367 for s, p, o in g:
368 s_str = str(s)
369 if s_str not in entities:
370 entities[s_str] = {'@id': s_str, '@type': []}
372 p_str = str(p)
373 if p == URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'):
374 entities[s_str]['@type'].append(str(o))
375 else:
376 if p_str not in entities[s_str]:
377 entities[s_str][p_str] = []
379 if isinstance(o, URIRef):
380 entities[s_str][p_str].append({'@id': str(o)})
381 elif isinstance(o, Literal):
382 if o.datatype:
383 entities[s_str][p_str].append({
384 '@value': str(o),
385 '@type': str(o.datatype)
386 })
387 else:
388 entities[s_str][p_str].append({'@value': str(o)})
390 # Group entities by their parent entity (e.g., br/0601/prov/se/1 -> br/0601)
391 grouped_entities = {}
392 for entity_id, entity_data in entities.items():
393 # Extract the parent entity ID from the provenance entity ID
394 parent_id = re.match(r'https://w3id.org/oc/meta/([^/]+/[0-9]+)', entity_id).group(0)
396 if parent_id not in grouped_entities:
397 grouped_entities[parent_id] = []
399 # Filter out properties we don't need for comparison
400 filtered_entity_data = {
401 '@id': entity_data['@id'],
402 }
404 # Keep the required properties for comparison
405 properties_to_keep = [
406 'http://www.w3.org/ns/prov#specializationOf',
407 'http://www.w3.org/ns/prov#wasDerivedFrom'
408 ]
410 for prop in properties_to_keep:
411 if prop in entity_data:
412 filtered_entity_data[prop] = entity_data[prop]
414 # Handle hasUpdateQuery specially
415 if 'https://w3id.org/oc/ontology/hasUpdateQuery' in entity_data:
416 # Extract the value from the hasUpdateQuery property
417 update_query_value = entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'][0].get('@value', '')
419 # Split the query into individual statements
420 if update_query_value:
421 # Extract the part between the INSERT DATA { GRAPH <...> { and } }
422 try:
423 query_content = update_query_value.split(
424 "INSERT DATA { GRAPH <https://w3id.org/oc/meta/br/> { "
425 )[1].split(" } }")[0]
427 # Split by dot and space to get individual statements
428 statements = set(query_content.split(" ."))
430 # Add to filtered entity data
431 filtered_entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'] = statements
432 except IndexError:
433 # If the format is different, just use the original value
434 filtered_entity_data['https://w3id.org/oc/ontology/hasUpdateQuery'] = \
435 entity_data['https://w3id.org/oc/ontology/hasUpdateQuery']
437 # Add this filtered entity to its parent's group
438 grouped_entities[parent_id].append(filtered_entity_data)
440 # Format the output to match the expected structure
441 entity_list = []
442 for parent_id, entities_list in sorted(grouped_entities.items()):
443 entity_list.append({
444 '@graph': sorted(entities_list, key=lambda x: x['@id'])
445 })
447 output[entity_type] = entity_list
448 expected_output = {
449 "ar": [
450 {
451 "@graph": [
452 {
453 "@id": "https://w3id.org/oc/meta/ar/0601/prov/se/1",
454 "http://www.w3.org/ns/prov#specializationOf": [
455 {"@id": "https://w3id.org/oc/meta/ar/0601"}
456 ],
457 }
458 ]
459 },
460 {
461 "@graph": [
462 {
463 "@id": "https://w3id.org/oc/meta/ar/0602/prov/se/1",
464 "http://www.w3.org/ns/prov#specializationOf": [
465 {"@id": "https://w3id.org/oc/meta/ar/0602"}
466 ],
467 }
468 ]
469 },
470 {
471 "@graph": [
472 {
473 "@id": "https://w3id.org/oc/meta/ar/0603/prov/se/1",
474 "http://www.w3.org/ns/prov#specializationOf": [
475 {"@id": "https://w3id.org/oc/meta/ar/0603"}
476 ],
477 }
478 ]
479 },
480 ],
481 "br": [
482 {
483 "@graph": [
484 {
485 "@id": "https://w3id.org/oc/meta/br/0601/prov/se/1",
486 "http://www.w3.org/ns/prov#specializationOf": [
487 {"@id": "https://w3id.org/oc/meta/br/0601"}
488 ],
489 },
490 {
491 "@id": "https://w3id.org/oc/meta/br/0601/prov/se/2",
492 "http://www.w3.org/ns/prov#specializationOf": [
493 {"@id": "https://w3id.org/oc/meta/br/0601"}
494 ],
495 "http://www.w3.org/ns/prov#wasDerivedFrom": [
496 {"@id": "https://w3id.org/oc/meta/br/0601/prov/se/1"}
497 ],
498 "https://w3id.org/oc/ontology/hasUpdateQuery": {
499 "",
500 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0601>",
501 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0603>",
502 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0602>",
503 "<https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/pro/isDocumentContextFor> <https://w3id.org/oc/meta/ar/0603>",
504 '<https://w3id.org/oc/meta/br/0601> <http://prismstandard.org/namespaces/basic/2.0/publicationDate> "2015-08-22"^^<http://www.w3.org/2001/XMLSchema#date>',
505 '<https://w3id.org/oc/meta/br/0601> <http://purl.org/dc/terms/title> "Some Aspects Of The Evolution Of Chernozems Under The Influence Of Natural And Anthropogenic Factors"^^<http://www.w3.org/2001/XMLSchema#string>',
506 },
507 },
508 ]
509 },
510 {
511 "@graph": [
512 {
513 "@id": "https://w3id.org/oc/meta/br/0602/prov/se/1",
514 "http://www.w3.org/ns/prov#specializationOf": [
515 {"@id": "https://w3id.org/oc/meta/br/0602"}
516 ],
517 }
518 ]
519 },
520 {
521 "@graph": [
522 {
523 "@id": "https://w3id.org/oc/meta/br/0603/prov/se/1",
524 "http://www.w3.org/ns/prov#specializationOf": [
525 {"@id": "https://w3id.org/oc/meta/br/0603"}
526 ],
527 }
528 ]
529 },
530 ],
531 "id": [
532 {
533 "@graph": [
534 {
535 "@id": "https://w3id.org/oc/meta/id/0601/prov/se/1",
536 "http://www.w3.org/ns/prov#specializationOf": [
537 {"@id": "https://w3id.org/oc/meta/id/0601"}
538 ],
539 }
540 ]
541 },
542 {
543 "@graph": [
544 {
545 "@id": "https://w3id.org/oc/meta/id/0602/prov/se/1",
546 "http://www.w3.org/ns/prov#specializationOf": [
547 {"@id": "https://w3id.org/oc/meta/id/0602"}
548 ],
549 }
550 ]
551 },
552 {
553 "@graph": [
554 {
555 "@id": "https://w3id.org/oc/meta/id/0603/prov/se/1",
556 "http://www.w3.org/ns/prov#specializationOf": [
557 {"@id": "https://w3id.org/oc/meta/id/0603"}
558 ],
559 }
560 ]
561 },
562 {
563 "@graph": [
564 {
565 "@id": "https://w3id.org/oc/meta/id/0604/prov/se/1",
566 "http://www.w3.org/ns/prov#specializationOf": [
567 {"@id": "https://w3id.org/oc/meta/id/0604"}
568 ],
569 }
570 ]
571 },
572 ],
573 "ra": [
574 {
575 "@graph": [
576 {
577 "@id": "https://w3id.org/oc/meta/ra/0601/prov/se/1",
578 "http://www.w3.org/ns/prov#specializationOf": [
579 {"@id": "https://w3id.org/oc/meta/ra/0601"}
580 ],
581 }
582 ]
583 },
584 {
585 "@graph": [
586 {
587 "@id": "https://w3id.org/oc/meta/ra/0602/prov/se/1",
588 "http://www.w3.org/ns/prov#specializationOf": [
589 {"@id": "https://w3id.org/oc/meta/ra/0602"}
590 ],
591 }
592 ]
593 },
594 ],
595 "re": [
596 {
597 "@graph": [
598 {
599 "@id": "https://w3id.org/oc/meta/re/0601/prov/se/1",
600 "http://www.w3.org/ns/prov#specializationOf": [
601 {"@id": "https://w3id.org/oc/meta/re/0601"}
602 ],
603 }
604 ]
605 }
606 ],
607 }
608 shutil.rmtree(output_folder)
609 self.maxDiff = None
610 self.assertEqual(output, expected_output)
612 def test_run_meta_process_thread_safe(self):
613 output_folder = os.path.join(BASE_DIR, "output_4")
614 meta_config_path = os.path.join(BASE_DIR, "meta_config_4.yaml")
615 with open(meta_config_path, encoding="utf-8") as file:
616 settings = yaml.full_load(file)
617 original_input_csv_dir = settings["input_csv_dir"]
618 settings["input_csv_dir"] = os.path.join(original_input_csv_dir, "preprocess")
619 settings["workers_number"] = 1
621 reset_server()
623 run_meta_process(settings=settings, meta_config_path=meta_config_path)
625 # Run it again to test thread safety
626 proc = subprocess.run(
627 [sys.executable, "-m", "oc_meta.run.meta_process", "-c", meta_config_path],
628 capture_output=True,
629 text=True,
630 )
632 output = dict()
634 entity_types = ['ar', 'br', 'id', 'ra', 're']
636 for entity_type in entity_types:
637 query = f"""
638 PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
639 PREFIX fabio: <http://purl.org/spar/fabio/>
640 PREFIX pro: <http://purl.org/spar/pro/>
641 PREFIX datacite: <http://purl.org/spar/datacite/>
642 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>
643 PREFIX frbr: <http://purl.org/vocab/frbr/core#>
644 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
645 PREFIX prism: <http://prismstandard.org/namespaces/basic/2.0/>
646 PREFIX dcterms: <http://purl.org/dc/terms/>
647 PREFIX oco: <https://w3id.org/oc/ontology/>
649 CONSTRUCT {{
650 ?s ?p ?o .
651 }}
652 WHERE {{
653 ?s ?p ?o .
654 FILTER(STRSTARTS(STR(?s), "https://w3id.org/oc/meta/{entity_type}/"))
655 }}
656 """
658 result = execute_sparql_query(SERVER, query, return_format=XML)
660 g = Graph()
661 for s, p, o in result:
662 g.add((s, p, o))
664 entities = {}
665 for s, p, o in g:
666 s_str = str(s)
667 if s_str not in entities:
668 entities[s_str] = {'@id': s_str, '@type': []}
670 p_str = str(p)
671 if p == URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'):
672 entities[s_str]['@type'].append(str(o))
673 else:
674 if p_str not in entities[s_str]:
675 entities[s_str][p_str] = []
677 if isinstance(o, URIRef):
678 entities[s_str][p_str].append({'@id': str(o)})
679 elif isinstance(o, Literal):
680 if o.datatype:
681 entities[s_str][p_str].append({
682 '@value': str(o),
683 '@type': str(o.datatype)
684 })
685 else:
686 entities[s_str][p_str].append({'@value': str(o)})
688 entity_list = list(entities.values())
690 output[entity_type] = [
691 {
692 '@graph': entity_list,
693 '@id': f"https://w3id.org/oc/meta/{entity_type}/"
694 }
695 ]
697 expected_output = {
698 "ar": [
699 {
700 "@graph": [
701 {
702 "@id": "https://w3id.org/oc/meta/ar/0604",
703 "@type": ["http://purl.org/spar/pro/RoleInTime"],
704 "http://purl.org/spar/pro/isHeldBy": [
705 {"@id": "https://w3id.org/oc/meta/ra/0604"}
706 ],
707 "http://purl.org/spar/pro/withRole": [
708 {"@id": "http://purl.org/spar/pro/publisher"}
709 ],
710 },
711 {
712 "@id": "https://w3id.org/oc/meta/ar/0602",
713 "@type": ["http://purl.org/spar/pro/RoleInTime"],
714 "http://purl.org/spar/pro/isHeldBy": [
715 {"@id": "https://w3id.org/oc/meta/ra/0602"}
716 ],
717 "http://purl.org/spar/pro/withRole": [
718 {"@id": "http://purl.org/spar/pro/author"}
719 ],
720 "https://w3id.org/oc/ontology/hasNext": [
721 {"@id": "https://w3id.org/oc/meta/ar/0603"}
722 ],
723 },
724 {
725 "@id": "https://w3id.org/oc/meta/ar/0603",
726 "@type": ["http://purl.org/spar/pro/RoleInTime"],
727 "http://purl.org/spar/pro/isHeldBy": [
728 {"@id": "https://w3id.org/oc/meta/ra/0603"}
729 ],
730 "http://purl.org/spar/pro/withRole": [
731 {"@id": "http://purl.org/spar/pro/author"}
732 ],
733 },
734 {
735 "@id": "https://w3id.org/oc/meta/ar/0605",
736 "@type": ["http://purl.org/spar/pro/RoleInTime"],
737 "http://purl.org/spar/pro/isHeldBy": [
738 {"@id": "https://w3id.org/oc/meta/ra/0605"}
739 ],
740 "http://purl.org/spar/pro/withRole": [
741 {"@id": "http://purl.org/spar/pro/editor"}
742 ],
743 },
744 {
745 "@id": "https://w3id.org/oc/meta/ar/0601",
746 "@type": ["http://purl.org/spar/pro/RoleInTime"],
747 "http://purl.org/spar/pro/isHeldBy": [
748 {"@id": "https://w3id.org/oc/meta/ra/0601"}
749 ],
750 "http://purl.org/spar/pro/withRole": [
751 {"@id": "http://purl.org/spar/pro/author"}
752 ],
753 "https://w3id.org/oc/ontology/hasNext": [
754 {"@id": "https://w3id.org/oc/meta/ar/0602"}
755 ],
756 },
757 ],
758 "@id": "https://w3id.org/oc/meta/ar/",
759 }
760 ],
761 "br": [
762 {
763 "@graph": [
764 {
765 "@id": "https://w3id.org/oc/meta/br/0601",
766 "@type": [
767 "http://purl.org/spar/fabio/Expression",
768 "http://purl.org/spar/fabio/JournalArticle",
769 ],
770 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
771 {
772 "@type": "http://www.w3.org/2001/XMLSchema#date",
773 "@value": "2012-03-31",
774 }
775 ],
776 "http://purl.org/dc/terms/title": [
777 {
778 "@value": "Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment",
779 "@type": "http://www.w3.org/2001/XMLSchema#string"
780 }
781 ],
782 "http://purl.org/spar/datacite/hasIdentifier": [
783 {"@id": "https://w3id.org/oc/meta/id/0601"}
784 ],
785 "http://purl.org/spar/pro/isDocumentContextFor": [
786 {"@id": "https://w3id.org/oc/meta/ar/0603"},
787 {"@id": "https://w3id.org/oc/meta/ar/0601"},
788 {"@id": "https://w3id.org/oc/meta/ar/0604"},
789 {"@id": "https://w3id.org/oc/meta/ar/0602"},
790 {"@id": "https://w3id.org/oc/meta/ar/0605"},
791 ],
792 "http://purl.org/vocab/frbr/core#embodiment": [
793 {"@id": "https://w3id.org/oc/meta/re/0601"}
794 ],
795 "http://purl.org/vocab/frbr/core#partOf": [
796 {"@id": "https://w3id.org/oc/meta/br/0604"}
797 ],
798 },
799 {
800 "@id": "https://w3id.org/oc/meta/br/0604",
801 "@type": [
802 "http://purl.org/spar/fabio/JournalIssue",
803 "http://purl.org/spar/fabio/Expression",
804 ],
805 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
806 {"@value": "1", "@type": "http://www.w3.org/2001/XMLSchema#string"}
807 ],
808 "http://purl.org/vocab/frbr/core#partOf": [
809 {"@id": "https://w3id.org/oc/meta/br/0603"}
810 ],
811 },
812 {
813 "@id": "https://w3id.org/oc/meta/br/0602",
814 "@type": [
815 "http://purl.org/spar/fabio/Expression",
816 "http://purl.org/spar/fabio/Journal",
817 ],
818 "http://purl.org/dc/terms/title": [
819 {"@value": "The Korean Journal Of Food And Nutrition", "@type": "http://www.w3.org/2001/XMLSchema#string"}
820 ],
821 "http://purl.org/spar/datacite/hasIdentifier": [
822 {"@id": "https://w3id.org/oc/meta/id/0602"}
823 ],
824 },
825 {
826 "@id": "https://w3id.org/oc/meta/br/0603",
827 "@type": [
828 "http://purl.org/spar/fabio/Expression",
829 "http://purl.org/spar/fabio/JournalVolume",
830 ],
831 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
832 {"@value": "25", "@type": "http://www.w3.org/2001/XMLSchema#string"}
833 ],
834 "http://purl.org/vocab/frbr/core#partOf": [
835 {"@id": "https://w3id.org/oc/meta/br/0602"}
836 ],
837 },
838 ],
839 "@id": "https://w3id.org/oc/meta/br/",
840 }
841 ],
842 "id": [
843 {
844 "@graph": [
845 {
846 "@id": "https://w3id.org/oc/meta/id/0605",
847 "@type": ["http://purl.org/spar/datacite/Identifier"],
848 "http://purl.org/spar/datacite/usesIdentifierScheme": [
849 {"@id": "http://purl.org/spar/datacite/orcid"}
850 ],
851 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
852 {"@value": "0000-0002-9666-2513", "@type": "http://www.w3.org/2001/XMLSchema#string"}
853 ],
854 },
855 {
856 "@id": "https://w3id.org/oc/meta/id/0601",
857 "@type": ["http://purl.org/spar/datacite/Identifier"],
858 "http://purl.org/spar/datacite/usesIdentifierScheme": [
859 {"@id": "http://purl.org/spar/datacite/doi"}
860 ],
861 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
862 {"@value": "10.9799/ksfan.2012.25.1.069", "@type": "http://www.w3.org/2001/XMLSchema#string"}
863 ],
864 },
865 {
866 "@id": "https://w3id.org/oc/meta/id/0603",
867 "@type": ["http://purl.org/spar/datacite/Identifier"],
868 "http://purl.org/spar/datacite/usesIdentifierScheme": [
869 {"@id": "http://purl.org/spar/datacite/orcid"}
870 ],
871 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
872 {"@value": "0000-0003-2542-5788", "@type": "http://www.w3.org/2001/XMLSchema#string"}
873 ],
874 },
875 {
876 "@id": "https://w3id.org/oc/meta/id/0604",
877 "@type": ["http://purl.org/spar/datacite/Identifier"],
878 "http://purl.org/spar/datacite/usesIdentifierScheme": [
879 {"@id": "http://purl.org/spar/datacite/crossref"}
880 ],
881 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
882 {"@value": "4768", "@type": "http://www.w3.org/2001/XMLSchema#string"}
883 ],
884 },
885 {
886 "@id": "https://w3id.org/oc/meta/id/0602",
887 "@type": ["http://purl.org/spar/datacite/Identifier"],
888 "http://purl.org/spar/datacite/usesIdentifierScheme": [
889 {"@id": "http://purl.org/spar/datacite/issn"}
890 ],
891 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
892 {"@value": "1225-4339", "@type": "http://www.w3.org/2001/XMLSchema#string"}
893 ],
894 },
895 ],
896 "@id": "https://w3id.org/oc/meta/id/",
897 }
898 ],
899 "ra": [
900 {
901 "@graph": [
902 {
903 "@id": "https://w3id.org/oc/meta/ra/0605",
904 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
905 "http://purl.org/spar/datacite/hasIdentifier": [
906 {"@id": "https://w3id.org/oc/meta/id/0605"}
907 ],
908 "http://xmlns.com/foaf/0.1/familyName": [
909 {"@value": "Chung", "@type": "http://www.w3.org/2001/XMLSchema#string"}
910 ],
911 "http://xmlns.com/foaf/0.1/givenName": [
912 {"@value": "Myong-Soo", "@type": "http://www.w3.org/2001/XMLSchema#string"}
913 ],
914 },
915 {
916 "@id": "https://w3id.org/oc/meta/ra/0602",
917 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
918 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Mun", "@type": "http://www.w3.org/2001/XMLSchema#string"}],
919 "http://xmlns.com/foaf/0.1/givenName": [
920 {"@value": "Ji-Hye", "@type": "http://www.w3.org/2001/XMLSchema#string"}
921 ],
922 },
923 {
924 "@id": "https://w3id.org/oc/meta/ra/0604",
925 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
926 "http://purl.org/spar/datacite/hasIdentifier": [
927 {"@id": "https://w3id.org/oc/meta/id/0604"}
928 ],
929 "http://xmlns.com/foaf/0.1/name": [
930 {"@value": "The Korean Society Of Food And Nutrition", "@type": "http://www.w3.org/2001/XMLSchema#string"}
931 ],
932 },
933 {
934 "@id": "https://w3id.org/oc/meta/ra/0603",
935 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
936 "http://xmlns.com/foaf/0.1/familyName": [
937 {"@value": "Chung", "@type": "http://www.w3.org/2001/XMLSchema#string"}
938 ],
939 "http://xmlns.com/foaf/0.1/givenName": [
940 {"@value": "Myong-Soo", "@type": "http://www.w3.org/2001/XMLSchema#string"}
941 ],
942 },
943 {
944 "@id": "https://w3id.org/oc/meta/ra/0601",
945 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
946 "http://purl.org/spar/datacite/hasIdentifier": [
947 {"@id": "https://w3id.org/oc/meta/id/0603"}
948 ],
949 "http://xmlns.com/foaf/0.1/familyName": [
950 {"@value": "Cheigh", "@type": "http://www.w3.org/2001/XMLSchema#string"}
951 ],
952 "http://xmlns.com/foaf/0.1/givenName": [
953 {"@value": "Chan-Ick", "@type": "http://www.w3.org/2001/XMLSchema#string"}
954 ],
955 },
956 ],
957 "@id": "https://w3id.org/oc/meta/ra/",
958 }
959 ],
960 "re": [
961 {
962 "@graph": [
963 {
964 "@id": "https://w3id.org/oc/meta/re/0601",
965 "@type": ["http://purl.org/spar/fabio/Manifestation"],
966 "http://prismstandard.org/namespaces/basic/2.0/endingPage": [
967 {"@value": "76", "@type": "http://www.w3.org/2001/XMLSchema#string"}
968 ],
969 "http://prismstandard.org/namespaces/basic/2.0/startingPage": [
970 {"@value": "69", "@type": "http://www.w3.org/2001/XMLSchema#string"}
971 ],
972 }
973 ],
974 "@id": "https://w3id.org/oc/meta/re/",
975 }
976 ],
977 }
979 processed_output = {}
980 for entity_type, entity_data in output.items():
981 processed_output[entity_type] = []
982 for graph_container in entity_data:
983 filtered_graph = []
984 for entity in graph_container['@graph']:
985 filtered_entity = {
986 '@id': entity['@id']
987 }
988 for pred, obj in entity.items():
989 if pred != '@id': # Only exclude @id since we already added it
990 filtered_entity[pred] = obj
992 if len(filtered_entity) > 1: # Only include if it has predicates beyond @id
993 filtered_graph.append(filtered_entity)
995 # Sort the graph by @id
996 filtered_graph = sorted(filtered_graph, key=lambda x: x['@id'])
998 processed_output[entity_type].append({
999 '@graph': filtered_graph,
1000 '@id': graph_container['@id']
1001 })
1002 # For each entity type in the expected output, verify that all expected entities exist
1003 # with their expected properties in the actual output from the triplestore
1004 for entity_type, expected_graphs in expected_output.items():
1005 self.assertIn(entity_type, processed_output, f"Entity type {entity_type} missing from triplestore output")
1007 for expected_graph in expected_graphs:
1008 expected_entities = expected_graph['@graph']
1010 # Find the corresponding graph in the processed output
1011 actual_graph = None
1012 for graph in processed_output[entity_type]:
1013 if graph['@id'] == expected_graph['@id']:
1014 actual_graph = graph
1015 break
1017 self.assertIsNotNone(actual_graph, f"Graph {expected_graph['@id']} not found in triplestore output")
1019 # For each expected entity, verify it exists with all expected properties
1020 for expected_entity in expected_entities:
1021 entity_id = expected_entity['@id']
1023 # Find the entity in the actual graph
1024 actual_entity = None
1025 for entity in actual_graph['@graph']:
1026 if entity['@id'] == entity_id:
1027 actual_entity = entity
1028 break
1030 self.assertIsNotNone(actual_entity, f"Entity {entity_id} not found in triplestore output")
1032 # Check that all expected predicates and objects exist
1033 for pred, expected_objects in expected_entity.items():
1034 if pred != '@id':
1035 self.assertIn(pred, actual_entity, f"Predicate {pred} missing for entity {entity_id}")
1037 # For each expected object, verify it exists in the actual objects
1038 for expected_obj in expected_objects:
1039 found = False
1040 for actual_obj in actual_entity[pred]:
1041 # Require exact matches for all objects
1042 if expected_obj == actual_obj:
1043 found = True
1044 break
1046 self.assertTrue(found, f"Object {expected_obj} not found for predicate {pred} of entity {entity_id}\nActual values: {actual_entity[pred]}")
1049 if os.path.exists(output_folder):
1050 shutil.rmtree(output_folder)
1052 self.assertFalse(
1053 "Reader: ERROR" in proc.stdout or "Storer: ERROR" in proc.stdout
1054 )
1055 self.assertFalse(
1056 "Reader: ERROR" in proc.stdout or "Storer: ERROR" in proc.stdout
1057 )
1059 def test_silencer_on(self):
1060 output_folder = os.path.join(BASE_DIR, "output_6")
1061 now = datetime.now()
1062 meta_config_path = os.path.join(BASE_DIR, "meta_config_6.yaml")
1063 with open(meta_config_path, encoding="utf-8") as file:
1064 settings = yaml.full_load(file)
1066 # Update settings with temporary files and Redis cache DB
1067 settings.update(
1068 {
1069 "redis_cache_db": 2,
1070 "ts_upload_cache": self.cache_file,
1071 "ts_failed_queries": self.failed_file,
1072 "ts_stop_file": self.stop_file,
1073 }
1074 )
1076 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1077 settings["input_csv_dir"] = os.path.join(
1078 BASE_DIR, "same_as_input_2_with_other_authors"
1079 )
1080 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1081 query_agents = """
1082 PREFIX pro: <http://purl.org/spar/pro/>
1083 SELECT (COUNT (?agent) AS ?agent_count)
1084 WHERE {
1085 <https://w3id.org/oc/meta/br/0601> pro:isDocumentContextFor ?agent.
1086 }
1087 """
1088 result = execute_sparql_query(SERVER, query_agents)
1089 expected_result = {
1090 "head": {"link": [], "vars": ["agent_count"]},
1091 "results": {
1092 "distinct": False,
1093 "ordered": True,
1094 "bindings": [
1095 {
1096 "agent_count": {
1097 "datatype": "http://www.w3.org/2001/XMLSchema#integer",
1098 "type": "typed-literal",
1099 "value": "3",
1100 }
1101 }
1102 ],
1103 },
1104 }
1105 shutil.rmtree(output_folder)
1106 delete_output_zip(".", now)
1107 self.assertEqual(result, expected_result)
1109 def test_silencer_off(self):
1110 output_folder = os.path.join(BASE_DIR, "output_7")
1111 now = datetime.now()
1112 meta_config_path = os.path.join(BASE_DIR, "meta_config_7.yaml")
1113 with open(meta_config_path, encoding="utf-8") as file:
1114 settings = yaml.full_load(file)
1116 # Update settings with temporary files and Redis cache DB
1117 settings.update(
1118 {
1119 "redis_cache_db": 2,
1120 "ts_upload_cache": self.cache_file,
1121 "ts_failed_queries": self.failed_file,
1122 "ts_stop_file": self.stop_file,
1123 }
1124 )
1126 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1127 settings["input_csv_dir"] = os.path.join(
1128 BASE_DIR, "same_as_input_2_with_other_authors"
1129 )
1130 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1131 query_agents = """
1132 PREFIX pro: <http://purl.org/spar/pro/>
1133 SELECT (COUNT (?agent) AS ?agent_count)
1134 WHERE {
1135 <https://w3id.org/oc/meta/br/0601> pro:isDocumentContextFor ?agent.
1136 }
1137 """
1138 result = execute_sparql_query(SERVER, query_agents)
1139 expected_result = {
1140 "head": {"link": [], "vars": ["agent_count"]},
1141 "results": {
1142 "distinct": False,
1143 "ordered": True,
1144 "bindings": [
1145 {
1146 "agent_count": {
1147 "datatype": "http://www.w3.org/2001/XMLSchema#integer",
1148 "type": "typed-literal",
1149 "value": "6",
1150 }
1151 }
1152 ],
1153 },
1154 }
1155 shutil.rmtree(output_folder)
1156 delete_output_zip(".", now)
1157 self.assertEqual(result, expected_result)
1159 def test_omid_in_input_data(self):
1160 query_all = """
1161 PREFIX fabio: <http://purl.org/spar/fabio/>
1162 PREFIX datacite: <http://purl.org/spar/datacite/>
1163 CONSTRUCT {?s ?p ?o. ?id ?id_p ?id_o.}
1164 WHERE {
1165 ?s a fabio:JournalArticle;
1166 ?p ?o.
1167 ?s datacite:hasIdentifier ?id.
1168 ?id ?id_p ?id_o.
1169 }
1170 """
1171 result = execute_sparql_query(SERVER, query_all, return_format=XML)
1172 output_folder = os.path.join(BASE_DIR, "output_8")
1173 now = datetime.now()
1174 meta_config_path_without_openalex = os.path.join(BASE_DIR, "meta_config_8.yaml")
1175 meta_config_path_with_openalex = os.path.join(BASE_DIR, "meta_config_9.yaml")
1176 with open(meta_config_path_without_openalex, encoding="utf-8") as file:
1177 settings_without_openalex = yaml.full_load(file)
1178 with open(meta_config_path_with_openalex, encoding="utf-8") as file:
1179 settings_with_openalex = yaml.full_load(file)
1181 # Update settings with temporary files and Redis cache DB
1182 cache_settings = {
1183 "redis_cache_db": 2,
1184 "ts_upload_cache": self.cache_file,
1185 "ts_failed_queries": self.failed_file,
1186 "ts_stop_file": self.stop_file,
1187 }
1188 settings_without_openalex.update(cache_settings)
1189 settings_with_openalex.update(cache_settings)
1191 run_meta_process(
1192 settings=settings_without_openalex,
1193 meta_config_path=meta_config_path_without_openalex,
1194 )
1195 run_meta_process(
1196 settings=settings_with_openalex,
1197 meta_config_path=meta_config_path_with_openalex,
1198 )
1199 query_all = """
1200 PREFIX fabio: <http://purl.org/spar/fabio/>
1201 PREFIX datacite: <http://purl.org/spar/datacite/>
1202 CONSTRUCT {?s ?p ?o. ?id ?id_p ?id_o.}
1203 WHERE {
1204 ?s a fabio:JournalArticle;
1205 ?p ?o.
1206 ?s datacite:hasIdentifier ?id.
1207 ?id ?id_p ?id_o.
1208 }
1209 """
1210 result = execute_sparql_query(SERVER, query_all, return_format=XML)
1211 expected_result = Graph()
1212 expected_result.parse(
1213 location=os.path.join(BASE_DIR, "test_omid_in_input_data.json"),
1214 format="json-ld",
1215 )
1216 prov_graph = ConjunctiveGraph()
1217 for dirpath, dirnames, filenames in os.walk(os.path.join(output_folder, "rdf")):
1218 if "br" in dirpath and "prov" in dirpath:
1219 for filename in filenames:
1220 prov_graph.parse(
1221 source=os.path.join(dirpath, filename), format="json-ld"
1222 )
1224 expected_prov_graph = ConjunctiveGraph()
1225 expected_prov_graph.parse(
1226 os.path.join(BASE_DIR, "test_omid_in_input_data_prov.json"),
1227 format="json-ld",
1228 )
1229 prov_graph.remove(
1230 (None, URIRef("http://www.w3.org/ns/prov#generatedAtTime"), None)
1231 )
1232 expected_prov_graph.remove(
1233 (None, URIRef("http://www.w3.org/ns/prov#generatedAtTime"), None)
1234 )
1235 prov_graph.remove(
1236 (None, URIRef("http://www.w3.org/ns/prov#invalidatedAtTime"), None)
1237 )
1238 expected_prov_graph.remove(
1239 (None, URIRef("http://www.w3.org/ns/prov#invalidatedAtTime"), None)
1240 )
1241 shutil.rmtree(output_folder)
1242 self.assertTrue(
1243 normalize_graph(result).isomorphic(normalize_graph(expected_result))
1244 )
1246 def test_publishers_sequence(self):
1247 output_folder = os.path.join(BASE_DIR, "output_9")
1248 meta_config_path = os.path.join(BASE_DIR, "meta_config_10.yaml")
1249 now = datetime.now()
1250 with open(meta_config_path, encoding="utf-8") as file:
1251 settings = yaml.full_load(file)
1253 # Update settings with temporary files and Redis cache DB
1254 settings.update(
1255 {
1256 "redis_cache_db": 2,
1257 "ts_upload_cache": self.cache_file,
1258 "ts_failed_queries": self.failed_file,
1259 "ts_stop_file": self.stop_file,
1260 }
1261 )
1263 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1264 query_all = """
1265 PREFIX datacite: <http://purl.org/spar/datacite/>
1266 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>
1267 CONSTRUCT {?br ?p ?o. ?o ?op ?oo. ?oo ?oop ?ooo. ?ooo ?ooop ?oooo.}
1268 WHERE {
1269 ?id literal:hasLiteralValue "10.17117/na.2015.08.1067"^^<http://www.w3.org/2001/XMLSchema#string>;
1270 datacite:usesIdentifierScheme datacite:doi;
1271 ^datacite:hasIdentifier ?br.
1272 ?br ?p ?o.
1273 ?o ?op ?oo.
1274 ?oo ?oop ?ooo.
1275 ?ooo ?ooop ?oooo.
1276 }
1277 """
1278 result = execute_sparql_query(SERVER, query_all, return_format=XML)
1279 expected_result = Graph()
1280 expected_result.parse(
1281 os.path.join(BASE_DIR, "test_publishers_sequence.json"), format="json-ld"
1282 )
1283 shutil.rmtree(output_folder)
1284 self.assertTrue(
1285 normalize_graph(result).isomorphic(normalize_graph(expected_result))
1286 )
1288 def test_duplicate_omids_with_datatype(self):
1289 output_folder = os.path.join(BASE_DIR, "output_duplicate_test")
1290 meta_config_path = os.path.join(BASE_DIR, "meta_config_duplicate.yaml")
1292 # Create test settings
1293 settings = {
1294 "triplestore_url": SERVER,
1295 "provenance_triplestore_url": PROV_SERVER,
1296 "input_csv_dir": os.path.join(BASE_DIR, "input_duplicate"),
1297 "base_output_dir": output_folder,
1298 "output_rdf_dir": output_folder,
1299 "resp_agent": "test",
1300 "base_iri": "https://w3id.org/oc/meta/",
1301 "context_path": None,
1302 "dir_split_number": 10000,
1303 "items_per_file": 1000,
1304 "default_dir": "_",
1305 "rdf_output_in_chunks": False,
1306 "zip_output_rdf": True,
1307 "source": None,
1308 "supplier_prefix": "060",
1309 "workers_number": 1,
1310 "use_doi_api_service": False,
1311 "blazegraph_full_text_search": False,
1312 "virtuoso_full_text_search": True,
1313 "fuseki_full_text_search": False,
1314 "graphdb_connector_name": None,
1315 "cache_endpoint": None,
1316 "cache_update_endpoint": None,
1317 "silencer": [],
1318 "redis_cache_db": 2,
1319 "ts_upload_cache": self.cache_file,
1320 "ts_failed_queries": self.failed_file,
1321 "ts_stop_file": self.stop_file,
1322 }
1324 # Setup: create test data
1325 os.makedirs(os.path.join(BASE_DIR, "input_duplicate"), exist_ok=True)
1326 with open(
1327 os.path.join(BASE_DIR, "input_duplicate", "test.csv"), "w", encoding="utf-8"
1328 ) as f:
1329 writer = csv.writer(f)
1330 writer.writerow(
1331 [
1332 "id",
1333 "title",
1334 "author",
1335 "pub_date",
1336 "venue",
1337 "volume",
1338 "issue",
1339 "page",
1340 "type",
1341 "publisher",
1342 "editor",
1343 ]
1344 )
1345 writer.writerow(
1346 [
1347 "issn:2543-3288 issn:2078-7685", # Exact problematic row from production
1348 "Journal of Diabetology",
1349 "",
1350 "",
1351 "",
1352 "",
1353 "",
1354 "",
1355 "journal",
1356 "Medknow [crossref:2581]",
1357 "",
1358 ]
1359 )
1361 # Setup: Insert pre-existing identifiers and BRs in triplestore
1362 sparql = SPARQLWrapper(SERVER)
1363 sparql.setMethod(POST)
1364 sparql.setQuery(
1365 """
1366 INSERT DATA {
1367 GRAPH <https://w3id.org/oc/meta/br/> {
1368 <https://w3id.org/oc/meta/br/0601> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601> ;
1369 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> .
1370 <https://w3id.org/oc/meta/br/0602> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0602> ;
1371 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> .
1372 }
1373 GRAPH <https://w3id.org/oc/meta/id/> {
1374 <https://w3id.org/oc/meta/id/0601> <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "2078-7685" ;
1375 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1376 <https://w3id.org/oc/meta/id/0602> <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "2543-3288" ;
1377 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1378 }
1379 }
1380 """
1381 )
1382 sparql.query()
1384 # Update Redis counters to match the inserted data
1385 redis_handler = RedisCounterHandler(db=5) # Use test db
1386 redis_handler.set_counter(
1387 2, "br", supplier_prefix="060"
1388 ) # BR counter for two BRs
1389 redis_handler.set_counter(
1390 2, "id", supplier_prefix="060"
1391 ) # ID counter for two IDs
1393 # Run the process
1394 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1396 # Check for errors
1397 errors_file = os.path.join(output_folder, "errors.txt")
1398 if os.path.exists(errors_file):
1399 with open(errors_file, "r") as f:
1400 errors = f.read()
1401 print(f"Errors found:\n{errors}")
1403 # Query to check for duplicates
1404 query = """
1405 SELECT DISTINCT ?id ?value
1406 WHERE {
1407 GRAPH <https://w3id.org/oc/meta/id/> {
1408 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?value ;
1409 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1410 FILTER(?value IN ("2078-7685"^^<http://www.w3.org/2001/XMLSchema#string>, "2078-7685",
1411 "2543-3288"^^<http://www.w3.org/2001/XMLSchema#string>, "2543-3288"))
1412 }
1413 }
1414 """
1415 result = execute_sparql_query(SERVER, query, return_format=JSON)
1416 # Group IDs by value to check for duplicates
1417 ids_by_value = {}
1418 for binding in result["results"]["bindings"]:
1419 value = binding["value"]["value"]
1420 id = binding["id"]["value"]
1421 if value not in ids_by_value:
1422 ids_by_value[value] = []
1423 ids_by_value[value].append(id)
1425 # Cleanup
1426 shutil.rmtree(output_folder, ignore_errors=True)
1427 shutil.rmtree(os.path.join(BASE_DIR, "input_duplicate"), ignore_errors=True)
1428 if os.path.exists(meta_config_path):
1429 os.remove(meta_config_path)
1431 # Check that we have both ISSNs and no duplicates
1432 for issn_value, ids in ids_by_value.items():
1433 self.assertEqual(
1434 len(ids), 1, f"Found multiple IDs for ISSN {issn_value}: {ids}"
1435 )
1437 self.assertEqual(
1438 len(ids_by_value),
1439 2,
1440 f"Expected 2 ISSNs, found {len(ids_by_value)}: {list(ids_by_value.keys())}",
1441 )
1443 def test_duplicate_omids_with_venue_datatype(self):
1444 """Test to verify that identifiers are not duplicated when merging previously unconnected venues"""
1445 output_folder = os.path.join(BASE_DIR, "output_duplicate_venue_test")
1446 meta_config_path = os.path.join(BASE_DIR, "meta_config_duplicate_venue.yaml")
1448 # Setup: create test data
1449 os.makedirs(os.path.join(BASE_DIR, "input_duplicate_venue"), exist_ok=True)
1450 with open(
1451 os.path.join(BASE_DIR, "input_duplicate_venue", "test.csv"),
1452 "w",
1453 encoding="utf-8",
1454 ) as f:
1455 writer = csv.writer(f)
1456 writer.writerow(
1457 [
1458 "id",
1459 "title",
1460 "author",
1461 "pub_date",
1462 "venue",
1463 "volume",
1464 "issue",
1465 "page",
1466 "type",
1467 "publisher",
1468 "editor",
1469 ]
1470 )
1471 writer.writerow(
1472 [
1473 "issn:1756-1833",
1474 "BMJ",
1475 "",
1476 "",
1477 "",
1478 "",
1479 "",
1480 "",
1481 "journal",
1482 "BMJ [crossref:239]",
1483 "",
1484 ]
1485 )
1486 writer.writerow(
1487 [
1488 "", # id
1489 "", # title
1490 "", # author
1491 "", # pub_date
1492 "BMJ [issn:0267-0623 issn:0959-8138 issn:1468-5833 issn:0007-1447]", # venue
1493 "283", # volume
1494 "", # issue
1495 "", # page
1496 "journal volume", # type
1497 "BMJ [crossref:239]", # publisher
1498 "", # editor
1499 ]
1500 )
1502 # Setup: Insert pre-existing data - aggiungiamo gli identificatori iniziali
1503 sparql = SPARQLWrapper(SERVER)
1504 sparql.setMethod(POST)
1505 sparql.setQuery(
1506 """
1507 INSERT DATA {
1508 GRAPH <https://w3id.org/oc/meta/br/> {
1509 # First venue - BMJ with initial ISSNs
1510 <https://w3id.org/oc/meta/br/0601>
1511 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601>, <https://w3id.org/oc/meta/id/0602> ;
1512 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ;
1513 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1514 <http://purl.org/dc/terms/title> "BMJ" .
1516 # Second venue
1517 <https://w3id.org/oc/meta/br/0602>
1518 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0603> ;
1519 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ;
1520 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1521 <http://purl.org/dc/terms/title> "British Medical Journal" .
1522 }
1523 GRAPH <https://w3id.org/oc/meta/id/> {
1524 # First venue's ISSNs
1525 <https://w3id.org/oc/meta/id/0601>
1526 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "1756-1833" ;
1527 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1528 <https://w3id.org/oc/meta/id/0602>
1529 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "0959-8138" ;
1530 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1531 # Second venue's ISSN
1532 <https://w3id.org/oc/meta/id/0603>
1533 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "0267-0623" ;
1534 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1535 }
1536 }
1537 """
1538 )
1539 sparql.query()
1541 # Update Redis counters for the pre-existing entities
1542 redis_handler = RedisCounterHandler(db=5)
1543 redis_handler.set_counter(
1544 6, "br", supplier_prefix="060"
1545 ) # Updated to account for 6 entities (2 venues + 4 volumes)
1546 redis_handler.set_counter(
1547 3, "id", supplier_prefix="060"
1548 ) # Corretto: 3 IDs (1756-1833, 0959-8138, 0267-0623)
1550 # Create test settings
1551 settings = {
1552 "triplestore_url": SERVER,
1553 "provenance_triplestore_url": PROV_SERVER,
1554 "input_csv_dir": os.path.join(BASE_DIR, "input_duplicate_venue"),
1555 "base_output_dir": output_folder,
1556 "output_rdf_dir": output_folder,
1557 "resp_agent": "test",
1558 "base_iri": "https://w3id.org/oc/meta/",
1559 "context_path": None,
1560 "dir_split_number": 10000,
1561 "items_per_file": 1000,
1562 "default_dir": "_",
1563 "rdf_output_in_chunks": False,
1564 "zip_output_rdf": True,
1565 "source": None,
1566 "supplier_prefix": "060",
1567 "workers_number": 1,
1568 "use_doi_api_service": False,
1569 "blazegraph_full_text_search": False,
1570 "virtuoso_full_text_search": True,
1571 "fuseki_full_text_search": False,
1572 "graphdb_connector_name": None,
1573 "cache_endpoint": None,
1574 "cache_update_endpoint": None,
1575 "silencer": [],
1576 "redis_cache_db": 2,
1577 "ts_upload_cache": self.cache_file,
1578 "ts_failed_queries": self.failed_file,
1579 "ts_stop_file": self.stop_file,
1580 }
1582 with open(meta_config_path, "w") as f:
1583 yaml.dump(settings, f)
1585 # Run the process
1586 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1588 # Query to check for duplicates - check all ISSNs
1589 query = """
1590 SELECT DISTINCT ?id ?value
1591 WHERE {
1592 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?value ;
1593 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1594 FILTER(STR(?value) IN ("1756-1833", "0959-8138", "0267-0623"))
1595 }
1596 """
1597 result = execute_sparql_query(SERVER, query, return_format=JSON)
1598 # Group IDs by value to check for duplicates
1599 ids_by_value = {}
1600 for binding in result["results"]["bindings"]:
1601 value = binding["value"]["value"]
1602 id = binding["id"]["value"]
1603 if value not in ids_by_value:
1604 ids_by_value[value] = []
1605 ids_by_value[value].append(id)
1607 # Cleanup
1608 shutil.rmtree(output_folder, ignore_errors=True)
1609 shutil.rmtree(
1610 os.path.join(BASE_DIR, "input_duplicate_venue"), ignore_errors=True
1611 )
1612 if os.path.exists(meta_config_path):
1613 os.remove(meta_config_path)
1615 # Check that we don't have duplicate IDs for any ISSN
1616 for issn_value, ids in ids_by_value.items():
1617 self.assertEqual(
1618 len(ids), 1, f"Found multiple IDs for ISSN {issn_value} in venue: {ids}"
1619 )
1621 # Verify that pre-existing IDs were reused
1622 self.assertTrue(
1623 any("0601" in id for ids in ids_by_value.values() for id in ids)
1624 and any("0602" in id for ids in ids_by_value.values() for id in ids),
1625 "Pre-existing IDs were not reused",
1626 )
1628 def test_doi_with_multiple_slashes(self):
1629 """Test handling of DOIs containing multiple forward slashes"""
1630 output_folder = os.path.join(BASE_DIR, "output_doi_test")
1631 meta_config_path = os.path.join(BASE_DIR, "meta_config_doi.yaml")
1633 # Setup: create test data with problematic DOI
1634 os.makedirs(os.path.join(BASE_DIR, "input_doi"), exist_ok=True)
1635 with open(
1636 os.path.join(BASE_DIR, "input_doi", "test.csv"), "w", encoding="utf-8"
1637 ) as f:
1638 writer = csv.writer(f)
1639 writer.writerow(
1640 [
1641 "id",
1642 "title",
1643 "author",
1644 "pub_date",
1645 "venue",
1646 "volume",
1647 "issue",
1648 "page",
1649 "type",
1650 "publisher",
1651 "editor",
1652 ]
1653 )
1654 writer.writerow(
1655 [
1656 "doi:10.1093/acprof:oso/9780199230723.001.0001", # Problematic DOI with multiple slashes
1657 "Test Book",
1658 "",
1659 "",
1660 "",
1661 "",
1662 "",
1663 "",
1664 "book",
1665 "",
1666 "",
1667 ]
1668 )
1670 # Create test settings
1671 settings = {
1672 "triplestore_url": SERVER,
1673 "provenance_triplestore_url": PROV_SERVER,
1674 "input_csv_dir": os.path.join(BASE_DIR, "input_doi"),
1675 "base_output_dir": output_folder,
1676 "output_rdf_dir": output_folder,
1677 "resp_agent": "test",
1678 "base_iri": "https://w3id.org/oc/meta/",
1679 "context_path": None,
1680 "dir_split_number": 10000,
1681 "items_per_file": 1000,
1682 "default_dir": "_",
1683 "rdf_output_in_chunks": False,
1684 "zip_output_rdf": True,
1685 "source": None,
1686 "supplier_prefix": "060",
1687 "workers_number": 1,
1688 "use_doi_api_service": False,
1689 "blazegraph_full_text_search": False,
1690 "virtuoso_full_text_search": True,
1691 "fuseki_full_text_search": False,
1692 "graphdb_connector_name": None,
1693 "cache_endpoint": None,
1694 "cache_update_endpoint": None,
1695 "silencer": [],
1696 "redis_cache_db": 2,
1697 "ts_upload_cache": self.cache_file,
1698 "ts_failed_queries": self.failed_file,
1699 "ts_stop_file": self.stop_file,
1700 }
1702 with open(meta_config_path, "w") as f:
1703 yaml.dump(settings, f)
1705 now = datetime.now()
1707 # Run the process
1708 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1710 # Query to verify DOI was processed correctly
1711 query = """
1712 SELECT ?br ?id ?value
1713 WHERE {
1714 ?id <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "10.1093/acprof:oso/9780199230723.001.0001"^^<http://www.w3.org/2001/XMLSchema#string> ;
1715 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/doi> ;
1716 ^<http://purl.org/spar/datacite/hasIdentifier> ?br .
1717 }
1718 """
1719 result = execute_sparql_query(SERVER, query, return_format=JSON)
1721 # Cleanup
1722 shutil.rmtree(output_folder, ignore_errors=True)
1723 shutil.rmtree(os.path.join(BASE_DIR, "input_doi"), ignore_errors=True)
1724 if os.path.exists(meta_config_path):
1725 os.remove(meta_config_path)
1726 delete_output_zip(".", now)
1728 # Verify results
1729 self.assertTrue(
1730 len(result["results"]["bindings"]) > 0,
1731 "DOI with multiple slashes was not processed correctly",
1732 )
1734 # Check that we got exactly one result
1735 self.assertEqual(
1736 len(result["results"]["bindings"]),
1737 1,
1738 f"Expected 1 result, got {len(result['results']['bindings'])}",
1739 )
1741 def test_volume_issue_deduplication(self):
1742 """Test to verify that volumes and issues are properly deduplicated"""
1743 output_folder = os.path.join(BASE_DIR, "output_vvi_test")
1744 meta_config_path = os.path.join(BASE_DIR, "meta_config_vvi.yaml")
1746 # Setup: create test data
1747 os.makedirs(os.path.join(BASE_DIR, "input_vvi"), exist_ok=True)
1748 with open(
1749 os.path.join(BASE_DIR, "input_vvi", "test.csv"), "w", encoding="utf-8"
1750 ) as f:
1751 writer = csv.writer(f)
1752 writer.writerow(
1753 [
1754 "id",
1755 "title",
1756 "author",
1757 "pub_date",
1758 "venue",
1759 "volume",
1760 "issue",
1761 "page",
1762 "type",
1763 "publisher",
1764 "editor",
1765 ]
1766 )
1767 # First article in volume 1, issue 1
1768 writer.writerow(
1769 [
1770 "doi:10.1234/test.1",
1771 "First Article",
1772 "",
1773 "2023",
1774 "Test Journal [issn:1756-1833]",
1775 "1",
1776 "1",
1777 "1-10",
1778 "journal article",
1779 "",
1780 "",
1781 ]
1782 )
1783 # Second article in same volume and issue
1784 writer.writerow(
1785 [
1786 "doi:10.1234/test.2",
1787 "Second Article",
1788 "",
1789 "2023",
1790 "Test Journal [issn:1756-1833]",
1791 "1",
1792 "1",
1793 "11-20",
1794 "journal article",
1795 "",
1796 "",
1797 ]
1798 )
1800 # Create test settings
1801 settings = {
1802 "triplestore_url": SERVER,
1803 "provenance_triplestore_url": PROV_SERVER,
1804 "input_csv_dir": os.path.join(BASE_DIR, "input_vvi"),
1805 "base_output_dir": output_folder,
1806 "output_rdf_dir": output_folder,
1807 "resp_agent": "test",
1808 "base_iri": "https://w3id.org/oc/meta/",
1809 "context_path": None,
1810 "dir_split_number": 10000,
1811 "items_per_file": 1000,
1812 "default_dir": "_",
1813 "rdf_output_in_chunks": False,
1814 "zip_output_rdf": True,
1815 "source": None,
1816 "supplier_prefix": "060",
1817 "workers_number": 1,
1818 "use_doi_api_service": False,
1819 "blazegraph_full_text_search": False,
1820 "virtuoso_full_text_search": True,
1821 "fuseki_full_text_search": False,
1822 "graphdb_connector_name": None,
1823 "cache_endpoint": None,
1824 "cache_update_endpoint": None,
1825 "silencer": [],
1826 "redis_cache_db": 2,
1827 "ts_upload_cache": self.cache_file,
1828 "ts_failed_queries": self.failed_file,
1829 "ts_stop_file": self.stop_file,
1830 }
1832 with open(meta_config_path, "w") as f:
1833 yaml.dump(settings, f)
1835 # Run the process
1836 run_meta_process(settings=settings, meta_config_path=meta_config_path)
1838 # Query to check volume and issue structure
1839 query = """
1840 PREFIX fabio: <http://purl.org/spar/fabio/>
1841 PREFIX frbr: <http://purl.org/vocab/frbr/core#>
1842 PREFIX prism: <http://prismstandard.org/namespaces/basic/2.0/>
1844 SELECT ?article ?volume ?issue ?seq_id
1845 WHERE {
1846 ?article a fabio:JournalArticle ;
1847 frbr:partOf ?issue .
1848 ?issue a fabio:JournalIssue ;
1849 fabio:hasSequenceIdentifier ?seq_id ;
1850 frbr:partOf ?volume .
1851 ?volume a fabio:JournalVolume .
1852 }
1853 ORDER BY ?article
1854 """
1856 result = execute_sparql_query(SERVER, query)
1858 # Cleanup
1859 shutil.rmtree(output_folder, ignore_errors=True)
1860 shutil.rmtree(os.path.join(BASE_DIR, "input_vvi"), ignore_errors=True)
1861 if os.path.exists(meta_config_path):
1862 os.remove(meta_config_path)
1864 # Verify results
1865 bindings = result["results"]["bindings"]
1867 # Should have 2 articles
1868 self.assertEqual(len(bindings), 2, "Expected 2 articles")
1870 # Both articles should reference the same volume and issue
1871 first_volume = bindings[0]["volume"]["value"]
1872 first_issue = bindings[0]["issue"]["value"]
1874 for binding in bindings[1:]:
1875 self.assertEqual(
1876 binding["volume"]["value"],
1877 first_volume,
1878 "Articles reference different volumes",
1879 )
1880 self.assertEqual(
1881 binding["issue"]["value"],
1882 first_issue,
1883 "Articles reference different issues",
1884 )
1886 def test_volume_issue_deduplication_with_triplestore(self):
1887 """Test that volumes and issues are properly deduplicated when they already exist in the triplestore"""
1888 output_folder = os.path.join(BASE_DIR, "output_vvi_triplestore_test")
1889 meta_config_path = os.path.join(BASE_DIR, "meta_config_vvi_triplestore.yaml")
1891 # Setup: Insert pre-existing venue with duplicate volumes and issues (with/without datatype)
1892 sparql = SPARQLWrapper(SERVER)
1893 sparql.setMethod(POST)
1894 sparql.setQuery(
1895 """
1896 INSERT DATA {
1897 GRAPH <https://w3id.org/oc/meta/br/> {
1898 # Venue
1899 <https://w3id.org/oc/meta/br/0601>
1900 <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0601> ;
1901 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Journal> ;
1902 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1903 <http://purl.org/dc/terms/title> "Test Journal" .
1905 # Volume 1 (without datatype)
1906 <https://w3id.org/oc/meta/br/0602>
1907 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalVolume> ;
1908 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1909 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0601> ;
1910 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1" .
1912 # Volume 1 (with datatype)
1913 <https://w3id.org/oc/meta/br/0604>
1914 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalVolume> ;
1915 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1916 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0601> ;
1917 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1"^^<http://www.w3.org/2001/XMLSchema#string> .
1919 # Issue 1 (without datatype)
1920 <https://w3id.org/oc/meta/br/0603>
1921 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalIssue> ;
1922 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1923 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0602> ;
1924 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1" .
1926 # Issue 1 (with datatype)
1927 <https://w3id.org/oc/meta/br/0605>
1928 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/JournalIssue> ;
1929 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/spar/fabio/Expression> ;
1930 <http://purl.org/vocab/frbr/core#partOf> <https://w3id.org/oc/meta/br/0604> ;
1931 <http://purl.org/spar/fabio/hasSequenceIdentifier> "1"^^<http://www.w3.org/2001/XMLSchema#string> .
1932 }
1933 GRAPH <https://w3id.org/oc/meta/id/> {
1934 <https://w3id.org/oc/meta/id/0601>
1935 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> "1756-1833" ;
1936 <http://purl.org/spar/datacite/usesIdentifierScheme> <http://purl.org/spar/datacite/issn> .
1937 }
1938 }
1939 """
1940 )
1941 sparql.query()
1943 # Update Redis counters for pre-existing entities
1944 redis_handler = RedisCounterHandler(db=5)
1945 redis_handler.set_counter(
1946 5, "br", supplier_prefix="060"
1947 ) # 5 entities: venue, 2 volumes, 2 issues
1948 redis_handler.set_counter(
1949 1, "id", supplier_prefix="060"
1950 ) # 1 identifier for venue
1952 # Create test data - article that should use existing volume and issue
1953 os.makedirs(os.path.join(BASE_DIR, "input_vvi_triplestore"), exist_ok=True)
1954 with open(
1955 os.path.join(BASE_DIR, "input_vvi_triplestore", "test.csv"),
1956 "w",
1957 encoding="utf-8",
1958 ) as f:
1959 writer = csv.writer(f)
1960 writer.writerow(
1961 [
1962 "id",
1963 "title",
1964 "author",
1965 "pub_date",
1966 "venue",
1967 "volume",
1968 "issue",
1969 "page",
1970 "type",
1971 "publisher",
1972 "editor",
1973 ]
1974 )
1975 writer.writerow(
1976 [
1977 "doi:10.1234/test.1",
1978 "Test Article",
1979 "",
1980 "2023",
1981 "Test Journal [issn:1756-1833]",
1982 "1", # Should match existing volume
1983 "1", # Should match existing issue
1984 "1-10",
1985 "journal article",
1986 "",
1987 "",
1988 ]
1989 )
1991 # Create test settings
1992 settings = {
1993 "triplestore_url": SERVER,
1994 "provenance_triplestore_url": PROV_SERVER,
1995 "input_csv_dir": os.path.join(BASE_DIR, "input_vvi_triplestore"),
1996 "base_output_dir": output_folder,
1997 "output_rdf_dir": output_folder,
1998 "resp_agent": "test",
1999 "base_iri": "https://w3id.org/oc/meta/",
2000 "context_path": None,
2001 "dir_split_number": 10000,
2002 "items_per_file": 1000,
2003 "default_dir": "_",
2004 "rdf_output_in_chunks": False,
2005 "zip_output_rdf": True,
2006 "source": None,
2007 "supplier_prefix": "060",
2008 "workers_number": 1,
2009 "use_doi_api_service": False,
2010 "blazegraph_full_text_search": False,
2011 "virtuoso_full_text_search": True,
2012 "fuseki_full_text_search": False,
2013 "graphdb_connector_name": None,
2014 "cache_endpoint": None,
2015 "cache_update_endpoint": None,
2016 "silencer": [],
2017 "redis_cache_db": 2,
2018 "ts_upload_cache": self.cache_file,
2019 "ts_failed_queries": self.failed_file,
2020 "ts_stop_file": self.stop_file,
2021 }
2023 with open(meta_config_path, "w") as f:
2024 yaml.dump(settings, f)
2026 # Run the process
2027 run_meta_process(settings=settings, meta_config_path=meta_config_path)
2029 # Check if new volumes/issues were created
2030 to_be_uploaded_dir = os.path.join(output_folder, "rdf", "to_be_uploaded")
2031 new_entities_created = False
2032 if os.path.exists(to_be_uploaded_dir):
2033 for dirpath, _, filenames in os.walk(to_be_uploaded_dir):
2034 for f in filenames:
2035 if f.endswith(".sparql"):
2036 with open(os.path.join(dirpath, f)) as file:
2037 content = file.read()
2038 if any(
2039 "JournalVolume" in line or "JournalIssue" in line
2040 for line in content.splitlines()
2041 ):
2042 print(f"\nFound new volume/issue creation in {f}:")
2043 new_entities_created = True
2045 # Query to get all entities and their relationships
2046 query = """
2047 PREFIX fabio: <http://purl.org/spar/fabio/>
2048 PREFIX frbr: <http://purl.org/vocab/frbr/core#>
2049 PREFIX datacite: <http://purl.org/spar/datacite/>
2051 SELECT DISTINCT ?article ?venue ?volume ?issue ?issn
2052 WHERE {
2053 ?article a fabio:JournalArticle ;
2054 frbr:partOf ?issue .
2055 ?issue a fabio:JournalIssue ;
2056 frbr:partOf ?volume .
2057 ?volume a fabio:JournalVolume ;
2058 frbr:partOf ?venue .
2059 ?venue datacite:hasIdentifier ?id .
2060 ?id datacite:usesIdentifierScheme datacite:issn ;
2061 <http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue> ?issn .
2062 }
2063 """
2065 result = execute_sparql_query(SERVER, query)
2067 # Cleanup
2068 shutil.rmtree(output_folder, ignore_errors=True)
2069 shutil.rmtree(
2070 os.path.join(BASE_DIR, "input_vvi_triplestore"), ignore_errors=True
2071 )
2072 if os.path.exists(meta_config_path):
2073 os.remove(meta_config_path)
2075 # Verify results
2076 bindings = result["results"]["bindings"]
2077 self.assertEqual(len(bindings), 1, "Expected exactly one article")
2079 # Get the URIs from the result
2080 venue_uri = bindings[0]["venue"]["value"]
2081 volume_uri = bindings[0]["volume"]["value"]
2082 issue_uri = bindings[0]["issue"]["value"]
2083 issn = bindings[0]["issn"]["value"]
2085 # Check if venue was deduplicated (should use existing venue)
2086 self.assertEqual(
2087 venue_uri,
2088 "https://w3id.org/oc/meta/br/0601",
2089 "Venue was not deduplicated correctly",
2090 )
2092 # Check if volume was deduplicated - either version is valid
2093 self.assertIn(
2094 volume_uri,
2095 ["https://w3id.org/oc/meta/br/0602", "https://w3id.org/oc/meta/br/0604"],
2096 "Volume was not deduplicated correctly - should use one of the existing volumes",
2097 )
2099 # Check if issue was deduplicated - either version is valid
2100 self.assertIn(
2101 issue_uri,
2102 ["https://w3id.org/oc/meta/br/0603", "https://w3id.org/oc/meta/br/0605"],
2103 "Issue was not deduplicated correctly - should use one of the existing issues",
2104 )
2106 # Check ISSN
2107 self.assertEqual(issn, "1756-1833", "ISSN does not match")
2109 # Verify no new volumes/issues were created
2110 self.assertFalse(
2111 new_entities_created,
2112 "New volumes/issues were created when they should have been deduplicated",
2113 )
2115 def test_temporary_identifiers(self):
2116 """Test that temporary identifiers are used for deduplication but not saved, and an OMID is generated"""
2117 output_folder = os.path.join(BASE_DIR, "output_temp_id_test")
2118 meta_config_path = os.path.join(BASE_DIR, "meta_config_temp.yaml")
2120 # Setup: create test data with only temporary identifier
2121 os.makedirs(os.path.join(BASE_DIR, "input_temp"), exist_ok=True)
2122 with open(
2123 os.path.join(BASE_DIR, "input_temp", "test.csv"), "w", encoding="utf-8"
2124 ) as f:
2125 writer = csv.writer(f)
2126 writer.writerow(
2127 [
2128 "id",
2129 "title",
2130 "author",
2131 "pub_date",
2132 "venue",
2133 "volume",
2134 "issue",
2135 "page",
2136 "type",
2137 "publisher",
2138 "editor",
2139 ]
2140 )
2141 writer.writerow(
2142 [
2143 "temp:567", # Only temporary identifier
2144 "Test Article",
2145 "",
2146 "2023",
2147 "",
2148 "",
2149 "",
2150 "",
2151 "journal article",
2152 "",
2153 "",
2154 ]
2155 )
2157 # Create test settings
2158 settings = {
2159 "triplestore_url": SERVER,
2160 "provenance_triplestore_url": PROV_SERVER,
2161 "input_csv_dir": os.path.join(BASE_DIR, "input_temp"),
2162 "base_output_dir": output_folder,
2163 "output_rdf_dir": output_folder,
2164 "resp_agent": "test",
2165 "base_iri": "https://w3id.org/oc/meta/",
2166 "context_path": None,
2167 "dir_split_number": 10000,
2168 "items_per_file": 1000,
2169 "default_dir": "_",
2170 "rdf_output_in_chunks": False,
2171 "zip_output_rdf": True,
2172 "source": None,
2173 "supplier_prefix": "060",
2174 "workers_number": 1,
2175 "use_doi_api_service": False,
2176 "blazegraph_full_text_search": False,
2177 "virtuoso_full_text_search": True,
2178 "fuseki_full_text_search": False,
2179 "graphdb_connector_name": None,
2180 "cache_endpoint": None,
2181 "cache_update_endpoint": None,
2182 "silencer": [],
2183 "redis_cache_db": 2,
2184 "ts_upload_cache": self.cache_file,
2185 "ts_failed_queries": self.failed_file,
2186 "ts_stop_file": self.stop_file,
2187 }
2189 with open(meta_config_path, "w") as f:
2190 yaml.dump(settings, f)
2192 now = datetime.now()
2194 # Run the process
2195 run_meta_process(settings=settings, meta_config_path=meta_config_path)
2197 # Query to verify an OMID was generated and no temporary identifier was saved
2198 query = """
2199 PREFIX fabio: <http://purl.org/spar/fabio/>
2200 PREFIX datacite: <http://purl.org/spar/datacite/>
2201 PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>
2203 SELECT ?br ?id ?value ?scheme
2204 WHERE {
2205 ?br a fabio:JournalArticle .
2206 OPTIONAL {
2207 ?br datacite:hasIdentifier ?id .
2208 ?id datacite:usesIdentifierScheme ?scheme ;
2209 literal:hasLiteralValue ?value .
2210 }
2211 }
2212 """
2213 result = execute_sparql_query(SERVER, query, return_format=JSON)
2215 # Cleanup
2216 shutil.rmtree(output_folder, ignore_errors=True)
2217 shutil.rmtree(os.path.join(BASE_DIR, "input_temp"), ignore_errors=True)
2218 if os.path.exists(meta_config_path):
2219 os.remove(meta_config_path)
2220 delete_output_zip(".", now)
2222 # Verify results
2223 bindings = result["results"]["bindings"]
2225 # Should find exactly one article
2226 self.assertEqual(len(bindings), 1, "Expected exactly one article")
2228 # The article should have a br/ URI (OMID)
2229 br_uri = bindings[0]["br"]["value"]
2230 self.assertTrue(
2231 "br/" in br_uri,
2232 f"Article URI {br_uri} does not contain expected OMID pattern 'br/'",
2233 )
2235 # Should not have any saved identifiers
2236 self.assertNotIn(
2237 "id",
2238 bindings[0],
2239 "Found unexpected identifier when only temporary ID was provided",
2240 )
2242 def test_temporary_identifiers_deduplication(self):
2243 """Test that multiple rows with the same temporary identifier are correctly deduplicated"""
2244 # Create test data with two rows using the same temporary identifier
2245 test_data = [
2246 {
2247 "id": "temp:789",
2248 "title": "Test Article 1",
2249 "author": "Smith, John [orcid:0000-0002-1234-5678]",
2250 "pub_date": "2020",
2251 "venue": "",
2252 "volume": "",
2253 "issue": "",
2254 "page": "",
2255 "type": "journal article",
2256 "publisher": "",
2257 "editor": "",
2258 },
2259 {
2260 "id": "temp:789", # Same temporary ID
2261 "title": "Test Article 1", # Same title
2262 "author": "Smith, John [orcid:0000-0002-1234-5678]",
2263 "pub_date": "2020",
2264 "venue": "",
2265 "volume": "",
2266 "issue": "",
2267 "page": "",
2268 "type": "journal article",
2269 "publisher": "",
2270 "editor": "",
2271 },
2272 ]
2274 # Write test data to CSV
2275 input_dir = os.path.join(BASE_DIR, "input_temp_dedup")
2276 os.makedirs(input_dir, exist_ok=True)
2277 csv_path = os.path.join(input_dir, "test.csv")
2278 write_csv(csv_path, test_data)
2280 # Run meta process
2281 output_dir = os.path.join(BASE_DIR, "output_temp_dedup")
2282 os.makedirs(output_dir, exist_ok=True)
2283 config = {
2284 "input_csv_dir": input_dir,
2285 "base_output_dir": output_dir,
2286 "output_rdf_dir": output_dir,
2287 "triplestore_url": SERVER,
2288 "provenance_triplestore_url": PROV_SERVER,
2289 "resp_agent": "https://w3id.org/oc/meta/prov/pa/1",
2290 "base_iri": "https://w3id.org/oc/meta/",
2291 "context_path": "https://w3id.org/oc/meta/context.json",
2292 "supplier_prefix": "060",
2293 "dir_split_number": 10000,
2294 "items_per_file": 1000,
2295 "default_dir": "_",
2296 "rdf_output_in_chunks": True,
2297 "zip_output_rdf": False,
2298 "source": None,
2299 "use_doi_api_service": False,
2300 "workers_number": 1,
2301 "silencer": [],
2302 "redis_host": "localhost",
2303 "redis_port": 6379,
2304 "redis_db": 5,
2305 "redis_cache_db": 2,
2306 "ts_upload_cache": self.cache_file,
2307 "ts_failed_queries": self.failed_file,
2308 "ts_stop_file": self.stop_file,
2309 "graphdb_connector_name": None,
2310 "blazegraph_full_text_search": False,
2311 "fuseki_full_text_search": False,
2312 "virtuoso_full_text_search": False,
2313 "provenance_endpoints": [],
2314 "cache_endpoint": None,
2315 "cache_update_endpoint": None,
2316 "normalize_titles": True,
2317 }
2318 config_path = os.path.join(output_dir, "config.yaml")
2319 with open(config_path, "w") as f:
2320 yaml.dump(config, f)
2322 # Run the process
2323 run_meta_process(settings=config, meta_config_path=config_path)
2325 # Query the triplestore to verify:
2326 # 1. Only one OMID was generated for both rows
2327 # 2. The temporary identifier was not saved
2328 query = """
2329 SELECT DISTINCT ?br
2330 WHERE {
2331 ?br a <http://purl.org/spar/fabio/JournalArticle> .
2332 }
2333 """
2334 results = execute_sparql_query(SERVER, query)
2336 # Clean up
2337 shutil.rmtree(input_dir)
2338 shutil.rmtree(output_dir)
2340 # Should only be one article
2341 articles = [
2342 str(result["br"]["value"]) for result in results["results"]["bindings"]
2343 ]
2344 self.assertEqual(
2345 len(articles), 1, "Should only be one article after deduplication"
2346 )
2349def normalize_graph(graph):
2350 """
2351 Normalizza i letterali nel grafo rimuovendo i tipi di dato espliciti.
2352 """
2353 normalized_graph = Graph()
2354 for subject, predicate, obj in graph:
2355 if isinstance(obj, Literal) and obj.datatype is not None:
2356 normalized_obj = Literal(obj.toPython())
2357 normalized_graph.add((subject, predicate, normalized_obj))
2358 else:
2359 normalized_graph.add((subject, predicate, obj))
2360 return normalized_graph
2363if __name__ == "__main__": # pragma: no cover
2364 unittest.main()