Coverage for heritrace/uri_generator/meta_uri_generator.py: 100%
55 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-18 11:10 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-18 11:10 +0000
1from collections import defaultdict
3from heritrace.uri_generator.uri_generator import URIGenerator
4from rdflib import URIRef
5from rdflib_ocdm.counter_handler.redis_counter_handler import RedisCounterHandler
6from SPARQLWrapper import JSON, SPARQLWrapper
9class InvalidURIFormatError(Exception):
10 """Exception raised when an URI has an invalid format."""
11 pass
14class MetaURIGenerator(URIGenerator):
15 def __init__(
16 self, base_iri: str, supplier_prefix: str, counter_handler: RedisCounterHandler
17 ):
18 self.base_iri = base_iri
19 self.supplier_prefix = supplier_prefix
20 self.counter_handler = counter_handler
21 self.entity_type_abbr = {
22 "http://purl.org/spar/fabio/Expression": "br",
23 "http://purl.org/spar/fabio/Article": "br",
24 "http://purl.org/spar/fabio/JournalArticle": "br",
25 "http://purl.org/spar/fabio/Book": "br",
26 "http://purl.org/spar/fabio/BookChapter": "br",
27 "http://purl.org/spar/fabio/JournalIssue": "br",
28 "http://purl.org/spar/fabio/JournalVolume": "br",
29 "http://purl.org/spar/fabio/Journal": "br",
30 "http://purl.org/spar/fabio/AcademicProceedings": "br",
31 "http://purl.org/spar/fabio/ProceedingsPaper": "br",
32 "http://purl.org/spar/fabio/ReferenceBook": "br",
33 "http://purl.org/spar/fabio/Review": "br",
34 "http://purl.org/spar/fabio/ReviewArticle": "br",
35 "http://purl.org/spar/fabio/Series": "br",
36 "http://purl.org/spar/fabio/Thesis": "br",
37 "http://purl.org/spar/pro/RoleInTime": "ar",
38 "http://purl.org/spar/fabio/Manifestation": "re",
39 "http://xmlns.com/foaf/0.1/Agent": "ra",
40 "http://purl.org/spar/datacite/Identifier": "id",
41 }
43 def generate_uri(self, entity_type: str) -> str:
44 last_used = self.counter_handler.read_counter(entity_type)
45 next_number = last_used + 1
46 self.counter_handler.set_counter(next_number, entity_type)
47 return URIRef(
48 f"{self.base_iri}/{self.entity_type_abbr[entity_type]}/{self.supplier_prefix}{next_number}"
49 )
51 def initialize_counters(self, sparql: SPARQLWrapper):
52 """
53 Initialize counters for entity types supported by this URI generator.
54 Extracts sequential numbers from both data and provenance for each abbreviation.
56 :param sparql: SPARQLWrapper instance to execute queries on the dataset
57 :raises InvalidURIFormatError: If an URI with invalid format is found
58 """
59 max_numbers = defaultdict(int)
61 data_query = f"""
62 SELECT ?s ?type
63 WHERE {{
64 ?s a ?type .
65 FILTER(STRSTARTS(str(?s), "{self.base_iri}/"))
66 }}
67 """
69 sparql.setQuery(data_query)
70 sparql.setReturnFormat(JSON)
71 data_results = sparql.query().convert()
73 for result in data_results["results"]["bindings"]:
74 entity_type = result["type"]["value"]
75 entity_uri = result["s"]["value"]
77 if entity_type in self.entity_type_abbr:
78 try:
79 uri_parts = entity_uri.split(self.supplier_prefix)
80 if len(uri_parts) == 2:
81 number_str = uri_parts[1].strip("/")
82 number = int(number_str)
83 abbr = self.entity_type_abbr[entity_type]
84 max_numbers[abbr] = max(max_numbers[abbr], number)
85 except (ValueError, IndexError):
86 raise InvalidURIFormatError(f"Invalid URI format found for entity: {entity_uri}")
88 prov_query = f"""
89 SELECT ?entity
90 WHERE {{
91 ?snapshot <http://www.w3.org/ns/prov#specializationOf> ?entity .
92 }}
93 """
95 sparql.setQuery(prov_query)
96 prov_results = sparql.query().convert()
98 for result in prov_results["results"]["bindings"]:
99 entity_uri = result["entity"]["value"]
101 # For provenance, we directly search for the abbreviation in the URI
102 for abbr in set(self.entity_type_abbr.values()):
103 if f"/{abbr}/" in entity_uri:
104 try:
105 uri_parts = entity_uri.split(self.supplier_prefix)
106 if len(uri_parts) == 2:
107 number_str = uri_parts[1].strip("/")
108 number = int(number_str)
109 max_numbers[abbr] = max(max_numbers[abbr], number)
110 except (ValueError, IndexError):
111 raise InvalidURIFormatError(f"Invalid URI format found in provenance for entity: {entity_uri}")
112 break
114 for entity_type, abbr in self.entity_type_abbr.items():
115 self.counter_handler.set_counter(max_numbers[abbr], entity_type)