Coverage for heritrace/uri_generator/meta_uri_generator.py: 100%

55 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-04-18 11:10 +0000

1from collections import defaultdict 

2 

3from heritrace.uri_generator.uri_generator import URIGenerator 

4from rdflib import URIRef 

5from rdflib_ocdm.counter_handler.redis_counter_handler import RedisCounterHandler 

6from SPARQLWrapper import JSON, SPARQLWrapper 

7 

8 

9class InvalidURIFormatError(Exception): 

10 """Exception raised when an URI has an invalid format.""" 

11 pass 

12 

13 

14class MetaURIGenerator(URIGenerator): 

15 def __init__( 

16 self, base_iri: str, supplier_prefix: str, counter_handler: RedisCounterHandler 

17 ): 

18 self.base_iri = base_iri 

19 self.supplier_prefix = supplier_prefix 

20 self.counter_handler = counter_handler 

21 self.entity_type_abbr = { 

22 "http://purl.org/spar/fabio/Expression": "br", 

23 "http://purl.org/spar/fabio/Article": "br", 

24 "http://purl.org/spar/fabio/JournalArticle": "br", 

25 "http://purl.org/spar/fabio/Book": "br", 

26 "http://purl.org/spar/fabio/BookChapter": "br", 

27 "http://purl.org/spar/fabio/JournalIssue": "br", 

28 "http://purl.org/spar/fabio/JournalVolume": "br", 

29 "http://purl.org/spar/fabio/Journal": "br", 

30 "http://purl.org/spar/fabio/AcademicProceedings": "br", 

31 "http://purl.org/spar/fabio/ProceedingsPaper": "br", 

32 "http://purl.org/spar/fabio/ReferenceBook": "br", 

33 "http://purl.org/spar/fabio/Review": "br", 

34 "http://purl.org/spar/fabio/ReviewArticle": "br", 

35 "http://purl.org/spar/fabio/Series": "br", 

36 "http://purl.org/spar/fabio/Thesis": "br", 

37 "http://purl.org/spar/pro/RoleInTime": "ar", 

38 "http://purl.org/spar/fabio/Manifestation": "re", 

39 "http://xmlns.com/foaf/0.1/Agent": "ra", 

40 "http://purl.org/spar/datacite/Identifier": "id", 

41 } 

42 

43 def generate_uri(self, entity_type: str) -> str: 

44 last_used = self.counter_handler.read_counter(entity_type) 

45 next_number = last_used + 1 

46 self.counter_handler.set_counter(next_number, entity_type) 

47 return URIRef( 

48 f"{self.base_iri}/{self.entity_type_abbr[entity_type]}/{self.supplier_prefix}{next_number}" 

49 ) 

50 

51 def initialize_counters(self, sparql: SPARQLWrapper): 

52 """ 

53 Initialize counters for entity types supported by this URI generator. 

54 Extracts sequential numbers from both data and provenance for each abbreviation. 

55  

56 :param sparql: SPARQLWrapper instance to execute queries on the dataset 

57 :raises InvalidURIFormatError: If an URI with invalid format is found 

58 """ 

59 max_numbers = defaultdict(int) 

60 

61 data_query = f""" 

62 SELECT ?s ?type 

63 WHERE {{ 

64 ?s a ?type . 

65 FILTER(STRSTARTS(str(?s), "{self.base_iri}/")) 

66 }} 

67 """ 

68 

69 sparql.setQuery(data_query) 

70 sparql.setReturnFormat(JSON) 

71 data_results = sparql.query().convert() 

72 

73 for result in data_results["results"]["bindings"]: 

74 entity_type = result["type"]["value"] 

75 entity_uri = result["s"]["value"] 

76 

77 if entity_type in self.entity_type_abbr: 

78 try: 

79 uri_parts = entity_uri.split(self.supplier_prefix) 

80 if len(uri_parts) == 2: 

81 number_str = uri_parts[1].strip("/") 

82 number = int(number_str) 

83 abbr = self.entity_type_abbr[entity_type] 

84 max_numbers[abbr] = max(max_numbers[abbr], number) 

85 except (ValueError, IndexError): 

86 raise InvalidURIFormatError(f"Invalid URI format found for entity: {entity_uri}") 

87 

88 prov_query = f""" 

89 SELECT ?entity 

90 WHERE {{ 

91 ?snapshot <http://www.w3.org/ns/prov#specializationOf> ?entity . 

92 }} 

93 """ 

94 

95 sparql.setQuery(prov_query) 

96 prov_results = sparql.query().convert() 

97 

98 for result in prov_results["results"]["bindings"]: 

99 entity_uri = result["entity"]["value"] 

100 

101 # For provenance, we directly search for the abbreviation in the URI 

102 for abbr in set(self.entity_type_abbr.values()): 

103 if f"/{abbr}/" in entity_uri: 

104 try: 

105 uri_parts = entity_uri.split(self.supplier_prefix) 

106 if len(uri_parts) == 2: 

107 number_str = uri_parts[1].strip("/") 

108 number = int(number_str) 

109 max_numbers[abbr] = max(max_numbers[abbr], number) 

110 except (ValueError, IndexError): 

111 raise InvalidURIFormatError(f"Invalid URI format found in provenance for entity: {entity_uri}") 

112 break 

113 

114 for entity_type, abbr in self.entity_type_abbr.items(): 

115 self.counter_handler.set_counter(max_numbers[abbr], entity_type)