Coverage for heritrace / scripts / clean_missing_entities.py: 100%
112 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-21 12:56 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-21 12:56 +0000
1#!/usr/bin/env python3
3# SPDX-FileCopyrightText: 2025 Arcangelo Massari <arcangelo.massari@unibo.it>
4#
5# SPDX-License-Identifier: ISC
7import argparse
8import importlib.util
9import logging
10import sys
11from typing import Dict, List
13from heritrace.extensions import SPARQLWrapperWithRetry
14from heritrace.utils.sparql_utils import VIRTUOSO_EXCLUDED_GRAPHS
15from SPARQLWrapper import JSON
18class MissingEntityCleaner:
19 """
20 A class to detect and clean up references to missing entities from the dataset.
22 Missing entities are URIs that are referenced by triples but don't actually exist
23 in the dataset (they have no triples where they are the subject). The script identifies
24 these missing references and removes all triples that reference them.
25 """
27 def __init__(self, endpoint: str, is_virtuoso: bool = False):
28 """
29 Initialize the MissingEntityCleaner.
31 Args:
32 endpoint: The SPARQL endpoint for the database
33 is_virtuoso: Boolean indicating if the endpoint is Virtuoso
34 """
35 self.endpoint = endpoint
36 self.is_virtuoso = is_virtuoso
37 self.sparql = SPARQLWrapperWithRetry(endpoint)
38 self.sparql.setReturnFormat(JSON)
39 self.logger = logging.getLogger(__name__)
41 def _find_missing_entities_with_references(self) -> Dict[str, List[Dict[str, str]]]:
42 """
43 Find missing entity references in the dataset along with their references.
45 A missing entity is one that:
46 1. Is referenced as an object in at least one triple
47 2. Has no triples where it is the subject (completely missing)
49 The following are excluded from being considered missing entities:
50 - Objects of rdf:type triples (types are not considered entities)
51 - Objects of ProWithRole triples
52 - Objects of datacite:usesIdentifierScheme triples
54 Returns:
55 Dictionary mapping missing entity URIs to lists of reference dictionaries
56 """
57 is_quad_store = self.is_virtuoso
59 # Define predicates to exclude
60 rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
61 pro_with_role = "http://purl.org/spar/pro/withRole"
62 datacite_uses_identifier_scheme = "http://purl.org/spar/datacite/usesIdentifierScheme"
64 # Combine all predicates to exclude
65 excluded_predicates = [
66 rdf_type,
67 pro_with_role,
68 datacite_uses_identifier_scheme
69 ]
71 # Format the excluded predicates for SPARQL
72 excluded_predicates_filter = " && ".join([f"?p != <{pred}>" for pred in excluded_predicates])
74 if is_quad_store:
75 # For quad stores like Virtuoso, we need to query across all graphs
76 query = f"""
77 SELECT DISTINCT ?entity ?s ?p
78 WHERE {{
79 # Entity is referenced as an object
80 GRAPH ?g1 {{
81 ?s ?p ?entity .
82 FILTER(isIRI(?entity))
84 # Exclude specified predicates
85 FILTER({excluded_predicates_filter})
86 }}
88 # But has no triples where it is the subject
89 FILTER NOT EXISTS {{
90 GRAPH ?g2 {{
91 ?entity ?anyPredicate ?anyObject .
92 }}
93 }}
95 # Exclude system graphs
96 FILTER(?g1 NOT IN (<{'>, <'.join(VIRTUOSO_EXCLUDED_GRAPHS)}>))
97 }}
98 """
99 else:
100 # For regular triple stores
101 query = f"""
102 SELECT DISTINCT ?entity ?s ?p
103 WHERE {{
104 # Entity is referenced as an object
105 ?s ?p ?entity .
106 FILTER(isIRI(?entity))
108 # Exclude specified predicates
109 FILTER({excluded_predicates_filter})
111 # But has no triples where it is the subject
112 FILTER NOT EXISTS {{
113 ?entity ?anyPredicate ?anyObject .
114 }}
115 }}
116 """
118 self.sparql.setQuery(query)
119 results = self.sparql.queryAndConvert()
121 missing_entities = {}
123 for result in results["results"]["bindings"]:
124 entity_uri = result["entity"]["value"]
125 subject = result["s"]["value"]
126 predicate = result["p"]["value"]
128 if entity_uri not in missing_entities:
129 missing_entities[entity_uri] = []
131 missing_entities[entity_uri].append({
132 "subject": subject,
133 "predicate": predicate
134 })
136 return missing_entities
138 def _remove_references(self, entity_uri: str, references: List[Dict[str, str]]) -> bool:
139 """
140 Remove all references to a missing entity.
142 Args:
143 entity_uri: The URI of the missing entity
144 references: List of references to the missing entity
146 Returns:
147 bool: True if all references were successfully removed, False otherwise
148 """
149 success = True
151 for reference in references:
152 subject = reference["subject"]
153 predicate = reference["predicate"]
155 is_quad_store = self.is_virtuoso
157 if is_quad_store:
158 # For quad stores, we need to specify the graph
159 query = f"""
160 DELETE {{
161 GRAPH ?g {{
162 <{subject}> <{predicate}> <{entity_uri}> .
163 }}
164 }}
165 WHERE {{
166 GRAPH ?g {{
167 <{subject}> <{predicate}> <{entity_uri}> .
168 }}
169 FILTER(?g NOT IN (<{'>, <'.join(VIRTUOSO_EXCLUDED_GRAPHS)}>))
170 }}
171 """
172 else:
173 # For regular triple stores
174 query = f"""
175 DELETE {{
176 <{subject}> <{predicate}> <{entity_uri}> .
177 }}
178 WHERE {{
179 <{subject}> <{predicate}> <{entity_uri}> .
180 }}
181 """
183 try:
184 self.sparql.setQuery(query)
185 self.sparql.method = "POST"
186 self.sparql.query()
187 self.logger.info(f"Removed reference from {subject} to {entity_uri} via {predicate}")
188 except Exception as e:
189 self.logger.error(f"Error removing reference from {subject} to {entity_uri} via {predicate}: {e}")
190 success = False
192 return success
194 def process_missing_entities(self) -> List[Dict]:
195 """
196 Process all missing entity references in the dataset.
198 This method:
199 1. Finds all missing entity references along with their references
200 2. For each missing entity, removes all references to it
202 Returns:
203 List[Dict]: A list of dictionaries containing results for each missing entity processed
204 Each dictionary includes:
205 - uri: the URI of the missing entity
206 - references: list of references that were processed
207 - success: boolean indicating if all references were successfully removed
208 """
209 missing_entities_with_refs = self._find_missing_entities_with_references()
211 if not missing_entities_with_refs:
212 self.logger.info("No missing entity references found.")
213 return []
215 num_missing_entities = len(missing_entities_with_refs)
216 self.logger.info(f"Found {num_missing_entities} missing entity references.")
218 total_references = sum(len(refs) for refs in missing_entities_with_refs.values())
219 results = []
221 for entity_uri, references in missing_entities_with_refs.items():
222 self.logger.info(f"Processing missing entity: {entity_uri}")
224 self.logger.info(f"Found {len(references)} references to missing entity {entity_uri}")
226 # Remove references to the missing entity
227 success = self._remove_references(entity_uri, references)
229 if not success:
230 self.logger.error(f"Failed to remove references to missing entity {entity_uri}")
232 results.append({
233 "uri": entity_uri,
234 "references": references,
235 "success": success
236 })
238 successful = all(result["success"] for result in results)
239 if successful:
240 self.logger.info(f"Successfully processed all missing entities. Found {num_missing_entities} missing entities and removed {total_references} references.")
242 return results
245def clean_missing_entities(endpoint: str, is_virtuoso: bool = False) -> List[Dict]:
246 """
247 Clean up references to missing entities from the dataset.
249 Args:
250 endpoint: The SPARQL endpoint for the database
251 is_virtuoso: Boolean indicating if the endpoint is Virtuoso
253 Returns:
254 List[Dict]: Results of processing each missing entity
255 """
256 cleaner = MissingEntityCleaner(endpoint=endpoint, is_virtuoso=is_virtuoso)
257 return cleaner.process_missing_entities()
260def load_config(config_path):
261 """
262 Load configuration from a Python file.
264 Args:
265 config_path: Path to the configuration file
267 Returns:
268 module: The loaded configuration module
269 """
270 try:
271 spec = importlib.util.spec_from_file_location("config", config_path)
272 config = importlib.util.module_from_spec(spec)
273 spec.loader.exec_module(config)
274 return config
275 except Exception as e:
276 logging.error(f"Error loading configuration file: {e}")
277 sys.exit(1)
280def main():
281 """Main entry point for the script when run from the command line."""
282 parser = argparse.ArgumentParser(description="Detect and clean up references to missing entities from the dataset")
283 parser.add_argument("--config", "-c", required=True, help="Path to the configuration file")
284 parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
286 args = parser.parse_args()
288 # Setup logging
289 log_level = logging.DEBUG if args.verbose else logging.INFO
290 logging.basicConfig(
291 level=log_level,
292 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
293 )
295 # Load configuration
296 config = load_config(args.config)
298 # Get required configuration from Config class
299 if not hasattr(config.Config, "DATASET_DB_URL"):
300 logging.error("Config class must define DATASET_DB_URL")
301 return 1
303 endpoint = config.Config.DATASET_DB_URL
305 # Check if database is Virtuoso directly from config instead of using is_virtuoso()
306 is_virtuoso = False
307 if hasattr(config.Config, "DATASET_DB_TRIPLESTORE"):
308 is_virtuoso = config.Config.DATASET_DB_TRIPLESTORE.lower() == "virtuoso"
310 logging.info(f"Starting missing entity detection and cleanup using endpoint: {endpoint}")
312 results = clean_missing_entities(endpoint=endpoint, is_virtuoso=is_virtuoso)
314 successful = all(result["success"] for result in results)
315 if not results:
316 logging.info("No missing entity references found")
317 return 0
318 elif successful:
319 logging.info(f"Successfully cleaned up missing entity references from the dataset. Processed {len(results)} missing entities.")
320 return 0
321 else:
322 logging.error(f"Failed to clean up some missing entity references from the dataset. {len([r for r in results if not r['success']])} entities had errors.")
323 return 1
326if __name__ == "__main__": # pragma: no cover
327 sys.exit(main())