Coverage for heritrace/scripts/clean_missing_entities.py: 100%
111 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-18 11:10 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-18 11:10 +0000
1#!/usr/bin/env python3
3import argparse
4import importlib.util
5import logging
6import sys
7from typing import Dict, List, Tuple
9from SPARQLWrapper import JSON, SPARQLWrapper
11from heritrace.utils.sparql_utils import VIRTUOSO_EXCLUDED_GRAPHS
14class MissingEntityCleaner:
15 """
16 A class to detect and clean up references to missing entities from the dataset.
18 Missing entities are URIs that are referenced by triples but don't actually exist
19 in the dataset (they have no triples where they are the subject). The script identifies
20 these missing references and removes all triples that reference them.
21 """
23 def __init__(self, endpoint: str, is_virtuoso: bool = False):
24 """
25 Initialize the MissingEntityCleaner.
27 Args:
28 endpoint: The SPARQL endpoint for the database
29 is_virtuoso: Boolean indicating if the endpoint is Virtuoso
30 """
31 self.endpoint = endpoint
32 self.is_virtuoso = is_virtuoso
33 self.sparql = SPARQLWrapper(endpoint)
34 self.sparql.setReturnFormat(JSON)
35 self.logger = logging.getLogger(__name__)
37 def _find_missing_entities_with_references(self) -> Dict[str, List[Dict[str, str]]]:
38 """
39 Find missing entity references in the dataset along with their references.
41 A missing entity is one that:
42 1. Is referenced as an object in at least one triple
43 2. Has no triples where it is the subject (completely missing)
45 The following are excluded from being considered missing entities:
46 - Objects of rdf:type triples (types are not considered entities)
47 - Objects of ProWithRole triples
48 - Objects of datacite:usesIdentifierScheme triples
50 Returns:
51 Dictionary mapping missing entity URIs to lists of reference dictionaries
52 """
53 is_quad_store = self.is_virtuoso
55 # Define predicates to exclude
56 rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
57 pro_with_role = "http://purl.org/spar/pro/withRole"
58 datacite_uses_identifier_scheme = "http://purl.org/spar/datacite/usesIdentifierScheme"
60 # Combine all predicates to exclude
61 excluded_predicates = [
62 rdf_type,
63 pro_with_role,
64 datacite_uses_identifier_scheme
65 ]
67 # Format the excluded predicates for SPARQL
68 excluded_predicates_filter = " && ".join([f"?p != <{pred}>" for pred in excluded_predicates])
70 if is_quad_store:
71 # For quad stores like Virtuoso, we need to query across all graphs
72 query = f"""
73 SELECT DISTINCT ?entity ?s ?p
74 WHERE {{
75 # Entity is referenced as an object
76 GRAPH ?g1 {{
77 ?s ?p ?entity .
78 FILTER(isIRI(?entity))
80 # Exclude specified predicates
81 FILTER({excluded_predicates_filter})
82 }}
84 # But has no triples where it is the subject
85 FILTER NOT EXISTS {{
86 GRAPH ?g2 {{
87 ?entity ?anyPredicate ?anyObject .
88 }}
89 }}
91 # Exclude system graphs
92 FILTER(?g1 NOT IN (<{'>, <'.join(VIRTUOSO_EXCLUDED_GRAPHS)}>))
93 }}
94 """
95 else:
96 # For regular triple stores
97 query = f"""
98 SELECT DISTINCT ?entity ?s ?p
99 WHERE {{
100 # Entity is referenced as an object
101 ?s ?p ?entity .
102 FILTER(isIRI(?entity))
104 # Exclude specified predicates
105 FILTER({excluded_predicates_filter})
107 # But has no triples where it is the subject
108 FILTER NOT EXISTS {{
109 ?entity ?anyPredicate ?anyObject .
110 }}
111 }}
112 """
114 self.sparql.setQuery(query)
115 results = self.sparql.queryAndConvert()
117 missing_entities = {}
119 for result in results["results"]["bindings"]:
120 entity_uri = result["entity"]["value"]
121 subject = result["s"]["value"]
122 predicate = result["p"]["value"]
124 if entity_uri not in missing_entities:
125 missing_entities[entity_uri] = []
127 missing_entities[entity_uri].append({
128 "subject": subject,
129 "predicate": predicate
130 })
132 return missing_entities
134 def _remove_references(self, entity_uri: str, references: List[Dict[str, str]]) -> bool:
135 """
136 Remove all references to a missing entity.
138 Args:
139 entity_uri: The URI of the missing entity
140 references: List of references to the missing entity
142 Returns:
143 bool: True if all references were successfully removed, False otherwise
144 """
145 success = True
147 for reference in references:
148 subject = reference["subject"]
149 predicate = reference["predicate"]
151 is_quad_store = self.is_virtuoso
153 if is_quad_store:
154 # For quad stores, we need to specify the graph
155 query = f"""
156 DELETE {{
157 GRAPH ?g {{
158 <{subject}> <{predicate}> <{entity_uri}> .
159 }}
160 }}
161 WHERE {{
162 GRAPH ?g {{
163 <{subject}> <{predicate}> <{entity_uri}> .
164 }}
165 FILTER(?g NOT IN (<{'>, <'.join(VIRTUOSO_EXCLUDED_GRAPHS)}>))
166 }}
167 """
168 else:
169 # For regular triple stores
170 query = f"""
171 DELETE {{
172 <{subject}> <{predicate}> <{entity_uri}> .
173 }}
174 WHERE {{
175 <{subject}> <{predicate}> <{entity_uri}> .
176 }}
177 """
179 try:
180 self.sparql.setQuery(query)
181 self.sparql.method = "POST"
182 self.sparql.query()
183 self.logger.info(f"Removed reference from {subject} to {entity_uri} via {predicate}")
184 except Exception as e:
185 self.logger.error(f"Error removing reference from {subject} to {entity_uri} via {predicate}: {e}")
186 success = False
188 return success
190 def process_missing_entities(self) -> List[Dict]:
191 """
192 Process all missing entity references in the dataset.
194 This method:
195 1. Finds all missing entity references along with their references
196 2. For each missing entity, removes all references to it
198 Returns:
199 List[Dict]: A list of dictionaries containing results for each missing entity processed
200 Each dictionary includes:
201 - uri: the URI of the missing entity
202 - references: list of references that were processed
203 - success: boolean indicating if all references were successfully removed
204 """
205 missing_entities_with_refs = self._find_missing_entities_with_references()
207 if not missing_entities_with_refs:
208 self.logger.info("No missing entity references found.")
209 return []
211 num_missing_entities = len(missing_entities_with_refs)
212 self.logger.info(f"Found {num_missing_entities} missing entity references.")
214 total_references = sum(len(refs) for refs in missing_entities_with_refs.values())
215 results = []
217 for entity_uri, references in missing_entities_with_refs.items():
218 self.logger.info(f"Processing missing entity: {entity_uri}")
220 self.logger.info(f"Found {len(references)} references to missing entity {entity_uri}")
222 # Remove references to the missing entity
223 success = self._remove_references(entity_uri, references)
225 if not success:
226 self.logger.error(f"Failed to remove references to missing entity {entity_uri}")
228 results.append({
229 "uri": entity_uri,
230 "references": references,
231 "success": success
232 })
234 successful = all(result["success"] for result in results)
235 if successful:
236 self.logger.info(f"Successfully processed all missing entities. Found {num_missing_entities} missing entities and removed {total_references} references.")
238 return results
241def clean_missing_entities(endpoint: str, is_virtuoso: bool = False) -> List[Dict]:
242 """
243 Clean up references to missing entities from the dataset.
245 Args:
246 endpoint: The SPARQL endpoint for the database
247 is_virtuoso: Boolean indicating if the endpoint is Virtuoso
249 Returns:
250 List[Dict]: Results of processing each missing entity
251 """
252 cleaner = MissingEntityCleaner(endpoint=endpoint, is_virtuoso=is_virtuoso)
253 return cleaner.process_missing_entities()
256def load_config(config_path):
257 """
258 Load configuration from a Python file.
260 Args:
261 config_path: Path to the configuration file
263 Returns:
264 module: The loaded configuration module
265 """
266 try:
267 spec = importlib.util.spec_from_file_location("config", config_path)
268 config = importlib.util.module_from_spec(spec)
269 spec.loader.exec_module(config)
270 return config
271 except Exception as e:
272 logging.error(f"Error loading configuration file: {e}")
273 sys.exit(1)
276def main():
277 """Main entry point for the script when run from the command line."""
278 parser = argparse.ArgumentParser(description="Detect and clean up references to missing entities from the dataset")
279 parser.add_argument("--config", "-c", required=True, help="Path to the configuration file")
280 parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
282 args = parser.parse_args()
284 # Setup logging
285 log_level = logging.DEBUG if args.verbose else logging.INFO
286 logging.basicConfig(
287 level=log_level,
288 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
289 )
291 # Load configuration
292 config = load_config(args.config)
294 # Get required configuration from Config class
295 if not hasattr(config.Config, "DATASET_DB_URL"):
296 logging.error("Config class must define DATASET_DB_URL")
297 return 1
299 endpoint = config.Config.DATASET_DB_URL
301 # Check if database is Virtuoso directly from config instead of using is_virtuoso()
302 is_virtuoso = False
303 if hasattr(config.Config, "DATASET_DB_TRIPLESTORE"):
304 is_virtuoso = config.Config.DATASET_DB_TRIPLESTORE.lower() == "virtuoso"
306 logging.info(f"Starting missing entity detection and cleanup using endpoint: {endpoint}")
308 results = clean_missing_entities(endpoint=endpoint, is_virtuoso=is_virtuoso)
310 successful = all(result["success"] for result in results)
311 if not results:
312 logging.info("No missing entity references found")
313 return 0
314 elif successful:
315 logging.info(f"Successfully cleaned up missing entity references from the dataset. Processed {len(results)} missing entities.")
316 return 0
317 else:
318 logging.error(f"Failed to clean up some missing entity references from the dataset. {len([r for r in results if not r['success']])} entities had errors.")
319 return 1
322if __name__ == "__main__": # pragma: no cover
323 sys.exit(main())