Coverage for heritrace/scripts/clean_missing_entities.py: 100%

111 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-04-18 11:10 +0000

1#!/usr/bin/env python3 

2 

3import argparse 

4import importlib.util 

5import logging 

6import sys 

7from typing import Dict, List, Tuple 

8 

9from SPARQLWrapper import JSON, SPARQLWrapper 

10 

11from heritrace.utils.sparql_utils import VIRTUOSO_EXCLUDED_GRAPHS 

12 

13 

14class MissingEntityCleaner: 

15 """ 

16 A class to detect and clean up references to missing entities from the dataset. 

17  

18 Missing entities are URIs that are referenced by triples but don't actually exist 

19 in the dataset (they have no triples where they are the subject). The script identifies  

20 these missing references and removes all triples that reference them. 

21 """ 

22 

23 def __init__(self, endpoint: str, is_virtuoso: bool = False): 

24 """ 

25 Initialize the MissingEntityCleaner. 

26 

27 Args: 

28 endpoint: The SPARQL endpoint for the database 

29 is_virtuoso: Boolean indicating if the endpoint is Virtuoso 

30 """ 

31 self.endpoint = endpoint 

32 self.is_virtuoso = is_virtuoso 

33 self.sparql = SPARQLWrapper(endpoint) 

34 self.sparql.setReturnFormat(JSON) 

35 self.logger = logging.getLogger(__name__) 

36 

37 def _find_missing_entities_with_references(self) -> Dict[str, List[Dict[str, str]]]: 

38 """ 

39 Find missing entity references in the dataset along with their references. 

40  

41 A missing entity is one that: 

42 1. Is referenced as an object in at least one triple 

43 2. Has no triples where it is the subject (completely missing) 

44  

45 The following are excluded from being considered missing entities: 

46 - Objects of rdf:type triples (types are not considered entities) 

47 - Objects of ProWithRole triples 

48 - Objects of datacite:usesIdentifierScheme triples 

49  

50 Returns: 

51 Dictionary mapping missing entity URIs to lists of reference dictionaries 

52 """ 

53 is_quad_store = self.is_virtuoso 

54 

55 # Define predicates to exclude 

56 rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" 

57 pro_with_role = "http://purl.org/spar/pro/withRole" 

58 datacite_uses_identifier_scheme = "http://purl.org/spar/datacite/usesIdentifierScheme" 

59 

60 # Combine all predicates to exclude 

61 excluded_predicates = [ 

62 rdf_type, 

63 pro_with_role, 

64 datacite_uses_identifier_scheme 

65 ] 

66 

67 # Format the excluded predicates for SPARQL 

68 excluded_predicates_filter = " && ".join([f"?p != <{pred}>" for pred in excluded_predicates]) 

69 

70 if is_quad_store: 

71 # For quad stores like Virtuoso, we need to query across all graphs 

72 query = f""" 

73 SELECT DISTINCT ?entity ?s ?p 

74 WHERE {{ 

75 # Entity is referenced as an object 

76 GRAPH ?g1 {{ 

77 ?s ?p ?entity . 

78 FILTER(isIRI(?entity)) 

79  

80 # Exclude specified predicates 

81 FILTER({excluded_predicates_filter}) 

82 }} 

83  

84 # But has no triples where it is the subject 

85 FILTER NOT EXISTS {{ 

86 GRAPH ?g2 {{ 

87 ?entity ?anyPredicate ?anyObject . 

88 }} 

89 }} 

90  

91 # Exclude system graphs 

92 FILTER(?g1 NOT IN (<{'>, <'.join(VIRTUOSO_EXCLUDED_GRAPHS)}>)) 

93 }} 

94 """ 

95 else: 

96 # For regular triple stores 

97 query = f""" 

98 SELECT DISTINCT ?entity ?s ?p 

99 WHERE {{ 

100 # Entity is referenced as an object 

101 ?s ?p ?entity . 

102 FILTER(isIRI(?entity)) 

103  

104 # Exclude specified predicates 

105 FILTER({excluded_predicates_filter}) 

106  

107 # But has no triples where it is the subject 

108 FILTER NOT EXISTS {{ 

109 ?entity ?anyPredicate ?anyObject . 

110 }} 

111 }} 

112 """ 

113 

114 self.sparql.setQuery(query) 

115 results = self.sparql.queryAndConvert() 

116 

117 missing_entities = {} 

118 

119 for result in results["results"]["bindings"]: 

120 entity_uri = result["entity"]["value"] 

121 subject = result["s"]["value"] 

122 predicate = result["p"]["value"] 

123 

124 if entity_uri not in missing_entities: 

125 missing_entities[entity_uri] = [] 

126 

127 missing_entities[entity_uri].append({ 

128 "subject": subject, 

129 "predicate": predicate 

130 }) 

131 

132 return missing_entities 

133 

134 def _remove_references(self, entity_uri: str, references: List[Dict[str, str]]) -> bool: 

135 """ 

136 Remove all references to a missing entity. 

137  

138 Args: 

139 entity_uri: The URI of the missing entity 

140 references: List of references to the missing entity 

141  

142 Returns: 

143 bool: True if all references were successfully removed, False otherwise 

144 """ 

145 success = True 

146 

147 for reference in references: 

148 subject = reference["subject"] 

149 predicate = reference["predicate"] 

150 

151 is_quad_store = self.is_virtuoso 

152 

153 if is_quad_store: 

154 # For quad stores, we need to specify the graph 

155 query = f""" 

156 DELETE {{ 

157 GRAPH ?g {{ 

158 <{subject}> <{predicate}> <{entity_uri}> . 

159 }} 

160 }} 

161 WHERE {{ 

162 GRAPH ?g {{ 

163 <{subject}> <{predicate}> <{entity_uri}> . 

164 }} 

165 FILTER(?g NOT IN (<{'>, <'.join(VIRTUOSO_EXCLUDED_GRAPHS)}>)) 

166 }} 

167 """ 

168 else: 

169 # For regular triple stores 

170 query = f""" 

171 DELETE {{ 

172 <{subject}> <{predicate}> <{entity_uri}> . 

173 }} 

174 WHERE {{ 

175 <{subject}> <{predicate}> <{entity_uri}> . 

176 }} 

177 """ 

178 

179 try: 

180 self.sparql.setQuery(query) 

181 self.sparql.method = "POST" 

182 self.sparql.query() 

183 self.logger.info(f"Removed reference from {subject} to {entity_uri} via {predicate}") 

184 except Exception as e: 

185 self.logger.error(f"Error removing reference from {subject} to {entity_uri} via {predicate}: {e}") 

186 success = False 

187 

188 return success 

189 

190 def process_missing_entities(self) -> List[Dict]: 

191 """ 

192 Process all missing entity references in the dataset. 

193  

194 This method: 

195 1. Finds all missing entity references along with their references 

196 2. For each missing entity, removes all references to it 

197  

198 Returns: 

199 List[Dict]: A list of dictionaries containing results for each missing entity processed 

200 Each dictionary includes: 

201 - uri: the URI of the missing entity 

202 - references: list of references that were processed 

203 - success: boolean indicating if all references were successfully removed 

204 """ 

205 missing_entities_with_refs = self._find_missing_entities_with_references() 

206 

207 if not missing_entities_with_refs: 

208 self.logger.info("No missing entity references found.") 

209 return [] 

210 

211 num_missing_entities = len(missing_entities_with_refs) 

212 self.logger.info(f"Found {num_missing_entities} missing entity references.") 

213 

214 total_references = sum(len(refs) for refs in missing_entities_with_refs.values()) 

215 results = [] 

216 

217 for entity_uri, references in missing_entities_with_refs.items(): 

218 self.logger.info(f"Processing missing entity: {entity_uri}") 

219 

220 self.logger.info(f"Found {len(references)} references to missing entity {entity_uri}") 

221 

222 # Remove references to the missing entity 

223 success = self._remove_references(entity_uri, references) 

224 

225 if not success: 

226 self.logger.error(f"Failed to remove references to missing entity {entity_uri}") 

227 

228 results.append({ 

229 "uri": entity_uri, 

230 "references": references, 

231 "success": success 

232 }) 

233 

234 successful = all(result["success"] for result in results) 

235 if successful: 

236 self.logger.info(f"Successfully processed all missing entities. Found {num_missing_entities} missing entities and removed {total_references} references.") 

237 

238 return results 

239 

240 

241def clean_missing_entities(endpoint: str, is_virtuoso: bool = False) -> List[Dict]: 

242 """ 

243 Clean up references to missing entities from the dataset. 

244  

245 Args: 

246 endpoint: The SPARQL endpoint for the database 

247 is_virtuoso: Boolean indicating if the endpoint is Virtuoso 

248  

249 Returns: 

250 List[Dict]: Results of processing each missing entity 

251 """ 

252 cleaner = MissingEntityCleaner(endpoint=endpoint, is_virtuoso=is_virtuoso) 

253 return cleaner.process_missing_entities() 

254 

255 

256def load_config(config_path): 

257 """ 

258 Load configuration from a Python file. 

259  

260 Args: 

261 config_path: Path to the configuration file 

262  

263 Returns: 

264 module: The loaded configuration module 

265 """ 

266 try: 

267 spec = importlib.util.spec_from_file_location("config", config_path) 

268 config = importlib.util.module_from_spec(spec) 

269 spec.loader.exec_module(config) 

270 return config 

271 except Exception as e: 

272 logging.error(f"Error loading configuration file: {e}") 

273 sys.exit(1) 

274 

275 

276def main(): 

277 """Main entry point for the script when run from the command line.""" 

278 parser = argparse.ArgumentParser(description="Detect and clean up references to missing entities from the dataset") 

279 parser.add_argument("--config", "-c", required=True, help="Path to the configuration file") 

280 parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging") 

281 

282 args = parser.parse_args() 

283 

284 # Setup logging 

285 log_level = logging.DEBUG if args.verbose else logging.INFO 

286 logging.basicConfig( 

287 level=log_level, 

288 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 

289 ) 

290 

291 # Load configuration 

292 config = load_config(args.config) 

293 

294 # Get required configuration from Config class 

295 if not hasattr(config.Config, "DATASET_DB_URL"): 

296 logging.error("Config class must define DATASET_DB_URL") 

297 return 1 

298 

299 endpoint = config.Config.DATASET_DB_URL 

300 

301 # Check if database is Virtuoso directly from config instead of using is_virtuoso() 

302 is_virtuoso = False 

303 if hasattr(config.Config, "DATASET_DB_TRIPLESTORE"): 

304 is_virtuoso = config.Config.DATASET_DB_TRIPLESTORE.lower() == "virtuoso" 

305 

306 logging.info(f"Starting missing entity detection and cleanup using endpoint: {endpoint}") 

307 

308 results = clean_missing_entities(endpoint=endpoint, is_virtuoso=is_virtuoso) 

309 

310 successful = all(result["success"] for result in results) 

311 if not results: 

312 logging.info("No missing entity references found") 

313 return 0 

314 elif successful: 

315 logging.info(f"Successfully cleaned up missing entity references from the dataset. Processed {len(results)} missing entities.") 

316 return 0 

317 else: 

318 logging.error(f"Failed to clean up some missing entity references from the dataset. {len([r for r in results if not r['success']])} entities had errors.") 

319 return 1 

320 

321 

322if __name__ == "__main__": # pragma: no cover 

323 sys.exit(main())