Coverage for heritrace / scripts / clean_missing_entities.py: 100%

112 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-21 12:56 +0000

1#!/usr/bin/env python3 

2 

3# SPDX-FileCopyrightText: 2025 Arcangelo Massari <arcangelo.massari@unibo.it> 

4# 

5# SPDX-License-Identifier: ISC 

6 

7import argparse 

8import importlib.util 

9import logging 

10import sys 

11from typing import Dict, List 

12 

13from heritrace.extensions import SPARQLWrapperWithRetry 

14from heritrace.utils.sparql_utils import VIRTUOSO_EXCLUDED_GRAPHS 

15from SPARQLWrapper import JSON 

16 

17 

18class MissingEntityCleaner: 

19 """ 

20 A class to detect and clean up references to missing entities from the dataset. 

21  

22 Missing entities are URIs that are referenced by triples but don't actually exist 

23 in the dataset (they have no triples where they are the subject). The script identifies  

24 these missing references and removes all triples that reference them. 

25 """ 

26 

27 def __init__(self, endpoint: str, is_virtuoso: bool = False): 

28 """ 

29 Initialize the MissingEntityCleaner. 

30 

31 Args: 

32 endpoint: The SPARQL endpoint for the database 

33 is_virtuoso: Boolean indicating if the endpoint is Virtuoso 

34 """ 

35 self.endpoint = endpoint 

36 self.is_virtuoso = is_virtuoso 

37 self.sparql = SPARQLWrapperWithRetry(endpoint) 

38 self.sparql.setReturnFormat(JSON) 

39 self.logger = logging.getLogger(__name__) 

40 

41 def _find_missing_entities_with_references(self) -> Dict[str, List[Dict[str, str]]]: 

42 """ 

43 Find missing entity references in the dataset along with their references. 

44  

45 A missing entity is one that: 

46 1. Is referenced as an object in at least one triple 

47 2. Has no triples where it is the subject (completely missing) 

48  

49 The following are excluded from being considered missing entities: 

50 - Objects of rdf:type triples (types are not considered entities) 

51 - Objects of ProWithRole triples 

52 - Objects of datacite:usesIdentifierScheme triples 

53  

54 Returns: 

55 Dictionary mapping missing entity URIs to lists of reference dictionaries 

56 """ 

57 is_quad_store = self.is_virtuoso 

58 

59 # Define predicates to exclude 

60 rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" 

61 pro_with_role = "http://purl.org/spar/pro/withRole" 

62 datacite_uses_identifier_scheme = "http://purl.org/spar/datacite/usesIdentifierScheme" 

63 

64 # Combine all predicates to exclude 

65 excluded_predicates = [ 

66 rdf_type, 

67 pro_with_role, 

68 datacite_uses_identifier_scheme 

69 ] 

70 

71 # Format the excluded predicates for SPARQL 

72 excluded_predicates_filter = " && ".join([f"?p != <{pred}>" for pred in excluded_predicates]) 

73 

74 if is_quad_store: 

75 # For quad stores like Virtuoso, we need to query across all graphs 

76 query = f""" 

77 SELECT DISTINCT ?entity ?s ?p 

78 WHERE {{ 

79 # Entity is referenced as an object 

80 GRAPH ?g1 {{ 

81 ?s ?p ?entity . 

82 FILTER(isIRI(?entity)) 

83  

84 # Exclude specified predicates 

85 FILTER({excluded_predicates_filter}) 

86 }} 

87  

88 # But has no triples where it is the subject 

89 FILTER NOT EXISTS {{ 

90 GRAPH ?g2 {{ 

91 ?entity ?anyPredicate ?anyObject . 

92 }} 

93 }} 

94  

95 # Exclude system graphs 

96 FILTER(?g1 NOT IN (<{'>, <'.join(VIRTUOSO_EXCLUDED_GRAPHS)}>)) 

97 }} 

98 """ 

99 else: 

100 # For regular triple stores 

101 query = f""" 

102 SELECT DISTINCT ?entity ?s ?p 

103 WHERE {{ 

104 # Entity is referenced as an object 

105 ?s ?p ?entity . 

106 FILTER(isIRI(?entity)) 

107  

108 # Exclude specified predicates 

109 FILTER({excluded_predicates_filter}) 

110  

111 # But has no triples where it is the subject 

112 FILTER NOT EXISTS {{ 

113 ?entity ?anyPredicate ?anyObject . 

114 }} 

115 }} 

116 """ 

117 

118 self.sparql.setQuery(query) 

119 results = self.sparql.queryAndConvert() 

120 

121 missing_entities = {} 

122 

123 for result in results["results"]["bindings"]: 

124 entity_uri = result["entity"]["value"] 

125 subject = result["s"]["value"] 

126 predicate = result["p"]["value"] 

127 

128 if entity_uri not in missing_entities: 

129 missing_entities[entity_uri] = [] 

130 

131 missing_entities[entity_uri].append({ 

132 "subject": subject, 

133 "predicate": predicate 

134 }) 

135 

136 return missing_entities 

137 

138 def _remove_references(self, entity_uri: str, references: List[Dict[str, str]]) -> bool: 

139 """ 

140 Remove all references to a missing entity. 

141  

142 Args: 

143 entity_uri: The URI of the missing entity 

144 references: List of references to the missing entity 

145  

146 Returns: 

147 bool: True if all references were successfully removed, False otherwise 

148 """ 

149 success = True 

150 

151 for reference in references: 

152 subject = reference["subject"] 

153 predicate = reference["predicate"] 

154 

155 is_quad_store = self.is_virtuoso 

156 

157 if is_quad_store: 

158 # For quad stores, we need to specify the graph 

159 query = f""" 

160 DELETE {{ 

161 GRAPH ?g {{ 

162 <{subject}> <{predicate}> <{entity_uri}> . 

163 }} 

164 }} 

165 WHERE {{ 

166 GRAPH ?g {{ 

167 <{subject}> <{predicate}> <{entity_uri}> . 

168 }} 

169 FILTER(?g NOT IN (<{'>, <'.join(VIRTUOSO_EXCLUDED_GRAPHS)}>)) 

170 }} 

171 """ 

172 else: 

173 # For regular triple stores 

174 query = f""" 

175 DELETE {{ 

176 <{subject}> <{predicate}> <{entity_uri}> . 

177 }} 

178 WHERE {{ 

179 <{subject}> <{predicate}> <{entity_uri}> . 

180 }} 

181 """ 

182 

183 try: 

184 self.sparql.setQuery(query) 

185 self.sparql.method = "POST" 

186 self.sparql.query() 

187 self.logger.info(f"Removed reference from {subject} to {entity_uri} via {predicate}") 

188 except Exception as e: 

189 self.logger.error(f"Error removing reference from {subject} to {entity_uri} via {predicate}: {e}") 

190 success = False 

191 

192 return success 

193 

194 def process_missing_entities(self) -> List[Dict]: 

195 """ 

196 Process all missing entity references in the dataset. 

197  

198 This method: 

199 1. Finds all missing entity references along with their references 

200 2. For each missing entity, removes all references to it 

201  

202 Returns: 

203 List[Dict]: A list of dictionaries containing results for each missing entity processed 

204 Each dictionary includes: 

205 - uri: the URI of the missing entity 

206 - references: list of references that were processed 

207 - success: boolean indicating if all references were successfully removed 

208 """ 

209 missing_entities_with_refs = self._find_missing_entities_with_references() 

210 

211 if not missing_entities_with_refs: 

212 self.logger.info("No missing entity references found.") 

213 return [] 

214 

215 num_missing_entities = len(missing_entities_with_refs) 

216 self.logger.info(f"Found {num_missing_entities} missing entity references.") 

217 

218 total_references = sum(len(refs) for refs in missing_entities_with_refs.values()) 

219 results = [] 

220 

221 for entity_uri, references in missing_entities_with_refs.items(): 

222 self.logger.info(f"Processing missing entity: {entity_uri}") 

223 

224 self.logger.info(f"Found {len(references)} references to missing entity {entity_uri}") 

225 

226 # Remove references to the missing entity 

227 success = self._remove_references(entity_uri, references) 

228 

229 if not success: 

230 self.logger.error(f"Failed to remove references to missing entity {entity_uri}") 

231 

232 results.append({ 

233 "uri": entity_uri, 

234 "references": references, 

235 "success": success 

236 }) 

237 

238 successful = all(result["success"] for result in results) 

239 if successful: 

240 self.logger.info(f"Successfully processed all missing entities. Found {num_missing_entities} missing entities and removed {total_references} references.") 

241 

242 return results 

243 

244 

245def clean_missing_entities(endpoint: str, is_virtuoso: bool = False) -> List[Dict]: 

246 """ 

247 Clean up references to missing entities from the dataset. 

248  

249 Args: 

250 endpoint: The SPARQL endpoint for the database 

251 is_virtuoso: Boolean indicating if the endpoint is Virtuoso 

252  

253 Returns: 

254 List[Dict]: Results of processing each missing entity 

255 """ 

256 cleaner = MissingEntityCleaner(endpoint=endpoint, is_virtuoso=is_virtuoso) 

257 return cleaner.process_missing_entities() 

258 

259 

260def load_config(config_path): 

261 """ 

262 Load configuration from a Python file. 

263  

264 Args: 

265 config_path: Path to the configuration file 

266  

267 Returns: 

268 module: The loaded configuration module 

269 """ 

270 try: 

271 spec = importlib.util.spec_from_file_location("config", config_path) 

272 config = importlib.util.module_from_spec(spec) 

273 spec.loader.exec_module(config) 

274 return config 

275 except Exception as e: 

276 logging.error(f"Error loading configuration file: {e}") 

277 sys.exit(1) 

278 

279 

280def main(): 

281 """Main entry point for the script when run from the command line.""" 

282 parser = argparse.ArgumentParser(description="Detect and clean up references to missing entities from the dataset") 

283 parser.add_argument("--config", "-c", required=True, help="Path to the configuration file") 

284 parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging") 

285 

286 args = parser.parse_args() 

287 

288 # Setup logging 

289 log_level = logging.DEBUG if args.verbose else logging.INFO 

290 logging.basicConfig( 

291 level=log_level, 

292 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 

293 ) 

294 

295 # Load configuration 

296 config = load_config(args.config) 

297 

298 # Get required configuration from Config class 

299 if not hasattr(config.Config, "DATASET_DB_URL"): 

300 logging.error("Config class must define DATASET_DB_URL") 

301 return 1 

302 

303 endpoint = config.Config.DATASET_DB_URL 

304 

305 # Check if database is Virtuoso directly from config instead of using is_virtuoso() 

306 is_virtuoso = False 

307 if hasattr(config.Config, "DATASET_DB_TRIPLESTORE"): 

308 is_virtuoso = config.Config.DATASET_DB_TRIPLESTORE.lower() == "virtuoso" 

309 

310 logging.info(f"Starting missing entity detection and cleanup using endpoint: {endpoint}") 

311 

312 results = clean_missing_entities(endpoint=endpoint, is_virtuoso=is_virtuoso) 

313 

314 successful = all(result["success"] for result in results) 

315 if not results: 

316 logging.info("No missing entity references found") 

317 return 0 

318 elif successful: 

319 logging.info(f"Successfully cleaned up missing entity references from the dataset. Processed {len(results)} missing entities.") 

320 return 0 

321 else: 

322 logging.error(f"Failed to clean up some missing entity references from the dataset. {len([r for r in results if not r['success']])} entities had errors.") 

323 return 1 

324 

325 

326if __name__ == "__main__": # pragma: no cover 

327 sys.exit(main())