Coverage for heritrace / utils / sparql_utils.py: 94%

436 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-21 12:56 +0000

1# SPDX-FileCopyrightText: 2024-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# 

3# SPDX-License-Identifier: ISC 

4 

5import os 

6from collections import defaultdict 

7from concurrent.futures import ProcessPoolExecutor, as_completed 

8from typing import List 

9 

10from rdflib import RDF, Dataset, Graph, Literal, URIRef 

11from rdflib.term import Node 

12from rdflib.plugins.sparql.algebra import translateUpdate 

13from rdflib.plugins.sparql.parser import parseUpdate 

14from rdflib.util import from_n3 

15from SPARQLWrapper import JSON 

16from time_agnostic_library.agnostic_entity import AgnosticEntity 

17 

18from heritrace.editor import Editor 

19from heritrace.extensions import (get_change_tracking_config, 

20 get_classes_with_multiple_shapes, 

21 get_custom_filter, get_dataset_is_quadstore, 

22 get_display_rules, get_provenance_sparql, 

23 get_shacl_graph, get_sparql) 

24from heritrace.utils.converters import convert_to_datetime 

25from heritrace.utils.display_rules_utils import (find_matching_rule, 

26 get_highest_priority_class, 

27 get_sortable_properties, 

28 is_entity_type_visible) 

29from heritrace.utils.shacl_utils import (determine_shape_for_classes, 

30 determine_shape_for_entity_triples) 

31from heritrace.utils.virtuoso_utils import (VIRTUOSO_EXCLUDED_GRAPHS, 

32 is_virtuoso) 

33 

34_AVAILABLE_CLASSES_CACHE = None 

35 

36 

37def _parse_n3(value: str) -> Node: 

38 result = from_n3(value) 

39 if not isinstance(result, Node): 

40 raise ValueError(f"Cannot parse N3 value: {value}") 

41 return result 

42 

43 

44def n3_set_to_graph(n3_set: set[tuple[str, ...]], is_quadstore: bool) -> Graph | Dataset: 

45 if is_quadstore: 

46 g = Dataset(default_union=True) 

47 for tup in n3_set: 

48 g.add((_parse_n3(tup[0]), _parse_n3(tup[1]), _parse_n3(tup[2]), _parse_n3(tup[3]))) # type: ignore[arg-type] 

49 else: 

50 g = Graph() 

51 for tup in n3_set: 

52 g.add((_parse_n3(tup[0]), _parse_n3(tup[1]), _parse_n3(tup[2]))) 

53 return g 

54 

55 

56def convert_to_rdflib_graphs(snapshots: dict, is_quadstore: bool) -> dict: 

57 converted = {} 

58 for entity_uri, timestamps in snapshots.items(): 

59 converted[entity_uri] = {} 

60 for ts, n3_set in timestamps.items(): 

61 converted[entity_uri][ts] = n3_set_to_graph(n3_set, is_quadstore) 

62 return converted 

63 

64 

65def get_triples_from_graph(graph_or_dataset, pattern): 

66 """ 

67 Get triples from a Graph or Dataset, handling both cases correctly. 

68 

69 For Dataset (quadstore), converts quads to triples by extracting (s, p, o). 

70 For Graph (triplestore), uses triples() directly. 

71 

72 Args: 

73 graph_or_dataset: Graph or Dataset instance 

74 pattern: Triple pattern tuple (s, p, o) where each can be None 

75 

76 Returns: 

77 Generator of triples (s, p, o) 

78 """ 

79 if isinstance(graph_or_dataset, Dataset): 

80 # For Dataset, use quads() and extract only (s, p, o) 

81 for s, p, o, g in graph_or_dataset.quads(pattern): 

82 yield (s, p, o) 

83 else: 

84 # For Graph, use triples() directly 

85 yield from graph_or_dataset.triples(pattern) 

86COUNT_LIMIT = int(os.getenv("COUNT_LIMIT", "10000")) 

87 

88 

89def precompute_available_classes_cache(): 

90 """Pre-compute available classes cache at application startup.""" 

91 global _AVAILABLE_CLASSES_CACHE 

92 _AVAILABLE_CLASSES_CACHE = get_available_classes() 

93 return _AVAILABLE_CLASSES_CACHE 

94 

95 

96def _wrap_virtuoso_graph_pattern(pattern: str) -> str: 

97 """Wrap a SPARQL pattern with Virtuoso GRAPH clause if needed.""" 

98 if is_virtuoso(): 

99 return f""" 

100 GRAPH ?g {{ 

101 {pattern} 

102 }} 

103 FILTER(?g NOT IN (<{'>, <'.join(VIRTUOSO_EXCLUDED_GRAPHS)}>)) 

104 """ 

105 return pattern 

106 

107 

108def _build_count_query_with_limit(class_uri: str, limit: int) -> str: 

109 """Build a COUNT query with LIMIT for a specific class.""" 

110 

111 return f""" 

112 SELECT (COUNT(?subject) as ?count) 

113 WHERE {{ 

114 {{ 

115 SELECT DISTINCT ?subject 

116 WHERE {{ 

117 ?subject a <{class_uri}> . 

118 }} 

119 LIMIT {limit} 

120 }} 

121 }} 

122 """ 

123 

124 

125def _count_class_instances(class_uri: str, limit: int = COUNT_LIMIT) -> tuple: 

126 """ 

127 Count instances of a class up to a limit. 

128 

129 Returns: 

130 tuple: (display_count, numeric_count) where display_count may be "LIMIT+" 

131 """ 

132 sparql = get_sparql() 

133 query = _build_count_query_with_limit(class_uri, limit + 1) 

134 

135 sparql.setQuery(query) 

136 sparql.setReturnFormat(JSON) 

137 result = sparql.query().convert() 

138 

139 count = int(result["results"]["bindings"][0]["count"]["value"]) 

140 

141 if count > limit: 

142 return f"{limit}+", limit 

143 return str(count), count 

144 

145 

146def _get_entities_with_enhanced_shape_detection(class_uri: str, classes_with_multiple_shapes: set, limit: int = COUNT_LIMIT): 

147 """ 

148 Get entities for a class using enhanced shape detection for classes with multiple shapes. 

149 Uses LIMIT to avoid loading all entities. 

150 """ 

151 # Early exit if no classes have multiple shapes 

152 if not classes_with_multiple_shapes or class_uri not in classes_with_multiple_shapes: 

153 return defaultdict(list) 

154 

155 sparql = get_sparql() 

156 

157 subjects_query = f""" 

158 SELECT DISTINCT ?subject 

159 WHERE {{ 

160 ?subject a <{class_uri}> . 

161 }} 

162 LIMIT {limit} 

163 """ 

164 

165 sparql.setQuery(subjects_query) 

166 sparql.setReturnFormat(JSON) 

167 subjects_results = sparql.query().convert() 

168 

169 subjects = [r["subject"]["value"] for r in subjects_results["results"]["bindings"]] 

170 

171 if not subjects: 

172 return defaultdict(list) 

173 

174 # Fetch triples only for these specific subjects 

175 subjects_filter = " ".join([f"(<{s}>)" for s in subjects]) 

176 pattern_with_filter = f"?subject a <{class_uri}> . ?subject ?p ?o . VALUES (?subject) {{ {subjects_filter} }}" 

177 

178 triples_query = f""" 

179 SELECT ?subject ?p ?o 

180 WHERE {{ 

181 {pattern_with_filter} 

182 }} 

183 """ 

184 

185 sparql.setQuery(triples_query) 

186 sparql.setReturnFormat(JSON) 

187 results = sparql.query().convert() 

188 

189 entities_triples = defaultdict(list) 

190 for binding in results["results"]["bindings"]: 

191 subject = binding["subject"]["value"] 

192 predicate = binding["p"]["value"] 

193 obj = binding["o"]["value"] 

194 entities_triples[subject].append((subject, predicate, obj)) 

195 

196 shape_to_entities = defaultdict(list) 

197 for subject_uri, triples in entities_triples.items(): 

198 shape_uri = determine_shape_for_entity_triples(triples) 

199 if shape_uri: 

200 entity_key = (class_uri, shape_uri) 

201 if is_entity_type_visible(entity_key): 

202 shape_to_entities[shape_uri].append({ 

203 "uri": subject_uri, 

204 "class": class_uri, 

205 "shape": shape_uri 

206 }) 

207 

208 return shape_to_entities 

209 

210 

211def get_classes_from_shacl_or_display_rules(): 

212 """Extract classes from SHACL shapes or display_rules configuration.""" 

213 SH_TARGET_CLASS = URIRef("http://www.w3.org/ns/shacl#targetClass") 

214 classes = set() 

215 

216 shacl_graph = get_shacl_graph() 

217 if shacl_graph: 

218 for shape in shacl_graph.subjects(SH_TARGET_CLASS, None, unique=True): 

219 for target_class in shacl_graph.objects(shape, SH_TARGET_CLASS, unique=True): 

220 classes.add(str(target_class)) 

221 

222 if not classes: 

223 display_rules = get_display_rules() 

224 if display_rules: 

225 for rule in display_rules: 

226 if "target" in rule and "class" in rule["target"]: 

227 classes.add(rule["target"]["class"]) 

228 

229 return list(classes) 

230 

231 

232def get_available_classes(): 

233 """ 

234 Fetch and format all available entity classes. 

235 Returns cached result if available (computed at startup). 

236 For small datasets (< COUNT_LIMIT), cache is invalidated to keep counts accurate. 

237 """ 

238 global _AVAILABLE_CLASSES_CACHE 

239 

240 if _AVAILABLE_CLASSES_CACHE is not None: 

241 total_count = sum(cls.get('count_numeric', 0) for cls in _AVAILABLE_CLASSES_CACHE) 

242 if total_count < COUNT_LIMIT: 

243 _AVAILABLE_CLASSES_CACHE = None 

244 

245 if _AVAILABLE_CLASSES_CACHE is not None: 

246 return _AVAILABLE_CLASSES_CACHE 

247 

248 custom_filter = get_custom_filter() 

249 classes_from_config = get_classes_from_shacl_or_display_rules() 

250 

251 if classes_from_config: 

252 class_uris = classes_from_config 

253 else: 

254 sparql = get_sparql() 

255 pattern = "?subject a ?class ." 

256 wrapped_pattern = _wrap_virtuoso_graph_pattern(pattern) 

257 

258 query = f""" 

259 SELECT DISTINCT ?class 

260 WHERE {{ 

261 {wrapped_pattern} 

262 }} 

263 """ 

264 

265 sparql.setQuery(query) 

266 sparql.setReturnFormat(JSON) 

267 results = sparql.query().convert() 

268 class_uris = [r["class"]["value"] for r in results["results"]["bindings"]] 

269 

270 # Count instances for each class 

271 classes_with_counts = [] 

272 for class_uri in class_uris: 

273 display_count, numeric_count = _count_class_instances(class_uri) 

274 classes_with_counts.append({ 

275 "uri": class_uri, 

276 "display_count": display_count, 

277 "numeric_count": numeric_count 

278 }) 

279 

280 # Sort by count descending 

281 classes_with_counts.sort(key=lambda x: x["numeric_count"], reverse=True) 

282 

283 available_classes = [] 

284 classes_with_multiple_shapes = get_classes_with_multiple_shapes() 

285 

286 for class_data in classes_with_counts: 

287 class_uri = class_data["uri"] 

288 

289 if classes_with_multiple_shapes and class_uri in classes_with_multiple_shapes: 

290 shape_to_entities = _get_entities_with_enhanced_shape_detection( 

291 class_uri, classes_with_multiple_shapes, limit=COUNT_LIMIT 

292 ) 

293 

294 for shape_uri, entities in shape_to_entities.items(): 

295 if entities: 

296 entity_key = (class_uri, shape_uri) 

297 available_classes.append({ 

298 "uri": class_uri, 

299 "label": custom_filter.human_readable_class(entity_key), 

300 "count": f"{len(entities)}+" if len(entities) >= COUNT_LIMIT else str(len(entities)), 

301 "count_numeric": len(entities), 

302 "shape": shape_uri 

303 }) 

304 else: 

305 shape_uri = determine_shape_for_classes([class_uri]) 

306 entity_key = (class_uri, shape_uri) 

307 

308 if is_entity_type_visible(entity_key): 

309 available_classes.append({ 

310 "uri": class_uri, 

311 "label": custom_filter.human_readable_class(entity_key), 

312 "count": class_data["display_count"], 

313 "count_numeric": class_data["numeric_count"], 

314 "shape": shape_uri 

315 }) 

316 

317 available_classes.sort(key=lambda x: x["label"].lower()) 

318 return available_classes 

319 

320 

321def build_sort_clause(sort_property: str, entity_type: str, shape_uri: str = None) -> str: 

322 """ 

323 Build a SPARQL sort clause based on the sortableBy configuration. 

324 

325 Args: 

326 sort_property: The property to sort by 

327 entity_type: The entity type URI 

328 shape_uri: Optional shape URI for more specific sorting rules 

329 

330 Returns: 

331 SPARQL sort clause or empty string 

332 """ 

333 if not sort_property or not entity_type: 

334 return "" 

335 

336 rule = find_matching_rule(entity_type, shape_uri) 

337 

338 if not rule or "sortableBy" not in rule: 

339 return "" 

340 

341 sort_config = next( 

342 (s for s in rule["sortableBy"] if s.get("property") == sort_property), 

343 None 

344 ) 

345 

346 if not sort_config: 

347 return "" 

348 

349 return f"OPTIONAL {{ ?subject <{sort_property}> ?sortValue }}" 

350 

351 

352def get_entities_for_class( 

353 selected_class, page, per_page, sort_property=None, sort_direction="ASC", selected_shape=None 

354): 

355 """ 

356 Retrieve entities for a specific class with pagination and sorting. 

357 

358 Args: 

359 selected_class (str): URI of the class to retrieve entities for 

360 page (int): Page number (1-indexed) 

361 per_page (int): Number of entities per page 

362 sort_property (str, optional): Property URI to sort by. Defaults to None. 

363 sort_direction (str, optional): Sort direction ("ASC" or "DESC"). Defaults to "ASC". 

364 selected_shape (str, optional): Shape URI for filtering entities. Defaults to None. 

365 

366 Returns: 

367 tuple: (list of entities, total count) 

368 

369 Performance Notes: 

370 - If sort_property is None, NO ORDER BY clause is applied to the SPARQL query. 

371 This significantly improves performance for large datasets by avoiding expensive 

372 sorting operations on URIs. 

373 - Without explicit ordering, the triplestore returns results in its natural order, 

374 which is deterministic within a session but may vary after database reloads. 

375 - For optimal performance with large datasets, configure display_rules.yaml without 

376 sortableBy properties to prevent users from triggering expensive sort operations. 

377 """ 

378 sparql = get_sparql() 

379 custom_filter = get_custom_filter() 

380 classes_with_multiple_shapes = get_classes_with_multiple_shapes() 

381 

382 use_shape_filtering = (selected_shape and selected_class in classes_with_multiple_shapes) 

383 

384 if use_shape_filtering: 

385 # For shape filtering, we need to fetch entities and check their shape 

386 # Use a larger LIMIT to ensure we get enough entities after filtering 

387 offset = (page - 1) * per_page 

388 fetch_limit = per_page * 5 # Safety margin for filtering 

389 

390 subjects_query = f""" 

391 SELECT DISTINCT ?subject 

392 WHERE {{ 

393 ?subject a <{selected_class}> . 

394 }} 

395 LIMIT {fetch_limit} 

396 OFFSET {offset} 

397 """ 

398 

399 sparql.setQuery(subjects_query) 

400 sparql.setReturnFormat(JSON) 

401 subjects_results = sparql.query().convert() 

402 

403 subjects = [r["subject"]["value"] for r in subjects_results["results"]["bindings"]] 

404 

405 if not subjects: 

406 return [], 0 

407 

408 # Now fetch triples for these specific subjects 

409 subjects_filter = " ".join([f"(<{s}>)" for s in subjects]) 

410 

411 triples_query = f""" 

412 SELECT ?subject ?p ?o 

413 WHERE {{ 

414 ?subject a <{selected_class}> . ?subject ?p ?o . VALUES (?subject) {{ {subjects_filter} }} 

415 }} 

416 """ 

417 

418 sparql.setQuery(triples_query) 

419 sparql.setReturnFormat(JSON) 

420 results = sparql.query().convert() 

421 

422 entities_triples = defaultdict(list) 

423 for binding in results["results"]["bindings"]: 

424 subject = binding["subject"]["value"] 

425 predicate = binding["p"]["value"] 

426 obj = binding["o"]["value"] 

427 entities_triples[subject].append((subject, predicate, obj)) 

428 

429 filtered_entities = [] 

430 for subject_uri, triples in entities_triples.items(): 

431 entity_shape = determine_shape_for_entity_triples(list(triples)) 

432 if entity_shape == selected_shape: 

433 entity_label = custom_filter.human_readable_entity( 

434 subject_uri, (selected_class, selected_shape), None 

435 ) 

436 filtered_entities.append({"uri": subject_uri, "label": entity_label}) 

437 

438 if sort_property and sort_direction: 

439 reverse_sort = sort_direction.upper() == "DESC" 

440 filtered_entities.sort(key=lambda x: x["label"].lower(), reverse=reverse_sort) 

441 

442 # For shape-filtered results, we can't accurately determine total_count without scanning all entities 

443 # Return the number of filtered entities as an approximation 

444 total_count = len(filtered_entities) 

445 return filtered_entities[:per_page], total_count 

446 

447 # Standard pagination path 

448 offset = (page - 1) * per_page 

449 sort_clause = "" 

450 order_clause = "" 

451 

452 if sort_property: 

453 sort_clause = build_sort_clause(sort_property, selected_class, selected_shape) 

454 if sort_clause: 

455 order_clause = f"ORDER BY {sort_direction}(?sortValue)" 

456 

457 entities_query = f""" 

458 SELECT ?subject {f"?sortValue" if sort_property else ""} 

459 WHERE {{ 

460 ?subject a <{selected_class}> . {sort_clause} 

461 }} 

462 {order_clause} 

463 LIMIT {per_page} 

464 OFFSET {offset} 

465 """ 

466 

467 available_classes = get_available_classes() 

468 

469 class_info = next( 

470 (c for c in available_classes 

471 if c["uri"] == selected_class and c.get("shape") == selected_shape), 

472 None 

473 ) 

474 total_count = class_info.get("count_numeric", 0) if class_info else 0 

475 

476 sparql.setQuery(entities_query) 

477 sparql.setReturnFormat(JSON) 

478 entities_results = sparql.query().convert() 

479 

480 entities = [] 

481 shape = selected_shape if selected_shape else determine_shape_for_classes([selected_class]) 

482 

483 for result in entities_results["results"]["bindings"]: 

484 subject_uri = result["subject"]["value"] 

485 entity_label = custom_filter.human_readable_entity( 

486 subject_uri, (selected_class, shape), None 

487 ) 

488 entities.append({"uri": subject_uri, "label": entity_label}) 

489 

490 return entities, total_count 

491 

492 

493def get_catalog_data( 

494 selected_class: str, 

495 page: int, 

496 per_page: int, 

497 sort_property: str = None, 

498 sort_direction: str = "ASC", 

499 selected_shape: str = None 

500) -> dict: 

501 """ 

502 Get catalog data with pagination and sorting. 

503 

504 Args: 

505 selected_class (str): Selected class URI 

506 page (int): Current page number 

507 per_page (int): Items per page 

508 sort_property (str, optional): Property to sort by 

509 sort_direction (str, optional): Sort direction ('ASC' or 'DESC') 

510 selected_shape (str, optional): URI of the shape to use for sorting rules 

511 

512 Returns: 

513 dict: Catalog data including entities, pagination info, and sort settings 

514 """ 

515 

516 entities = [] 

517 total_count = 0 

518 sortable_properties = [] 

519 

520 if selected_class: 

521 sortable_properties = get_sortable_properties( 

522 (selected_class, selected_shape) 

523 ) 

524 

525 if not sort_property and sortable_properties: 

526 sort_property = sortable_properties[0]["property"] 

527 

528 entities, total_count = get_entities_for_class( 

529 selected_class, page, per_page, sort_property, sort_direction, selected_shape 

530 ) 

531 

532 return { 

533 "entities": entities, 

534 "total_pages": ( 

535 (total_count + per_page - 1) // per_page if total_count > 0 else 0 

536 ), 

537 "current_page": page, 

538 "per_page": per_page, 

539 "total_count": total_count, 

540 "sort_property": sort_property, 

541 "sort_direction": sort_direction, 

542 "sortable_properties": sortable_properties, 

543 "selected_class": selected_class, 

544 "selected_shape": selected_shape, 

545 } 

546 

547 

548def fetch_data_graph_for_subject(subject: str) -> Graph | Dataset: 

549 """ 

550 Fetch all triples/quads associated with a subject from the dataset. 

551 Handles both triplestore and quadstore cases appropriately. 

552 

553 Args: 

554 subject (str): The URI of the subject to fetch data for 

555 

556 Returns: 

557 Graph|Dataset: A graph containing all triples/quads for the subject 

558 """ 

559 g = Dataset() if get_dataset_is_quadstore() else Graph() 

560 sparql = get_sparql() 

561 

562 if is_virtuoso(): 

563 # For virtuoso we need to explicitly query the graph 

564 query = f""" 

565 SELECT ?predicate ?object ?g WHERE {{ 

566 GRAPH ?g {{ 

567 <{subject}> ?predicate ?object. 

568 }} 

569 FILTER(?g NOT IN (<{'>, <'.join(VIRTUOSO_EXCLUDED_GRAPHS)}>)) 

570 }} 

571 """ 

572 else: 

573 if get_dataset_is_quadstore(): 

574 # For non-virtuoso quadstore, we need to query all graphs 

575 query = f""" 

576 SELECT ?predicate ?object ?g WHERE {{ 

577 GRAPH ?g {{ 

578 <{subject}> ?predicate ?object. 

579 }} 

580 }} 

581 """ 

582 else: 

583 # For regular triplestore 

584 query = f""" 

585 SELECT ?predicate ?object WHERE {{ 

586 <{subject}> ?predicate ?object. 

587 }} 

588 """ 

589 

590 sparql.setQuery(query) 

591 sparql.setReturnFormat(JSON) 

592 query_results = sparql.query().convert() 

593 results = query_results.get("results", {}).get("bindings", []) 

594 

595 for result in results: 

596 # Create the appropriate value (Literal or URIRef) 

597 obj_data = result["object"] 

598 if obj_data["type"] in {"literal", "typed-literal"}: 

599 if "datatype" in obj_data: 

600 value = Literal( 

601 obj_data["value"], datatype=URIRef(obj_data["datatype"]) 

602 ) 

603 else: 

604 # Create literal without explicit datatype to match Reader.import_entities_from_triplestore 

605 value = Literal(obj_data["value"]) 

606 else: 

607 value = URIRef(obj_data["value"]) 

608 

609 # Add triple/quad based on store type 

610 if get_dataset_is_quadstore(): 

611 graph_uri = URIRef(result["g"]["value"]) 

612 g.add( 

613 ( 

614 URIRef(subject), 

615 URIRef(result["predicate"]["value"]), 

616 value, 

617 graph_uri, 

618 ) 

619 ) 

620 else: 

621 g.add((URIRef(subject), URIRef(result["predicate"]["value"]), value)) 

622 

623 return g 

624 

625 

626def parse_sparql_update(query) -> dict: 

627 parsed = parseUpdate(query) 

628 translated = translateUpdate(parsed).algebra 

629 modifications = {} 

630 

631 def extract_quads(quads): 

632 result = [] 

633 for graph, triples in quads.items(): 

634 for triple in triples: 

635 result.append((triple[0], triple[1], triple[2])) 

636 return result 

637 

638 for operation in translated: 

639 if operation.name == "DeleteData": 

640 if hasattr(operation, "quads") and operation.quads: 

641 deletions = extract_quads(operation.quads) 

642 else: 

643 deletions = operation.triples 

644 if deletions: 

645 modifications.setdefault("Deletions", list()).extend(deletions) 

646 elif operation.name == "InsertData": 

647 if hasattr(operation, "quads") and operation.quads: 

648 additions = extract_quads(operation.quads) 

649 else: 

650 additions = operation.triples 

651 if additions: 

652 modifications.setdefault("Additions", list()).extend(additions) 

653 

654 return modifications 

655 

656 

657def fetch_current_state_with_related_entities( 

658 provenance: dict, 

659) -> Graph | Dataset: 

660 """ 

661 Fetch the current state of an entity and all its related entities known from provenance. 

662 

663 Args: 

664 provenance (dict): Dictionary containing provenance metadata for main entity and related entities 

665 

666 Returns: 

667 Dataset: A graph containing the current state of all entities 

668 """ 

669 combined_graph = Dataset() if get_dataset_is_quadstore() else Graph() 

670 

671 # Fetch state for all entities mentioned in provenance 

672 for entity_uri in provenance.keys(): 

673 current_graph = fetch_data_graph_for_subject(entity_uri) 

674 

675 if get_dataset_is_quadstore(): 

676 for quad in current_graph.quads(): 

677 combined_graph.add(quad) 

678 else: 

679 for triple in current_graph: 

680 combined_graph.add(triple) 

681 

682 return combined_graph 

683 

684 

685def get_deleted_entities_with_filtering( 

686 page=1, 

687 per_page=50, 

688 sort_property="deletionTime", 

689 sort_direction="DESC", 

690 selected_class=None, 

691 selected_shape=None, 

692): 

693 """ 

694 Fetch and process deleted entities from the provenance graph, with filtering and sorting. 

695 """ 

696 sortable_properties = [ 

697 {"property": "deletionTime", "displayName": "Deletion Time", "sortType": "date"} 

698 ] 

699 provenance_sparql = get_provenance_sparql() 

700 custom_filter = get_custom_filter() 

701 

702 prov_query = """ 

703 SELECT DISTINCT ?entity ?lastSnapshot ?deletionTime ?agent ?lastValidSnapshotTime 

704 WHERE { 

705 ?lastSnapshot a <http://www.w3.org/ns/prov#Entity> ; 

706 <http://www.w3.org/ns/prov#specializationOf> ?entity ; 

707 <http://www.w3.org/ns/prov#generatedAtTime> ?deletionTime ; 

708 <http://www.w3.org/ns/prov#invalidatedAtTime> ?invalidationTime ; 

709 <http://www.w3.org/ns/prov#wasDerivedFrom> ?lastValidSnapshot. 

710 

711 ?lastValidSnapshot <http://www.w3.org/ns/prov#generatedAtTime> ?lastValidSnapshotTime . 

712 

713 OPTIONAL { ?lastSnapshot <http://www.w3.org/ns/prov#wasAttributedTo> ?agent . } 

714 

715 FILTER NOT EXISTS { 

716 ?laterSnapshot <http://www.w3.org/ns/prov#wasDerivedFrom> ?lastSnapshot . 

717 } 

718 } 

719 """ 

720 provenance_sparql.setQuery(prov_query) 

721 provenance_sparql.setReturnFormat(JSON) 

722 prov_results = provenance_sparql.query().convert() 

723 

724 results_bindings = prov_results["results"]["bindings"] 

725 if not results_bindings: 

726 return [], [], None, None, [], 0 

727 

728 deleted_entities = [] 

729 max_workers = max(1, min(os.cpu_count() or 4, len(results_bindings))) 

730 with ProcessPoolExecutor(max_workers=max_workers) as executor: 

731 future_to_entity = { 

732 executor.submit(process_deleted_entity, result, sortable_properties): result 

733 for result in results_bindings 

734 } 

735 for future in as_completed(future_to_entity): 

736 entity_info = future.result() 

737 if entity_info is not None: 

738 deleted_entities.append(entity_info) 

739 

740 class_counts = {} 

741 for entity in deleted_entities: 

742 for type_uri in entity["entity_types"]: 

743 class_counts[type_uri] = class_counts.get(type_uri, 0) + 1 

744 

745 available_classes = [ 

746 { 

747 "uri": class_uri, 

748 "label": custom_filter.human_readable_class((class_uri, determine_shape_for_classes([class_uri]))), 

749 "count": count, 

750 } 

751 for class_uri, count in class_counts.items() 

752 ] 

753 

754 reverse_sort = sort_direction.upper() == "DESC" 

755 if sort_property == "deletionTime": 

756 deleted_entities.sort(key=lambda e: e["deletionTime"], reverse=reverse_sort) 

757 else: 

758 deleted_entities.sort( 

759 key=lambda e: e["sort_values"].get(sort_property, "").lower(), 

760 reverse=reverse_sort, 

761 ) 

762 

763 available_classes.sort(key=lambda x: x["label"].lower()) 

764 if not selected_class and available_classes: 

765 selected_class = available_classes[0]["uri"] 

766 

767 if selected_class: 

768 if selected_shape is None: 

769 selected_shape = determine_shape_for_classes([selected_class]) 

770 entity_key = (selected_class, selected_shape) 

771 sortable_properties.extend( 

772 get_sortable_properties(entity_key) 

773 ) 

774 

775 if selected_class: 

776 filtered_entities = [ 

777 entity 

778 for entity in deleted_entities 

779 if selected_class in entity["entity_types"] 

780 ] 

781 else: 

782 filtered_entities = deleted_entities 

783 

784 total_count = len(filtered_entities) 

785 offset = (page - 1) * per_page 

786 paginated_entities = filtered_entities[offset : offset + per_page] 

787 

788 return paginated_entities, available_classes, selected_class, selected_shape, sortable_properties, total_count 

789 

790 

791def process_deleted_entity(result: dict, sortable_properties: list) -> dict | None: 

792 """ 

793 Process a single deleted entity, filtering by visible classes. 

794 """ 

795 change_tracking_config = get_change_tracking_config() 

796 custom_filter = get_custom_filter() 

797 

798 entity_uri = result["entity"]["value"] 

799 last_valid_snapshot_time = result["lastValidSnapshotTime"]["value"] 

800 

801 agnostic_entity = AgnosticEntity( 

802 res=entity_uri, config=change_tracking_config, include_related_objects=True, include_merged_entities=True, include_reverse_relations=True 

803 ) 

804 state, _, _ = agnostic_entity.get_state_at_time( 

805 (last_valid_snapshot_time, last_valid_snapshot_time) 

806 ) 

807 state = convert_to_rdflib_graphs(state, get_dataset_is_quadstore()) 

808 

809 if entity_uri not in state: 

810 return None 

811 

812 last_valid_time = convert_to_datetime(last_valid_snapshot_time, stringify=True) 

813 last_valid_state: Graph | Dataset = state[entity_uri][last_valid_time] 

814 

815 entity_types = [ 

816 str(o) 

817 for s, p, o in get_triples_from_graph(last_valid_state, (URIRef(entity_uri), RDF.type, None)) 

818 ] 

819 highest_priority_type = get_highest_priority_class(entity_types) 

820 shape = determine_shape_for_classes([highest_priority_type]) 

821 visible_types = [t for t in entity_types if is_entity_type_visible((t, determine_shape_for_classes([t])))] 

822 if not visible_types: 

823 return None 

824 

825 sort_values = {} 

826 for prop in sortable_properties: 

827 prop_uri = prop["property"] 

828 values = [ 

829 str(o) 

830 for s, p, o in get_triples_from_graph( 

831 last_valid_state, (URIRef(entity_uri), URIRef(prop_uri), None) 

832 ) 

833 ] 

834 sort_values[prop_uri] = values[0] if values else "" 

835 

836 return { 

837 "uri": entity_uri, 

838 "deletionTime": result["deletionTime"]["value"], 

839 "deletedBy": custom_filter.format_agent_reference( 

840 result.get("agent", {}).get("value", "") 

841 ), 

842 "lastValidSnapshotTime": last_valid_snapshot_time, 

843 "type": custom_filter.human_readable_predicate( 

844 highest_priority_type, (highest_priority_type, shape) 

845 ), 

846 "label": custom_filter.human_readable_entity( 

847 entity_uri, (highest_priority_type, shape), last_valid_state 

848 ), 

849 "entity_types": visible_types, 

850 "sort_values": sort_values, 

851 } 

852 

853 

854def find_orphaned_entities(subject, entity_type, predicate=None, object_value=None): 

855 """ 

856 Find entities that would become orphaned after deleting a triple or an entire entity, 

857 including intermediate relation entities. 

858 

859 An entity is considered orphaned if: 

860 1. It has no incoming references from other entities (except from the entity being deleted) 

861 2. It does not reference any entities that are subjects of other triples 

862 

863 For intermediate relations, an entity is also considered orphaned if: 

864 1. It connects to the entity being deleted 

865 2. It has no other valid connections after the deletion 

866 3. It is directly involved in the deletion operation (if predicate and object_value are specified) 

867 

868 Args: 

869 subject (str): The URI of the subject being deleted 

870 entity_type (str): The type of the entity being deleted 

871 predicate (str, optional): The predicate being deleted 

872 object_value (str, optional): The object value being deleted 

873 

874 Returns: 

875 tuple: Lists of (orphaned_entities, intermediate_orphans) 

876 """ 

877 sparql = get_sparql() 

878 display_rules = get_display_rules() 

879 

880 intermediate_classes = set() 

881 

882 for rule in display_rules: 

883 if "target" in rule and "class" in rule["target"] and rule["target"]["class"] == entity_type: 

884 for prop in rule.get("displayProperties", []): 

885 if "intermediateRelation" in prop: 

886 intermediate_classes.add(prop["intermediateRelation"]["class"]) 

887 

888 orphan_query = f""" 

889 SELECT DISTINCT ?entity ?type 

890 WHERE {{ 

891 {f"<{subject}> <{predicate}> ?entity ." if predicate and object_value else ""} 

892 {f"FILTER(?entity = <{object_value}>)" if predicate and object_value else ""} 

893  

894 # If no specific predicate, get all connected entities 

895 {f"<{subject}> ?p ?entity ." if not predicate else ""} 

896  

897 FILTER(isIRI(?entity)) 

898 ?entity a ?type . 

899  

900 # No incoming references from other entities 

901 FILTER NOT EXISTS {{ 

902 ?other ?anyPredicate ?entity . 

903 FILTER(?other != <{subject}>) 

904 }} 

905  

906 # No outgoing references to active entities 

907 FILTER NOT EXISTS {{ 

908 ?entity ?outgoingPredicate ?connectedEntity . 

909 ?connectedEntity ?furtherPredicate ?furtherObject . 

910 {f"FILTER(?connectedEntity != <{subject}>)" if not predicate else ""} 

911 }} 

912  

913 # Exclude intermediate relation entities 

914 FILTER(?type NOT IN (<{f">, <".join(intermediate_classes)}>)) 

915 }} 

916 """ 

917 

918 # Query to find orphaned intermediate relations 

919 if predicate and object_value: 

920 intermediate_query = f""" 

921 SELECT DISTINCT ?entity ?type 

922 WHERE {{ 

923 <{object_value}> a ?type . 

924 FILTER(?type IN (<{f">, <".join(intermediate_classes)}>))  

925 BIND(<{object_value}> AS ?entity) 

926 }} 

927 """ 

928 else: 

929 # Se stiamo cancellando l'intera entità, trova tutte le entità intermedie collegate 

930 intermediate_query = f""" 

931 SELECT DISTINCT ?entity ?type 

932 WHERE {{ 

933 # Find intermediate relations connected to the entity being deleted 

934 {{ 

935 <{subject}> ?p ?entity . 

936 ?entity a ?type . 

937 FILTER(?type IN (<{f">, <".join(intermediate_classes)}>)) 

938 }} UNION {{ 

939 ?entity ?p <{subject}> . 

940 ?entity a ?type . 

941 FILTER(?type IN (<{f">, <".join(intermediate_classes)}>)) 

942 }}  

943 }} 

944 """ 

945 

946 orphaned = [] 

947 intermediate_orphans = [] 

948 

949 # Execute queries and process results 

950 for query, result_list in [ 

951 (orphan_query, orphaned), 

952 (intermediate_query, intermediate_orphans), 

953 ]: 

954 sparql.setQuery(query) 

955 sparql.setReturnFormat(JSON) 

956 results = sparql.query().convert() 

957 

958 for result in results["results"]["bindings"]: 

959 result_list.append( 

960 {"uri": result["entity"]["value"], "type": result["type"]["value"]} 

961 ) 

962 

963 return orphaned, intermediate_orphans 

964 

965 

966def import_entity_graph(editor: Editor, subject: str, max_depth: int = 5, include_referencing_entities: bool = False): 

967 """ 

968 Recursively import the main subject and its connected entity graph up to a specified depth. 

969 

970 This function imports the specified subject and all entities connected to it, 

971 directly or indirectly, up to the maximum depth specified. It traverses the 

972 graph of connected entities, importing each one into the editor. 

973 

974 Args: 

975 editor (Editor): The Editor instance to use for importing. 

976 subject (str): The URI of the subject to start the import from. 

977 max_depth (int): The maximum depth of recursion (default is 5). 

978 include_referencing_entities (bool): Whether to include entities that have the subject as their object (default False). 

979 Useful when deleting an entity to ensure all references are properly removed. 

980 

981 Returns: 

982 Editor: The updated Editor instance with all imported entities. 

983 """ 

984 imported_subjects = set() 

985 

986 # First import referencing entities if needed 

987 if include_referencing_entities: 

988 sparql = get_sparql() 

989 

990 # Build query based on database type 

991 if editor.dataset_is_quadstore: 

992 query = f""" 

993 SELECT DISTINCT ?s 

994 WHERE {{ 

995 GRAPH ?g {{ 

996 ?s ?p <{subject}> . 

997 }} 

998 FILTER(?p != <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>) 

999 }} 

1000 """ 

1001 else: 

1002 query = f""" 

1003 SELECT DISTINCT ?s 

1004 WHERE {{ 

1005 ?s ?p <{subject}> . 

1006 FILTER(?p != <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>) 

1007 }} 

1008 """ 

1009 

1010 sparql.setQuery(query) 

1011 sparql.setReturnFormat(JSON) 

1012 results = sparql.query().convert() 

1013 

1014 # Import each referencing entity 

1015 for result in results["results"]["bindings"]: 

1016 referencing_subject = result["s"]["value"] 

1017 if referencing_subject != subject and referencing_subject not in imported_subjects: 

1018 imported_subjects.add(referencing_subject) 

1019 editor.import_entity(URIRef(referencing_subject)) 

1020 

1021 def recursive_import(current_subject: str, current_depth: int): 

1022 if current_depth > max_depth or current_subject in imported_subjects: 

1023 return 

1024 

1025 imported_subjects.add(current_subject) 

1026 editor.import_entity(URIRef(current_subject)) 

1027 

1028 query = f""" 

1029 SELECT ?p ?o 

1030 WHERE {{ 

1031 <{current_subject}> ?p ?o . 

1032 FILTER(isIRI(?o)) 

1033 FILTER(?p != <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>) 

1034 }} 

1035 """ 

1036 

1037 sparql = get_sparql() 

1038 sparql.setQuery(query) 

1039 sparql.setReturnFormat(JSON) 

1040 results = sparql.query().convert() 

1041 

1042 for result in results["results"]["bindings"]: 

1043 object_entity = result["o"]["value"] 

1044 recursive_import(object_entity, current_depth + 1) 

1045 

1046 recursive_import(subject, 1) 

1047 return editor 

1048 

1049 

1050def get_entity_types(subject_uri: str) -> List[str]: 

1051 """ 

1052 Get all RDF types for an entity. 

1053 

1054 Args: 

1055 subject_uri: URI of the entity 

1056 

1057 Returns: 

1058 List of type URIs 

1059 """ 

1060 sparql = get_sparql() 

1061 

1062 query = f""" 

1063 SELECT ?type WHERE {{ 

1064 <{subject_uri}> a ?type . 

1065 }} 

1066 """ 

1067 

1068 sparql.setQuery(query) 

1069 sparql.setReturnFormat(JSON) 

1070 results = sparql.query().convert() 

1071 

1072 return [result["type"]["value"] for result in results["results"]["bindings"]] 

1073 

1074 

1075def collect_referenced_entities(data, existing_entities=None): 

1076 """ 

1077 Recursively collect all URIs of existing entities referenced in the structured data. 

1078  

1079 This function traverses the structured data to find explicit references to existing entities 

1080 that need to be imported into the editor before calling preexisting_finished(). 

1081  

1082 Args: 

1083 data: The structured data (can be dict, list, or string) 

1084 existing_entities: Set to collect URIs (created if None) 

1085  

1086 Returns: 

1087 Set of URIs (strings) of existing entities that should be imported 

1088 """ 

1089 

1090 if existing_entities is None: 

1091 existing_entities = set() 

1092 

1093 if isinstance(data, dict): 

1094 if data.get("is_existing_entity") is True and "entity_uri" in data: 

1095 existing_entities.add(data["entity_uri"]) 

1096 

1097 # If it's an entity with entity_type, it's a new entity being created 

1098 elif "entity_type" in data: 

1099 properties = data.get("properties", {}) 

1100 for prop_values in properties.values(): 

1101 collect_referenced_entities(prop_values, existing_entities) 

1102 else: 

1103 for value in data.values(): 

1104 collect_referenced_entities(value, existing_entities) 

1105 

1106 elif isinstance(data, list): 

1107 for item in data: 

1108 collect_referenced_entities(item, existing_entities) 

1109 

1110 return existing_entities 

1111 

1112 

1113def import_referenced_entities(editor, structured_data): 

1114 """ 

1115 Import all existing entities referenced in structured data into the editor. 

1116  

1117 This function should be called before editor.preexisting_finished() to ensure 

1118 that all existing entities that will be linked have their snapshots created. 

1119  

1120 Args: 

1121 editor: The Editor instance 

1122 structured_data: The structured data containing entity references 

1123 """ 

1124 referenced_entities = collect_referenced_entities(structured_data) 

1125 for entity_uri in referenced_entities: 

1126 try: 

1127 editor.import_entity(entity_uri) 

1128 except Exception as e: 

1129 print(f"Warning: Could not import entity {entity_uri}: {e}") 

1130 continue