Coverage for heritrace / utils / sparql_utils.py: 93%

488 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-07-02 10:16 +0000

1# SPDX-FileCopyrightText: 2024-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# 

3# SPDX-License-Identifier: ISC 

4 

5import logging 

6import os 

7import time 

8from collections import defaultdict, deque 

9from collections.abc import Generator 

10from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed 

11from dataclasses import dataclass 

12 

13from flask import current_app 

14from rdflib import RDF, Dataset, Graph, Literal, URIRef 

15from rdflib.plugins.sparql.algebra import translateUpdate 

16from rdflib.plugins.sparql.parser import parseUpdate 

17from rdflib.term import Node 

18from rdflib.util import from_n3 

19from SPARQLWrapper import JSON 

20from SPARQLWrapper.SPARQLExceptions import SPARQLWrapperException 

21from time_agnostic_library.agnostic_entity import AgnosticEntity 

22 

23from heritrace.editor import Editor 

24from heritrace.extensions import ( 

25 get_change_tracking_config, 

26 get_classes_with_multiple_shapes, 

27 get_custom_filter, 

28 get_dataset_is_quadstore, 

29 get_display_rules, 

30 get_provenance_sparql, 

31 get_shacl_graph, 

32 get_sparql, 

33) 

34from heritrace.sparql import get_sparql_bindings 

35from heritrace.utils.converters import convert_to_datetime 

36from heritrace.utils.display_rules_utils import ( 

37 find_matching_rule, 

38 get_highest_priority_class, 

39 get_sortable_properties, 

40 is_entity_type_visible, 

41) 

42from heritrace.utils.shacl_utils import ( 

43 determine_shape_for_classes, 

44 determine_shape_for_entity_triples, 

45) 

46from heritrace.utils.virtuoso_utils import VIRTUOSO_EXCLUDED_GRAPHS, is_virtuoso 

47 

48_cache: dict[str, tuple[list[dict[str, str | int]], float] | None] = { 

49 "available_classes": None 

50} 

51AVAILABLE_CLASSES_TTL_SECONDS = 60 

52 

53 

54def _parse_n3(value: str) -> Node: 

55 result = from_n3(value) 

56 if not isinstance(result, Node): 

57 msg = f"Cannot parse N3 value: {value}" 

58 raise TypeError(msg) 

59 return result 

60 

61 

62def n3_set_to_graph( 

63 n3_set: set[tuple[str, ...]], 

64 *, 

65 is_quadstore: bool, 

66) -> Graph | Dataset: 

67 if is_quadstore: 

68 g = Dataset(default_union=True) 

69 for tup in n3_set: 

70 quad = ( 

71 _parse_n3(tup[0]), 

72 _parse_n3(tup[1]), 

73 _parse_n3(tup[2]), 

74 _parse_n3(tup[3]), 

75 ) 

76 g.add(quad) # type: ignore[arg-type] 

77 else: 

78 g = Graph() 

79 for tup in n3_set: 

80 g.add((_parse_n3(tup[0]), _parse_n3(tup[1]), _parse_n3(tup[2]))) 

81 return g 

82 

83 

84def convert_to_rdflib_graphs(snapshots: dict, *, is_quadstore: bool) -> dict: 

85 converted = {} 

86 for entity_uri, timestamps in snapshots.items(): 

87 converted[entity_uri] = {} 

88 for ts, n3_set in timestamps.items(): 

89 converted[entity_uri][ts] = n3_set_to_graph( 

90 n3_set, is_quadstore=is_quadstore 

91 ) 

92 return converted 

93 

94 

95def get_triples_from_graph( 

96 graph_or_dataset: Graph | Dataset, 

97 pattern: tuple[URIRef | None, URIRef | None, Node | None], 

98) -> Generator[tuple[Node, Node, Node]]: 

99 """ 

100 Get triples from a Graph or Dataset, handling both cases correctly. 

101 

102 For Dataset (quadstore), converts quads to triples by extracting (s, p, o). 

103 For Graph (triplestore), uses triples() directly. 

104 

105 Args: 

106 graph_or_dataset: Graph or Dataset instance 

107 pattern: Triple pattern tuple (s, p, o) where each can be None 

108 

109 Returns: 

110 Generator of triples (s, p, o) 

111 """ 

112 if isinstance(graph_or_dataset, Dataset): 

113 # For Dataset, use quads() and extract only (s, p, o) 

114 for s, p, o, _g in graph_or_dataset.quads(pattern): 

115 yield (s, p, o) 

116 else: 

117 # For Graph, use triples() directly 

118 yield from graph_or_dataset.triples(pattern) 

119 

120 

121COUNT_LIMIT = int(os.getenv("COUNT_LIMIT", "10000")) 

122 

123 

124def _wrap_virtuoso_graph_pattern(pattern: str) -> str: 

125 """Wrap a SPARQL pattern with Virtuoso GRAPH clause if needed.""" 

126 if is_virtuoso(): 

127 return f""" 

128 GRAPH ?g {{ 

129 {pattern} 

130 }} 

131 FILTER(?g NOT IN (<{">, <".join(VIRTUOSO_EXCLUDED_GRAPHS)}>)) 

132 """ 

133 return pattern 

134 

135 

136def _build_count_query_with_limit(class_uri: str, limit: int) -> str: 

137 """Build a COUNT query with LIMIT for a specific class.""" 

138 

139 return f""" 

140 SELECT (COUNT(?subject) as ?count) 

141 WHERE {{ 

142 {{ 

143 SELECT DISTINCT ?subject 

144 WHERE {{ 

145 ?subject a <{class_uri}> . 

146 }} 

147 LIMIT {limit} 

148 }} 

149 }} 

150 """ 

151 

152 

153def _count_class_instances(class_uri: str, limit: int = COUNT_LIMIT) -> tuple: 

154 """ 

155 Count instances of a class up to a limit. 

156 

157 Returns: 

158 tuple: (display_count, numeric_count) where display_count may be "LIMIT+" 

159 """ 

160 sparql = get_sparql() 

161 query = _build_count_query_with_limit(class_uri, limit + 1) 

162 

163 sparql.setQuery(query) 

164 sparql.setReturnFormat(JSON) 

165 bindings = get_sparql_bindings(sparql.query().convert()) 

166 

167 count = int(bindings[0]["count"]["value"]) 

168 

169 if count > limit: 

170 return f"{limit}+", limit 

171 return str(count), count 

172 

173 

174def _get_entities_with_enhanced_shape_detection( 

175 class_uri: str, classes_with_multiple_shapes: set[str], limit: int = COUNT_LIMIT 

176) -> defaultdict[str, list[dict[str, str]]]: 

177 """ 

178 Get entities for a class using enhanced shape detection 

179 for classes with multiple shapes. 

180 Uses LIMIT to avoid loading all entities. 

181 """ 

182 # Early exit if no classes have multiple shapes 

183 if ( 

184 not classes_with_multiple_shapes 

185 or class_uri not in classes_with_multiple_shapes 

186 ): 

187 return defaultdict(list) 

188 

189 sparql = get_sparql() 

190 

191 subjects_query = f""" 

192 SELECT DISTINCT ?subject 

193 WHERE {{ 

194 ?subject a <{class_uri}> . 

195 }} 

196 LIMIT {limit} 

197 """ 

198 

199 sparql.setQuery(subjects_query) 

200 sparql.setReturnFormat(JSON) 

201 subjects_bindings = get_sparql_bindings(sparql.query().convert()) 

202 

203 subjects = [r["subject"]["value"] for r in subjects_bindings] 

204 

205 if not subjects: 

206 return defaultdict(list) 

207 

208 # Fetch triples only for these specific subjects 

209 subjects_filter = " ".join([f"(<{s}>)" for s in subjects]) 

210 pattern_with_filter = ( 

211 f"?subject a <{class_uri}> . ?subject ?p ?o" 

212 f" . VALUES (?subject) {{ {subjects_filter} }}" 

213 ) 

214 

215 triples_query = f""" 

216 SELECT ?subject ?p ?o 

217 WHERE {{ 

218 {pattern_with_filter} 

219 }} 

220 """ 

221 

222 sparql.setQuery(triples_query) 

223 sparql.setReturnFormat(JSON) 

224 triples_bindings = get_sparql_bindings(sparql.query().convert()) 

225 

226 entities_triples = defaultdict(list) 

227 for binding in triples_bindings: 

228 subject = binding["subject"]["value"] 

229 predicate = binding["p"]["value"] 

230 obj = binding["o"]["value"] 

231 entities_triples[subject].append((subject, predicate, obj)) 

232 

233 shape_to_entities = defaultdict(list) 

234 for subject_uri, triples in entities_triples.items(): 

235 shape_uri = determine_shape_for_entity_triples(triples) 

236 if shape_uri: 

237 entity_key = (class_uri, shape_uri) 

238 if is_entity_type_visible(entity_key): 

239 shape_to_entities[shape_uri].append( 

240 {"uri": subject_uri, "class": class_uri, "shape": shape_uri} 

241 ) 

242 

243 return shape_to_entities 

244 

245 

246def get_classes_from_shacl_or_display_rules() -> list[str]: 

247 """Extract classes from SHACL shapes or display_rules configuration.""" 

248 sh_target_class = URIRef("http://www.w3.org/ns/shacl#targetClass") 

249 classes = set() 

250 

251 shacl_graph = get_shacl_graph() 

252 if shacl_graph: 

253 for shape in shacl_graph.subjects(sh_target_class, None, unique=True): 

254 for target_class in shacl_graph.objects( 

255 shape, sh_target_class, unique=True 

256 ): 

257 classes.add(str(target_class)) 

258 

259 if not classes: 

260 display_rules = get_display_rules() 

261 if display_rules: 

262 for rule in display_rules: 

263 if "target" in rule and "class" in rule["target"]: 

264 classes.add(rule["target"]["class"]) 

265 

266 return list(classes) 

267 

268 

269def _get_classes_from_config() -> list[str]: 

270 classes_from_config = get_classes_from_shacl_or_display_rules() 

271 if classes_from_config: 

272 return classes_from_config 

273 

274 return _get_classes_from_sparql() 

275 

276 

277def _get_classes_from_sparql() -> list[str]: 

278 sparql = get_sparql() 

279 pattern = "?subject a ?class ." 

280 wrapped_pattern = _wrap_virtuoso_graph_pattern(pattern) 

281 

282 query = f""" 

283 SELECT DISTINCT ?class 

284 WHERE {{ 

285 {wrapped_pattern} 

286 }} 

287 """ 

288 

289 sparql.setQuery(query) 

290 sparql.setReturnFormat(JSON) 

291 class_bindings = get_sparql_bindings(sparql.query().convert()) 

292 return [r["class"]["value"] for r in class_bindings] 

293 

294 

295def get_available_classes() -> list[dict[str, str | int]]: 

296 cached = _cache["available_classes"] 

297 if cached is not None: 

298 available_classes, computed_at = cached 

299 if time.monotonic() - computed_at < AVAILABLE_CLASSES_TTL_SECONDS: 

300 return available_classes 

301 

302 custom_filter = get_custom_filter() 

303 class_uris = _get_classes_from_config() 

304 

305 classes_with_counts = [] 

306 for class_uri in class_uris: 

307 display_count, numeric_count = _count_class_instances(class_uri) 

308 classes_with_counts.append( 

309 { 

310 "uri": class_uri, 

311 "display_count": display_count, 

312 "numeric_count": numeric_count, 

313 } 

314 ) 

315 

316 classes_with_counts.sort(key=lambda x: x["numeric_count"], reverse=True) 

317 

318 available_classes = [] 

319 classes_with_multiple_shapes = get_classes_with_multiple_shapes() 

320 

321 for class_data in classes_with_counts: 

322 class_uri = class_data["uri"] 

323 

324 if classes_with_multiple_shapes and class_uri in classes_with_multiple_shapes: 

325 shape_to_entities = _get_entities_with_enhanced_shape_detection( 

326 class_uri, classes_with_multiple_shapes, limit=COUNT_LIMIT 

327 ) 

328 

329 for shape_uri, entities in shape_to_entities.items(): 

330 if entities: 

331 entity_key = (class_uri, shape_uri) 

332 available_classes.append( 

333 { 

334 "uri": class_uri, 

335 "label": custom_filter.human_readable_class(entity_key), 

336 "count": f"{len(entities)}+" 

337 if len(entities) >= COUNT_LIMIT 

338 else str(len(entities)), 

339 "count_numeric": len(entities), 

340 "shape": shape_uri, 

341 } 

342 ) 

343 else: 

344 shape_uri = determine_shape_for_classes([class_uri]) 

345 entity_key = (class_uri, shape_uri) 

346 

347 if is_entity_type_visible(entity_key): 

348 available_classes.append( 

349 { 

350 "uri": class_uri, 

351 "label": custom_filter.human_readable_class(entity_key), 

352 "count": class_data["display_count"], 

353 "count_numeric": class_data["numeric_count"], 

354 "shape": shape_uri, 

355 } 

356 ) 

357 

358 available_classes.sort(key=lambda x: x["label"].lower()) 

359 _cache["available_classes"] = (available_classes, time.monotonic()) 

360 return available_classes 

361 

362 

363def build_sort_clause( 

364 sort_property: str, entity_type: str, shape_uri: str | None = None 

365) -> str: 

366 """ 

367 Build a SPARQL sort clause based on the sortableBy configuration. 

368 

369 Args: 

370 sort_property: The property to sort by 

371 entity_type: The entity type URI 

372 shape_uri: Optional shape URI for more specific sorting rules 

373 

374 Returns: 

375 SPARQL sort clause or empty string 

376 """ 

377 if not sort_property or not entity_type: 

378 return "" 

379 

380 rule = find_matching_rule(entity_type, shape_uri) 

381 

382 if not rule or "sortableBy" not in rule: 

383 return "" 

384 

385 sort_config = next( 

386 (s for s in rule["sortableBy"] if s.get("property") == sort_property), None 

387 ) 

388 

389 if not sort_config: 

390 return "" 

391 

392 return f"OPTIONAL {{ ?subject <{sort_property}> ?sortValue }}" 

393 

394 

395@dataclass(frozen=True, slots=True) 

396class CatalogQuery: 

397 selected_class: str | None 

398 page: int 

399 per_page: int 

400 sort_property: str | None = None 

401 sort_direction: str = "ASC" 

402 selected_shape: str | None = None 

403 

404 

405def _fetch_entity_labels( 

406 subject_uris: list[str], entity_key: tuple[str | None, str | None] 

407) -> list[str]: 

408 if not subject_uris: 

409 return [] 

410 custom_filter = get_custom_filter() 

411 app = current_app._get_current_object() # type: ignore[attr-defined] # noqa: SLF001 

412 

413 def fetch_label(uri: str) -> str: 

414 with app.app_context(): 

415 return custom_filter.human_readable_entity(uri, entity_key, None) 

416 

417 with ThreadPoolExecutor(max_workers=8) as executor: 

418 return list(executor.map(fetch_label, subject_uris)) 

419 

420 

421def _get_entities_with_shape_filtering( 

422 query: CatalogQuery, 

423) -> tuple[list[dict[str, str]], int]: 

424 sparql = get_sparql() 

425 selected_class = query.selected_class 

426 selected_shape = query.selected_shape 

427 offset = (query.page - 1) * query.per_page 

428 fetch_limit = query.per_page * 5 

429 

430 subjects_query = f""" 

431 SELECT DISTINCT ?subject 

432 WHERE {{ 

433 ?subject a <{selected_class}> . 

434 }} 

435 LIMIT {fetch_limit} 

436 OFFSET {offset} 

437 """ 

438 

439 sparql.setQuery(subjects_query) 

440 sparql.setReturnFormat(JSON) 

441 subjects_bindings = get_sparql_bindings(sparql.query().convert()) 

442 

443 subjects = [r["subject"]["value"] for r in subjects_bindings] 

444 

445 if not subjects: 

446 return [], 0 

447 

448 subjects_filter = " ".join([f"(<{s}>)" for s in subjects]) 

449 

450 triples_query = f""" 

451 SELECT ?subject ?p ?o 

452 WHERE {{ 

453 ?subject a <{selected_class}> . ?subject ?p ?o . VALUES (?subject) {{ 

454 {subjects_filter} }} 

455 }} 

456 """ 

457 

458 sparql.setQuery(triples_query) 

459 sparql.setReturnFormat(JSON) 

460 triples_bindings = get_sparql_bindings(sparql.query().convert()) 

461 

462 entities_triples = defaultdict(list) 

463 for binding in triples_bindings: 

464 subject = binding["subject"]["value"] 

465 predicate = binding["p"]["value"] 

466 obj = binding["o"]["value"] 

467 entities_triples[subject].append((subject, predicate, obj)) 

468 

469 matching_uris = [ 

470 subject_uri 

471 for subject_uri, triples in entities_triples.items() 

472 if determine_shape_for_entity_triples(list(triples)) == selected_shape 

473 ] 

474 labels = _fetch_entity_labels(matching_uris, (selected_class, selected_shape)) 

475 filtered_entities = [ 

476 {"uri": uri, "label": label} 

477 for uri, label in zip(matching_uris, labels, strict=True) 

478 ] 

479 

480 if query.sort_property and query.sort_direction: 

481 reverse_sort = query.sort_direction.upper() == "DESC" 

482 filtered_entities.sort(key=lambda x: x["label"].lower(), reverse=reverse_sort) 

483 

484 total_count = len(filtered_entities) 

485 return filtered_entities[: query.per_page], total_count 

486 

487 

488def get_entities_for_class( 

489 query: CatalogQuery, 

490 available_classes: list[dict[str, str | int]], 

491) -> tuple[list[dict[str, str]], int]: 

492 if query.selected_class is None: 

493 msg = "selected_class must not be None" 

494 raise ValueError(msg) 

495 sparql = get_sparql() 

496 classes_with_multiple_shapes = get_classes_with_multiple_shapes() 

497 

498 selected_class: str = query.selected_class 

499 selected_shape = query.selected_shape 

500 page = query.page 

501 per_page = query.per_page 

502 sort_property = query.sort_property 

503 sort_direction = query.sort_direction 

504 

505 use_shape_filtering = ( 

506 selected_shape and selected_class in classes_with_multiple_shapes 

507 ) 

508 

509 if use_shape_filtering: 

510 return _get_entities_with_shape_filtering(query) 

511 

512 offset = (page - 1) * per_page 

513 sort_clause = "" 

514 order_clause = "" 

515 

516 if sort_property: 

517 sort_clause = build_sort_clause(sort_property, selected_class, selected_shape) 

518 if sort_clause: 

519 order_clause = f"ORDER BY {sort_direction}(?sortValue)" 

520 

521 entities_query = f""" 

522 SELECT ?subject {"?sortValue" if sort_property else ""} 

523 WHERE {{ 

524 ?subject a <{selected_class}> . {sort_clause} 

525 }} 

526 {order_clause} 

527 LIMIT {per_page} 

528 OFFSET {offset} 

529 """ 

530 

531 class_info = next( 

532 ( 

533 c 

534 for c in available_classes 

535 if c["uri"] == selected_class and c.get("shape") == selected_shape 

536 ), 

537 None, 

538 ) 

539 total_count = int(class_info["count_numeric"]) if class_info else 0 

540 

541 sparql.setQuery(entities_query) 

542 sparql.setReturnFormat(JSON) 

543 entities_bindings = get_sparql_bindings(sparql.query().convert()) 

544 

545 shape = selected_shape or determine_shape_for_classes([selected_class]) 

546 subject_uris = [result["subject"]["value"] for result in entities_bindings] 

547 labels = _fetch_entity_labels(subject_uris, (selected_class, shape)) 

548 entities = [ 

549 {"uri": uri, "label": label} 

550 for uri, label in zip(subject_uris, labels, strict=True) 

551 ] 

552 

553 return entities, total_count 

554 

555 

556def get_catalog_data( 

557 query: CatalogQuery, 

558 available_classes: list[dict[str, str | int]], 

559) -> dict: 

560 entities = [] 

561 total_count = 0 

562 sortable_properties = [] 

563 sort_property = query.sort_property 

564 

565 if query.selected_class: 

566 sortable_properties = get_sortable_properties( 

567 (query.selected_class, query.selected_shape) 

568 ) 

569 

570 if not sort_property and sortable_properties: 

571 sort_property = sortable_properties[0]["property"] 

572 

573 inner_query = CatalogQuery( 

574 selected_class=query.selected_class, 

575 page=query.page, 

576 per_page=query.per_page, 

577 sort_property=sort_property, 

578 sort_direction=query.sort_direction, 

579 selected_shape=query.selected_shape, 

580 ) 

581 entities, total_count = get_entities_for_class(inner_query, available_classes) 

582 

583 return { 

584 "entities": entities, 

585 "total_pages": ( 

586 (total_count + query.per_page - 1) // query.per_page 

587 if total_count > 0 

588 else 0 

589 ), 

590 "current_page": query.page, 

591 "per_page": query.per_page, 

592 "total_count": total_count, 

593 "sort_property": sort_property, 

594 "sort_direction": query.sort_direction, 

595 "sortable_properties": sortable_properties, 

596 "selected_class": query.selected_class, 

597 "selected_shape": query.selected_shape, 

598 } 

599 

600 

601def fetch_data_graph_for_subject(subject: URIRef) -> Graph | Dataset: 

602 g = Dataset() if get_dataset_is_quadstore() else Graph() 

603 sparql = get_sparql() 

604 

605 if is_virtuoso(): 

606 # For virtuoso we need to explicitly query the graph 

607 query = f""" 

608 SELECT ?predicate ?object ?g WHERE {{ 

609 GRAPH ?g {{ 

610 <{subject}> ?predicate ?object. 

611 }} 

612 FILTER(?g NOT IN (<{">, <".join(VIRTUOSO_EXCLUDED_GRAPHS)}>)) 

613 }} 

614 """ 

615 elif get_dataset_is_quadstore(): 

616 # For non-virtuoso quadstore, we need to query all graphs 

617 query = f""" 

618 SELECT ?predicate ?object ?g WHERE {{ 

619 GRAPH ?g {{ 

620 <{subject}> ?predicate ?object. 

621 }} 

622 }} 

623 """ 

624 else: 

625 # For regular triplestore 

626 query = f""" 

627 SELECT ?predicate ?object WHERE {{ 

628 <{subject}> ?predicate ?object. 

629 }} 

630 """ 

631 

632 sparql.setQuery(query) 

633 sparql.setReturnFormat(JSON) 

634 bindings = get_sparql_bindings(sparql.query().convert()) 

635 

636 for result in bindings: 

637 # Create the appropriate value (Literal or URIRef) 

638 obj_data = result["object"] 

639 if obj_data["type"] in {"literal", "typed-literal"}: 

640 if "datatype" in obj_data: 

641 value = Literal( 

642 obj_data["value"], datatype=URIRef(obj_data["datatype"]) 

643 ) 

644 else: 

645 # Omit explicit datatype to match Reader's import behavior 

646 value = Literal(obj_data["value"]) 

647 else: 

648 value = URIRef(obj_data["value"]) 

649 

650 # Add triple/quad based on store type 

651 if get_dataset_is_quadstore(): 

652 graph_uri = URIRef(result["g"]["value"]) 

653 g.add( 

654 ( # type: ignore[arg-type] 

655 subject, 

656 URIRef(result["predicate"]["value"]), 

657 value, 

658 graph_uri, 

659 ) 

660 ) 

661 else: 

662 g.add((subject, URIRef(result["predicate"]["value"]), value)) 

663 

664 return g 

665 

666 

667def parse_sparql_update(query: str) -> dict[str, list[tuple[Node, Node, Node]]]: 

668 parsed = parseUpdate(query) 

669 translated = translateUpdate(parsed).algebra 

670 modifications = {} 

671 

672 def extract_quads( 

673 quads: defaultdict[Node, list[tuple[Node, Node, Node]]], 

674 ) -> list[tuple[Node, Node, Node]]: 

675 return [ 

676 (triple[0], triple[1], triple[2]) 

677 for triples in quads.values() 

678 for triple in triples 

679 ] 

680 

681 for operation in translated: 

682 if operation.name == "DeleteData": 

683 if hasattr(operation, "quads") and operation.quads: 

684 deletions = extract_quads(operation.quads) 

685 else: 

686 deletions = operation.triples 

687 if deletions: 

688 modifications.setdefault("Deletions", []).extend(deletions) 

689 elif operation.name == "InsertData": 

690 if hasattr(operation, "quads") and operation.quads: 

691 additions = extract_quads(operation.quads) 

692 else: 

693 additions = operation.triples 

694 if additions: 

695 modifications.setdefault("Additions", []).extend(additions) 

696 

697 return modifications 

698 

699 

700def fetch_current_state_with_related_entities( 

701 provenance: dict, 

702) -> Graph | Dataset: 

703 """ 

704 Fetch the current state of an entity and all its related entities known from 

705 provenance. 

706 

707 Args: 

708 provenance (dict): Dictionary containing provenance metadata for main entity and 

709 related entities 

710 

711 Returns: 

712 Dataset: A graph containing the current state of all entities 

713 """ 

714 combined_graph = Dataset() if get_dataset_is_quadstore() else Graph() 

715 

716 # Fetch state for all entities mentioned in provenance 

717 for entity_uri in provenance: 

718 current_graph = fetch_data_graph_for_subject(URIRef(entity_uri)) 

719 

720 if get_dataset_is_quadstore(): 

721 for quad in current_graph.quads(): # type: ignore[union-attr] 

722 combined_graph.add(quad) # type: ignore[call-overload] 

723 else: 

724 for triple in current_graph: 

725 combined_graph.add(triple) # type: ignore[call-overload] 

726 

727 return combined_graph 

728 

729 

730@dataclass(frozen=True, slots=True) 

731class DeletedEntitiesQuery: 

732 page: int = 1 

733 per_page: int = 50 

734 sort_property: str = "deletionTime" 

735 sort_direction: str = "DESC" 

736 selected_class: str | None = None 

737 selected_shape: str | None = None 

738 

739 

740def _filter_and_paginate_deleted_entities( 

741 deleted_entities: list[dict], 

742 query: DeletedEntitiesQuery, 

743 sortable_properties: list[dict[str, str]], 

744) -> tuple[ 

745 list[dict], 

746 str | None, 

747 str | None, 

748 list[dict[str, str]], 

749 int, 

750]: 

751 selected_class = query.selected_class 

752 selected_shape = query.selected_shape 

753 

754 reverse_sort = query.sort_direction.upper() == "DESC" 

755 if query.sort_property == "deletionTime": 

756 deleted_entities.sort(key=lambda e: e["deletionTime"], reverse=reverse_sort) 

757 else: 

758 deleted_entities.sort( 

759 key=lambda e: e["sort_values"].get(query.sort_property, "").lower(), 

760 reverse=reverse_sort, 

761 ) 

762 

763 if selected_class: 

764 if selected_shape is None: 

765 selected_shape = determine_shape_for_classes([selected_class]) 

766 entity_key = (selected_class, selected_shape) 

767 sortable_properties.extend(get_sortable_properties(entity_key)) 

768 

769 if selected_class: 

770 filtered_entities = [ 

771 entity 

772 for entity in deleted_entities 

773 if selected_class in entity["entity_types"] 

774 ] 

775 else: 

776 filtered_entities = deleted_entities 

777 

778 total_count = len(filtered_entities) 

779 offset = (query.page - 1) * query.per_page 

780 paginated_entities = filtered_entities[offset : offset + query.per_page] 

781 

782 return ( 

783 paginated_entities, 

784 selected_class, 

785 selected_shape, 

786 sortable_properties, 

787 total_count, 

788 ) 

789 

790 

791def get_deleted_entities_with_filtering( 

792 query: DeletedEntitiesQuery, 

793) -> tuple[ 

794 list[dict[str, str | list[str] | dict[str, str]]], 

795 list[dict[str, str | int]], 

796 str | None, 

797 str | None, 

798 list[dict[str, str]], 

799 int, 

800]: 

801 sortable_properties = [ 

802 {"property": "deletionTime", "displayName": "Deletion Time", "sortType": "date"} 

803 ] 

804 provenance_sparql = get_provenance_sparql() 

805 custom_filter = get_custom_filter() 

806 

807 prov_query = """ 

808 SELECT DISTINCT ?entity ?lastSnapshot ?deletionTime ?agent ?lastValidSnapshotTime 

809 WHERE { 

810 ?lastSnapshot a <http://www.w3.org/ns/prov#Entity> ; 

811 <http://www.w3.org/ns/prov#specializationOf> ?entity ; 

812 <http://www.w3.org/ns/prov#generatedAtTime> ?deletionTime ; 

813 <http://www.w3.org/ns/prov#invalidatedAtTime> ?invalidationTime ; 

814 <http://www.w3.org/ns/prov#wasDerivedFrom> ?lastValidSnapshot. 

815 

816 ?lastValidSnapshot <http://www.w3.org/ns/prov#generatedAtTime> 

817 ?lastValidSnapshotTime . 

818 

819 OPTIONAL { ?lastSnapshot <http://www.w3.org/ns/prov#wasAttributedTo> ?agent . } 

820 

821 FILTER NOT EXISTS { 

822 ?laterSnapshot <http://www.w3.org/ns/prov#wasDerivedFrom> ?lastSnapshot . 

823 } 

824 } 

825 """ 

826 provenance_sparql.setQuery(prov_query) 

827 provenance_sparql.setReturnFormat(JSON) 

828 results_bindings = get_sparql_bindings(provenance_sparql.query().convert()) 

829 if not results_bindings: 

830 return [], [], None, None, [], 0 

831 

832 deleted_entities = [] 

833 max_workers = max(1, min(os.cpu_count() or 4, len(results_bindings))) 

834 with ProcessPoolExecutor(max_workers=max_workers) as executor: 

835 future_to_entity = { 

836 executor.submit(process_deleted_entity, result, sortable_properties): result 

837 for result in results_bindings 

838 } 

839 for future in as_completed(future_to_entity): 

840 entity_info = future.result() 

841 if entity_info is not None: 

842 deleted_entities.append(entity_info) 

843 

844 class_counts = {} 

845 for entity in deleted_entities: 

846 for type_uri in entity["entity_types"]: 

847 class_counts[type_uri] = class_counts.get(type_uri, 0) + 1 

848 

849 available_classes = [ 

850 { 

851 "uri": class_uri, 

852 "label": custom_filter.human_readable_class( 

853 (class_uri, determine_shape_for_classes([class_uri])) 

854 ), 

855 "count": count, 

856 } 

857 for class_uri, count in class_counts.items() 

858 ] 

859 

860 available_classes.sort(key=lambda x: x["label"].lower()) 

861 

862 resolved_query = query 

863 if not query.selected_class and available_classes: 

864 resolved_query = DeletedEntitiesQuery( 

865 page=query.page, 

866 per_page=query.per_page, 

867 sort_property=query.sort_property, 

868 sort_direction=query.sort_direction, 

869 selected_class=available_classes[0]["uri"], 

870 selected_shape=query.selected_shape, 

871 ) 

872 

873 ( 

874 paginated_entities, 

875 selected_class, 

876 selected_shape, 

877 sortable_properties, 

878 total_count, 

879 ) = _filter_and_paginate_deleted_entities( 

880 deleted_entities, resolved_query, sortable_properties 

881 ) 

882 

883 return ( 

884 paginated_entities, 

885 available_classes, 

886 selected_class, 

887 selected_shape, 

888 sortable_properties, 

889 total_count, 

890 ) 

891 

892 

893def process_deleted_entity(result: dict, sortable_properties: list) -> dict | None: 

894 """ 

895 Process a single deleted entity, filtering by visible classes. 

896 """ 

897 change_tracking_config = get_change_tracking_config() 

898 custom_filter = get_custom_filter() 

899 

900 entity_uri = result["entity"]["value"] 

901 last_valid_snapshot_time = result["lastValidSnapshotTime"]["value"] 

902 

903 agnostic_entity = AgnosticEntity( 

904 res=entity_uri, 

905 config=change_tracking_config, 

906 include_related_objects=True, 

907 include_merged_entities=True, 

908 include_reverse_relations=True, 

909 ) 

910 state, _, _ = agnostic_entity.get_state_at_time( 

911 (last_valid_snapshot_time, last_valid_snapshot_time) 

912 ) 

913 state = convert_to_rdflib_graphs(state, is_quadstore=get_dataset_is_quadstore()) 

914 

915 if entity_uri not in state: 

916 return None 

917 

918 last_valid_dt = convert_to_datetime(last_valid_snapshot_time) 

919 if last_valid_dt is None: 

920 msg = "last_valid_dt must not be None" 

921 raise AssertionError(msg) 

922 last_valid_state: Graph | Dataset = state[entity_uri][last_valid_dt.isoformat()] 

923 

924 entity_types = [ 

925 str(o) 

926 for _, _, o in get_triples_from_graph( 

927 last_valid_state, (URIRef(entity_uri), RDF.type, None) 

928 ) 

929 ] 

930 highest_priority_type = get_highest_priority_class(entity_types) 

931 if not highest_priority_type: 

932 return None 

933 shape = determine_shape_for_classes([highest_priority_type]) 

934 visible_types = [ 

935 t 

936 for t in entity_types 

937 if is_entity_type_visible((t, determine_shape_for_classes([t]))) 

938 ] 

939 if not visible_types: 

940 return None 

941 

942 sort_values = {} 

943 for prop in sortable_properties: 

944 prop_uri = prop["property"] 

945 values = [ 

946 str(o) 

947 for _, _, o in get_triples_from_graph( 

948 last_valid_state, (URIRef(entity_uri), URIRef(prop_uri), None) 

949 ) 

950 ] 

951 sort_values[prop_uri] = values[0] if values else "" 

952 

953 return { 

954 "uri": entity_uri, 

955 "deletionTime": result["deletionTime"]["value"], 

956 "deletedBy": custom_filter.format_agent_reference( 

957 result.get("agent", {}).get("value", "") 

958 ), 

959 "lastValidSnapshotTime": last_valid_snapshot_time, 

960 "type": custom_filter.human_readable_predicate( 

961 highest_priority_type, (highest_priority_type, shape) 

962 ), 

963 "label": custom_filter.human_readable_entity( 

964 entity_uri, (highest_priority_type, shape), last_valid_state 

965 ), 

966 "entity_types": visible_types, 

967 "sort_values": sort_values, 

968 } 

969 

970 

971def find_orphaned_entities( 

972 subject: URIRef, 

973 entity_type: str, 

974 predicate: URIRef | None = None, 

975 object_value: str | None = None, 

976) -> tuple[list[dict[str, str]], list[dict[str, str]]]: 

977 sparql = get_sparql() 

978 display_rules = get_display_rules() 

979 

980 intermediate_classes = set() 

981 

982 for rule in display_rules: 

983 if ( 

984 "target" in rule 

985 and "class" in rule["target"] 

986 and rule["target"]["class"] == entity_type 

987 ): 

988 for prop in rule.get("displayProperties", []): 

989 if "intermediateRelation" in prop: 

990 intermediate_classes.add(prop["intermediateRelation"]["class"]) 

991 

992 orphan_query = f""" 

993 SELECT DISTINCT ?entity ?type 

994 WHERE {{ 

995 {f"<{subject}> <{predicate}> ?entity ." if predicate and object_value else ""} 

996 {f"FILTER(?entity = <{object_value}>)" if predicate and object_value else ""} 

997 

998 # If no specific predicate, get all connected entities 

999 {f"<{subject}> ?p ?entity ." if not predicate else ""} 

1000 

1001 FILTER(isIRI(?entity)) 

1002 ?entity a ?type . 

1003 

1004 # No incoming references from other entities 

1005 FILTER NOT EXISTS {{ 

1006 ?other ?anyPredicate ?entity . 

1007 FILTER(?other != <{subject}>) 

1008 }} 

1009 

1010 # No outgoing references to active entities 

1011 FILTER NOT EXISTS {{ 

1012 ?entity ?outgoingPredicate ?connectedEntity . 

1013 ?connectedEntity ?furtherPredicate ?furtherObject . 

1014 {f"FILTER(?connectedEntity != <{subject}>)" if not predicate else ""} 

1015 }} 

1016 

1017 # Exclude intermediate relation entities 

1018 FILTER(?type NOT IN (<{">, <".join(intermediate_classes)}>)) 

1019 }} 

1020 """ 

1021 

1022 # Query to find orphaned intermediate relations 

1023 if predicate and object_value: 

1024 intermediate_query = f""" 

1025 SELECT DISTINCT ?entity ?type 

1026 WHERE {{ 

1027 <{object_value}> a ?type . 

1028 FILTER(?type IN (<{">, <".join(intermediate_classes)}>)) 

1029 BIND(<{object_value}> AS ?entity) 

1030 }} 

1031 """ 

1032 else: 

1033 # Se stiamo cancellando l'intera entità, trova tutte le entità intermedie 

1034 # collegate 

1035 intermediate_query = f""" 

1036 SELECT DISTINCT ?entity ?type 

1037 WHERE {{ 

1038 # Find intermediate relations connected to the entity being deleted 

1039 {{ 

1040 <{subject}> ?p ?entity . 

1041 ?entity a ?type . 

1042 FILTER(?type IN (<{">, <".join(intermediate_classes)}>)) 

1043 }} UNION {{ 

1044 ?entity ?p <{subject}> . 

1045 ?entity a ?type . 

1046 FILTER(?type IN (<{">, <".join(intermediate_classes)}>)) 

1047 }} 

1048 }} 

1049 """ 

1050 

1051 orphaned = [] 

1052 intermediate_orphans = [] 

1053 

1054 # Execute queries and process results 

1055 for query, result_list in [ 

1056 (orphan_query, orphaned), 

1057 (intermediate_query, intermediate_orphans), 

1058 ]: 

1059 sparql.setQuery(query) 

1060 sparql.setReturnFormat(JSON) 

1061 query_bindings = get_sparql_bindings(sparql.query().convert()) 

1062 

1063 for result in query_bindings: 

1064 result_list.append( 

1065 {"uri": result["entity"]["value"], "type": result["type"]["value"]} 

1066 ) 

1067 

1068 return orphaned, intermediate_orphans 

1069 

1070 

1071def import_entity_graph( 

1072 editor: Editor, 

1073 subject: URIRef, 

1074 max_depth: int = 5, 

1075 *, 

1076 include_referencing_entities: bool = False, 

1077) -> Editor: 

1078 imported_subjects: set[str] = set() 

1079 subject_str = str(subject) 

1080 

1081 if include_referencing_entities: 

1082 sparql = get_sparql() 

1083 

1084 if editor.dataset_is_quadstore: 

1085 query = f""" 

1086 SELECT DISTINCT ?s 

1087 WHERE {{ 

1088 GRAPH ?g {{ 

1089 ?s ?p <{subject}> . 

1090 }} 

1091 FILTER(?p != <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>) 

1092 }} 

1093 """ 

1094 else: 

1095 query = f""" 

1096 SELECT DISTINCT ?s 

1097 WHERE {{ 

1098 ?s ?p <{subject}> . 

1099 FILTER(?p != <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>) 

1100 }} 

1101 """ 

1102 

1103 sparql.setQuery(query) 

1104 sparql.setReturnFormat(JSON) 

1105 ref_bindings = get_sparql_bindings(sparql.query().convert()) 

1106 

1107 for result in ref_bindings: 

1108 referencing_subject = result["s"]["value"] 

1109 if ( 

1110 referencing_subject != subject_str 

1111 and referencing_subject not in imported_subjects 

1112 ): 

1113 imported_subjects.add(referencing_subject) 

1114 editor.import_entity(URIRef(referencing_subject)) 

1115 

1116 # Breadth-first traversal so each entity is visited at its minimal distance 

1117 # from the subject: a depth-first walk would consume one level per hop along 

1118 # ordering chains (e.g. oco:hasNext) and silently skip entities pushed beyond 

1119 # max_depth, leaving them without provenance snapshots. 

1120 queue: deque[tuple[str, int]] = deque([(subject_str, 1)]) 

1121 while queue: 

1122 current_subject, current_depth = queue.popleft() 

1123 if current_depth > max_depth or current_subject in imported_subjects: 

1124 continue 

1125 

1126 imported_subjects.add(current_subject) 

1127 editor.import_entity(URIRef(current_subject)) 

1128 

1129 query = f""" 

1130 SELECT ?p ?o 

1131 WHERE {{ 

1132 <{current_subject}> ?p ?o . 

1133 FILTER(isIRI(?o)) 

1134 FILTER(?p != <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>) 

1135 }} 

1136 """ 

1137 

1138 sparql = get_sparql() 

1139 sparql.setQuery(query) 

1140 sparql.setReturnFormat(JSON) 

1141 inner_bindings = get_sparql_bindings(sparql.query().convert()) 

1142 

1143 for result in inner_bindings: 

1144 queue.append((result["o"]["value"], current_depth + 1)) 

1145 

1146 return editor 

1147 

1148 

1149def get_entity_types(subject_uri: str) -> list[str]: 

1150 sparql = get_sparql() 

1151 

1152 query = f""" 

1153 SELECT ?type WHERE {{ 

1154 <{subject_uri}> a ?type . 

1155 }} 

1156 """ 

1157 

1158 sparql.setQuery(query) 

1159 sparql.setReturnFormat(JSON) 

1160 bindings = get_sparql_bindings(sparql.query().convert()) 

1161 

1162 return [result["type"]["value"] for result in bindings] 

1163 

1164 

1165def collect_referenced_entities( 

1166 data: dict[str, str | dict | list] | list | str, 

1167 existing_entities: set[str] | None = None, 

1168) -> set[str]: 

1169 """ 

1170 Recursively collect all URIs of existing entities referenced in the structured data. 

1171 

1172 This function traverses the structured data to find explicit references to existing 

1173 entities 

1174 that need to be imported into the editor before calling preexisting_finished(). 

1175 

1176 Args: 

1177 data: The structured data (can be dict, list, or string) 

1178 existing_entities: Set to collect URIs (created if None) 

1179 

1180 Returns: 

1181 Set of URIs (strings) of existing entities that should be imported 

1182 """ 

1183 

1184 if existing_entities is None: 

1185 existing_entities = set() 

1186 

1187 if isinstance(data, dict): 

1188 if data.get("is_existing_entity") is True and "entity_uri" in data: 

1189 existing_entities.add(str(data["entity_uri"])) 

1190 

1191 # If it's an entity with entity_type, it's a new entity being created 

1192 elif "entity_type" in data: 

1193 properties = data.get("properties", {}) 

1194 if isinstance(properties, dict): 

1195 for prop_values in properties.values(): 

1196 collect_referenced_entities(prop_values, existing_entities) 

1197 else: 

1198 for value in data.values(): 

1199 collect_referenced_entities(value, existing_entities) 

1200 

1201 elif isinstance(data, list): 

1202 for item in data: 

1203 collect_referenced_entities(item, existing_entities) 

1204 

1205 return existing_entities 

1206 

1207 

1208def import_referenced_entities( 

1209 editor: Editor, 

1210 structured_data: dict[str, str | dict | list] | list | str, 

1211) -> None: 

1212 referenced_entities = collect_referenced_entities(structured_data) 

1213 for entity_uri in referenced_entities: 

1214 try: 

1215 editor.import_entity(URIRef(entity_uri)) 

1216 except (SPARQLWrapperException, OSError, ValueError): # noqa: PERF203 

1217 logging.getLogger(__name__).debug( 

1218 "Failed to import referenced entity %s", entity_uri 

1219 ) 

1220 continue