Coverage for heritrace / utils / sparql_utils.py: 93%
488 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-07-02 10:16 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-07-02 10:16 +0000
1# SPDX-FileCopyrightText: 2024-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
2#
3# SPDX-License-Identifier: ISC
5import logging
6import os
7import time
8from collections import defaultdict, deque
9from collections.abc import Generator
10from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
11from dataclasses import dataclass
13from flask import current_app
14from rdflib import RDF, Dataset, Graph, Literal, URIRef
15from rdflib.plugins.sparql.algebra import translateUpdate
16from rdflib.plugins.sparql.parser import parseUpdate
17from rdflib.term import Node
18from rdflib.util import from_n3
19from SPARQLWrapper import JSON
20from SPARQLWrapper.SPARQLExceptions import SPARQLWrapperException
21from time_agnostic_library.agnostic_entity import AgnosticEntity
23from heritrace.editor import Editor
24from heritrace.extensions import (
25 get_change_tracking_config,
26 get_classes_with_multiple_shapes,
27 get_custom_filter,
28 get_dataset_is_quadstore,
29 get_display_rules,
30 get_provenance_sparql,
31 get_shacl_graph,
32 get_sparql,
33)
34from heritrace.sparql import get_sparql_bindings
35from heritrace.utils.converters import convert_to_datetime
36from heritrace.utils.display_rules_utils import (
37 find_matching_rule,
38 get_highest_priority_class,
39 get_sortable_properties,
40 is_entity_type_visible,
41)
42from heritrace.utils.shacl_utils import (
43 determine_shape_for_classes,
44 determine_shape_for_entity_triples,
45)
46from heritrace.utils.virtuoso_utils import VIRTUOSO_EXCLUDED_GRAPHS, is_virtuoso
48_cache: dict[str, tuple[list[dict[str, str | int]], float] | None] = {
49 "available_classes": None
50}
51AVAILABLE_CLASSES_TTL_SECONDS = 60
54def _parse_n3(value: str) -> Node:
55 result = from_n3(value)
56 if not isinstance(result, Node):
57 msg = f"Cannot parse N3 value: {value}"
58 raise TypeError(msg)
59 return result
62def n3_set_to_graph(
63 n3_set: set[tuple[str, ...]],
64 *,
65 is_quadstore: bool,
66) -> Graph | Dataset:
67 if is_quadstore:
68 g = Dataset(default_union=True)
69 for tup in n3_set:
70 quad = (
71 _parse_n3(tup[0]),
72 _parse_n3(tup[1]),
73 _parse_n3(tup[2]),
74 _parse_n3(tup[3]),
75 )
76 g.add(quad) # type: ignore[arg-type]
77 else:
78 g = Graph()
79 for tup in n3_set:
80 g.add((_parse_n3(tup[0]), _parse_n3(tup[1]), _parse_n3(tup[2])))
81 return g
84def convert_to_rdflib_graphs(snapshots: dict, *, is_quadstore: bool) -> dict:
85 converted = {}
86 for entity_uri, timestamps in snapshots.items():
87 converted[entity_uri] = {}
88 for ts, n3_set in timestamps.items():
89 converted[entity_uri][ts] = n3_set_to_graph(
90 n3_set, is_quadstore=is_quadstore
91 )
92 return converted
95def get_triples_from_graph(
96 graph_or_dataset: Graph | Dataset,
97 pattern: tuple[URIRef | None, URIRef | None, Node | None],
98) -> Generator[tuple[Node, Node, Node]]:
99 """
100 Get triples from a Graph or Dataset, handling both cases correctly.
102 For Dataset (quadstore), converts quads to triples by extracting (s, p, o).
103 For Graph (triplestore), uses triples() directly.
105 Args:
106 graph_or_dataset: Graph or Dataset instance
107 pattern: Triple pattern tuple (s, p, o) where each can be None
109 Returns:
110 Generator of triples (s, p, o)
111 """
112 if isinstance(graph_or_dataset, Dataset):
113 # For Dataset, use quads() and extract only (s, p, o)
114 for s, p, o, _g in graph_or_dataset.quads(pattern):
115 yield (s, p, o)
116 else:
117 # For Graph, use triples() directly
118 yield from graph_or_dataset.triples(pattern)
121COUNT_LIMIT = int(os.getenv("COUNT_LIMIT", "10000"))
124def _wrap_virtuoso_graph_pattern(pattern: str) -> str:
125 """Wrap a SPARQL pattern with Virtuoso GRAPH clause if needed."""
126 if is_virtuoso():
127 return f"""
128 GRAPH ?g {{
129 {pattern}
130 }}
131 FILTER(?g NOT IN (<{">, <".join(VIRTUOSO_EXCLUDED_GRAPHS)}>))
132 """
133 return pattern
136def _build_count_query_with_limit(class_uri: str, limit: int) -> str:
137 """Build a COUNT query with LIMIT for a specific class."""
139 return f"""
140 SELECT (COUNT(?subject) as ?count)
141 WHERE {{
142 {{
143 SELECT DISTINCT ?subject
144 WHERE {{
145 ?subject a <{class_uri}> .
146 }}
147 LIMIT {limit}
148 }}
149 }}
150 """
153def _count_class_instances(class_uri: str, limit: int = COUNT_LIMIT) -> tuple:
154 """
155 Count instances of a class up to a limit.
157 Returns:
158 tuple: (display_count, numeric_count) where display_count may be "LIMIT+"
159 """
160 sparql = get_sparql()
161 query = _build_count_query_with_limit(class_uri, limit + 1)
163 sparql.setQuery(query)
164 sparql.setReturnFormat(JSON)
165 bindings = get_sparql_bindings(sparql.query().convert())
167 count = int(bindings[0]["count"]["value"])
169 if count > limit:
170 return f"{limit}+", limit
171 return str(count), count
174def _get_entities_with_enhanced_shape_detection(
175 class_uri: str, classes_with_multiple_shapes: set[str], limit: int = COUNT_LIMIT
176) -> defaultdict[str, list[dict[str, str]]]:
177 """
178 Get entities for a class using enhanced shape detection
179 for classes with multiple shapes.
180 Uses LIMIT to avoid loading all entities.
181 """
182 # Early exit if no classes have multiple shapes
183 if (
184 not classes_with_multiple_shapes
185 or class_uri not in classes_with_multiple_shapes
186 ):
187 return defaultdict(list)
189 sparql = get_sparql()
191 subjects_query = f"""
192 SELECT DISTINCT ?subject
193 WHERE {{
194 ?subject a <{class_uri}> .
195 }}
196 LIMIT {limit}
197 """
199 sparql.setQuery(subjects_query)
200 sparql.setReturnFormat(JSON)
201 subjects_bindings = get_sparql_bindings(sparql.query().convert())
203 subjects = [r["subject"]["value"] for r in subjects_bindings]
205 if not subjects:
206 return defaultdict(list)
208 # Fetch triples only for these specific subjects
209 subjects_filter = " ".join([f"(<{s}>)" for s in subjects])
210 pattern_with_filter = (
211 f"?subject a <{class_uri}> . ?subject ?p ?o"
212 f" . VALUES (?subject) {{ {subjects_filter} }}"
213 )
215 triples_query = f"""
216 SELECT ?subject ?p ?o
217 WHERE {{
218 {pattern_with_filter}
219 }}
220 """
222 sparql.setQuery(triples_query)
223 sparql.setReturnFormat(JSON)
224 triples_bindings = get_sparql_bindings(sparql.query().convert())
226 entities_triples = defaultdict(list)
227 for binding in triples_bindings:
228 subject = binding["subject"]["value"]
229 predicate = binding["p"]["value"]
230 obj = binding["o"]["value"]
231 entities_triples[subject].append((subject, predicate, obj))
233 shape_to_entities = defaultdict(list)
234 for subject_uri, triples in entities_triples.items():
235 shape_uri = determine_shape_for_entity_triples(triples)
236 if shape_uri:
237 entity_key = (class_uri, shape_uri)
238 if is_entity_type_visible(entity_key):
239 shape_to_entities[shape_uri].append(
240 {"uri": subject_uri, "class": class_uri, "shape": shape_uri}
241 )
243 return shape_to_entities
246def get_classes_from_shacl_or_display_rules() -> list[str]:
247 """Extract classes from SHACL shapes or display_rules configuration."""
248 sh_target_class = URIRef("http://www.w3.org/ns/shacl#targetClass")
249 classes = set()
251 shacl_graph = get_shacl_graph()
252 if shacl_graph:
253 for shape in shacl_graph.subjects(sh_target_class, None, unique=True):
254 for target_class in shacl_graph.objects(
255 shape, sh_target_class, unique=True
256 ):
257 classes.add(str(target_class))
259 if not classes:
260 display_rules = get_display_rules()
261 if display_rules:
262 for rule in display_rules:
263 if "target" in rule and "class" in rule["target"]:
264 classes.add(rule["target"]["class"])
266 return list(classes)
269def _get_classes_from_config() -> list[str]:
270 classes_from_config = get_classes_from_shacl_or_display_rules()
271 if classes_from_config:
272 return classes_from_config
274 return _get_classes_from_sparql()
277def _get_classes_from_sparql() -> list[str]:
278 sparql = get_sparql()
279 pattern = "?subject a ?class ."
280 wrapped_pattern = _wrap_virtuoso_graph_pattern(pattern)
282 query = f"""
283 SELECT DISTINCT ?class
284 WHERE {{
285 {wrapped_pattern}
286 }}
287 """
289 sparql.setQuery(query)
290 sparql.setReturnFormat(JSON)
291 class_bindings = get_sparql_bindings(sparql.query().convert())
292 return [r["class"]["value"] for r in class_bindings]
295def get_available_classes() -> list[dict[str, str | int]]:
296 cached = _cache["available_classes"]
297 if cached is not None:
298 available_classes, computed_at = cached
299 if time.monotonic() - computed_at < AVAILABLE_CLASSES_TTL_SECONDS:
300 return available_classes
302 custom_filter = get_custom_filter()
303 class_uris = _get_classes_from_config()
305 classes_with_counts = []
306 for class_uri in class_uris:
307 display_count, numeric_count = _count_class_instances(class_uri)
308 classes_with_counts.append(
309 {
310 "uri": class_uri,
311 "display_count": display_count,
312 "numeric_count": numeric_count,
313 }
314 )
316 classes_with_counts.sort(key=lambda x: x["numeric_count"], reverse=True)
318 available_classes = []
319 classes_with_multiple_shapes = get_classes_with_multiple_shapes()
321 for class_data in classes_with_counts:
322 class_uri = class_data["uri"]
324 if classes_with_multiple_shapes and class_uri in classes_with_multiple_shapes:
325 shape_to_entities = _get_entities_with_enhanced_shape_detection(
326 class_uri, classes_with_multiple_shapes, limit=COUNT_LIMIT
327 )
329 for shape_uri, entities in shape_to_entities.items():
330 if entities:
331 entity_key = (class_uri, shape_uri)
332 available_classes.append(
333 {
334 "uri": class_uri,
335 "label": custom_filter.human_readable_class(entity_key),
336 "count": f"{len(entities)}+"
337 if len(entities) >= COUNT_LIMIT
338 else str(len(entities)),
339 "count_numeric": len(entities),
340 "shape": shape_uri,
341 }
342 )
343 else:
344 shape_uri = determine_shape_for_classes([class_uri])
345 entity_key = (class_uri, shape_uri)
347 if is_entity_type_visible(entity_key):
348 available_classes.append(
349 {
350 "uri": class_uri,
351 "label": custom_filter.human_readable_class(entity_key),
352 "count": class_data["display_count"],
353 "count_numeric": class_data["numeric_count"],
354 "shape": shape_uri,
355 }
356 )
358 available_classes.sort(key=lambda x: x["label"].lower())
359 _cache["available_classes"] = (available_classes, time.monotonic())
360 return available_classes
363def build_sort_clause(
364 sort_property: str, entity_type: str, shape_uri: str | None = None
365) -> str:
366 """
367 Build a SPARQL sort clause based on the sortableBy configuration.
369 Args:
370 sort_property: The property to sort by
371 entity_type: The entity type URI
372 shape_uri: Optional shape URI for more specific sorting rules
374 Returns:
375 SPARQL sort clause or empty string
376 """
377 if not sort_property or not entity_type:
378 return ""
380 rule = find_matching_rule(entity_type, shape_uri)
382 if not rule or "sortableBy" not in rule:
383 return ""
385 sort_config = next(
386 (s for s in rule["sortableBy"] if s.get("property") == sort_property), None
387 )
389 if not sort_config:
390 return ""
392 return f"OPTIONAL {{ ?subject <{sort_property}> ?sortValue }}"
395@dataclass(frozen=True, slots=True)
396class CatalogQuery:
397 selected_class: str | None
398 page: int
399 per_page: int
400 sort_property: str | None = None
401 sort_direction: str = "ASC"
402 selected_shape: str | None = None
405def _fetch_entity_labels(
406 subject_uris: list[str], entity_key: tuple[str | None, str | None]
407) -> list[str]:
408 if not subject_uris:
409 return []
410 custom_filter = get_custom_filter()
411 app = current_app._get_current_object() # type: ignore[attr-defined] # noqa: SLF001
413 def fetch_label(uri: str) -> str:
414 with app.app_context():
415 return custom_filter.human_readable_entity(uri, entity_key, None)
417 with ThreadPoolExecutor(max_workers=8) as executor:
418 return list(executor.map(fetch_label, subject_uris))
421def _get_entities_with_shape_filtering(
422 query: CatalogQuery,
423) -> tuple[list[dict[str, str]], int]:
424 sparql = get_sparql()
425 selected_class = query.selected_class
426 selected_shape = query.selected_shape
427 offset = (query.page - 1) * query.per_page
428 fetch_limit = query.per_page * 5
430 subjects_query = f"""
431 SELECT DISTINCT ?subject
432 WHERE {{
433 ?subject a <{selected_class}> .
434 }}
435 LIMIT {fetch_limit}
436 OFFSET {offset}
437 """
439 sparql.setQuery(subjects_query)
440 sparql.setReturnFormat(JSON)
441 subjects_bindings = get_sparql_bindings(sparql.query().convert())
443 subjects = [r["subject"]["value"] for r in subjects_bindings]
445 if not subjects:
446 return [], 0
448 subjects_filter = " ".join([f"(<{s}>)" for s in subjects])
450 triples_query = f"""
451 SELECT ?subject ?p ?o
452 WHERE {{
453 ?subject a <{selected_class}> . ?subject ?p ?o . VALUES (?subject) {{
454 {subjects_filter} }}
455 }}
456 """
458 sparql.setQuery(triples_query)
459 sparql.setReturnFormat(JSON)
460 triples_bindings = get_sparql_bindings(sparql.query().convert())
462 entities_triples = defaultdict(list)
463 for binding in triples_bindings:
464 subject = binding["subject"]["value"]
465 predicate = binding["p"]["value"]
466 obj = binding["o"]["value"]
467 entities_triples[subject].append((subject, predicate, obj))
469 matching_uris = [
470 subject_uri
471 for subject_uri, triples in entities_triples.items()
472 if determine_shape_for_entity_triples(list(triples)) == selected_shape
473 ]
474 labels = _fetch_entity_labels(matching_uris, (selected_class, selected_shape))
475 filtered_entities = [
476 {"uri": uri, "label": label}
477 for uri, label in zip(matching_uris, labels, strict=True)
478 ]
480 if query.sort_property and query.sort_direction:
481 reverse_sort = query.sort_direction.upper() == "DESC"
482 filtered_entities.sort(key=lambda x: x["label"].lower(), reverse=reverse_sort)
484 total_count = len(filtered_entities)
485 return filtered_entities[: query.per_page], total_count
488def get_entities_for_class(
489 query: CatalogQuery,
490 available_classes: list[dict[str, str | int]],
491) -> tuple[list[dict[str, str]], int]:
492 if query.selected_class is None:
493 msg = "selected_class must not be None"
494 raise ValueError(msg)
495 sparql = get_sparql()
496 classes_with_multiple_shapes = get_classes_with_multiple_shapes()
498 selected_class: str = query.selected_class
499 selected_shape = query.selected_shape
500 page = query.page
501 per_page = query.per_page
502 sort_property = query.sort_property
503 sort_direction = query.sort_direction
505 use_shape_filtering = (
506 selected_shape and selected_class in classes_with_multiple_shapes
507 )
509 if use_shape_filtering:
510 return _get_entities_with_shape_filtering(query)
512 offset = (page - 1) * per_page
513 sort_clause = ""
514 order_clause = ""
516 if sort_property:
517 sort_clause = build_sort_clause(sort_property, selected_class, selected_shape)
518 if sort_clause:
519 order_clause = f"ORDER BY {sort_direction}(?sortValue)"
521 entities_query = f"""
522 SELECT ?subject {"?sortValue" if sort_property else ""}
523 WHERE {{
524 ?subject a <{selected_class}> . {sort_clause}
525 }}
526 {order_clause}
527 LIMIT {per_page}
528 OFFSET {offset}
529 """
531 class_info = next(
532 (
533 c
534 for c in available_classes
535 if c["uri"] == selected_class and c.get("shape") == selected_shape
536 ),
537 None,
538 )
539 total_count = int(class_info["count_numeric"]) if class_info else 0
541 sparql.setQuery(entities_query)
542 sparql.setReturnFormat(JSON)
543 entities_bindings = get_sparql_bindings(sparql.query().convert())
545 shape = selected_shape or determine_shape_for_classes([selected_class])
546 subject_uris = [result["subject"]["value"] for result in entities_bindings]
547 labels = _fetch_entity_labels(subject_uris, (selected_class, shape))
548 entities = [
549 {"uri": uri, "label": label}
550 for uri, label in zip(subject_uris, labels, strict=True)
551 ]
553 return entities, total_count
556def get_catalog_data(
557 query: CatalogQuery,
558 available_classes: list[dict[str, str | int]],
559) -> dict:
560 entities = []
561 total_count = 0
562 sortable_properties = []
563 sort_property = query.sort_property
565 if query.selected_class:
566 sortable_properties = get_sortable_properties(
567 (query.selected_class, query.selected_shape)
568 )
570 if not sort_property and sortable_properties:
571 sort_property = sortable_properties[0]["property"]
573 inner_query = CatalogQuery(
574 selected_class=query.selected_class,
575 page=query.page,
576 per_page=query.per_page,
577 sort_property=sort_property,
578 sort_direction=query.sort_direction,
579 selected_shape=query.selected_shape,
580 )
581 entities, total_count = get_entities_for_class(inner_query, available_classes)
583 return {
584 "entities": entities,
585 "total_pages": (
586 (total_count + query.per_page - 1) // query.per_page
587 if total_count > 0
588 else 0
589 ),
590 "current_page": query.page,
591 "per_page": query.per_page,
592 "total_count": total_count,
593 "sort_property": sort_property,
594 "sort_direction": query.sort_direction,
595 "sortable_properties": sortable_properties,
596 "selected_class": query.selected_class,
597 "selected_shape": query.selected_shape,
598 }
601def fetch_data_graph_for_subject(subject: URIRef) -> Graph | Dataset:
602 g = Dataset() if get_dataset_is_quadstore() else Graph()
603 sparql = get_sparql()
605 if is_virtuoso():
606 # For virtuoso we need to explicitly query the graph
607 query = f"""
608 SELECT ?predicate ?object ?g WHERE {{
609 GRAPH ?g {{
610 <{subject}> ?predicate ?object.
611 }}
612 FILTER(?g NOT IN (<{">, <".join(VIRTUOSO_EXCLUDED_GRAPHS)}>))
613 }}
614 """
615 elif get_dataset_is_quadstore():
616 # For non-virtuoso quadstore, we need to query all graphs
617 query = f"""
618 SELECT ?predicate ?object ?g WHERE {{
619 GRAPH ?g {{
620 <{subject}> ?predicate ?object.
621 }}
622 }}
623 """
624 else:
625 # For regular triplestore
626 query = f"""
627 SELECT ?predicate ?object WHERE {{
628 <{subject}> ?predicate ?object.
629 }}
630 """
632 sparql.setQuery(query)
633 sparql.setReturnFormat(JSON)
634 bindings = get_sparql_bindings(sparql.query().convert())
636 for result in bindings:
637 # Create the appropriate value (Literal or URIRef)
638 obj_data = result["object"]
639 if obj_data["type"] in {"literal", "typed-literal"}:
640 if "datatype" in obj_data:
641 value = Literal(
642 obj_data["value"], datatype=URIRef(obj_data["datatype"])
643 )
644 else:
645 # Omit explicit datatype to match Reader's import behavior
646 value = Literal(obj_data["value"])
647 else:
648 value = URIRef(obj_data["value"])
650 # Add triple/quad based on store type
651 if get_dataset_is_quadstore():
652 graph_uri = URIRef(result["g"]["value"])
653 g.add(
654 ( # type: ignore[arg-type]
655 subject,
656 URIRef(result["predicate"]["value"]),
657 value,
658 graph_uri,
659 )
660 )
661 else:
662 g.add((subject, URIRef(result["predicate"]["value"]), value))
664 return g
667def parse_sparql_update(query: str) -> dict[str, list[tuple[Node, Node, Node]]]:
668 parsed = parseUpdate(query)
669 translated = translateUpdate(parsed).algebra
670 modifications = {}
672 def extract_quads(
673 quads: defaultdict[Node, list[tuple[Node, Node, Node]]],
674 ) -> list[tuple[Node, Node, Node]]:
675 return [
676 (triple[0], triple[1], triple[2])
677 for triples in quads.values()
678 for triple in triples
679 ]
681 for operation in translated:
682 if operation.name == "DeleteData":
683 if hasattr(operation, "quads") and operation.quads:
684 deletions = extract_quads(operation.quads)
685 else:
686 deletions = operation.triples
687 if deletions:
688 modifications.setdefault("Deletions", []).extend(deletions)
689 elif operation.name == "InsertData":
690 if hasattr(operation, "quads") and operation.quads:
691 additions = extract_quads(operation.quads)
692 else:
693 additions = operation.triples
694 if additions:
695 modifications.setdefault("Additions", []).extend(additions)
697 return modifications
700def fetch_current_state_with_related_entities(
701 provenance: dict,
702) -> Graph | Dataset:
703 """
704 Fetch the current state of an entity and all its related entities known from
705 provenance.
707 Args:
708 provenance (dict): Dictionary containing provenance metadata for main entity and
709 related entities
711 Returns:
712 Dataset: A graph containing the current state of all entities
713 """
714 combined_graph = Dataset() if get_dataset_is_quadstore() else Graph()
716 # Fetch state for all entities mentioned in provenance
717 for entity_uri in provenance:
718 current_graph = fetch_data_graph_for_subject(URIRef(entity_uri))
720 if get_dataset_is_quadstore():
721 for quad in current_graph.quads(): # type: ignore[union-attr]
722 combined_graph.add(quad) # type: ignore[call-overload]
723 else:
724 for triple in current_graph:
725 combined_graph.add(triple) # type: ignore[call-overload]
727 return combined_graph
730@dataclass(frozen=True, slots=True)
731class DeletedEntitiesQuery:
732 page: int = 1
733 per_page: int = 50
734 sort_property: str = "deletionTime"
735 sort_direction: str = "DESC"
736 selected_class: str | None = None
737 selected_shape: str | None = None
740def _filter_and_paginate_deleted_entities(
741 deleted_entities: list[dict],
742 query: DeletedEntitiesQuery,
743 sortable_properties: list[dict[str, str]],
744) -> tuple[
745 list[dict],
746 str | None,
747 str | None,
748 list[dict[str, str]],
749 int,
750]:
751 selected_class = query.selected_class
752 selected_shape = query.selected_shape
754 reverse_sort = query.sort_direction.upper() == "DESC"
755 if query.sort_property == "deletionTime":
756 deleted_entities.sort(key=lambda e: e["deletionTime"], reverse=reverse_sort)
757 else:
758 deleted_entities.sort(
759 key=lambda e: e["sort_values"].get(query.sort_property, "").lower(),
760 reverse=reverse_sort,
761 )
763 if selected_class:
764 if selected_shape is None:
765 selected_shape = determine_shape_for_classes([selected_class])
766 entity_key = (selected_class, selected_shape)
767 sortable_properties.extend(get_sortable_properties(entity_key))
769 if selected_class:
770 filtered_entities = [
771 entity
772 for entity in deleted_entities
773 if selected_class in entity["entity_types"]
774 ]
775 else:
776 filtered_entities = deleted_entities
778 total_count = len(filtered_entities)
779 offset = (query.page - 1) * query.per_page
780 paginated_entities = filtered_entities[offset : offset + query.per_page]
782 return (
783 paginated_entities,
784 selected_class,
785 selected_shape,
786 sortable_properties,
787 total_count,
788 )
791def get_deleted_entities_with_filtering(
792 query: DeletedEntitiesQuery,
793) -> tuple[
794 list[dict[str, str | list[str] | dict[str, str]]],
795 list[dict[str, str | int]],
796 str | None,
797 str | None,
798 list[dict[str, str]],
799 int,
800]:
801 sortable_properties = [
802 {"property": "deletionTime", "displayName": "Deletion Time", "sortType": "date"}
803 ]
804 provenance_sparql = get_provenance_sparql()
805 custom_filter = get_custom_filter()
807 prov_query = """
808 SELECT DISTINCT ?entity ?lastSnapshot ?deletionTime ?agent ?lastValidSnapshotTime
809 WHERE {
810 ?lastSnapshot a <http://www.w3.org/ns/prov#Entity> ;
811 <http://www.w3.org/ns/prov#specializationOf> ?entity ;
812 <http://www.w3.org/ns/prov#generatedAtTime> ?deletionTime ;
813 <http://www.w3.org/ns/prov#invalidatedAtTime> ?invalidationTime ;
814 <http://www.w3.org/ns/prov#wasDerivedFrom> ?lastValidSnapshot.
816 ?lastValidSnapshot <http://www.w3.org/ns/prov#generatedAtTime>
817 ?lastValidSnapshotTime .
819 OPTIONAL { ?lastSnapshot <http://www.w3.org/ns/prov#wasAttributedTo> ?agent . }
821 FILTER NOT EXISTS {
822 ?laterSnapshot <http://www.w3.org/ns/prov#wasDerivedFrom> ?lastSnapshot .
823 }
824 }
825 """
826 provenance_sparql.setQuery(prov_query)
827 provenance_sparql.setReturnFormat(JSON)
828 results_bindings = get_sparql_bindings(provenance_sparql.query().convert())
829 if not results_bindings:
830 return [], [], None, None, [], 0
832 deleted_entities = []
833 max_workers = max(1, min(os.cpu_count() or 4, len(results_bindings)))
834 with ProcessPoolExecutor(max_workers=max_workers) as executor:
835 future_to_entity = {
836 executor.submit(process_deleted_entity, result, sortable_properties): result
837 for result in results_bindings
838 }
839 for future in as_completed(future_to_entity):
840 entity_info = future.result()
841 if entity_info is not None:
842 deleted_entities.append(entity_info)
844 class_counts = {}
845 for entity in deleted_entities:
846 for type_uri in entity["entity_types"]:
847 class_counts[type_uri] = class_counts.get(type_uri, 0) + 1
849 available_classes = [
850 {
851 "uri": class_uri,
852 "label": custom_filter.human_readable_class(
853 (class_uri, determine_shape_for_classes([class_uri]))
854 ),
855 "count": count,
856 }
857 for class_uri, count in class_counts.items()
858 ]
860 available_classes.sort(key=lambda x: x["label"].lower())
862 resolved_query = query
863 if not query.selected_class and available_classes:
864 resolved_query = DeletedEntitiesQuery(
865 page=query.page,
866 per_page=query.per_page,
867 sort_property=query.sort_property,
868 sort_direction=query.sort_direction,
869 selected_class=available_classes[0]["uri"],
870 selected_shape=query.selected_shape,
871 )
873 (
874 paginated_entities,
875 selected_class,
876 selected_shape,
877 sortable_properties,
878 total_count,
879 ) = _filter_and_paginate_deleted_entities(
880 deleted_entities, resolved_query, sortable_properties
881 )
883 return (
884 paginated_entities,
885 available_classes,
886 selected_class,
887 selected_shape,
888 sortable_properties,
889 total_count,
890 )
893def process_deleted_entity(result: dict, sortable_properties: list) -> dict | None:
894 """
895 Process a single deleted entity, filtering by visible classes.
896 """
897 change_tracking_config = get_change_tracking_config()
898 custom_filter = get_custom_filter()
900 entity_uri = result["entity"]["value"]
901 last_valid_snapshot_time = result["lastValidSnapshotTime"]["value"]
903 agnostic_entity = AgnosticEntity(
904 res=entity_uri,
905 config=change_tracking_config,
906 include_related_objects=True,
907 include_merged_entities=True,
908 include_reverse_relations=True,
909 )
910 state, _, _ = agnostic_entity.get_state_at_time(
911 (last_valid_snapshot_time, last_valid_snapshot_time)
912 )
913 state = convert_to_rdflib_graphs(state, is_quadstore=get_dataset_is_quadstore())
915 if entity_uri not in state:
916 return None
918 last_valid_dt = convert_to_datetime(last_valid_snapshot_time)
919 if last_valid_dt is None:
920 msg = "last_valid_dt must not be None"
921 raise AssertionError(msg)
922 last_valid_state: Graph | Dataset = state[entity_uri][last_valid_dt.isoformat()]
924 entity_types = [
925 str(o)
926 for _, _, o in get_triples_from_graph(
927 last_valid_state, (URIRef(entity_uri), RDF.type, None)
928 )
929 ]
930 highest_priority_type = get_highest_priority_class(entity_types)
931 if not highest_priority_type:
932 return None
933 shape = determine_shape_for_classes([highest_priority_type])
934 visible_types = [
935 t
936 for t in entity_types
937 if is_entity_type_visible((t, determine_shape_for_classes([t])))
938 ]
939 if not visible_types:
940 return None
942 sort_values = {}
943 for prop in sortable_properties:
944 prop_uri = prop["property"]
945 values = [
946 str(o)
947 for _, _, o in get_triples_from_graph(
948 last_valid_state, (URIRef(entity_uri), URIRef(prop_uri), None)
949 )
950 ]
951 sort_values[prop_uri] = values[0] if values else ""
953 return {
954 "uri": entity_uri,
955 "deletionTime": result["deletionTime"]["value"],
956 "deletedBy": custom_filter.format_agent_reference(
957 result.get("agent", {}).get("value", "")
958 ),
959 "lastValidSnapshotTime": last_valid_snapshot_time,
960 "type": custom_filter.human_readable_predicate(
961 highest_priority_type, (highest_priority_type, shape)
962 ),
963 "label": custom_filter.human_readable_entity(
964 entity_uri, (highest_priority_type, shape), last_valid_state
965 ),
966 "entity_types": visible_types,
967 "sort_values": sort_values,
968 }
971def find_orphaned_entities(
972 subject: URIRef,
973 entity_type: str,
974 predicate: URIRef | None = None,
975 object_value: str | None = None,
976) -> tuple[list[dict[str, str]], list[dict[str, str]]]:
977 sparql = get_sparql()
978 display_rules = get_display_rules()
980 intermediate_classes = set()
982 for rule in display_rules:
983 if (
984 "target" in rule
985 and "class" in rule["target"]
986 and rule["target"]["class"] == entity_type
987 ):
988 for prop in rule.get("displayProperties", []):
989 if "intermediateRelation" in prop:
990 intermediate_classes.add(prop["intermediateRelation"]["class"])
992 orphan_query = f"""
993 SELECT DISTINCT ?entity ?type
994 WHERE {{
995 {f"<{subject}> <{predicate}> ?entity ." if predicate and object_value else ""}
996 {f"FILTER(?entity = <{object_value}>)" if predicate and object_value else ""}
998 # If no specific predicate, get all connected entities
999 {f"<{subject}> ?p ?entity ." if not predicate else ""}
1001 FILTER(isIRI(?entity))
1002 ?entity a ?type .
1004 # No incoming references from other entities
1005 FILTER NOT EXISTS {{
1006 ?other ?anyPredicate ?entity .
1007 FILTER(?other != <{subject}>)
1008 }}
1010 # No outgoing references to active entities
1011 FILTER NOT EXISTS {{
1012 ?entity ?outgoingPredicate ?connectedEntity .
1013 ?connectedEntity ?furtherPredicate ?furtherObject .
1014 {f"FILTER(?connectedEntity != <{subject}>)" if not predicate else ""}
1015 }}
1017 # Exclude intermediate relation entities
1018 FILTER(?type NOT IN (<{">, <".join(intermediate_classes)}>))
1019 }}
1020 """
1022 # Query to find orphaned intermediate relations
1023 if predicate and object_value:
1024 intermediate_query = f"""
1025 SELECT DISTINCT ?entity ?type
1026 WHERE {{
1027 <{object_value}> a ?type .
1028 FILTER(?type IN (<{">, <".join(intermediate_classes)}>))
1029 BIND(<{object_value}> AS ?entity)
1030 }}
1031 """
1032 else:
1033 # Se stiamo cancellando l'intera entità, trova tutte le entità intermedie
1034 # collegate
1035 intermediate_query = f"""
1036 SELECT DISTINCT ?entity ?type
1037 WHERE {{
1038 # Find intermediate relations connected to the entity being deleted
1039 {{
1040 <{subject}> ?p ?entity .
1041 ?entity a ?type .
1042 FILTER(?type IN (<{">, <".join(intermediate_classes)}>))
1043 }} UNION {{
1044 ?entity ?p <{subject}> .
1045 ?entity a ?type .
1046 FILTER(?type IN (<{">, <".join(intermediate_classes)}>))
1047 }}
1048 }}
1049 """
1051 orphaned = []
1052 intermediate_orphans = []
1054 # Execute queries and process results
1055 for query, result_list in [
1056 (orphan_query, orphaned),
1057 (intermediate_query, intermediate_orphans),
1058 ]:
1059 sparql.setQuery(query)
1060 sparql.setReturnFormat(JSON)
1061 query_bindings = get_sparql_bindings(sparql.query().convert())
1063 for result in query_bindings:
1064 result_list.append(
1065 {"uri": result["entity"]["value"], "type": result["type"]["value"]}
1066 )
1068 return orphaned, intermediate_orphans
1071def import_entity_graph(
1072 editor: Editor,
1073 subject: URIRef,
1074 max_depth: int = 5,
1075 *,
1076 include_referencing_entities: bool = False,
1077) -> Editor:
1078 imported_subjects: set[str] = set()
1079 subject_str = str(subject)
1081 if include_referencing_entities:
1082 sparql = get_sparql()
1084 if editor.dataset_is_quadstore:
1085 query = f"""
1086 SELECT DISTINCT ?s
1087 WHERE {{
1088 GRAPH ?g {{
1089 ?s ?p <{subject}> .
1090 }}
1091 FILTER(?p != <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>)
1092 }}
1093 """
1094 else:
1095 query = f"""
1096 SELECT DISTINCT ?s
1097 WHERE {{
1098 ?s ?p <{subject}> .
1099 FILTER(?p != <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>)
1100 }}
1101 """
1103 sparql.setQuery(query)
1104 sparql.setReturnFormat(JSON)
1105 ref_bindings = get_sparql_bindings(sparql.query().convert())
1107 for result in ref_bindings:
1108 referencing_subject = result["s"]["value"]
1109 if (
1110 referencing_subject != subject_str
1111 and referencing_subject not in imported_subjects
1112 ):
1113 imported_subjects.add(referencing_subject)
1114 editor.import_entity(URIRef(referencing_subject))
1116 # Breadth-first traversal so each entity is visited at its minimal distance
1117 # from the subject: a depth-first walk would consume one level per hop along
1118 # ordering chains (e.g. oco:hasNext) and silently skip entities pushed beyond
1119 # max_depth, leaving them without provenance snapshots.
1120 queue: deque[tuple[str, int]] = deque([(subject_str, 1)])
1121 while queue:
1122 current_subject, current_depth = queue.popleft()
1123 if current_depth > max_depth or current_subject in imported_subjects:
1124 continue
1126 imported_subjects.add(current_subject)
1127 editor.import_entity(URIRef(current_subject))
1129 query = f"""
1130 SELECT ?p ?o
1131 WHERE {{
1132 <{current_subject}> ?p ?o .
1133 FILTER(isIRI(?o))
1134 FILTER(?p != <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>)
1135 }}
1136 """
1138 sparql = get_sparql()
1139 sparql.setQuery(query)
1140 sparql.setReturnFormat(JSON)
1141 inner_bindings = get_sparql_bindings(sparql.query().convert())
1143 for result in inner_bindings:
1144 queue.append((result["o"]["value"], current_depth + 1))
1146 return editor
1149def get_entity_types(subject_uri: str) -> list[str]:
1150 sparql = get_sparql()
1152 query = f"""
1153 SELECT ?type WHERE {{
1154 <{subject_uri}> a ?type .
1155 }}
1156 """
1158 sparql.setQuery(query)
1159 sparql.setReturnFormat(JSON)
1160 bindings = get_sparql_bindings(sparql.query().convert())
1162 return [result["type"]["value"] for result in bindings]
1165def collect_referenced_entities(
1166 data: dict[str, str | dict | list] | list | str,
1167 existing_entities: set[str] | None = None,
1168) -> set[str]:
1169 """
1170 Recursively collect all URIs of existing entities referenced in the structured data.
1172 This function traverses the structured data to find explicit references to existing
1173 entities
1174 that need to be imported into the editor before calling preexisting_finished().
1176 Args:
1177 data: The structured data (can be dict, list, or string)
1178 existing_entities: Set to collect URIs (created if None)
1180 Returns:
1181 Set of URIs (strings) of existing entities that should be imported
1182 """
1184 if existing_entities is None:
1185 existing_entities = set()
1187 if isinstance(data, dict):
1188 if data.get("is_existing_entity") is True and "entity_uri" in data:
1189 existing_entities.add(str(data["entity_uri"]))
1191 # If it's an entity with entity_type, it's a new entity being created
1192 elif "entity_type" in data:
1193 properties = data.get("properties", {})
1194 if isinstance(properties, dict):
1195 for prop_values in properties.values():
1196 collect_referenced_entities(prop_values, existing_entities)
1197 else:
1198 for value in data.values():
1199 collect_referenced_entities(value, existing_entities)
1201 elif isinstance(data, list):
1202 for item in data:
1203 collect_referenced_entities(item, existing_entities)
1205 return existing_entities
1208def import_referenced_entities(
1209 editor: Editor,
1210 structured_data: dict[str, str | dict | list] | list | str,
1211) -> None:
1212 referenced_entities = collect_referenced_entities(structured_data)
1213 for entity_uri in referenced_entities:
1214 try:
1215 editor.import_entity(URIRef(entity_uri))
1216 except (SPARQLWrapperException, OSError, ValueError): # noqa: PERF203
1217 logging.getLogger(__name__).debug(
1218 "Failed to import referenced entity %s", entity_uri
1219 )
1220 continue