Coverage for src/time_agnostic_library/agnostic

1# SPDX-FileCopyrightText: 2021-2026 Arcangelo Massari <arcangelo.massari@unibo.it>

3# SPDX-License-Identifier: ISC

6import atexit

7import json

8import os

9from concurrent.futures import ThreadPoolExecutor, as_completed

10from pathlib import Path

12from rdflib.plugins.sparql.parserutils import CompValue

13from rdflib.plugins.sparql.processor import prepareQuery

15from time_agnostic_library.agnostic_entity import (

16 AgnosticEntity,

17 _compose_update_queries,

18 _fast_parse_update,

19 _filter_timestamps_by_interval,

20 _parse_datetime,

21)

22from time_agnostic_library.prov_entity import ProvEntity

23from time_agnostic_library.sparql import Sparql, _binding_to_n3, _n3_to_binding

24from time_agnostic_library.support import convert_to_datetime

26CONFIG_PATH = "./config.json"

28_OBJECT_POS = 2

30_PARALLEL_THRESHOLD = os.cpu_count() or 1

32_IO_EXECUTOR = ThreadPoolExecutor(max_workers=2)

33atexit.register(_IO_EXECUTOR.shutdown, wait=False)

36def _run_in_parallel(worker_fn, args_list):

37 if len(args_list) < _PARALLEL_THRESHOLD:

38 for args in args_list:

39 yield worker_fn(*args)

40 return

41 with ThreadPoolExecutor() as executor:

42 futures = {

43 executor.submit(worker_fn, *args): i for i, args in enumerate(args_list)

44 }

45 for future in as_completed(futures):

46 yield future.result()

49def _reconstruct_entity_worker(entity, config, on_time, other_snapshots_flag):

50 agnostic_entity = AgnosticEntity(

51 entity,

52 config=config,

53 include_related_objects=False,

54 include_merged_entities=False,

55 include_reverse_relations=False,

56 )

57 if on_time:

58 entity_graphs, _, other_snapshots = agnostic_entity.get_state_at_time(

59 time=on_time,

60 include_prov_metadata=other_snapshots_flag,

61 )

62 return entity, entity_graphs, other_snapshots

63 entity_history = agnostic_entity.get_history(include_prov_metadata=True)

64 return entity, entity_history[0], {}

67def _sparql_values(uris: set[str]) -> str:

68 return " ".join(f"<{uri}>" for uri in uris)

71def _wrap_in_graph(body: str, *, is_quadstore: bool) -> str:

72 if is_quadstore:

73 return f"GRAPH ?g {{ {body} }}"

74 return body

77def _batch_query_provenance_snapshots(

78 entity_uris: set[str], config: dict

79) -> dict[str, list[dict]]:

80 values = _sparql_values(entity_uris)

81 body = f"""

82 ?snapshot <{ProvEntity.iri_specialization_of}> ?entity;

83 <{ProvEntity.iri_generated_at_time}> ?time.

84 OPTIONAL {{ ?snapshot <{ProvEntity.iri_has_update_query}> ?updateQuery. }}

85 VALUES ?entity {{ {values} }}

86 """

87 wrapped = _wrap_in_graph(body, is_quadstore=config["provenance"]["is_quadstore"])

88 query = f"SELECT ?entity ?time ?updateQuery WHERE {{ {wrapped} }}"

89 results = Sparql(query, config).run_select_query()

90 output: dict[str, list[dict]] = {uri: [] for uri in entity_uris}

91 for binding in results["results"]["bindings"]:

92 entity_uri = binding["entity"]["value"]

93 entry = {

94 "time": binding["time"]["value"],

95 "updateQuery": binding["updateQuery"]["value"]

96 if "updateQuery" in binding

97 else None,

98 }

99 output[entity_uri].append(entry)

100 return output

101

102

103def _sparql_filter_in(var: str, uris: set[str]) -> str:

104 return f"FILTER({var} IN ({', '.join(f'<{uri}>' for uri in uris)}))"

105

106

107def _batch_query_dataset_triples(

108 entity_uris: set[str], config: dict

109) -> dict[str, set[tuple]]:

110 is_quadstore = config["dataset"]["is_quadstore"]

111 body = f"?s ?p ?o. {_sparql_filter_in('?s', entity_uris)}"

112 wrapped = _wrap_in_graph(body, is_quadstore=is_quadstore)

113 select_vars = "?s ?p ?o ?g" if is_quadstore else "?s ?p ?o"

114 query = f"SELECT {select_vars} WHERE {{ {wrapped} }}"

115 results = Sparql(query, config).run_select_query()

116 output: dict[str, set[tuple]] = {uri: set() for uri in entity_uris}

117 for binding in results["results"]["bindings"]:

118 s_val = binding["s"]["value"]

119 s = _binding_to_n3(binding["s"])

120 p = _binding_to_n3(binding["p"])

121 o = _binding_to_n3(binding["o"])

122 if is_quadstore and "g" in binding:

123 output[s_val].add((s, p, o, _binding_to_n3(binding["g"])))

124 else:

125 output[s_val].add((s, p, o))

126 return output

127

128

129def _iter_versions_as_sets(

130 prov_snapshots: list[dict],

131 dataset_quads: set[tuple],

132 relevant_times: set[str] | None = None,

133) -> list[tuple[str, tuple]]:

134 sorted_snaps = sorted(

135 prov_snapshots, key=lambda x: _parse_datetime(x["time"]), reverse=True

136 )

137 target_count = len(relevant_times) if relevant_times else None

138 working = set(dataset_quads)

139 results = []

140 found = 0

141 for i, snap in enumerate(sorted_snaps):

142 if i > 0:

143 prev_uq = sorted_snaps[i - 1]["updateQuery"]

144 if prev_uq is not None:

145 for op_type, quads in _fast_parse_update(prev_uq):

146 if op_type == "DeleteData":

147 for quad in quads:

148 working.add(quad)

149 elif op_type == "InsertData":

150 for quad in quads:

151 working.discard(quad)

152 if relevant_times is None or snap["time"] in relevant_times:

153 normalized = str(convert_to_datetime(snap["time"], stringify=True))

154 results.append((normalized, tuple(working)))

155 found += 1

156 if target_count is not None and found == target_count:

157 break

158 return results

159

160

161def _reconstruct_at_time_as_sets(

162 prov_snapshots: list[dict],

163 dataset_quads: set[tuple],

164 on_time: tuple[str | None, str | None],

165) -> list[tuple[str, tuple]]:

166 if not prov_snapshots:

167 return []

168 sorted_snaps = sorted(

169 prov_snapshots, key=lambda x: _parse_datetime(x["time"]), reverse=True

170 )

171 relevant = _filter_timestamps_by_interval(

172 on_time,

173 [{"time": {"value": s["time"]}} for s in sorted_snaps],

174 time_index="time",

175 )

176 if not relevant:

177 interval_start = _parse_datetime(on_time[0]) if on_time[0] else None

178 if interval_start:

179 earlier = [

180 s for s in sorted_snaps if _parse_datetime(s["time"]) <= interval_start

181 ]

182 if earlier:

183 best = max(earlier, key=lambda x: _parse_datetime(x["time"]))

184 relevant = [{"time": {"value": best["time"]}}]

185 else:

186 return []

187 else:

188 return []

189 relevant_times = {r["time"]["value"] for r in relevant}

190 return _iter_versions_as_sets(prov_snapshots, dataset_quads, relevant_times)

191

192

193def _match_single_pattern(triple_pattern: tuple, quads: tuple) -> list[dict]:

194 s_pat, p_pat, o_pat = triple_pattern[0], triple_pattern[1], triple_pattern[2]

195 s_is_var = s_pat.startswith("?")

196 p_is_var = p_pat.startswith("?")

197 o_is_var = o_pat.startswith("?")

198 bindings = []

199 for quad in quads:

200 s, p, o = quad[0], quad[1], quad[2]

201 if not p_is_var and p != p_pat:

202 continue

203 if not o_is_var and o != o_pat:

204 continue

205 if not s_is_var and s != s_pat:

206 continue

207 binding = {}

208 if s_is_var:

209 binding[s_pat[1:]] = _n3_to_binding(s)

210 if p_is_var:

211 binding[p_pat[1:]] = _n3_to_binding(p)

212 if o_is_var:

213 binding[o_pat[1:]] = _n3_to_binding(o)

214 bindings.append(binding)

215 return bindings

216

217

218def _merge_entity_bindings(

219 entity_bindings: dict[str, dict[str, list[dict]]],

220) -> dict[str, list[dict]]:

221 all_timestamps: set[str] = set()

222 for per_ts in entity_bindings.values():

223 all_timestamps.update(per_ts.keys())

224 sorted_timestamps = sorted(all_timestamps, key=_parse_datetime)

225 result: dict[str, list[dict]] = {}

226 last_known: dict[str, list[dict]] = {}

227 for ts in sorted_timestamps:

228 merged: list[dict] = []

229 for entity_str, per_ts in entity_bindings.items():

230 if ts in per_ts:

231 last_known[entity_str] = per_ts[ts]

232 if entity_str in last_known:

233 merged.extend(last_known[entity_str])

234 result[ts] = merged

235 return result

236

237

238def _batch_query_dm_provenance(

239 entity_uris: set[str], config: dict

240) -> dict[str, list[dict]]:

241 values = _sparql_values(entity_uris)

242 query = f"""

243 SELECT ?entity ?time ?updateQuery ?invalidatedAtTime

244 WHERE {{

245 ?se <{ProvEntity.iri_specialization_of}> ?entity;

246 <{ProvEntity.iri_generated_at_time}> ?time.

247 OPTIONAL {{

248 ?se <{ProvEntity.iri_has_update_query}> ?updateQuery.

249 }}

250 OPTIONAL {{

251 ?se <{ProvEntity.iri_invalidated_at_time}> ?invalidatedAtTime.

252 }}

253 VALUES ?entity {{ {values} }}

254 }}

255 """

256 results = Sparql(query, config).run_select_query()

257 output: dict[str, list[dict]] = {uri: [] for uri in entity_uris}

258 for binding in results["results"]["bindings"]:

259 entity_uri = binding["entity"]["value"]

260 entry = {

261 "time": binding["time"]["value"],

262 "updateQuery": binding["updateQuery"]["value"]

263 if "updateQuery" in binding

264 else None,

265 "invalidatedAtTime": binding["invalidatedAtTime"]["value"]

266 if "invalidatedAtTime" in binding

267 else None,

268 }

269 output[entity_uri].append(entry)

270 return output

271

272

273def _build_delta_result(

274 entity_str: str,

275 snapshots: list[dict],

276 on_time: tuple[str | None, str | None] | None,

277 changed_properties: set[str],

278) -> dict:

279 output: dict[str, dict] = {}

280 parsed_snaps = [(snap, _parse_datetime(snap["time"])) for snap in snapshots]

281 parsed_snaps.sort(key=lambda x: x[1])

282 after_dt = _parse_datetime(on_time[0]) if on_time and on_time[0] else None

283 before_dt = _parse_datetime(on_time[1]) if on_time and on_time[1] else None

284 creation_dt = parsed_snaps[0][1]

285 update_queries: list[str] = []

286 created = None

287 has_relevant = False

288 for snap, snap_dt in parsed_snaps:

289 if after_dt and snap_dt < after_dt:

290 continue

291 if before_dt and snap_dt > before_dt:

292 break

293 has_relevant = True

294 if snap_dt == creation_dt:

295 created = creation_dt.isoformat()

296 elif snap["updateQuery"]:

297 update_queries.append(snap["updateQuery"])

298 if not has_relevant:

299 return output

300 additions, deletions = _compose_update_queries(update_queries)

301 if changed_properties:

302 prop_n3_set = {f"<{p}>" for p in changed_properties}

303 additions = {q for q in additions if q[1] in prop_n3_set}

304 deletions = {q for q in deletions if q[1] in prop_n3_set}

305 last_snap = parsed_snaps[-1][0]

306 deleted = (

307 parsed_snaps[-1][1].isoformat() if last_snap["invalidatedAtTime"] else None

308 )

309 output[entity_str] = {

310 "created": created,

311 "deleted": deleted,

312 "additions": additions,

313 "deletions": deletions,

314 }

315 return output

316

317

318class AgnosticQuery:

319 blazegraph_full_text_search: bool

320 fuseki_full_text_search: bool

321 virtuoso_full_text_search: bool

322 graphdb_connector_name: str

323

324 def __init__(

325 self,

326 query: str,

327 on_time: tuple[str | None, str | None] | None = (None, None),

328 *,

329 other_snapshots: bool = False,

330 config_path: str = CONFIG_PATH,

331 config_dict: dict | None = None,

332 ):

333 self.query = query

334 self.other_snapshots = other_snapshots

335 self.config_path = config_path

336 self.other_snapshots_metadata: dict = {}

337 if config_dict is not None:

338 self.config = config_dict

339 else:

340 with Path(config_path).open(encoding="utf8") as json_file:

341 self.config = json.load(json_file)

342 self.__init_text_index(self.config)

343 if on_time:

344 after_time = convert_to_datetime(on_time[0], stringify=True)

345 before_time = convert_to_datetime(on_time[1], stringify=True)

346 self.on_time: tuple[str | None, str | None] | None = (

347 after_time,

348 before_time,

349 ) # type: ignore[assignment]

350 else:

351 self.on_time = None

352 self.reconstructed_entities: set[str] = set()

353 self.vars_to_explicit_by_time: dict = {}

354 self.relevant_entities_graphs: dict[str, dict[str, set]] = {}

355 self.relevant_graphs: dict[str, set[tuple[str, ...]]] = {}

356 self._rebuild_relevant_graphs()

357

358 def __init_text_index(self, config: dict):

359 for full_text_search in (

360 "blazegraph_full_text_search",

361 "fuseki_full_text_search",

362 "virtuoso_full_text_search",

363 ):

364 ts_full_text_search: str = config[full_text_search]

365 if ts_full_text_search.lower() in {"true", "1", 1, "t", "y", "yes", "ok"}:

366 setattr(self, full_text_search, True)

367 elif (

368 ts_full_text_search.lower() in {"false", "0", 0, "n", "f", "no"}

369 or not ts_full_text_search

370 ):

371 setattr(self, full_text_search, False)

372 else:

373 msg = (

374 f"Enter a valid value for '{full_text_search}' in the "

375 "configuration file, for example 'yes' or 'no'."

376 )

377 raise ValueError(msg)

378 self.graphdb_connector_name = config["graphdb_connector_name"]

379 if (

380 len(

381 [

382 index

383 for index in [

384 self.blazegraph_full_text_search,

385 self.fuseki_full_text_search,

386 self.virtuoso_full_text_search,

387 self.graphdb_connector_name,

388 ]

389 if index

390 ]

391 )

392 > 1

393 ):

394 msg = (

395 "The use of multiple indexing systems simultaneously "

396 "is currently not supported."

397 )

398 raise ValueError(msg)

399

400 def _process_query(self) -> list[tuple[str, ...]]:

401 # Parse the SPARQL string into an algebra tree via rdflib, then walk

402 # the tree to extract triple patterns as N3 strings, separating

403 # mandatory patterns from OPTIONAL groups.

404 algebra = prepareQuery(self.query).algebra

405 if algebra.name != "SelectQuery":

406 msg = "Only SELECT queries are allowed."

407 raise ValueError(msg)

408 mandatory: list[tuple[str, ...]] = []

409 self._optional_groups: list[list[tuple[str, ...]]] = []

410 self._collect_patterns(algebra, mandatory)

411 all_triples = list(mandatory)

412 for group in self._optional_groups:

413 all_triples.extend(group)

414 # Reject triples made of only variables (e.g. ?s ?p ?o): they would

415 # match every entity in the dataset, making the query too expensive.

416 triples_without_hook = [

417 t for t in all_triples if all(el.startswith("?") for el in t)

418 ]

419 if triples_without_hook:

420 msg = (

421 "Could not perform a generic time agnostic query. "

422 "Please, specify at least one URI or Literal within the query."

423 )

424 raise ValueError(msg)

425 self._select_vars = [str(v) for v in algebra["PV"]]

426 self._mandatory_triples = mandatory

427 return all_triples

428

429 def _collect_patterns(

430 self, node: CompValue, mandatory: list[tuple[str, ...]]

431 ) -> None:

432 name = node.name

433 if name == "LeftJoin":

434 # OPTIONAL = left join: p1 (left, mandatory) must match, p2 (right,

435 # optional) extends the binding if possible, otherwise it's ignored

436 self._collect_patterns(node["p1"], mandatory)

437 opt_group: list[tuple[str, ...]] = []

438 self._collect_triples_flat(node["p2"], opt_group)

439 if opt_group:

440 self._optional_groups.append(opt_group)

441 elif name == "Join":

442 # Both sides are mandatory (rdflib splits BGPs into Join nodes)

443 self._collect_patterns(node["p1"], mandatory)

444 self._collect_patterns(node["p2"], mandatory)

445 elif "triples" in node:

446 # BGP leaf node: convert rdflib terms to N3 strings

447 mandatory.extend(tuple(el.n3() for el in t) for t in node["triples"])

448 else:

449 for v in node.values():

450 if isinstance(v, CompValue):

451 self._collect_patterns(v, mandatory)

452

453 def _collect_triples_flat(

454 self, node: CompValue, triples: list[tuple[str, ...]]

455 ) -> None:

456 if "triples" in node:

457 triples.extend(tuple(el.n3() for el in t) for t in node["triples"])

458 for v in node.values():

459 if isinstance(v, CompValue):

460 self._collect_triples_flat(v, triples)

461

462 def _rebuild_relevant_graphs(self) -> None:

463 triples_checked = set()

464 all_isolated = True

465 self.triples = self._process_query()

466 for triple in self.triples:

467 if self._is_isolated(triple) and self._is_a_new_triple(

468 triple, triples_checked

469 ):

470 present_entities = self._get_present_entities(triple)

471 self._rebuild_relevant_entity(triple[0])

472 self._find_entities_in_update_queries(triple, present_entities)

473 else:

474 all_isolated = False

475 self._rebuild_relevant_entity(triple[0])

476 triples_checked.add(triple)

477 self._align_snapshots()

478 if not all_isolated:

479 self._solve_variables()

480

481 def _is_isolated(self, triple: tuple) -> bool:

482 if triple[0].startswith("<") and triple[0].endswith(">"):

483 return False

484 variables = [el for el in triple if el.startswith("?")]

485 for variable in variables:

486 other_triples = {t for t in self.triples if t != triple}

487 if self._there_is_transitive_closure(variable, other_triples):

488 return False

489 return True

490

491 def _there_is_transitive_closure(self, variable: str, triples: set[tuple]) -> bool:

492 there_is_transitive_closure = False

493 for triple in triples:

494 if variable in triple and triple.index(variable) == _OBJECT_POS:

495 if triple[0].startswith("<") and triple[0].endswith(">"):

496 return True

497 if triple[0].startswith("?"):

498 other_triples = {t for t in triples if t != triple}

499 there_is_transitive_closure = self._there_is_transitive_closure(

500 triple[0], other_triples

501 )

502 return there_is_transitive_closure

503

504 def _rebuild_relevant_entity(self, entity_n3: str) -> None:

505 if entity_n3.startswith("<") and entity_n3.endswith(">"):

506 entity_uri = entity_n3[1:-1]

507 if entity_uri not in self.reconstructed_entities:

508 self.reconstructed_entities.add(entity_uri)

509 result = self._reconstruct_entity_state(entity_uri)

510 if result is not None:

511 self._merge_entity_result(entity_uri, *result)

512

513 def _reconstruct_entity_state(self, entity_uri: str) -> tuple[dict, dict] | None:

514 agnostic_entity = AgnosticEntity(

515 entity_uri,

516 config=self.config,

517 include_related_objects=False,

518 include_merged_entities=False,

519 include_reverse_relations=False,

520 )

521 if self.on_time:

522 entity_graphs, _, other_snapshots = agnostic_entity.get_state_at_time(

523 time=self.on_time, include_prov_metadata=self.other_snapshots

524 )

525 return entity_graphs, other_snapshots

526 entity_history = agnostic_entity.get_history(include_prov_metadata=True)

527 return entity_history[0], {}

528

529 def _merge_entity_result(

530 self, entity_uri: str, entity_graphs: dict, other_snapshots: dict

531 ) -> None:

532 if other_snapshots:

533 self.other_snapshots_metadata.update(other_snapshots)

534 if self.on_time:

535 if entity_graphs:

536 for relevant_timestamp, quad_set in entity_graphs.items():

537 self.relevant_entities_graphs.setdefault(entity_uri, {})[

538 relevant_timestamp

539 ] = quad_set

540 elif entity_graphs.get(entity_uri):

541 self.relevant_entities_graphs.update(entity_graphs)

542

543 def _get_present_entities(self, triple: tuple) -> set[str]:

544 variables = [el for el in triple if el.startswith("?")]

545 if self.config["dataset"]["is_quadstore"]:

546 query = (

547 f"SELECT {' '.join(variables)} WHERE "

548 f"{{GRAPH ?_g {{{triple[0]} {triple[1]} {triple[2]}}} "

549 "FILTER(!CONTAINS(STR(?_g), '/prov/'))}"

550 )

551 else:

552 query = (

553 f"SELECT {' '.join(variables)} WHERE "

554 f"{{{triple[0]} {triple[1]} {triple[2]}}}"

555 )

556 results = Sparql(query, self.config).run_select_query()

557 bindings = results["results"]["bindings"]

558 if triple[1].startswith("^"):

559 if triple[2].startswith("?"):

560 var_name = triple[2][1:]

561 return {

562 b[var_name]["value"]

563 for b in bindings

564 if var_name in b and b[var_name]["type"] == "uri"

565 }

566 return {triple[2][1:-1]} if bindings else set()

567 var_name = triple[0][1:]

568 return {

569 b[var_name]["value"]

570 for b in bindings

571 if var_name in b and b[var_name]["type"] == "uri"

572 }

573

574 def _is_a_new_triple(self, triple: tuple, triples_checked: set) -> bool:

575 uris_in_triple = {

576 el for el in triple if el.startswith("<") and el.endswith(">")

577 }

578 for triple_checked in triples_checked:

579 uris_in_triple_checked = {

580 el for el in triple_checked if el.startswith("<") and el.endswith(">")

581 }

582 if not uris_in_triple.difference(uris_in_triple_checked):

583 return False

584 return True

585

586 def _get_query_to_update_queries(self, triple: tuple) -> str:

587 uris_n3 = {el for el in triple if el.startswith("<") and el.endswith(">")}

588 return self.get_full_text_search(uris_n3)

589

590 def get_full_text_search(self, uris_in_triple: set) -> str:

591 uris_in_triple = {

592 el[1:-1] if el.startswith("<") and el.endswith(">") else el

593 for el in uris_in_triple

594 }

595 if self.blazegraph_full_text_search:

596 query_to_identify = f"""

597 PREFIX bds: <http://www.bigdata.com/rdf/search#>

598 SELECT ?updateQuery

599 WHERE {{

600 ?snapshot <{ProvEntity.iri_has_update_query}> ?updateQuery.

601 ?updateQuery bds:search "{" ".join(uris_in_triple)}";

602 bds:matchAllTerms 'true'.

603 }}

604 """

605 elif self.fuseki_full_text_search:

606 query_obj = '\\" AND \\"'.join(uris_in_triple)

607 query_to_identify = f"""

608 PREFIX text: <http://jena.apache.org/text#>

609 SELECT ?updateQuery WHERE {{

610 ?se text:query "\\"{query_obj}\\"";

611 <{ProvEntity.iri_has_update_query}> ?updateQuery.

612 }}

613 """

614 elif self.virtuoso_full_text_search:

615 query_obj = "' AND '".join(uris_in_triple)

616 query_to_identify = f"""

617 PREFIX bif: <bif:>

618 SELECT ?updateQuery

619 WHERE {{

620 ?snapshot <{ProvEntity.iri_has_update_query}> ?updateQuery.

621 ?updateQuery bif:contains "'{query_obj}'".

622 }}

623 """

624 elif self.graphdb_connector_name:

625 quote = '"'

626 con_queries = (

627 f"con:query '{quote}"

628 + f"{quote}'; con:query '{quote}".join(uris_in_triple)

629 + f"{quote}'"

630 )

631 query_to_identify = f"""

632 PREFIX con: <http://www.ontotext.com/connectors/lucene#>

633 PREFIX con-inst: <http://www.ontotext.com/connectors/lucene/instance#>

634 SELECT ?updateQuery

635 WHERE {{

636 ?snapshot <{ProvEntity.iri_has_update_query}> ?updateQuery.

637 [] a con-inst:{self.graphdb_connector_name};

638 {con_queries};

639 con:entities ?snapshot.

640 }}

641 """

642 else:

643 filters = ").".join(

644 f"FILTER CONTAINS (?updateQuery, '{uri}'" for uri in uris_in_triple

645 )

646 query_to_identify = f"""

647 SELECT ?updateQuery

648 WHERE {{

649 ?snapshot <{ProvEntity.iri_has_update_query}> ?updateQuery.

650 {filters}).

651 }}

652 """

653 return query_to_identify

654

655 def _find_entity_uris_in_update_queries(self, triple: tuple, entities: set) -> None:

656 uris_n3 = {el for el in triple if el.startswith("<") and el.endswith(">")}

657 uris_str = {el[1:-1] for el in uris_n3}

658 if not any(

659 [

660 self.blazegraph_full_text_search,

661 self.fuseki_full_text_search,

662 self.virtuoso_full_text_search,

663 self.graphdb_connector_name,

664 ]

665 ):

666 filter_clauses = ".".join(

667 f"FILTER CONTAINS (?uq, '{uri}')" for uri in uris_str

668 )

669 query = f"""

670 SELECT ?entity WHERE {{

671 ?snapshot <{ProvEntity.iri_specialization_of}> ?entity;

672 <{ProvEntity.iri_has_update_query}> ?uq.

673 {filter_clauses}.

674 }}

675 """

676 results = Sparql(query, self.config).run_select_query()

677 for binding in results["results"]["bindings"]:

678 entities.add(binding["entity"]["value"])

679 return

680 query_to_identify = self._get_query_to_update_queries(triple)

681 results = Sparql(query_to_identify, self.config).run_select_query()

682 for binding in results["results"]["bindings"]:

683 uq = binding.get("updateQuery")

684 if uq and uq.get("value"):

685 for _, quads in _fast_parse_update(uq["value"]):

686 for quad in quads:

687 quad_uris = {

688 el

689 for el in quad[:3]

690 if el.startswith("<") and el.endswith(">")

691 }

692 if uris_n3.issubset(quad_uris):

693 entities.add(quad[0][1:-1])

694

695 def _find_entities_in_update_queries(

696 self, triple: tuple, present_entities: set | None = None

697 ):

698 if present_entities is None:

699 present_entities = set()

700 relevant_entities_found = present_entities

701 self._find_entity_uris_in_update_queries(triple, relevant_entities_found)

702 if relevant_entities_found:

703 args_list = [

704 (entity_uri, self.config, self.on_time, self.other_snapshots)

705 for entity_uri in relevant_entities_found

706 ]

707 for result in _run_in_parallel(_reconstruct_entity_worker, args_list):

708 if result is not None:

709 entity, entity_graphs, other_snapshots = result

710 self.reconstructed_entities.add(entity)

711 self._merge_entity_result(entity, entity_graphs, other_snapshots)

712

713 def _solve_variables(self) -> None:

714 self.vars_to_explicit_by_time = {}

715 self._get_vars_to_explicit_by_time()

716 while self._there_are_variables():

717 solved_variables = self._explicit_solvable_variables()

718 self._align_snapshots()

719 if not solved_variables:

720 return

721 self._update_vars_to_explicit(solved_variables)

722 self._get_vars_to_explicit_by_time()

723

724 def _there_are_variables(self) -> bool:

725 for triples in self.vars_to_explicit_by_time.values():

726 for triple in triples:

727 if any(el.startswith("?") for el in triple):

728 return True

729 return False

730

731 def _explicit_solvable_variables(self) -> dict:

732 explicit_triples: dict[str, dict[str, set]] = {}

733 for se, triples in self.vars_to_explicit_by_time.items():

734 for triple in triples:

735 variables = [el for el in triple if el.startswith("?")]

736 if len(variables) == 1:

737 variable = variables[0]

738 variable_index = triple.index(variable)

739 if variable_index == _OBJECT_POS:

740 matching = [

741 q

742 for q in self.relevant_graphs[se]

743 if q[0] == triple[0] and q[1] == triple[1]

744 ]

745 query_results = [(triple[0], triple[1], q[2]) for q in matching]

746 for row in query_results:

747 explicit_triples.setdefault(se, {})

748 explicit_triples[se].setdefault(variable, set())

749 explicit_triples[se][variable].add(row)

750 args_list = [

751 (

752 row[2][1:-1],

753 self.config,

754 self.on_time,

755 self.other_snapshots,

756 )

757 for row in query_results

758 if row[2].startswith("<")

759 and row[2].endswith(">")

760 and row[2][1:-1] not in self.reconstructed_entities

761 ]

762 for result_data in _run_in_parallel(

763 _reconstruct_entity_worker, args_list

764 ):

765 if result_data is not None:

766 entity, entity_graphs, other_snapshots = result_data

767 self.reconstructed_entities.add(entity)

768 self._merge_entity_result(

769 entity, entity_graphs, other_snapshots

770 )

771 return explicit_triples

772

773 def _align_snapshots(self) -> None:

774 for snapshots in self.relevant_entities_graphs.values():

775 for snapshot, quad_set in snapshots.items():

776 if snapshot in self.relevant_graphs:

777 self.relevant_graphs[snapshot].update(quad_set)

778 else:

779 self.relevant_graphs[snapshot] = set(quad_set)

780 if len(self.relevant_graphs) <= 1:

781 return

782 ordered_data = sorted(

783 self.relevant_graphs.items(),

784 key=lambda x: _parse_datetime(x[0]),

785 )

786 for index, (se, quad_set) in enumerate(ordered_data):

787 if index > 0:

788 previous_se = ordered_data[index - 1][0]

789 prev_subjects = {q[0] for q in self.relevant_graphs[previous_se]}

790 cur_subjects = {q[0] for q in quad_set}

791 for subject_n3 in prev_subjects:

792 subject_uri = (

793 subject_n3[1:-1] if subject_n3.startswith("<") else subject_n3

794 )

795 if (

796 subject_n3 not in cur_subjects

797 and subject_uri in self.relevant_entities_graphs

798 and se not in self.relevant_entities_graphs[subject_uri]

799 ):

800 for quad in self.relevant_graphs[previous_se]:

801 if quad[0] == subject_n3:

802 self.relevant_graphs[se].add(quad)

803

804 def _update_vars_to_explicit(self, solved_variables: dict):

805 vars_to_explicit_by_time: dict = {}

806 for se, triples in self.vars_to_explicit_by_time.items():

807 vars_to_explicit_by_time.setdefault(se, set())

808 new_triples = set()

809 for triple in triples:

810 if se in solved_variables:

811 for solved_var, solved_triples in solved_variables[se].items():

812 if solved_var in triple:

813 for solved_triple in solved_triples:

814 new_triple = None

815 if (

816 solved_triple[0] != triple[0]

817 and solved_triple[1] == triple[1]

818 ):

819 continue

820 if (

821 solved_triple[0] == triple[0]

822 and solved_triple[1] == triple[1]

823 ):

824 new_triple = solved_triple

825 else:

826 new_triple = (

827 solved_triple[2],

828 triple[1],

829 triple[2],

830 )

831 new_triples.add(new_triple)

832 elif not any(el.startswith("?") for el in triple) or not any(

833 var for var in solved_variables[se] if var in triple

834 ):

835 new_triples.add(triple)

836 vars_to_explicit_by_time[se] = new_triples

837 self.vars_to_explicit_by_time = vars_to_explicit_by_time

838

839 def _get_vars_to_explicit_by_time(self) -> None:

840 relevant_triples = None

841 for se in self.relevant_graphs:

842 if se not in self.vars_to_explicit_by_time:

843 if relevant_triples is None:

844 relevant_triples = set()

845 for triple in self.triples:

846 if any(

847 el

848 for el in triple

849 if el.startswith("?")

850 and not self._is_a_dead_end(el, triple)

851 ) and not self._is_isolated(triple):

852 relevant_triples.add(triple)

853 self.vars_to_explicit_by_time[se] = set(relevant_triples)

854

855 def _is_a_dead_end(self, el: str, triple: tuple) -> bool:

856 return (

857 el.startswith("?")

858 and triple.index(el) == _OBJECT_POS

859 and not any(t for t in self.triples if el in t if t.index(el) == 0)

860 )

861

862

863class VersionQuery(AgnosticQuery):

864 """Time-travel queries, both on a single version and all versions of the dataset.

865

866 :param query: The SPARQL query string.

867 :type query: str

868 :param on_time: If you want to query a specific version, specify the time

869 interval here. The format is (START, END). If one of the two values is None,

870 only the other is considered. Dates must be in ISO 8601 format.

871 :type on_time: Tuple[Union[str, None]], optional

872 :param config_path: The path to the configuration file.

873 :type config_path: str, optional

874 """

875

876 def __init__(

877 self,

878 query: str,

879 on_time: tuple[str | None, str | None] | None = None,

880 *,

881 other_snapshots: bool = False,

882 config_path: str = CONFIG_PATH,

883 config_dict: dict | None = None,

884 ):

885 self._streaming_results: dict[str, list[dict]] = {}

886 super().__init__(

887 query,

888 on_time,

889 other_snapshots=other_snapshots,

890 config_path=config_path,

891 config_dict=config_dict,

892 )

893

894 def _rebuild_relevant_graphs(self) -> None:

895 self.triples = self._process_query()

896 if self.on_time is not None:

897 if (

898 len(self.triples) == 1

899 and self._is_isolated(self.triples[0])

900 and not self.triples[0][1].startswith("^")

901 and not self.other_snapshots

902 ):

903 self._rebuild_vm_batch(self.on_time)

904 return

905 super()._rebuild_relevant_graphs()

906 return

907 if not all(self._is_isolated(t) for t in self.triples):

908 super()._rebuild_relevant_graphs()

909 self._streaming_results = {

910 str(convert_to_datetime(ts, stringify=True)): self._extract_bindings(g)

911 for ts, g in self.relevant_graphs.items()

912 }

913 return

914 self._rebuild_streaming()

915

916 def _discover_entities_parallel(self, triple: tuple) -> set[str]:

917 fut_present = _IO_EXECUTOR.submit(self._get_present_entities, triple)

918 entities_set: set = set()

919 fut_prov = _IO_EXECUTOR.submit(

920 self._find_entity_uris_in_update_queries, triple, entities_set

921 )

922 present_entities = fut_present.result()

923 fut_prov.result()

924 all_entities = set(present_entities)

925 all_entities.update(entities_set)

926 return all_entities

927

928 def _rebuild_vm_batch(self, on_time: tuple[str | None, str | None]) -> None:

929 triple = self.triples[0]

930 all_entity_strs = self._discover_entities_parallel(triple)

931 if not all_entity_strs:

932 return

933 fut_prov = _IO_EXECUTOR.submit(

934 _batch_query_provenance_snapshots, all_entity_strs, self.config

935 )

936 fut_data = _IO_EXECUTOR.submit(

937 _batch_query_dataset_triples, all_entity_strs, self.config

938 )

939 prov_data = fut_prov.result()

940 dataset_data = fut_data.result()

941 entity_bindings: dict[str, dict[str, list[dict]]] = {}

942 for entity_str in all_entity_strs:

943 per_ts: dict[str, list[dict]] = {}

944 for ts, quad_set in _reconstruct_at_time_as_sets(

945 prov_data[entity_str],

946 dataset_data[entity_str],

947 on_time,

948 ):

949 per_ts[ts] = _match_single_pattern(triple, quad_set)

950 entity_bindings[entity_str] = per_ts

951 self._streaming_results = _merge_entity_bindings(entity_bindings)

952

953 def _extract_bindings(self, quads: set[tuple[str, ...]]) -> list[dict]:

954 # Match the SPARQL query patterns against the quad set.

955 # Phase 1: mandatory triples. Start with an empty binding and for each

956 # pattern keep only the quads that are compatible with what was already

957 # matched.

958 bindings: list[dict[str, str]] = [{}]

959 for pattern in self._mandatory_triples:

960 new_bindings: list[dict[str, str]] = []

961 for binding in bindings:

962 for quad in quads:

963 new_binding = self._try_match(pattern, quad, binding)

964 if new_binding is not None:

965 new_bindings.append(new_binding)

966 bindings = new_bindings

967 # Phase 2: OPTIONAL groups. Try to extend each binding, but if nothing

968 # matches, keep the binding as-is (no data is lost).

969 for opt_group in self._optional_groups:

970 bindings = self._left_join(bindings, opt_group, quads)

971 # Phase 3: project to SELECT variables and deduplicate.

972 seen: set[frozenset] = set()

973 result: list[dict] = []

974 for b in bindings:

975 projected_n3: dict[str, str] = {}

976 for var in self._select_vars:

977 key = "?" + var

978 if key in b:

979 projected_n3[var] = b[key]

980 frozen = frozenset(projected_n3.items())

981 if frozen not in seen:

982 seen.add(frozen)

983 result.append(

984 {var: _n3_to_binding(val) for var, val in projected_n3.items()}

985 )

986 return result

987

988 def _left_join(

989 self,

990 bindings: list[dict],

991 opt_triples: list[tuple],

992 quads: set[tuple[str, ...]],

993 ) -> list[dict]:

994 # For each binding, try to add values from the optional patterns.

995 # If a quad matches, the binding grows with new variables.

996 # If nothing matches, the binding is kept unchanged.

997 result: list[dict] = []

998 for binding in bindings:

999 extended: list[dict] = [dict(binding)]

1000 for pattern in opt_triples:

1001 new_extended: list[dict] = []

1002 for b in extended:

1003 matched = False

1004 for quad in quads:

1005 new_b = self._try_match(pattern, quad, b)

1006 if new_b is not None:

1007 new_extended.append(new_b)

1008 matched = True

1009 if not matched:

1010 new_extended.append(b)

1011 extended = new_extended

1012 result.extend(extended)

1013 return result

1014

1015 @staticmethod

1016 def _try_match(pattern: tuple, quad: tuple, binding: dict) -> dict | None:

1017 # Check if a triple pattern (s, p, o) matches a quad.

1018 new_binding = dict(binding)

1019 for expected, actual in zip(pattern[:3], quad[:3], strict=True):

1020 is_variable = expected.startswith("?")

1021 if is_variable and expected in new_binding:

1022 # Variable already bound: check consistency

1023 if new_binding[expected] != actual:

1024 return None

1025 elif is_variable:

1026 # New variable: bind it

1027 new_binding[expected] = actual

1028 elif expected != actual:

1029 # Fixed term: must match exactly

1030 return None

1031 return new_binding

1032

1033 def _rebuild_streaming(self) -> None:

1034 triples_checked = set()

1035 all_entity_strs: set[str] = set()

1036 use_fast_path = (

1037 len(self.triples) == 1

1038 and self._is_isolated(self.triples[0])

1039 and not self.triples[0][1].startswith("^")

1040 )

1041 for triple in self.triples:

1042 if self._is_a_new_triple(triple, triples_checked):

1043 present_entities = self._get_present_entities(triple)

1044 prov_entities: set = set()

1045 self._find_entity_uris_in_update_queries(triple, prov_entities)

1046 all_entity_strs.update(present_entities)

1047 all_entity_strs.update(prov_entities)

1048 triples_checked.add(triple)

1049 if not all_entity_strs:

1050 self._streaming_results = {}

1051 return

1052 if use_fast_path:

1053 fut_prov = _IO_EXECUTOR.submit(

1054 _batch_query_provenance_snapshots, all_entity_strs, self.config

1055 )

1056 fut_data = _IO_EXECUTOR.submit(

1057 _batch_query_dataset_triples, all_entity_strs, self.config

1058 )

1059 prov_data = fut_prov.result()

1060 dataset_data = fut_data.result()

1061 triple = self.triples[0]

1062 entity_bindings: dict[str, dict[str, list[dict]]] = {}

1063 for entity_str in all_entity_strs:

1064 per_ts: dict[str, list[dict]] = {}

1065 for ts, quad_set in _iter_versions_as_sets(

1066 prov_data[entity_str], dataset_data[entity_str]

1067 ):

1068 per_ts[ts] = _match_single_pattern(triple, quad_set)

1069 entity_bindings[entity_str] = per_ts

1070 else:

1071 entity_bindings = {}

1072 for entity_str in all_entity_strs:

1073 ae = AgnosticEntity(

1074 entity_str,

1075 config=self.config,

1076 include_related_objects=False,

1077 include_merged_entities=False,

1078 include_reverse_relations=False,

1079 )

1080 per_ts = {}

1081 for ts, quad_set in ae.iter_versions():

1082 per_ts[ts] = self._extract_bindings(quad_set)

1083 entity_bindings[entity_str] = per_ts

1084 self._streaming_results = _merge_entity_bindings(entity_bindings)

1085

1086 def run_agnostic_query(

1087 self, *, include_all_timestamps: bool = False

1088 ) -> tuple[dict[str, list[dict]], set]:

1089 if self.on_time is None or self._streaming_results:

1090 agnostic_result = self._streaming_results

1091 if include_all_timestamps:

1092 agnostic_result = self._fill_timestamp_gaps(agnostic_result)

1093 return agnostic_result, set()

1094 agnostic_result: dict[str, list[dict]] = {}

1095 for timestamp, graph in self.relevant_graphs.items():

1096 normalized = str(convert_to_datetime(timestamp, stringify=True))

1097 agnostic_result[normalized] = self._extract_bindings(graph)

1098 return agnostic_result, {

1099 data["generatedAtTime"] for _, data in self.other_snapshots_metadata.items()

1100 }

1101

1102 def _get_all_provenance_timestamps(self) -> set:

1103 query = f"""

1104 SELECT ?time WHERE {{

1105 ?snapshot <{ProvEntity.iri_generated_at_time}> ?time .

1106 }}

1107 """

1108 results = Sparql(query, self.config).run_select_query()

1109 return {r["time"]["value"] for r in results["results"]["bindings"]}

1110

1111 def _fill_timestamp_gaps(self, result: dict) -> dict:

1112 all_timestamps = self._get_all_provenance_timestamps()

1113 sorted_result_ts = sorted(result.keys(), key=_parse_datetime)

1114 if not sorted_result_ts:

1115 return result

1116 min_ts = _parse_datetime(sorted_result_ts[0])

1117 relevant_timestamps = sorted(

1118 [t for t in all_timestamps if min_ts <= _parse_datetime(t)],

1119 key=_parse_datetime,

1120 )

1121 filled = dict(result)

1122 last_known = None

1123 for ts in relevant_timestamps:

1124 normalized = convert_to_datetime(ts, stringify=True)

1125 if normalized in filled:

1126 last_known = normalized

1127 elif last_known is not None:

1128 filled[normalized] = filled[last_known]

1129 return filled

1130

1131

1132class DeltaQuery(AgnosticQuery):

1133 """Single time and cross-time delta structured queries.

1134

1135 :param query: A SPARQL query string. It is useful to identify the entities

1136 whose change you want to investigate.

1137 :type query: str

1138 :param on_time: If you want to query specific snapshots, specify the time

1139 interval here. The format is (START, END). If one of the two values is None,

1140 only the other is considered. Dates must be in ISO 8601 format.

1141 :type on_time: Tuple[Union[str, None]], optional

1142 :param changed_properties: A set of properties. It narrows the field to those

1143 entities where the properties specified in the set have changed.

1144 :type changed_properties: Set[str], optional

1145 :param config_path: The path to the configuration file.

1146 :type config_path: str, optional

1147 """

1148

1149 def __init__(

1150 self,

1151 query: str,

1152 on_time: tuple[str | None, str | None] | None = None,

1153 changed_properties: set[str] | None = None,

1154 config_path: str = CONFIG_PATH,

1155 config_dict: dict | None = None,

1156 ):

1157 if changed_properties is None:

1158 changed_properties = set()

1159 super().__init__(

1160 query=query,

1161 on_time=on_time,

1162 config_path=config_path,

1163 config_dict=config_dict,

1164 )

1165 self.changed_properties = changed_properties

1166

1167 def _rebuild_relevant_graphs(self) -> None:

1168 triples_checked = set()

1169 self.triples = self._process_query()

1170 needs_graph_alignment = False

1171 for triple in self.triples:

1172 if self._is_isolated(triple) and self._is_a_new_triple(

1173 triple, triples_checked

1174 ):

1175 present_entities = self._get_present_entities(triple)

1176 self.reconstructed_entities.update(present_entities)

1177 prov_entities: set = set()

1178 self._find_entity_uris_in_update_queries(triple, prov_entities)

1179 self.reconstructed_entities.update(prov_entities)

1180 else:

1181 needs_graph_alignment = True

1182 self._rebuild_relevant_entity(triple[0])

1183 triples_checked.add(triple)

1184 if needs_graph_alignment:

1185 self._align_snapshots()

1186 self._solve_variables()

1187

1188 def run_agnostic_query(self) -> dict:

1189 entity_uris = set(self.reconstructed_entities)

1190 if not entity_uris:

1191 return {}

1192 prov_data = _batch_query_dm_provenance(entity_uris, self.config)

1193 output = {}

1194 for entity_str in entity_uris:

1195 snapshots = prov_data[entity_str]

1196 if not snapshots:

1197 continue

1198 result = _build_delta_result(

1199 entity_str,

1200 snapshots,

1201 self.on_time,

1202 self.changed_properties,

1203 )

1204 output.update(result)

1205 return output

1206

1207

1208def get_insert_query(graph_iri: str, data: set[tuple[str, ...]]) -> tuple[str, int]:

1209 if not data:

1210 return "", 0

1211 statements = "\n".join(f"{q[0]} {q[1]} {q[2]} ." for q in data)

1212 return f"INSERT DATA {{ GRAPH <{graph_iri}> {{ {statements} }} }}", len(data)

Coverage for src / time_agnostic_library / agnostic_query.py: 100%

694 statements