Coverage for oc_ocdm/storer.py: 86%

1#!/usr/bin/python

3# SPDX-FileCopyrightText: 2020-2022 Simone Persiani <iosonopersia@gmail.com>

4# SPDX-FileCopyrightText: 2022-2025 Arcangelo Massari <arcangelo.massari@unibo.it>

6# SPDX-License-Identifier: ISC

8# -*- coding: utf-8 -*-

9from __future__ import annotations

11import hashlib

12import json

13import os

14from collections.abc import Iterable

15from datetime import datetime

16from typing import TYPE_CHECKING, cast

17from zipfile import ZIP_DEFLATED, ZipFile

19import orjson

20from filelock import FileLock

21from rdflib import Dataset, Graph, Literal, URIRef

22from rdflib.term import Node

23from triplelite import RDFTerm, TripleLite

25from oc_ocdm._types import ContextMap, JsonLdDocument, JsonObject, JsonValue, RdfLibObject, RdfLibQuad

26from oc_ocdm.constants import RDF_TYPE, XSD_STRING

27from oc_ocdm.graph.graph_entity import GraphEntity

28from oc_ocdm.metadata.metadata_entity import MetadataEntity

29from oc_ocdm.prov.prov_entity import ProvEntity

30from oc_ocdm.reader import Reader, transform_jsonld_graphs

31from oc_ocdm.support.query_utils import get_update_query

32from oc_ocdm.support.reporter import Reporter

33from oc_ocdm.support.sparql import SPARQLEndpointError, sparql_update

34from oc_ocdm.support.support import find_paths

36if TYPE_CHECKING:

37 from typing import List, Tuple

39 from oc_ocdm.abstract_entity import AbstractEntity

40 from oc_ocdm.graph.graph_set import GraphSet

41 from oc_ocdm.metadata.metadata_set import MetadataSet

42 from oc_ocdm.prov.prov_set import ProvSet

45def _entity_to_jsonld_dict(entity: AbstractEntity) -> JsonObject:

46 result: JsonObject = {"@id": entity.res}

47 types: list[JsonValue] = []

48 props: dict[str, list[JsonValue]] = {}

49 for _, p, o in entity.g.triples((entity.res, None, None)):

50 if p == RDF_TYPE:

51 types.append(o.value)

52 else:

53 if o.type == "uri":

54 val: JsonObject = {"@id": o.value}

55 elif o.lang:

56 val = {"@language": o.lang, "@value": o.value}

57 else:

58 val = {"@type": o.datatype if o.datatype else XSD_STRING, "@value": o.value}

59 props.setdefault(p, []).append(val)

60 if types:

61 result["@type"] = types

62 result.update(props)

63 return result

66def _compact_uri(uri: str, ns_to_prefix: list[tuple[str, str]]) -> str:

67 for ns, prefix in ns_to_prefix:

68 if uri.startswith(ns):

69 return prefix + ":" + uri[len(ns) :]

70 return uri

73def _compact_jsonld(

74 data: JsonLdDocument, context_path: str, ns_to_prefix: list[tuple[str, str]]

75) -> JsonObject | JsonLdDocument:

76 compacted = transform_jsonld_graphs(data, lambda uri: _compact_uri(uri, ns_to_prefix))

77 for graph_obj in compacted:

78 graph_obj["@context"] = context_path

79 if len(compacted) == 1:

80 return compacted[0]

81 return compacted

84def _add_quads(dataset: Dataset, quads: list[RdfLibQuad]) -> None:

85 dataset.addN(cast(Iterable[tuple[Node, Node, Node, Graph]], quads))

88class _JsonLdDoc:

89 __slots__ = ("_entities",)

91 def __init__(self, data: JsonLdDocument) -> None:

92 self._entities: dict[str, dict[str, JsonObject]] = {}

93 for graph_obj in data:

94 graph_iri = cast(str, graph_obj["@id"])

95 entity_index: dict[str, JsonObject] = {}

96 for entity_dict in cast(list[JsonObject], graph_obj["@graph"]):

97 entity_index[cast(str, entity_dict["@id"])] = entity_dict

98 self._entities[graph_iri] = entity_index

100 def upsert_entity(self, graph_iri: str, entity_uri: str, entity_dict: JsonObject) -> None:

101 if graph_iri not in self._entities:

102 self._entities[graph_iri] = {}

103 self._entities[graph_iri][entity_uri] = entity_dict

104

105 def merge_entity(self, graph_iri: str, entity_uri: str, entity_dict: JsonObject) -> None:

106 if graph_iri not in self._entities:

107 self._entities[graph_iri] = {}

108 existing = self._entities[graph_iri].get(entity_uri)

109 if existing is None:

110 self._entities[graph_iri][entity_uri] = entity_dict

111 return

112 for key, value in entity_dict.items():

113 if key == "@id":

114 continue

115 if key not in existing:

116 existing[key] = value

117 else:

118 existing_values = cast(list[JsonValue], existing[key])

119 for v in cast(list[JsonValue], value):

120 if v not in existing_values:

121 existing_values.append(v)

122

123 def remove_entity(self, graph_iri: str, entity_uri: str) -> None:

124 if graph_iri in self._entities and entity_uri in self._entities[graph_iri]:

125 del self._entities[graph_iri][entity_uri]

126

127 def to_list(self) -> JsonLdDocument:

128 return [

129 {"@id": graph_iri, "@graph": cast(JsonValue, list(entities.values()))}

130 for graph_iri, entities in self._entities.items()

131 if entities

132 ]

133

134

135class Storer(object):

136 def __init__(

137 self,

138 abstract_set: GraphSet | MetadataSet | ProvSet,

139 repok: Reporter | None = None,

140 reperr: Reporter | None = None,

141 context_map: ContextMap | None = None,

142 default_dir: str = "_",

143 dir_split: int = 0,

144 n_file_item: int = 1,

145 output_format: str = "json-ld",

146 zip_output: bool = False,

147 modified_entities: set[str] | None = None,

148 ) -> None:

149 # We only accept format strings that:

150 # 1. are supported by rdflib

151 # 2. correspond to an output format which is effectively either NT or NQ

152 # The only exception to this rule is the 'json-ld' format, which is the default value of 'output_format'.

153 supported_formats: set[str] = {

154 "application/n-triples",

155 "ntriples",

156 "nt",

157 "nt11",

158 "application/n-quads",

159 "nquads",

160 "json-ld",

161 }

162 if output_format not in supported_formats:

163 raise ValueError(

164 f"Given output_format '{output_format}' is not supported. Available formats: {supported_formats}."

165 )

166 else:

167 self.output_format: str = output_format

168 self.zip_output = zip_output

169 self.dir_split: int = dir_split

170 self.n_file_item: int = n_file_item

171 self.default_dir: str = default_dir if default_dir != "" else "_"

172 self.a_set: GraphSet | MetadataSet | ProvSet = abstract_set

173 self.modified_entities = modified_entities

174

175 if context_map is not None:

176 self.context_map: ContextMap = context_map

177 else:

178 self.context_map: ContextMap = {}

179

180 if self.output_format == "json-ld":

181 for context_url in self.context_map:

182 ctx_file_path = self.context_map[context_url]

183 if isinstance(ctx_file_path, str) and os.path.isfile(ctx_file_path):

184 # This expensive operation is done only when it's really needed

185 with open(ctx_file_path, "rt", encoding="utf-8") as ctx_f:

186 self.context_map[context_url] = cast(JsonObject, json.load(ctx_f))

187

188 if repok is None:

189 self.repok: Reporter = Reporter(prefix="[Storer: INFO] ")

190 else:

191 self.repok: Reporter = repok

192

193 if reperr is None:

194 self.reperr: Reporter = Reporter(prefix="[Storer: ERROR] ")

195 else:

196 self.reperr: Reporter = reperr

197

198 @staticmethod

199 def _to_rdflib_obj(o: RDFTerm) -> RdfLibObject:

200 if o.type == "literal":

201 if o.lang:

202 return Literal(o.value, lang=o.lang)

203 return Literal(o.value, datatype=URIRef(o.datatype))

204 return URIRef(o.value)

205

206 @staticmethod

207 def _entity_quads(entity_g: TripleLite) -> list[RdfLibQuad]:

208 graph_id = URIRef(entity_g.identifier) if entity_g.identifier else None

209 return [(URIRef(s), URIRef(p), Storer._to_rdflib_obj(o), graph_id) for s, p, o in entity_g]

210

211 def store_graphs_in_file(self, file_path: str, context_path: str | None = None) -> None:

212 self.repok.new_article()

213 self.reperr.new_article()

214 self.repok.add_sentence("Store the graphs into a file: starting process")

215

216 if self.output_format == "json-ld":

217 self._store_graphs_in_file_jsonld_fast(file_path, context_path)

218 return

219

220 cg: Dataset = Dataset()

221 for entity in self._snapshot_entities():

222 _add_quads(cg, self._entity_quads(entity.g))

223

224 self._store_in_file(cg, file_path, context_path)

225

226 @staticmethod

227 def _is_snapshot_entity(entity: AbstractEntity) -> bool:

228 if len(entity.g) == 0:

229 return False

230 if isinstance(entity, (GraphEntity, MetadataEntity)):

231 return not entity.to_be_deleted

232 return True

233

234 def _snapshot_entities(self) -> list[AbstractEntity]:

235 return [entity for entity in self.a_set.res_to_entity.values() if self._is_snapshot_entity(entity)]

236

237 def _store_in_file(self, cur_g: Dataset, cur_file_path: str, context_path: str | None = None) -> None:

238 zip_file_path = cur_file_path.replace(os.path.splitext(cur_file_path)[1], ".zip")

239

240 if self.zip_output:

241 with ZipFile(zip_file_path, mode="w", compression=ZIP_DEFLATED, allowZip64=True) as zip_file:

242 self._write_graph(cur_g, zip_file, cur_file_path, context_path)

243 else:

244 self._write_graph(cur_g, None, cur_file_path, context_path)

245

246 self.repok.add_sentence(f"File '{cur_file_path}' added.")

247

248 def _write_graph(

249 self, graph: Dataset, zip_file: ZipFile | None, cur_file_path: str, context_path: str | None

250 ) -> None:

251 if self.output_format == "json-ld":

252 if context_path is not None and context_path in self.context_map:

253 cur_json_ld = json.loads(graph.serialize(format="json-ld", context=self.context_map[context_path]))

254 if isinstance(cur_json_ld, dict):

255 cur_json_ld["@context"] = context_path

256 else:

257 for item in cur_json_ld:

258 item["@context"] = context_path

259 if zip_file is not None:

260 data = json.dumps(cur_json_ld, ensure_ascii=False).encode("utf-8")

261 zip_file.writestr(zinfo_or_arcname=os.path.basename(cur_file_path), data=data)

262 else:

263 with open(cur_file_path, "wt", encoding="utf-8") as f:

264 json.dump(cur_json_ld, f, ensure_ascii=False)

265 else:

266 if zip_file is not None:

267 data = graph.serialize(format="json-ld").encode("utf-8")

268 zip_file.writestr(zinfo_or_arcname=os.path.basename(cur_file_path), data=data)

269 else:

270 graph.serialize(destination=cur_file_path, format="json-ld")

271 else:

272 # Handle other RDF formats

273 if zip_file is not None:

274 rdf_serialization = graph.serialize(destination=None, format=self.output_format, encoding="utf-8")

275 zip_file.writestr(zinfo_or_arcname=os.path.basename(cur_file_path), data=rdf_serialization)

276 else:

277 graph.serialize(destination=cur_file_path, format=self.output_format, encoding="utf-8")

278

279 def store_all(

280 self, base_dir: str, base_iri: str, context_path: str | None = None, process_id: int | str | None = None

281 ) -> List[str]:

282 self.repok.new_article()

283 self.reperr.new_article()

284

285 self.repok.add_sentence("Starting the process")

286

287 relevant_paths: dict[str, list[AbstractEntity]] = {}

288 created_dirs: set[str] = set()

289 for entity in self.a_set.res_to_entity.values():

290 is_relevant = True

291 if self.modified_entities is not None and entity.res.split("/prov/se/")[0] not in self.modified_entities:

292 is_relevant = False

293 if is_relevant:

294 cur_dir_path, cur_file_path = self._dir_and_file_paths(entity.res, base_dir, base_iri, process_id)

295 if cur_dir_path not in created_dirs:

296 os.makedirs(cur_dir_path, exist_ok=True)

297 created_dirs.add(cur_dir_path)

298 relevant_paths.setdefault(cur_file_path, list())

299 relevant_paths[cur_file_path].append(entity)

300

301 if self.output_format == "json-ld":

302 return self._store_all_jsonld_fast(relevant_paths, context_path)

303

304 reader = Reader(context_map=self.context_map)

305 for relevant_path, entities_in_path in relevant_paths.items():

306 stored_g = None

307 output_filepath = (

308 relevant_path.replace(os.path.splitext(relevant_path)[1], ".zip") if self.zip_output else relevant_path

309 )

310 lock = FileLock(f"{output_filepath}.lock")

311 with lock:

312 if os.path.exists(output_filepath):

313 stored_g = reader.load(output_filepath)

314 if stored_g is None:

315 stored_g = Dataset()

316 for entity_in_path in entities_in_path:

317 self.store(entity_in_path, stored_g, relevant_path, context_path, False)

318 self._store_in_file(stored_g, relevant_path, context_path)

319

320 return list(relevant_paths.keys())

321

322 def _entity_triples_as_rdflib_quads(self, entity: AbstractEntity) -> list[RdfLibQuad]:

323 graph_id = URIRef(entity.g.identifier) if entity.g.identifier else None

324 return [

325 (URIRef(s), URIRef(p), self._to_rdflib_obj(o), graph_id)

326 for s, p, o in entity.g.triples((entity.res, None, None))

327 ]

328

329 def store(

330 self,

331 entity: AbstractEntity,

332 destination_g: Dataset,

333 cur_file_path: str,

334 context_path: str | None = None,

335 store_now: bool = True,

336 ) -> Dataset | None:

337 self.repok.new_article()

338 self.reperr.new_article()

339

340 try:

341 if isinstance(entity, ProvEntity):

342 _add_quads(destination_g, self._entity_triples_as_rdflib_quads(entity))

343 elif isinstance(entity, GraphEntity) or isinstance(entity, MetadataEntity):

344 if entity.to_be_deleted:

345 destination_g.remove((URIRef(entity.res), None, None, None)) # type: ignore[arg-type]

346 else:

347 if len(entity.preexisting_triples) > 0:

348 destination_g.remove((URIRef(entity.res), None, None, None)) # type: ignore[arg-type]

349 _add_quads(destination_g, self._entity_triples_as_rdflib_quads(entity))

350

351 if store_now:

352 self._store_in_file(destination_g, cur_file_path, context_path)

353

354 return destination_g

355 except Exception as e:

356 self.reperr.add_sentence(f"[1] It was impossible to store the RDF statements in {cur_file_path}. {e}")

357

358 def _build_ns_to_prefix(self, context_path: str) -> list[tuple[str, str]]:

359 ctx = cast(JsonObject, self.context_map[context_path])

360 if "@context" in ctx:

361 ctx = cast(JsonObject, ctx["@context"])

362 pairs = [(ns, prefix) for prefix, ns in ctx.items() if isinstance(ns, str) and not prefix.startswith("@")]

363 pairs.sort(key=lambda x: len(x[0]), reverse=True)

364 return pairs

365

366 def _write_jsonld_fast(self, json_bytes: bytes, relevant_path: str) -> None:

367 if self.zip_output:

368 zip_file_path = relevant_path.replace(os.path.splitext(relevant_path)[1], ".zip")

369 with ZipFile(zip_file_path, mode="w", compression=ZIP_DEFLATED, allowZip64=True) as zf:

370 zf.writestr(os.path.basename(relevant_path), json_bytes)

371 else:

372 with open(relevant_path, "wb") as f:

373 f.write(json_bytes)

374 self.repok.add_sentence(f"File '{relevant_path}' added.")

375

376 def _store_all_jsonld_fast(

377 self, relevant_paths: dict[str, list[AbstractEntity]], context_path: str | None

378 ) -> List[str]:

379 reader = Reader(context_map=self.context_map)

380 ns_to_prefix: list[tuple[str, str]] | None = None

381 if context_path is not None and context_path in self.context_map:

382 ns_to_prefix = self._build_ns_to_prefix(context_path)

383

384 for relevant_path, entities_in_path in relevant_paths.items():

385 output_filepath = (

386 relevant_path.replace(os.path.splitext(relevant_path)[1], ".zip") if self.zip_output else relevant_path

387 )

388 lock = FileLock(f"{output_filepath}.lock")

389 with lock:

390 existing_data: JsonLdDocument | None = None

391 if os.path.exists(output_filepath):

392 existing_data = reader.load_jsonld_dict(output_filepath)

393 doc = _JsonLdDoc(existing_data if existing_data is not None else [])

394

395 for entity in entities_in_path:

396 graph_iri = cast(str, entity.g.identifier)

397 if isinstance(entity, ProvEntity):

398 doc.merge_entity(graph_iri, entity.res, _entity_to_jsonld_dict(entity))

399 elif isinstance(entity, (GraphEntity, MetadataEntity)):

400 if entity.to_be_deleted:

401 doc.remove_entity(graph_iri, entity.res)

402 else:

403 if len(entity.preexisting_triples) > 0:

404 doc.remove_entity(graph_iri, entity.res)

405 doc.upsert_entity(graph_iri, entity.res, _entity_to_jsonld_dict(entity))

406

407 output_data: JsonLdDocument | JsonObject = doc.to_list()

408 if context_path is not None and ns_to_prefix is not None:

409 output_data = _compact_jsonld(output_data, context_path, ns_to_prefix)

410 json_bytes = orjson.dumps(output_data)

411 self._write_jsonld_fast(json_bytes, relevant_path)

412

413 return list(relevant_paths.keys())

414

415 def _store_graphs_in_file_jsonld_fast(self, file_path: str, context_path: str | None) -> None:

416 doc = _JsonLdDoc([])

417 for entity in self._snapshot_entities():

418 graph_iri = cast(str, entity.g.identifier)

419 doc.upsert_entity(graph_iri, entity.res, _entity_to_jsonld_dict(entity))

420

421 output_data: JsonLdDocument | JsonObject = doc.to_list()

422 if context_path is not None and context_path in self.context_map:

423 ns_to_prefix = self._build_ns_to_prefix(context_path)

424 output_data = _compact_jsonld(output_data, context_path, ns_to_prefix)

425 json_bytes = orjson.dumps(output_data)

426 self._write_jsonld_fast(json_bytes, file_path)

427

428 def upload_and_store(

429 self, base_dir: str, triplestore_url: str, base_iri: str, context_path: str | None = None, batch_size: int = 10

430 ) -> None:

431 self.store_all(base_dir, base_iri, context_path)

432 self.upload_all(triplestore_url, base_dir, batch_size)

433

434 def _dir_and_file_paths(

435 self, res: str, base_dir: str, base_iri: str, process_id: int | str | None = None

436 ) -> Tuple[str, str]:

437 is_json: bool = self.output_format == "json-ld"

438 return find_paths(

439 res,

440 base_dir,

441 base_iri,

442 self.default_dir,

443 self.dir_split,

444 self.n_file_item,

445 is_json=is_json,

446 process_id=process_id,

447 )

448

449 @staticmethod

450 def _class_to_entity_type(entity: AbstractEntity) -> str:

451 if isinstance(entity, GraphEntity):

452 return "graph"

453 elif isinstance(entity, ProvEntity):

454 return "prov"

455 else:

456 return "metadata"

457

458 def upload_all(

459 self, triplestore_url: str, base_dir: str | None = None, batch_size: int = 10, save_queries: bool = False

460 ) -> bool:

461 """

462 Upload SPARQL update queries to the triplestore in batches, or save them to disk.

463

464 Args:

465 triplestore_url: SPARQL endpoint URL

466 base_dir: Base directory for output files (required when save_queries is True)

467 batch_size: Number of queries per SPARQL batch

468 save_queries: If True, save combined SPARQL queries to disk instead of uploading

469

470 Returns:

471 True if all batches were processed successfully, False otherwise

472 """

473 self.repok.new_article()

474 self.reperr.new_article()

475

476 if batch_size <= 0:

477 batch_size = 10

478

479 query_batch: list[str] = []

480 added_statements: int = 0

481 removed_statements: int = 0

482 result: bool = True

483 to_be_uploaded_dir: str = ""

484

485 if base_dir:

486 to_be_uploaded_dir = os.path.join(base_dir, "to_be_uploaded")

487 os.makedirs(to_be_uploaded_dir, exist_ok=True)

488

489 entities_to_process: Iterable[AbstractEntity] = self.a_set.res_to_entity.values()

490 if self.modified_entities is not None:

491 entities_to_process = [

492 entity

493 for entity in entities_to_process

494 if str(entity.res).split("/prov/se/")[0] in self.modified_entities

495 ]

496

497 for entity in entities_to_process:

498 entity_type = self._class_to_entity_type(entity)

499 update_queries, n_added, n_removed = get_update_query(entity, entity_type=entity_type)

500

501 if not update_queries:

502 continue

503

504 for query in update_queries:

505 query_batch.append(query)

506 added_statements += n_added // len(update_queries)

507 removed_statements += n_removed // len(update_queries)

508

509 if len(query_batch) >= batch_size:

510 query_string = " ; ".join(query_batch)

511 if save_queries:

512 self._save_query(query_string, to_be_uploaded_dir, added_statements, removed_statements)

513 else:

514 result &= self._query(

515 query_string, triplestore_url, base_dir, added_statements, removed_statements

516 )

517 query_batch = []

518 added_statements = 0

519 removed_statements = 0

520

521 if query_batch:

522 query_string = " ; ".join(query_batch)

523 if save_queries:

524 self._save_query(query_string, to_be_uploaded_dir, added_statements, removed_statements)

525 else:

526 result &= self._query(query_string, triplestore_url, base_dir, added_statements, removed_statements)

527

528 return result

529

530 def _save_query(self, query_string: str, directory: str, added_statements: int, removed_statements: int) -> None:

531 content_hash = hashlib.sha256(query_string.encode("utf-8")).hexdigest()[:16]

532 file_name = f"{content_hash}_add{added_statements}_remove{removed_statements}.sparql"

533 file_path = os.path.join(directory, file_name)

534 with open(file_path, "w", encoding="utf-8") as f:

535 f.write(query_string)

536

537 def upload(self, entity: AbstractEntity, triplestore_url: str, base_dir: str | None = None) -> bool:

538 self.repok.new_article()

539 self.reperr.new_article()

540

541 entity_type = self._class_to_entity_type(entity)

542 update_queries, n_added, n_removed = get_update_query(entity, entity_type=entity_type)

543 query_string = " ; ".join(update_queries) if update_queries else ""

544 return self._query(query_string, triplestore_url, base_dir, n_added, n_removed)

545

546 def execute_query(self, query_string: str, triplestore_url: str) -> bool:

547 self.repok.new_article()

548 self.reperr.new_article()

549

550 return self._query(query_string, triplestore_url)

551

552 def _query(

553 self,

554 query_string: str,

555 triplestore_url: str,

556 base_dir: str | None = None,

557 added_statements: int = 0,

558 removed_statements: int = 0,

559 ) -> bool:

560 if query_string != "":

561 try:

562 sparql_update(triplestore_url, query_string, max_retries=3, backoff_factor=2.5)

563

564 self.repok.add_sentence(

565 f"Triplestore updated with {added_statements} added statements and "

566 f"with {removed_statements} removed statements."

567 )

568

569 return True

570

571 except SPARQLEndpointError as e:

572 self.reperr.add_sentence(

573 f"[3] Graph was not loaded into the triplestore due to communication problems: {e}"

574 )

575 if base_dir is not None:

576 tp_err_dir: str = base_dir + os.sep + "tp_err"

577 if not os.path.exists(tp_err_dir):

578 os.makedirs(tp_err_dir, exist_ok=True)

579 cur_file_err: str = (

580 tp_err_dir + os.sep + datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f_not_uploaded.txt")

581 )

582 with open(cur_file_err, "wt", encoding="utf-8") as f:

583 f.write(query_string)

584

585 return False

Coverage for oc_ocdm / storer.py: 86%

368 statements