Coverage for oc_ocdm/reader.py: 87%

1#!/usr/bin/python

3# SPDX-FileCopyrightText: 2020-2022 Simone Persiani <iosonopersia@gmail.com>

4# SPDX-FileCopyrightText: 2022-2026 Arcangelo Massari <arcangelo.massari@unibo.it>

6# SPDX-License-Identifier: ISC

8# -*- coding: utf-8 -*-

9from __future__ import annotations

11import json

12import os

13from collections.abc import Callable

14from importlib import import_module

15from typing import TYPE_CHECKING, BinaryIO, TextIO, cast

16from zipfile import ZipFile

18import orjson

19from rdflib import Dataset, Graph, URIRef

20from rdflib.term import Node

21from triplelite import TripleLite, from_rdflib

23from oc_ocdm._types import ContextMap, JsonLdDocument, JsonObject, JsonValue, SparqlResultRows

24from oc_ocdm.constants import RDF_TYPE

25from oc_ocdm.graph.graph_entity import GraphEntity

26from oc_ocdm.support.reporter import Reporter

27from oc_ocdm.support.sparql import SPARQLEndpointError, sparql_query

28from oc_ocdm.support.support import build_graph_from_results, normalize_graph_literals

30if TYPE_CHECKING:

31 from typing import List, Optional

33 from oc_ocdm.graph.graph_set import GraphSet

35_validate = cast(Callable[..., tuple[object, object, object]], getattr(import_module("pyshacl"), "validate"))

38def _transform_jsonld_value(value: JsonValue, uri_fn: Callable[[str], str]) -> JsonValue:

39 if isinstance(value, dict):

40 if "@id" in value:

41 return {"@id": uri_fn(cast(str, value["@id"]))}

42 result: JsonObject = {}

43 if "@value" in value:

44 result["@value"] = value["@value"]

45 if "@type" in value:

46 result["@type"] = uri_fn(cast(str, value["@type"]))

47 if "@language" in value:

48 result["@language"] = value["@language"]

49 return result

50 return value

53def _transform_jsonld_entity(entity: JsonObject, uri_fn: Callable[[str], str]) -> JsonObject:

54 transformed: JsonObject = {}

55 for key, value in entity.items():

56 if key == "@id":

57 transformed["@id"] = uri_fn(cast(str, value))

58 elif key == "@type":

59 transformed["@type"] = (

60 [uri_fn(cast(str, t)) for t in value] if isinstance(value, list) else [uri_fn(cast(str, value))]

61 )

62 elif key.startswith("@"):

63 continue

64 else:

65 new_key = uri_fn(key)

66 if isinstance(value, list):

67 transformed[new_key] = [_transform_jsonld_value(v, uri_fn) for v in value]

68 else:

69 transformed[new_key] = _transform_jsonld_value(value, uri_fn)

70 return transformed

73def transform_jsonld_graphs(data: JsonLdDocument, uri_fn: Callable[[str], str]) -> JsonLdDocument:

74 result: JsonLdDocument = []

75 for graph_obj in data:

76 new_graph: JsonObject = {}

77 if "@id" in graph_obj:

78 new_graph["@id"] = uri_fn(cast(str, graph_obj["@id"]))

79 if "@graph" in graph_obj:

80 new_graph["@graph"] = [

81 _transform_jsonld_entity(entity, uri_fn) for entity in cast(list[JsonObject], graph_obj["@graph"])

82 ]

83 result.append(new_graph)

84 return result

87def _expand_uri(curie: str, prefix_to_ns: dict[str, str]) -> str:

88 colon = curie.find(":")

89 if colon > 0:

90 prefix = curie[:colon]

91 ns = prefix_to_ns.get(prefix)

92 if ns is not None:

93 return ns + curie[colon + 1 :]

94 return curie

97def _expand_jsonld(data: JsonLdDocument, prefix_to_ns: dict[str, str]) -> JsonLdDocument:

98 return transform_jsonld_graphs(data, lambda uri: _expand_uri(uri, prefix_to_ns))

100

101class Reader(object):

102 def __init__(

103 self,

104 repok: Optional[Reporter] = None,

105 reperr: Optional[Reporter] = None,

106 context_map: Optional[ContextMap] = None,

107 ) -> None:

108

109 if context_map is not None:

110 self.context_map: ContextMap = context_map

111 else:

112 self.context_map: ContextMap = {}

113 for context_url in self.context_map:

114 ctx_file_path = self.context_map[context_url]

115 if isinstance(ctx_file_path, str) and os.path.isfile(ctx_file_path):

116 # This expensive operation is done only when it's really needed

117 with open(ctx_file_path, "rt", encoding="utf-8") as ctx_f:

118 self.context_map[context_url] = cast(JsonObject, json.load(ctx_f))

119

120 if repok is None:

121 self.repok: Reporter = Reporter(prefix="[Reader: INFO] ")

122 else:

123 self.repok: Reporter = repok

124

125 if reperr is None:

126 self.reperr: Reporter = Reporter(prefix="[Reader: ERROR] ")

127 else:

128 self.reperr: Reporter = reperr

129

130 def load(self, rdf_file_path: str) -> Optional[Dataset]:

131 self.repok.new_article()

132 self.reperr.new_article()

133

134 loaded_graph: Optional[Dataset] = None

135 if os.path.isfile(rdf_file_path):

136 try:

137 loaded_graph = self._load_graph(rdf_file_path)

138 except Exception as e:

139 self.reperr.add_sentence(

140 "[1] "

141 "It was impossible to handle the format used for "

142 "storing the file (stored in the temporary path) "

143 f"'{rdf_file_path}'. Additional details: {e}"

144 )

145 else:

146 self.reperr.add_sentence(f"[2] The file specified ('{rdf_file_path}') doesn't exist.")

147

148 return loaded_graph

149

150 _EXT_TO_FORMATS: dict[str, list[str]] = {

151 ".json": ["json-ld"],

152 ".jsonld": ["json-ld"],

153 ".xml": ["rdfxml"],

154 ".rdf": ["rdfxml"],

155 ".ttl": ["turtle"],

156 ".trig": ["trig"],

157 ".nt": ["nt11"],

158 ".nq": ["nquads"],

159 }

160 _ALL_FORMATS: list[str] = ["json-ld", "rdfxml", "turtle", "trig", "nt11", "nquads"]

161

162 @staticmethod

163 def _formats_for_file(file_name: str) -> list[str]:

164 ext = os.path.splitext(file_name)[1].lower()

165 preferred = Reader._EXT_TO_FORMATS.get(ext)

166 if preferred is not None:

167 return preferred + [f for f in Reader._ALL_FORMATS if f not in preferred]

168 return Reader._ALL_FORMATS

169

170 def _load_graph(self, file_path: str) -> Dataset:

171 loaded_graph = Dataset()

172

173 if file_path.endswith(".zip"):

174 try:

175 with ZipFile(file=file_path, mode="r") as archive:

176 for zf_name in archive.namelist():

177 formats = self._formats_for_file(zf_name)

178 with archive.open(zf_name) as f:

179 if self._try_parse(loaded_graph, cast(BinaryIO, f), formats):

180 for graph in loaded_graph.graphs():

181 normalize_graph_literals(graph)

182 return loaded_graph

183 except Exception as e:

184 raise IOError(f"Error opening or reading zip file '{file_path}': {e}")

185 else:

186 formats = self._formats_for_file(file_path)

187 try:

188 with open(file_path, "rt", encoding="utf-8") as f:

189 if self._try_parse(loaded_graph, f, formats):

190 for graph in loaded_graph.graphs():

191 normalize_graph_literals(graph)

192 return loaded_graph

193 except Exception as e:

194 raise IOError(f"Error opening or reading file '{file_path}': {e}")

195

196 raise IOError(f"It was impossible to load the file '{file_path}' with supported formats.")

197

198 def _try_parse(self, graph: Dataset, file_obj: TextIO | BinaryIO, formats: List[str]) -> bool:

199 for cur_format in formats:

200 file_obj.seek(0)

201 try:

202 if cur_format == "json-ld":

203 json_ld_file = cast(JsonObject | JsonLdDocument, json.load(file_obj))

204 if isinstance(json_ld_file, dict):

205 json_ld_file = [json_ld_file]

206 for json_ld_resource in json_ld_file:

207 context_url = json_ld_resource["@context"] if "@context" in json_ld_resource else None

208 if isinstance(context_url, str) and context_url in self.context_map:

209 context_data = self.context_map[context_url]

210 if isinstance(context_data, dict) and "@context" in context_data:

211 json_ld_resource["@context"] = context_data["@context"]

212 graph.parse(data=json.dumps(json_ld_file, ensure_ascii=False), format=cur_format)

213 else:

214 graph.parse(file=file_obj, format=cur_format)

215 return True

216 except Exception:

217 continue

218 return False

219

220 def load_jsonld_dict(self, rdf_file_path: str) -> JsonLdDocument:

221 if rdf_file_path.endswith(".zip"):

222 with ZipFile(file=rdf_file_path, mode="r") as archive:

223 for zf_name in archive.namelist():

224 ext = os.path.splitext(zf_name)[1].lower()

225 if ext in (".json", ".jsonld"):

226 with archive.open(zf_name) as f:

227 data = cast(JsonObject | JsonLdDocument, orjson.loads(f.read()))

228 break

229 else:

230 raise IOError(f"No JSON/JSON-LD file found inside ZIP archive '{rdf_file_path}'.")

231 else:

232 with open(rdf_file_path, "rb") as f:

233 data = cast(JsonObject | JsonLdDocument, orjson.loads(f.read()))

234 if isinstance(data, dict):

235 data = [data]

236 prefix_to_ns: dict[str, str] | None = None

237 for graph_obj in data:

238 ctx_url = graph_obj["@context"] if "@context" in graph_obj else None

239 if isinstance(ctx_url, str) and ctx_url in self.context_map:

240 ctx = self.context_map[ctx_url]

241 if isinstance(ctx, dict) and "@context" in ctx:

242 context_value = ctx["@context"]

243 if isinstance(context_value, dict):

244 ctx = context_value

245 if isinstance(ctx, dict):

246 prefix_to_ns = {k: v for k, v in ctx.items() if isinstance(v, str) and not k.startswith("@")}

247 break

248 if prefix_to_ns is not None:

249 data = _expand_jsonld(data, prefix_to_ns)

250 return data

251

252 def graph_validation(self, graph: Graph, closed: bool = False) -> Graph:

253 valid_graph: Graph = Graph(identifier=graph.identifier)

254 sg = Graph()

255 if closed:

256 sg.parse(os.path.join("oc_ocdm", "resources", "shacle_closed.ttl"))

257 else:

258 sg.parse(os.path.join("oc_ocdm", "resources", "shacle.ttl"))

259 _, report_result, _ = _validate(

260 graph,

261 shacl_graph=sg,

262 ont_graph=None,

263 inference=None,

264 abort_on_first=False,

265 allow_infos=False,

266 allow_warnings=False,

267 meta_shacl=False,

268 advanced=False,

269 js=False,

270 debug=False,

271 )

272 if not isinstance(report_result, Graph):

273 raise TypeError(f"Expected Graph from SHACL validation, got {type(report_result)}")

274 invalid_nodes: set[Node] = set()

275 for triple in report_result.triples((None, URIRef("http://www.w3.org/ns/shacl#focusNode"), None)):

276 invalid_nodes.add(triple[2])

277 for s in graph.subjects(unique=True):

278 if isinstance(s, URIRef) and s not in invalid_nodes:

279 for valid_subject_triple in graph.triples((s, None, None)):

280 valid_graph.add(valid_subject_triple)

281 return valid_graph

282

283 @staticmethod

284 def import_entities_from_graph(

285 g_set: GraphSet,

286 results: SparqlResultRows | TripleLite | Graph | Dataset,

287 resp_agent: str,

288 enable_validation: bool = False,

289 closed: bool = False,

290 ) -> List[GraphEntity]:

291 if isinstance(results, list):

292 graph: TripleLite | Graph = build_graph_from_results(results)

293 elif isinstance(results, Dataset):

294 merged = TripleLite()

295 for tl in from_rdflib(results):

296 for triple in tl.triples((None, None, None)):

297 merged.add(triple)

298 graph = merged

299 elif isinstance(results, Graph):

300 graph = results

301 else:

302 graph = results

303 if enable_validation:

304 reader = Reader()

305 if not isinstance(graph, Graph):

306 graph = graph.to_rdflib()

307 graph = reader.graph_validation(graph, closed)

308 if isinstance(graph, Graph):

309 graph = from_rdflib(graph)[0]

310 imported_entities: List[GraphEntity] = []

311 for subject in graph.subjects():

312 types: List[str] = [o.value for o in graph.objects(subject, RDF_TYPE)]

313 preexisting = graph.subgraph(subject)

314 if GraphEntity.iri_note in types:

315 imported_entities.append(

316 g_set.add_an(resp_agent=resp_agent, res=subject, preexisting_graph=preexisting)

317 )

318 elif GraphEntity.iri_role_in_time in types:

319 imported_entities.append(

320 g_set.add_ar(resp_agent=resp_agent, res=subject, preexisting_graph=preexisting)

321 )

322 elif GraphEntity.iri_bibliographic_reference in types:

323 imported_entities.append(

324 g_set.add_be(resp_agent=resp_agent, res=subject, preexisting_graph=preexisting)

325 )

326 elif GraphEntity.iri_expression in types:

327 imported_entities.append(

328 g_set.add_br(resp_agent=resp_agent, res=subject, preexisting_graph=preexisting)

329 )

330 elif GraphEntity.iri_citation in types:

331 imported_entities.append(

332 g_set.add_ci(resp_agent=resp_agent, res=subject, preexisting_graph=preexisting)

333 )

334 elif GraphEntity.iri_discourse_element in types:

335 imported_entities.append(

336 g_set.add_de(resp_agent=resp_agent, res=subject, preexisting_graph=preexisting)

337 )

338 elif GraphEntity.iri_identifier in types:

339 imported_entities.append(

340 g_set.add_id(resp_agent=resp_agent, res=subject, preexisting_graph=preexisting)

341 )

342 elif GraphEntity.iri_singleloc_pointer_list in types:

343 imported_entities.append(

344 g_set.add_pl(resp_agent=resp_agent, res=subject, preexisting_graph=preexisting)

345 )

346 elif GraphEntity.iri_agent in types:

347 imported_entities.append(

348 g_set.add_ra(resp_agent=resp_agent, res=subject, preexisting_graph=preexisting)

349 )

350 elif GraphEntity.iri_manifestation in types:

351 imported_entities.append(

352 g_set.add_re(resp_agent=resp_agent, res=subject, preexisting_graph=preexisting)

353 )

354 elif GraphEntity.iri_intextref_pointer in types:

355 imported_entities.append(

356 g_set.add_rp(resp_agent=resp_agent, res=subject, preexisting_graph=preexisting)

357 )

358 return imported_entities

359

360 @staticmethod

361 def import_entity_from_triplestore(

362 g_set: GraphSet, ts_url: str, res: str, resp_agent: str, enable_validation: bool = False

363 ) -> GraphEntity:

364 query: str = f"SELECT ?s ?p ?o WHERE {{BIND (<{res}> AS ?s). ?s ?p ?o.}}"

365 try:

366 result = sparql_query(ts_url, query, max_retries=3, backoff_factor=2.5)["results"]["bindings"]

367

368 if not result:

369 raise ValueError(f"The requested entity {res} was not found in the triplestore.")

370

371 imported_entities: List[GraphEntity] = Reader.import_entities_from_graph(

372 g_set, result, resp_agent, enable_validation

373 )

374 if len(imported_entities) <= 0:

375 raise ValueError("The requested entity was not recognized as a proper OCDM entity.")

376 return imported_entities[0]

377

378 except ValueError:

379 raise

380 except SPARQLEndpointError as e:

381 print(f"[3] Could not import entity due to communication problems: {e}")

382 raise

383

384 @staticmethod

385 def import_entities_from_triplestore(

386 g_set: GraphSet,

387 ts_url: str,

388 entities: List[str],

389 resp_agent: str,

390 enable_validation: bool = False,

391 batch_size: int = 1000,

392 ) -> List[GraphEntity]:

393 if not entities:

394 raise ValueError("No entities provided for import")

395

396 imported_entities: List[GraphEntity] = []

397

398 try:

399 for i in range(0, len(entities), batch_size):

400 batch = entities[i : i + batch_size]

401 not_found_entities = set(batch)

402

403 union_patterns: list[str] = []

404 for entity in batch:

405 union_patterns.append(f"{{ BIND(<{entity}> AS ?s) ?s ?p ?o }}")

406

407 query = f"""

408 SELECT ?s ?p ?o

409 WHERE {{

410 {" UNION ".join(union_patterns)}

411 }}

412 """

413

414 results = sparql_query(ts_url, query, max_retries=3, backoff_factor=2.5)["results"]["bindings"]

415

416 if not results:

417 entities_str = ", ".join(not_found_entities)

418 raise ValueError(f"The requested entities were not found in the triplestore: {entities_str}")

419

420 for result in results:

421 if "s" in result and "value" in result["s"]:

422 not_found_entities.discard(result["s"]["value"])

423

424 batch_entities = Reader.import_entities_from_graph(

425 g_set=g_set, results=results, resp_agent=resp_agent, enable_validation=enable_validation

426 )

427 imported_entities.extend(batch_entities)

428

429 if not_found_entities:

430 entities_str = ", ".join(not_found_entities)

431 raise ValueError(

432 f"The following entities were not recognized as proper OCDM entities: {entities_str}"

433 )

434

435 except ValueError:

436 raise

437 except SPARQLEndpointError as e:

438 print(f"[3] Could not import batch due to communication problems: {e}")

439 raise

440

441 if not imported_entities:

442 raise ValueError("None of the requested entities were found or recognized as proper OCDM entities.")

443

444 return imported_entities

Coverage for oc_ocdm / reader.py: 87%

287 statements