Coverage for src/time_agnostic_library/ocdm

1# SPDX-FileCopyrightText: 2026 Arcangelo Massari <arcangelo.massari@unibo.it>

3# SPDX-License-Identifier: ISC

5import gzip

6import re

7from collections import defaultdict

8from collections.abc import Callable

9from concurrent.futures import ThreadPoolExecutor

10from datetime import datetime

11from pathlib import Path

13PROV_NS = "http://www.w3.org/ns/prov#"

14OCO_NS = "https://w3id.org/oc/ontology/"

15DCTERMS_NS = "http://purl.org/dc/terms/"

16XSD_NS = "http://www.w3.org/2001/XMLSchema#"

18_TRIPLE_LEN = 3

20# Regex to parse an N-Triples line into (subject, predicate, object) in a single

21# C-level pass. Falls back to the character-by-character parser on mismatch.

22#

23# N-Triples line format: <subject> <predicate> <object> .

24#

25# Group 1 - subject: URI <http://...> or blank node _:id

26# Group 2 - predicate: always a URI <http://...>

27# Group 3 - object, one of:

28# - URI: <http://...>

29# - literal: "text" optionally followed by @lang or ^^<datatype>

30# - blank node: _:id

31_NT_RE = re.compile(

32 r"(<[^>]+>|_:\S+)\s+" # group 1: subject (URI or blank node)

33 r"(<[^>]+>)\s+" # group 2: predicate (URI)

34 r"(<[^>]+>" # group 3 option a: URI object

35 r'|"(?:[^"\\]|\\.)*"' # group 3 option b: quoted literal (handles escapes)

36 r"(?:@[a-zA-Z-]+|\^\^<[^>]+>)?" # optional language tag or datatype

37 r"|_:\S+)" # group 3 option c: blank node object

38 r"\s*\.\s*$" # trailing dot and whitespace

39)

42def parse_ntriples_line(

43 line: str,

44 object_normalizer: Callable[[str], str] | None = None,

45) -> tuple[str, str, str] | None:

46 line = line.strip()

47 if not line or line.startswith("#"):

48 return None

49 m = _NT_RE.match(line)

50 if m:

51 obj = m.group(3)

52 if object_normalizer:

53 obj = object_normalizer(obj)

54 return (m.group(1), m.group(2), obj)

55 if line.endswith(" ."):

56 line = line[:-2]

57 elif line.endswith("."):

58 line = line[:-1]

59 line = line.strip()

60 parts = []

61 i = 0

62 while i < len(line) and len(parts) < _TRIPLE_LEN:

63 if line[i] == "<":

64 end = line.index(">", i)

65 parts.append(line[i : end + 1])

66 i = end + 1

67 elif line[i] == '"':

68 j = i + 1

69 while j < len(line):

70 if line[j] == "\\" and j + 1 < len(line):

71 j += 2

72 continue

73 if line[j] == '"':

74 break

75 j += 1

76 end_quote = j

77 rest_start = end_quote + 1

78 if rest_start < len(line) and line[rest_start : rest_start + 2] == "^^":

79 dt_start = rest_start + 2

80 if dt_start < len(line) and line[dt_start] == "<":

81 dt_end = line.index(">", dt_start)

82 parts.append(line[i : dt_end + 1])

83 i = dt_end + 1

84 else:

85 space = line.find(" ", dt_start)

86 if space == -1:

87 parts.append(line[i:])

88 i = len(line)

89 else:

90 parts.append(line[i:space])

91 i = space

92 elif rest_start < len(line) and line[rest_start] == "@":

93 space = line.find(" ", rest_start)

94 if space == -1:

95 parts.append(line[i:])

96 i = len(line)

97 else:

98 parts.append(line[i:space])

99 i = space

100 else:

101 parts.append(line[i : end_quote + 1])

102 i = end_quote + 1

103 elif line[i] == "_":

104 space = line.find(" ", i)

105 if space == -1:

106 parts.append(line[i:])

107 i = len(line)

108 else:

109 parts.append(line[i:space])

110 i = space

111 elif line[i] == " " or line[i] == "\t":

112 i += 1

113 else:

114 space = line.find(" ", i)

115 if space == -1:

116 parts.append(line[i:])

117 i = len(line)

118 else:

119 parts.append(line[i:space])

120 i = space

121 if len(parts) == _TRIPLE_LEN:

122 obj = parts[2]

123 if object_normalizer:

124 obj = object_normalizer(obj)

125 return (parts[0], parts[1], obj)

126 return None

127

128

129def extract_subject_uri(s_term: str) -> str:

130 if s_term.startswith("<") and s_term.endswith(">"):

131 return s_term[1:-1]

132 return s_term

133

134

135def _open_ntriples(filepath: Path):

136 if filepath.suffix == ".gz":

137 return gzip.open(filepath, "rt", encoding="utf-8", errors="replace")

138 return filepath.open(encoding="utf-8", errors="replace")

139

140

141def read_ntriples_file(

142 filepath: Path,

143 object_normalizer: Callable[[str], str] | None = None,

144) -> list[tuple[str, str, str]]:

145 triples = []

146 with _open_ntriples(filepath) as f:

147 for line in f:

148 parsed = parse_ntriples_line(line, object_normalizer)

149 if parsed:

150 triples.append(parsed)

151 return triples

152

153

154def group_triples_by_subject(

155 triples: list[tuple[str, str, str]],

156) -> dict[str, set[tuple[str, str]]]:

157 by_subject: dict[str, set[tuple[str, str]]] = defaultdict(set)

158 for s, p, o in triples:

159 uri = extract_subject_uri(s)

160 by_subject[uri].add((p, o))

161 return by_subject

162

163

164def _read_and_group(

165 filepath: Path,

166 object_normalizer: Callable[[str], str] | None = None,

167) -> dict[str, set[tuple[str, str]]]:

168 by_subject: dict[str, set[tuple[str, str]]] = defaultdict(set)

169 match = _NT_RE.match

170 with _open_ntriples(filepath) as f:

171 if object_normalizer:

172 for line in f:

173 m = match(line)

174 if m:

175 s, p, obj = m.groups()

176 obj = object_normalizer(obj)

177 uri = s[1:-1] if s[0] == "<" else s

178 by_subject[uri].add((p, obj))

179 else:

180 parsed = parse_ntriples_line(line, object_normalizer)

181 if parsed:

182 s, p, o = parsed

183 uri = s[1:-1] if s[0] == "<" and s[-1] == ">" else s

184 by_subject[uri].add((p, o))

185 else:

186 for line in f:

187 m = match(line)

188 if m:

189 s, p, obj = m.groups()

190 uri = s[1:-1] if s[0] == "<" else s

191 by_subject[uri].add((p, obj))

192 else:

193 parsed = parse_ntriples_line(line, object_normalizer)

194 if parsed:

195 s, p, o = parsed

196 uri = s[1:-1] if s[0] == "<" and s[-1] == ">" else s

197 by_subject[uri].add((p, o))

198 return by_subject

199

200

201def _format_timestamp(dt: datetime) -> str:

202 return dt.strftime("%Y-%m-%dT%H:%M:%S+00:00")

203

204

205def _build_update_query(

206 entity_uri: str,

207 data_graph_uri: str,

208 deleted_po: set[tuple[str, str]],

209 added_po: set[tuple[str, str]],

210) -> str:

211 parts = []

212 if deleted_po:

213 triples = " ".join(f"<{entity_uri}> {p} {o} ." for p, o in deleted_po)

214 parts.append(f"DELETE DATA {{ GRAPH <{data_graph_uri}> {{ {triples} }} }}")

215 if added_po:

216 triples = " ".join(f"<{entity_uri}> {p} {o} ." for p, o in added_po)

217 parts.append(f"INSERT DATA {{ GRAPH <{data_graph_uri}> {{ {triples} }} }}")

218 return "; ".join(parts)

219

220

221def _escape_sparql_for_nquads(query: str) -> str:

222 escaped = query.replace("\\", "\\\\")

223 escaped = escaped.replace('"', '\\"')

224 escaped = escaped.replace("\n", "\\n")

225 escaped = escaped.replace("\r", "\\r")

226 return escaped.replace("\t", "\\t")

227

228

229class OCDMConverter:

230 def __init__(

231 self,

232 data_graph_uri: str,

233 agent_uri: str,

234 object_normalizer: Callable[[str], str] | None = None,

235 ):

236 self.data_graph_uri = data_graph_uri

237 self.agent_uri = agent_uri

238 self.object_normalizer = object_normalizer

239

240 def convert_from_ic(

241 self,

242 ic_files: list[Path],

243 timestamps: list[datetime],

244 dataset_output: Path,

245 provenance_output: Path,

246 ) -> None:

247 all_entities: set[str] = set()

248 entity_changes: dict[

249 str, list[tuple[int, set[tuple[str, str]], set[tuple[str, str]]]]

250 ] = defaultdict(list)

251 prev_by_subject: dict[str, set[tuple[str, str]]] = {}

252 latest_by_subject: dict[str, set[tuple[str, str]]] = {}

253

254 # Prefetch pipeline: a single background thread reads and parses the

255 # next IC file while the main thread diffs the current pair.

256 with ThreadPoolExecutor(max_workers=1) as executor:

257 future = executor.submit(

258 _read_and_group, ic_files[0], self.object_normalizer

259 )

260

261 for version_idx in range(len(ic_files)):

262 cur_by_subject = future.result()

263

264 if version_idx + 1 < len(ic_files):

265 future = executor.submit(

266 _read_and_group,

267 ic_files[version_idx + 1],

268 self.object_normalizer,

269 )

270

271 all_entities.update(cur_by_subject.keys())

272

273 if version_idx > 0:

274 for entity_uri in prev_by_subject.keys() | cur_by_subject.keys():

275 prev_po = prev_by_subject.get(entity_uri, set())

276 cur_po = cur_by_subject.get(entity_uri, set())

277 deleted_po = prev_po - cur_po

278 added_po = cur_po - prev_po

279 if deleted_po or added_po:

280 entity_changes[entity_uri].append(

281 (version_idx, deleted_po, added_po)

282 )

283

284 prev_by_subject = cur_by_subject

285 if version_idx == len(ic_files) - 1:

286 latest_by_subject = cur_by_subject

287

288 self._write_ocdm_output(

289 all_entities,

290 entity_changes,

291 latest_by_subject,

292 timestamps,

293 dataset_output,

294 provenance_output,

295 )

296

297 def convert_from_cb(

298 self,

299 initial_snapshot: Path,

300 changesets: list[tuple[Path, Path]],

301 timestamps: list[datetime],

302 dataset_output: Path,

303 provenance_output: Path,

304 ) -> None:

305 all_entities: set[str] = set()

306 entity_changes: dict[

307 str, list[tuple[int, set[tuple[str, str]], set[tuple[str, str]]]]

308 ] = defaultdict(list)

309

310 current_state: dict[str, set[tuple[str, str]]] = defaultdict(

311 set, _read_and_group(initial_snapshot, self.object_normalizer)

312 )

313 all_entities.update(current_state.keys())

314

315 # Read the added and deleted files of each changeset in parallel.

316 with ThreadPoolExecutor(max_workers=2) as executor:

317 for changeset_idx, (added_file, deleted_file) in enumerate(changesets):

318 version_idx = changeset_idx + 1

319

320 fut_del = executor.submit(

321 _read_and_group, deleted_file, self.object_normalizer

322 )

323 fut_add = executor.submit(

324 _read_and_group, added_file, self.object_normalizer

325 )

326 deleted_by_subject = fut_del.result()

327 added_by_subject = fut_add.result()

328

329 changed_entities = deleted_by_subject.keys() | added_by_subject.keys()

330 all_entities.update(changed_entities)

331

332 for entity_uri in changed_entities:

333 deleted_po = deleted_by_subject.get(entity_uri, set())

334 added_po = added_by_subject.get(entity_uri, set())

335

336 current_state[entity_uri] -= deleted_po

337 current_state[entity_uri] |= added_po

338

339 if not current_state[entity_uri]:

340 del current_state[entity_uri]

341

342 if deleted_po or added_po:

343 entity_changes[entity_uri].append(

344 (version_idx, deleted_po, added_po)

345 )

346

347 self._write_ocdm_output(

348 all_entities,

349 entity_changes,

350 current_state,

351 timestamps,

352 dataset_output,

353 provenance_output,

354 )

355

356 def _write_ocdm_output(

357 self,

358 all_entities: set[str],

359 entity_changes: dict[

360 str, list[tuple[int, set[tuple[str, str]], set[tuple[str, str]]]]

361 ],

362 latest_by_subject: dict[str, set[tuple[str, str]]],

363 timestamps: list[datetime],

364 dataset_output: Path,

365 provenance_output: Path,

366 ) -> None:

367 dataset_output.parent.mkdir(parents=True, exist_ok=True)

368 provenance_output.parent.mkdir(parents=True, exist_ok=True)

369 sorted_entities = sorted(all_entities)

370

371 lines = []

372 for entity_uri in sorted_entities:

373 po_set = latest_by_subject.get(entity_uri, set())

374 for p, o in sorted(po_set):

375 lines.append(f"<{entity_uri}> {p} {o} <{self.data_graph_uri}> .\n")

376 with Path(dataset_output).open("w", encoding="utf-8") as f:

377 f.writelines(lines)

378

379 lines = []

380 for entity_uri in sorted_entities:

381 prov_graph = f"<{entity_uri}/prov/>"

382 changes = entity_changes.get(entity_uri, [])

383

384 se1_uri = f"<{entity_uri}/prov/se/1>"

385 t0 = _format_timestamp(timestamps[0])

386

387 lines.append(

388 f"{se1_uri} <{PROV_NS}specializationOf> <{entity_uri}> {prov_graph} .\n"

389 )

390 lines.append(

391 f'{se1_uri} <{PROV_NS}generatedAtTime> "{t0}"^^<{XSD_NS}dateTime> '

392 f"{prov_graph} .\n"

393 )

394 lines.append(

395 f"{se1_uri} <{PROV_NS}wasAttributedTo> <{self.agent_uri}> "

396 f"{prov_graph} .\n"

397 )

398 lines.append(

399 f'{se1_uri} <{DCTERMS_NS}description> "The entity has been created." '

400 f"{prov_graph} .\n"

401 )

402

403 for change_idx, (version_idx, deleted_po, added_po) in enumerate(changes):

404 se_num = change_idx + 2

405 se_uri = f"<{entity_uri}/prov/se/{se_num}>"

406 timestamp = _format_timestamp(timestamps[version_idx])

407

408 lines.append(

409 f"{se_uri} <{PROV_NS}specializationOf> <{entity_uri}> "

410 f"{prov_graph} .\n"

411 )

412 lines.append(

413 f'{se_uri} <{PROV_NS}generatedAtTime> "{timestamp}"'

414 f"^^<{XSD_NS}dateTime> {prov_graph} .\n"

415 )

416 lines.append(

417 f"{se_uri} <{PROV_NS}wasAttributedTo> <{self.agent_uri}> "

418 f"{prov_graph} .\n"

419 )

420

421 update_query = _build_update_query(

422 entity_uri, self.data_graph_uri, deleted_po, added_po

423 )

424 escaped_query = _escape_sparql_for_nquads(update_query)

425 lines.append(

426 f'{se_uri} <{OCO_NS}hasUpdateQuery> "{escaped_query}" '

427 f"{prov_graph} .\n"

428 )

429 lines.append(

430 f"{se_uri} <{DCTERMS_NS}description> "

431 f'"The entity has been modified." {prov_graph} .\n'

432 )

433

434 prev_se_uri = f"<{entity_uri}/prov/se/{se_num - 1}>"

435 lines.append(

436 f"{se_uri} <{PROV_NS}wasDerivedFrom> {prev_se_uri} {prov_graph} .\n"

437 )

438

439 if (

440 change_idx == len(changes) - 1

441 and entity_uri not in latest_by_subject

442 ):

443 lines.append(

444 f'{se_uri} <{PROV_NS}invalidatedAtTime> "{timestamp}"'

445 f"^^<{XSD_NS}dateTime> {prov_graph} .\n"

446 )

447

448 with Path(provenance_output).open("w", encoding="utf-8") as f:

449 f.writelines(lines)

Coverage for src / time_agnostic_library / ocdm_converter.py: 100%

247 statements