Coverage for oc_ocdm/support/support.py: 90%

1#!/usr/bin/python

3# SPDX-FileCopyrightText: 2022-2026 Arcangelo Massari <arcangelo.massari@unibo.it>

5# SPDX-License-Identifier: ISC

7# -*- coding: utf-8 -*-

8from __future__ import annotations

10import os

11import re

12from dataclasses import dataclass

13from datetime import datetime

14from functools import lru_cache

15from typing import TYPE_CHECKING

16from urllib.parse import quote

18from rdflib import Graph, Literal

19from rdflib.namespace import XSD as _RDFLIB_XSD

20from rdflib.term import Node

21from triplelite import XSD_STRING, RDFTerm, TripleLite

23from oc_ocdm._types import SparqlBinding, SparqlResultRows

24from oc_ocdm.constants import RDF_TYPE, XSD_DATE, XSD_GYEAR, XSD_GYEARMONTH, XSD_STRING

26_RDFLIB_XSD_STRING = _RDFLIB_XSD.string

28if TYPE_CHECKING:

29 from typing import Dict, List, Optional, Set, Tuple

31 from oc_ocdm.graph.entities.bibliographic.agent_role import AgentRole

32 from oc_ocdm.graph.entities.bibliographic.bibliographic_resource import BibliographicResource

33 from oc_ocdm.graph.entities.bibliographic.responsible_agent import ResponsibleAgent

36@dataclass

37class ParsedURI:

38 base_iri: str

39 short_name: str

40 prefix: str

41 count: str

42 is_prov: bool

43 prov_subject_short_name: str

44 prov_subject_prefix: str

45 prov_subject_count: str

48@dataclass

49class ContributorSubList:

50 id: int

51 agents: list[ResponsibleAgent]

54def sparql_binding_to_rdfterm(binding: SparqlBinding) -> RDFTerm:

55 if binding["type"] == "uri":

56 return RDFTerm("uri", binding["value"])

57 datatype = binding["datatype"] if "datatype" in binding else ""

58 lang = binding["xml:lang"] if "xml:lang" in binding else ""

59 if not datatype and not lang:

60 datatype = XSD_STRING

61 return RDFTerm("literal", binding["value"], datatype, lang)

64def normalize_graph_literals(g: Graph) -> None:

65 triples_to_update: list[tuple[Node, Node, Literal, Literal]] = []

66 for s, p, o in g:

67 if isinstance(o, Literal) and o.datatype is None and o.language is None:

68 triples_to_update.append((s, p, o, Literal(str(o), datatype=_RDFLIB_XSD_STRING)))

69 for s, p, old_o, new_o in triples_to_update:

70 g.remove((s, p, old_o))

71 g.add((s, p, new_o))

74def create_date(date_list: Optional[List[Optional[int]]] = None) -> Optional[str]:

75 string: Optional[str] = None

76 if date_list is not None:

77 l_date_list: int = len(date_list)

78 if l_date_list != 0 and date_list[0] is not None:

79 if (

80 l_date_list == 3

81 and date_list[1] is not None

82 and date_list[2] is not None

83 and (date_list[1] != 1 or date_list[2] != 1)

84 ):

85 string = datetime(date_list[0], date_list[1], date_list[2]).strftime("%Y-%m-%d")

86 elif l_date_list >= 2 and date_list[1] is not None:

87 string = datetime(date_list[0], date_list[1], 1).strftime("%Y-%m")

88 else:

89 string = datetime(date_list[0], 1, 1).strftime("%Y")

90 return string

93def get_datatype_from_iso_8601(string: str) -> Tuple[str, str]:

94 # Keep only the "yyyy-mm-dd" part of the string

95 string = string[:10]

97 try:

98 date_parts: List[int] = [int(s) for s in string.split(sep="-", maxsplit=2)]

99 except ValueError:

100 raise ValueError("The provided date string is not ISO-8601 compliant!")

101

102 num_of_parts: int = len(date_parts)

103 if num_of_parts == 3:

104 return XSD_DATE, datetime(date_parts[0], date_parts[1], date_parts[2]).strftime("%Y-%m-%d")

105 elif num_of_parts == 2:

106 return XSD_GYEARMONTH, datetime(date_parts[0], date_parts[1], 1).strftime("%Y-%m")

107 else:

108 return XSD_GYEAR, datetime(date_parts[0], 1, 1).strftime("%Y")

109

110

111def get_ordered_contributors_from_br(br: BibliographicResource, contributor_type: str) -> List[ResponsibleAgent]:

112

113 ar_list: List[AgentRole] = br.get_contributors()

114

115 list_id: int = 0

116 heads: Dict[str, ContributorSubList] = {}

117 tails: Dict[str, ContributorSubList] = {}

118 sub_lists: List[ContributorSubList] = []

119 from_id_to_res_in_heads: Dict[int, str] = {}

120 for ar in ar_list:

121 role_type: Optional[str] = ar.get_role_type()

122 ra: Optional[ResponsibleAgent] = ar.get_is_held_by()

123 next_ar: Optional[AgentRole] = ar.get_next()

124 if next_ar is not None:

125 next_ar_res: Optional[str] = next_ar.res

126 else:

127 next_ar_res: Optional[str] = None

128

129 if role_type is not None and role_type == str(contributor_type) and ra is not None:

130 if next_ar_res is not None and next_ar_res in heads:

131 sub_list = heads[next_ar_res]

132 sub_list.agents.insert(0, ra)

133 del heads[next_ar_res]

134 heads[ar.res] = sub_list

135 from_id_to_res_in_heads[sub_list.id] = ar.res

136 elif ar.res in tails:

137 sub_list = tails[ar.res]

138 sub_list.agents.append(ra)

139 del tails[ar.res]

140

141 if next_ar_res is not None:

142 tails[next_ar_res] = sub_list

143 else:

144 # This AR cannot be inserted into any list, so

145 # we need to create an entirely new list for it:

146 sub_list = ContributorSubList(list_id, [ra])

147 list_id += 1

148 sub_lists.append(sub_list)

149

150 heads[ar.res] = sub_list

151 from_id_to_res_in_heads[sub_list.id] = ar.res

152 if next_ar_res is not None:

153 tails[next_ar_res] = sub_list

154

155 ids_in_heads: Set[int] = {val.id for val in heads.values()}

156 ids_in_tails: Set[int] = {val.id for val in tails.values()}

157 diff_set: Set[int] = ids_in_heads - ids_in_tails

158 if len(diff_set) == 0:

159 # No contributor was found!

160 return []

161 elif len(diff_set) != 1:

162 raise ValueError("A malformed list of AgentRole entities was given.")

163 else:

164 result_list: List[ResponsibleAgent] = []

165 cur_id: int = diff_set.pop()

166 already_merged_list_ids: Set[int] = set()

167 finished: bool = False

168 while not finished:

169 found: bool = False

170 if cur_id in from_id_to_res_in_heads:

171 res: str = from_id_to_res_in_heads[cur_id]

172 subl = heads[res]

173 subl_id: int = subl.id

174 if subl_id not in already_merged_list_ids:

175 found = True

176 already_merged_list_ids.add(subl_id)

177 result_list = subl.agents + result_list

178

179 # Now we need to get the next cur_id value:

180 if res in tails:

181 cur_id = tails[res].id

182 else:

183 finished = True

184

185 if not found:

186 raise ValueError("A malformed list of AgentRole entities was given.")

187

188 unmerged_list_ids: Set[int] = ids_in_heads - already_merged_list_ids

189 if len(unmerged_list_ids) != 0:

190 raise ValueError("A malformed list of AgentRole entities was given.")

191

192 return result_list

193

194

195def encode_url(u: str) -> str:

196 return quote(u, "://")

197

198

199def create_literal(g: TripleLite, res: str, p: str, s: str, dt: str | None = None, nor: bool = True) -> None:

200 if not is_string_empty(s):

201 g.add((res, p, RDFTerm("literal", s, dt if dt is not None else XSD_STRING)))

202

203

204def create_type(g: TripleLite, res: str, res_type: str) -> None:

205 g.add((res, RDF_TYPE, RDFTerm("uri", res_type)))

206

207

208def is_string_empty(string: Optional[str]) -> bool:

209 return string is None or string.strip() == ""

210

211

212# Variable used in several functions

213entity_regex: str = r"^(.+)/([a-z][a-z])/(0[1-9]+0)?((?:[1-9][0-9]*)|(?:\d+-\d+))$"

214prov_regex: str = r"^(.+)/([a-z][a-z])/(0[1-9]+0)?((?:[1-9][0-9]*)|(?:\d+-\d+))/prov/([a-z][a-z])/([1-9][0-9]*)$"

215

216_compiled_entity_regex = re.compile(entity_regex)

217_compiled_prov_regex = re.compile(prov_regex)

218

219

220@lru_cache(maxsize=4096)

221def parse_uri(res: str) -> ParsedURI:

222 string_iri = str(res)

223 if "/prov/" in string_iri:

224 match = _compiled_prov_regex.match(string_iri)

225 if match:

226 return ParsedURI(

227 base_iri=match.group(1),

228 short_name=match.group(5),

229 prefix="",

230 count=match.group(6),

231 is_prov=True,

232 prov_subject_short_name=match.group(2),

233 prov_subject_prefix=match.group(3) or "",

234 prov_subject_count=match.group(4),

235 )

236 else:

237 match = _compiled_entity_regex.match(string_iri)

238 if match:

239 return ParsedURI(

240 base_iri=match.group(1),

241 short_name=match.group(2),

242 prefix=match.group(3) or "",

243 count=match.group(4),

244 is_prov=False,

245 prov_subject_short_name="",

246 prov_subject_prefix="",

247 prov_subject_count="",

248 )

249 return ParsedURI("", "", "", "", False, "", "", "")

250

251

252def get_base_iri(res: str) -> str:

253 return parse_uri(res).base_iri

254

255

256def get_short_name(res: str) -> str:

257 return parse_uri(res).short_name

258

259

260def get_prefix(res: str) -> str:

261 return parse_uri(res).prefix

262

263

264def get_count(res: str) -> str:

265 return parse_uri(res).count

266

267

268def get_resource_number(res: str) -> int:

269 parsed = parse_uri(res)

270 count = parsed.prov_subject_count if parsed.is_prov else parsed.count

271 return int(count) if count else 0

272

273

274def find_local_line_id(res: str, n_file_item: int = 1) -> int:

275 cur_number: int = get_resource_number(res)

276

277 cur_file_split: int = 0

278 while True:

279 if cur_number > cur_file_split:

280 cur_file_split += n_file_item

281 else:

282 cur_file_split -= n_file_item

283 break

284

285 return cur_number - cur_file_split

286

287

288def find_paths(

289 res: str,

290 base_dir: str,

291 base_iri: str,

292 default_dir: str,

293 dir_split: int,

294 n_file_item: int,

295 is_json: bool = True,

296 process_id: int | str | None = None,

297) -> Tuple[str, str]:

298 """

299 This function is responsible for looking for the correct JSON file that contains the data related to the

300 resource identified by the variable 'string_iri'. This search takes into account the organisation in

301 directories and files, as well as the particular supplier prefix for bibliographic entities, if specified.

302 In case no supplier prefix is specified, the 'default_dir' (usually set to "_") is used instead.

303 """

304 string_iri: str = str(res)

305 process_id_str: str = f"_{process_id}" if process_id else ""

306

307 if is_dataset(res):

308 cur_dir_path: str = (base_dir + re.sub(r"^%s(.*)$" % base_iri, r"\1", string_iri))[:-1]

309 cur_file_path: str = cur_dir_path + os.sep + "index" + process_id_str + ".json"

310 return cur_dir_path, cur_file_path

311

312 parsed = parse_uri(res)

313 cur_number: int = int(parsed.prov_subject_count) if parsed.is_prov else int(parsed.count)

314

315 cur_file_split: int = ((cur_number - 1) // n_file_item + 1) * n_file_item if cur_number > 0 else n_file_item

316

317 if dir_split and not string_iri.startswith(base_iri + "prov/"):

318 cur_split: int = ((cur_number - 1) // dir_split + 1) * dir_split if cur_number > 0 else dir_split

319

320 if parsed.is_prov:

321 sub_folder = parsed.prov_subject_prefix or default_dir or "_"

322 file_extension = ".json" if is_json else ".nq"

323 cur_dir_path = (

324 base_dir

325 + parsed.prov_subject_short_name

326 + os.sep

327 + sub_folder

328 + os.sep

329 + str(cur_split)

330 + os.sep

331 + str(cur_file_split)

332 + os.sep

333 + "prov"

334 )

335 cur_file_path = cur_dir_path + os.sep + parsed.short_name + process_id_str + file_extension

336 else:

337 sub_folder = parsed.prefix or default_dir or "_"

338 file_extension = ".json" if is_json else ".nt"

339 cur_dir_path = base_dir + parsed.short_name + os.sep + sub_folder + os.sep + str(cur_split)

340 cur_file_path = cur_dir_path + os.sep + str(cur_file_split) + process_id_str + file_extension

341 elif dir_split == 0:

342 if parsed.is_prov:

343 sub_folder = parsed.prov_subject_prefix or default_dir or "_"

344 file_extension = ".json" if is_json else ".nq"

345 cur_dir_path = (

346 base_dir

347 + parsed.prov_subject_short_name

348 + os.sep

349 + sub_folder

350 + os.sep

351 + str(cur_file_split)

352 + os.sep

353 + "prov"

354 )

355 cur_file_path = cur_dir_path + os.sep + parsed.short_name + process_id_str + file_extension

356 else:

357 sub_folder = parsed.prefix or default_dir or "_"

358 file_extension = ".json" if is_json else ".nt"

359 cur_dir_path = base_dir + parsed.short_name + os.sep + sub_folder

360 cur_file_path = cur_dir_path + os.sep + str(cur_file_split) + process_id_str + file_extension

361 else:

362 file_extension = ".json" if is_json else ".nq"

363 cur_dir_path = base_dir + parsed.short_name

364 cur_file_path = cur_dir_path + os.sep + parsed.prefix + parsed.count + process_id_str + file_extension

365

366 return cur_dir_path, cur_file_path

367

368

369def has_supplier_prefix(res: str, base_iri: str) -> bool:

370 string_iri: str = str(res)

371 return re.search(r"^%s[a-z][a-z]/0" % base_iri, string_iri) is not None

372

373

374def build_graph_from_results(results: SparqlResultRows) -> TripleLite:

375 graph = TripleLite()

376 for triple in results:

377 graph.add((triple["s"]["value"], triple["p"]["value"], sparql_binding_to_rdfterm(triple["o"])))

378 return graph

379

380

381def is_dataset(res: str) -> bool:

382 string_iri: str = str(res)

383 return re.search(r"^.+/[0-9]+(-[0-9]+)?(/[0-9]+)?$", string_iri) is None

Coverage for oc_ocdm / support / support.py: 90%

223 statements