Coverage for oc_meta/lib/finder.py: 80%

640 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-07-14 14:06 +0000

1from time import sleep 

2from typing import Dict, List, Tuple 

3 

4import yaml 

5from dateutil import parser 

6from oc_meta.plugins.editor import MetaEditor 

7from oc_ocdm.graph import GraphEntity 

8from oc_ocdm.graph.graph_entity import GraphEntity 

9from oc_ocdm.prov.prov_entity import ProvEntity 

10from oc_ocdm.support import get_count, get_resource_number 

11from rdflib import RDF, XSD, Graph, Literal, URIRef 

12from SPARQLWrapper import JSON, POST, SPARQLWrapper 

13from time_agnostic_library.agnostic_entity import AgnosticEntity 

14 

15 

16class ResourceFinder: 

17 

18 def __init__(self, ts_url, base_iri:str, local_g: Graph = Graph(), settings: dict = dict(), meta_config_path: str = None): 

19 self.ts = SPARQLWrapper(ts_url) 

20 self.ts.setMethod(POST) 

21 self.base_iri = base_iri[:-1] if base_iri[-1] == '/' else base_iri 

22 self.local_g = local_g 

23 self.ids_in_local_g = set() 

24 self.meta_config_path = meta_config_path 

25 self.meta_settings = settings 

26 self.blazegraph_full_text_search = settings['blazegraph_full_text_search'] if settings and 'blazegraph_full_text_search' in settings else False 

27 self.virtuoso_full_text_search = settings['virtuoso_full_text_search'] if settings and 'virtuoso_full_text_search' in settings else False 

28 

29 def __query(self, query, return_format = JSON): 

30 """Execute a SPARQL query with retries and exponential backoff""" 

31 self.ts.setReturnFormat(return_format) 

32 self.ts.setQuery(query) 

33 max_retries = 5 # Aumentiamo il numero di tentativi 

34 base_wait = 5 # Tempo base di attesa in secondi 

35 

36 for attempt in range(max_retries): 

37 try: 

38 result = self.ts.queryAndConvert() 

39 return result 

40 except Exception as e: 

41 wait_time = base_wait * (2 ** attempt) # Exponential backoff 

42 if attempt < max_retries - 1: # Se non è l'ultimo tentativo 

43 sleep(wait_time) 

44 else: 

45 # Ultimo tentativo fallito, logghiamo l'errore e solleviamo un'eccezione custom 

46 error_msg = f"Failed to execute SPARQL query after {max_retries} attempts: {str(e)}\nQuery: {query}" 

47 print(error_msg) # Log dell'errore 

48 raise Exception(error_msg) 

49 

50 # _______________________________BR_________________________________ # 

51 

52 def retrieve_br_from_id(self, schema: str, value: str) -> List[Tuple[str, str, list]]: 

53 ''' 

54 Given an identifier, it retrieves bibliographic resources associated with that identifier, related titles and other identifiers MetaIDs and literal values. 

55 

56 :params schema: an identifier schema 

57 :type schema: str 

58 :params value: an identifier literal value 

59 :type value: str 

60 :returns List[Tuple[str, str, list]]: -- it returns a list of three elements tuples. The first element is the MetaID of a resource associated with the input ID. The second element is a title of that resource, if present. The third element is a list of MetaID-ID tuples related to identifiers associated with that resource.  

61 ''' 

62 schema_uri = URIRef(GraphEntity.DATACITE + schema) 

63 value = value.replace('\\', '\\\\') 

64 result_list = [] 

65 identifier_uri = None 

66 

67 # Search for both string-typed and untyped literals 

68 for literal_value in [Literal(value, datatype=XSD.string), Literal(value)]: 

69 for starting_triple in self.local_g.triples((None, GraphEntity.iri_has_literal_value, literal_value)): 

70 for known_id_triple in self.local_g.triples((starting_triple[0], None, None)): 

71 if known_id_triple[1] == GraphEntity.iri_uses_identifier_scheme and known_id_triple[2] == schema_uri: 

72 identifier_uri = known_id_triple[0] 

73 if identifier_uri: 

74 break 

75 if identifier_uri: 

76 metaid_id_list = [(identifier_uri.replace(f'{self.base_iri}/id/', ''), f'{schema}:{value}')] 

77 for triple in self.local_g.triples((None, GraphEntity.iri_has_identifier, identifier_uri)): 

78 title = '' 

79 res = triple[0] 

80 for res_triple in self.local_g.triples((res, None, None)): 

81 if res_triple[1] == GraphEntity.iri_title: 

82 title = str(res_triple[2]) 

83 elif res_triple[1] == GraphEntity.iri_has_identifier and res_triple[2] != identifier_uri: 

84 for id_triple in self.local_g.triples((res_triple[2], None, None)): 

85 if id_triple[1] == GraphEntity.iri_uses_identifier_scheme: 

86 id_schema = id_triple[2] 

87 elif id_triple[1] == GraphEntity.iri_has_literal_value: 

88 id_literal_value = id_triple[2] 

89 full_id = f'{id_schema.replace(GraphEntity.DATACITE, "")}:{id_literal_value}' 

90 metaid_id_tuple = (res_triple[2].replace(f'{self.base_iri}/id/', ''), full_id) 

91 metaid_id_list.append(metaid_id_tuple) 

92 result_list.append((res.replace(f'{self.base_iri}/br/', ''), title, metaid_id_list)) 

93 

94 return result_list 

95 

96 def retrieve_br_from_meta(self, metaid: str) -> Tuple[str, List[Tuple[str, str]]]: 

97 ''' 

98 Given a MetaID, it retrieves the title of the bibliographic resource having that MetaID and other identifiers of that entity. 

99 

100 :params metaid: a MetaID 

101 :type metaid: str 

102 :returns Tuple[str, List[Tuple[str, str]]]: -- it returns a tuple of two elements. The first element is the resource's title associated with the input MetaID. The second element is a list of MetaID-ID tuples related to identifiers associated with that entity. 

103 ''' 

104 metaid_uri = f'{self.base_iri}/br/{metaid}' 

105 title = '' 

106 identifiers = [] 

107 it_exists = False 

108 

109 for triple in self.local_g.triples((URIRef(metaid_uri), None, None)): 

110 it_exists = True 

111 if triple[1] == GraphEntity.iri_title: 

112 title = str(triple[2]) 

113 elif triple[1] == GraphEntity.iri_has_identifier: 

114 id_scheme = '' 

115 literal_value = '' 

116 identifier = triple[2] 

117 for triple_inner in self.local_g.triples((identifier, None, None)): 

118 if triple_inner[1] == GraphEntity.iri_uses_identifier_scheme: 

119 id_scheme = str(triple_inner[2]).replace(GraphEntity.DATACITE, '') 

120 elif triple_inner[1] == GraphEntity.iri_has_literal_value: 

121 literal_value = str(triple_inner[2]) 

122 if id_scheme and literal_value: # Ensure both id_scheme and literal_value are found before appending 

123 full_id = f'{id_scheme}:{literal_value}' 

124 identifiers.append((str(identifier).replace(self.base_iri + '/id/', ''), full_id)) 

125 

126 if not it_exists: 

127 return "", [], False 

128 

129 return title, identifiers, True 

130 

131 # _______________________________ID_________________________________ # 

132 

133 def retrieve_metaid_from_id(self, schema: str, value: str) -> str: 

134 ''' 

135 Given the schema and value of an ID, it returns the MetaID associated with that identifier. 

136 

137 :params schema: an identifier schema 

138 :type schema: str 

139 :params value: an identifier literal value 

140 :type value: str 

141 :returns str: -- it returns the MetaID associated with the input ID. 

142 ''' 

143 schema_uri = URIRef(GraphEntity.DATACITE + schema) 

144 value = value.replace('\\', '\\\\') 

145 

146 # Create both untyped and string-typed literals  

147 for literal in [Literal(value, datatype=XSD.string), Literal(value)]: 

148 for starting_triple in self.local_g.triples((None, GraphEntity.iri_has_literal_value, literal)): 

149 for known_id_triple in self.local_g.triples((starting_triple[0], None, None)): 

150 if known_id_triple[1] == GraphEntity.iri_uses_identifier_scheme and known_id_triple[2] == schema_uri: 

151 return known_id_triple[0].replace(f'{self.base_iri}/id/', '') 

152 

153 # If no match is found, return None or an appropriate value 

154 return None 

155 

156 def retrieve_metaid_from_merged_entity(self, metaid_uri:str, prov_config:str) -> str: 

157 ''' 

158 It looks for MetaId in the provenance. If the input entity was deleted due to a merge, this function returns the target entity. Otherwise, it returns None. 

159 

160 :params metaid_uri: a MetaId URI 

161 :type metaid_uri: str 

162 :params prov_config: the path of the configuration file required by time-agnostic-library 

163 :type prov_config: str 

164 :returns str: -- It returns the MetaID associated with the target entity after a merge. If there was no merge, it returns None. 

165 ''' 

166 metaval = None 

167 with open(prov_config, 'r', encoding='utf8') as f: 

168 prov_config_dict = yaml.safe_load(f) 

169 agnostic_meta = AgnosticEntity(res=metaid_uri, config=prov_config_dict, related_entities_history=False) 

170 agnostic_meta_history = agnostic_meta.get_history(include_prov_metadata=True) 

171 meta_history_data = agnostic_meta_history[0][metaid_uri] 

172 if meta_history_data: 

173 meta_history_metadata = agnostic_meta_history[1][metaid_uri] 

174 penultimate_snapshot = sorted( 

175 meta_history_metadata.items(), 

176 key=lambda x: parser.parse(x[1]['generatedAtTime']).replace(tzinfo=None), 

177 reverse=True 

178 )[1][0] 

179 query_if_it_was_merged = f''' 

180 SELECT DISTINCT ?se 

181 WHERE {{ 

182 ?se a <{ProvEntity.iri_entity}>; 

183 <{ProvEntity.iri_was_derived_from}> <{penultimate_snapshot}>. 

184 }} 

185 ''' 

186 results = self.__query(query_if_it_was_merged)['results']['bindings'] 

187 # The entity was merged to another 

188 merged_entity = [se for se in results if metaid_uri not in se['se']['value']] 

189 if merged_entity: 

190 merged_entity:str = merged_entity[0]['se']['value'] 

191 merged_entity = merged_entity.split('/prov/')[0] 

192 merged_entity = get_count(merged_entity) 

193 metaval = merged_entity 

194 return metaval 

195 

196 # _______________________________RA_________________________________ # 

197 def retrieve_ra_from_meta(self, metaid: str) -> Tuple[str, List[Tuple[str, str]]]: 

198 ''' 

199 Given a MetaID, it retrieves the name and id of the responsible agent associated with it, whether it is an author or a publisher. 

200 The output has the following format: 

201 

202 ('NAME', [('METAID_OF_THE_IDENTIFIER', 'LITERAL_VALUE')]) 

203 ('American Medical Association (ama)', [('4274', 'crossref:10')]) 

204 

205 :params metaid: a responsible agent's MetaID 

206 :type metaid: str 

207 :returns str: -- it returns a tuple, where the first element is the responsible agent's name, and the second element is a list containing its identifier's MetaID and literal value 

208 ''' 

209 metaid_uri = f'{self.base_iri}/ra/{metaid}' 

210 family_name = '' 

211 given_name = '' 

212 name = '' 

213 identifiers = [] 

214 it_exists = False 

215 

216 for triple in self.local_g.triples((URIRef(metaid_uri), None, None)): 

217 it_exists = True 

218 if triple[1] == GraphEntity.iri_family_name: 

219 family_name = str(triple[2]) 

220 elif triple[1] == GraphEntity.iri_given_name: 

221 given_name = str(triple[2]) 

222 elif triple[1] == GraphEntity.iri_name: 

223 name = str(triple[2]) 

224 elif triple[1] == GraphEntity.iri_has_identifier: 

225 identifier = triple[2] 

226 id_scheme = '' 

227 literal_value = '' 

228 for triple_inner in self.local_g.triples((identifier, None, None)): 

229 if triple_inner[1] == GraphEntity.iri_uses_identifier_scheme: 

230 id_scheme = str(triple_inner[2]).replace(GraphEntity.DATACITE, '') 

231 elif triple_inner[1] == GraphEntity.iri_has_literal_value: 

232 literal_value = str(triple_inner[2]) 

233 if id_scheme and literal_value: 

234 full_id = f'{id_scheme}:{literal_value}' 

235 identifiers.append((str(identifier).replace(self.base_iri + '/id/', ''), full_id)) 

236 

237 full_name = self._construct_full_name(name, family_name, given_name) 

238 

239 return full_name, identifiers, it_exists 

240 

241 def retrieve_ra_from_id(self, schema: str, value: str, publisher: bool) -> List[Tuple[str, str, list]]: 

242 ''' 

243 Given an identifier, it retrieves responsible agents associated with that identifier, related names and other identifiers MetaIDs and literal values. 

244 The output has the following format: :: 

245 

246 [(METAID, NAME, [(METAID_OF_THE_IDENTIFIER, LITERAL_VALUE)])] 

247 [('3309', 'American Medical Association (ama)', [('4274', 'crossref:10')])] 

248 

249 :params schema: an identifier schema 

250 :type schema: str 

251 :params value: an identifier literal value 

252 :type value: str 

253 :params publisher: True if the identifier is associated with a publisher, False otherwise. 

254 :type publisher: bool 

255 :returns List[Tuple[str, str, list]]: -- it returns a list of three elements tuples. The first element is the MetaID of a responsible agent associated with the input ID. The second element is the name of that responsible agent, if present. The third element is a list of MetaID-ID tuples related to identifiers associated with that responsible agent.  

256 ''' 

257 schema_uri = URIRef(GraphEntity.DATACITE + schema) 

258 value = value.replace('\\', '\\\\') 

259 result_list = [] 

260 identifier_uri = None 

261 

262 # Search for both string-typed and untyped literals 

263 for literal_value in [Literal(value, datatype=XSD.string), Literal(value)]: 

264 for starting_triple in self.local_g.triples((None, GraphEntity.iri_has_literal_value, literal_value)): 

265 for known_id_triple in self.local_g.triples((starting_triple[0], None, None)): 

266 if known_id_triple[1] == GraphEntity.iri_uses_identifier_scheme and known_id_triple[2] == schema_uri: 

267 identifier_uri = known_id_triple[0] 

268 break 

269 if identifier_uri: 

270 break 

271 if identifier_uri: 

272 metaid_id_list = [(identifier_uri.replace(f'{self.base_iri}/id/', ''), f'{schema}:{value}')] 

273 for triple in self.local_g.triples((None, GraphEntity.iri_has_identifier, identifier_uri)): 

274 name = '' 

275 family_name = '' 

276 given_name = '' 

277 res = triple[0] 

278 for res_triple in self.local_g.triples((res, None, None)): 

279 if res_triple[1] == GraphEntity.iri_name: 

280 name = str(res_triple[2]) 

281 elif res_triple[1] == GraphEntity.iri_family_name: 

282 family_name = str(res_triple[2]) 

283 elif res_triple[1] == GraphEntity.iri_given_name: 

284 given_name = str(res_triple[2]) 

285 elif res_triple[1] == GraphEntity.iri_has_identifier and res_triple[2] != identifier_uri: 

286 for id_triple in self.local_g.triples((res_triple[2], None, None)): 

287 if id_triple[1] == GraphEntity.iri_uses_identifier_scheme: 

288 id_schema = id_triple[2] 

289 elif id_triple[1] == GraphEntity.iri_has_literal_value: 

290 id_literal_value = id_triple[2] 

291 full_id = f'{id_schema.replace(GraphEntity.DATACITE, "")}:{id_literal_value}' 

292 metaid_id_tuple = (res_triple[2].replace(f'{self.base_iri}/id/', ''), full_id) 

293 metaid_id_list.append(metaid_id_tuple) 

294 

295 full_name = self._construct_full_name(name, family_name, given_name) 

296 result_list.append((res.replace(f'{self.base_iri}/ra/', ''), full_name, metaid_id_list)) 

297 

298 return result_list 

299 

300 def _construct_full_name(self, name: str, family_name: str, given_name: str) -> str: 

301 if name and not family_name and not given_name: 

302 return name 

303 elif not name and family_name and not given_name: 

304 return f'{family_name},' 

305 elif not name and not family_name and given_name: 

306 return f', {given_name}' 

307 elif not name and family_name and given_name: 

308 return f'{family_name}, {given_name}' 

309 else: 

310 return '' 

311 

312 def retrieve_ra_sequence_from_br_meta(self, metaid: str, col_name: str) -> List[Dict[str, tuple]]: 

313 ''' 

314 Given a bibliographic resource's MetaID and a field name, it returns its agent roles and responsible agents in the correct order according to the specified field. 

315 The output has the following format: :: 

316 [ 

317 {METAID_AR_1: (NAME_RA_1, [(METAID_ID_RA_1, LITERAL_VALUE_ID_RA_1)], METAID_RA_1)},  

318 {METAID_AR_2: (NAME_RA_2, [(METAID_ID_RA_2, LITERAL_VALUE_ID_RA_2)], METAID_RA_2)},  

319 {METAID_AR_N: (NAME_RA_N, [(METAID_ID_RA_N, LITERAL_VALUE_ID_RA_N)], METAID_RA_N)},  

320 ] 

321 [ 

322 {'5343': ('Hodge, James G.', [], '3316')},  

323 {'5344': ('Anderson, Evan D.', [], '3317')},  

324 {'5345': ('Kirsch, Thomas D.', [], '3318')},  

325 {'5346': ('Kelen, Gabor D.', [('4278', 'orcid:0000-0002-3236-8286')], '3319')} 

326 ]  

327 :params metaid: a MetaID 

328 :type meta_id: str 

329 :params col_name: a MetaID 

330 :type col_name: str 

331 :returns: List[Dict[str, tuple]] -- the output is a list of three-elements tuples. Each tuple's first and third elements are the MetaIDs of an agent role and responsible agent related to the specified bibliographic resource. The second element is a two-elements tuple, where the first element is the MetaID of the identifier of the responsible agent. In contrast, the second one is the literal value of that id. 

332 ''' 

333 if col_name == 'author': 

334 role = GraphEntity.iri_author 

335 elif col_name == 'editor': 

336 role = GraphEntity.iri_editor 

337 else: 

338 role = GraphEntity.iri_publisher 

339 

340 metaid_uri = URIRef(f'{self.base_iri}/br/{str(metaid)}') 

341 dict_ar = dict() 

342 changes_made = False 

343 

344 for triple in self.local_g.triples((metaid_uri, GraphEntity.iri_is_document_context_for, None)): 

345 for ar_triple in self.local_g.triples((triple[2], None, None)): 

346 if ar_triple[2] == role: 

347 role_value = str(triple[2]).replace(f'{self.base_iri}/ar/', '') 

348 next_role = '' 

349 for relevant_ar_triple in self.local_g.triples((triple[2], None, None)): 

350 if relevant_ar_triple[1] == GraphEntity.iri_has_next: 

351 next_role = str(relevant_ar_triple[2]).replace(f'{self.base_iri}/ar/', '') 

352 elif relevant_ar_triple[1] == GraphEntity.iri_is_held_by: 

353 ra = str(relevant_ar_triple[2]).replace(f'{self.base_iri}/ra/', '') 

354 dict_ar[role_value] = {'next': next_role, 'ra': ra} 

355 

356 initial_dict_ar = dict_ar.copy() 

357 

358 # Detect and handle duplicated RA 

359 ra_to_ars = {} 

360 for ar, details in dict_ar.items(): 

361 ra = details['ra'] 

362 if ra not in ra_to_ars: 

363 ra_to_ars[ra] = [] 

364 ra_to_ars[ra].append(ar) 

365 

366 # Identify and delete duplicate ARs 

367 ar_to_delete_list = [] 

368 for ra, ars in ra_to_ars.items(): 

369 if len(ars) > 1: 

370 # Keep the first AR and delete the rest 

371 for ar_to_delete in ars[1:]: 

372 meta_editor = MetaEditor(meta_config=self.meta_config_path, resp_agent='https://w3id.org/oc/meta/prov/pa/1', save_queries=True) 

373 meta_editor.delete(res=f"{self.base_iri}/ar/{ar_to_delete}") 

374 ar_to_delete_list.append(ar_to_delete) 

375 changes_made = True 

376 

377 for ar in ar_to_delete_list: 

378 del dict_ar[ar] 

379 

380 # Check for ARs that have themselves as 'next' and remove the 'next' relationship 

381 for ar, details in dict_ar.items(): 

382 if details['next'] == ar: 

383 meta_editor = MetaEditor(meta_config=self.meta_config_path, resp_agent='https://w3id.org/oc/meta/prov/pa/1', save_queries=True) 

384 meta_editor.delete(res=f"{self.base_iri}/ar/{ar}", property=str(GraphEntity.iri_has_next)) 

385 dict_ar[ar]['next'] = '' 

386 changes_made = True 

387 

388 # Remove invalid 'next' references 

389 for role, details in list(dict_ar.items()): 

390 if details['next'] and details['next'] not in dict_ar: 

391 dict_ar[role]['next'] = '' 

392 changes_made = True 

393 

394 # Find the start_role by excluding all roles that are "next" for others from the set of all roles. 

395 all_roles = set(dict_ar.keys()) 

396 roles_with_next = set(details['next'] for details in dict_ar.values() if details['next']) 

397 start_role_candidates = all_roles - roles_with_next 

398 # Handle the edge cases for start role determination  

399 

400 MAX_ITERATIONS = 1000 # Numero massimo di iterazioni permesse 

401 SAFETY_TIMER = 3600 # Timer di sicurezza di 1 ora (in secondi) 

402 

403 if len(all_roles) == 0: 

404 return [] 

405 elif len(start_role_candidates) != 1: 

406 # If more than one start candidate exists or none exist in a multi-role situation, resolve automatically 

407 chains = [] 

408 for start_candidate in start_role_candidates: 

409 current_role = start_candidate 

410 chain = [] 

411 visited_roles = set() 

412 iteration_count = 0 

413 while current_role and current_role not in visited_roles and iteration_count < MAX_ITERATIONS: 

414 visited_roles.add(current_role) 

415 ra_info = self.retrieve_ra_from_meta(dict_ar[current_role]['ra'])[0:2] 

416 ra_tuple = ra_info + (dict_ar[current_role]['ra'],) 

417 chain.append({current_role: ra_tuple}) 

418 current_role = dict_ar[current_role]['next'] 

419 iteration_count += 1 

420 

421 if iteration_count == MAX_ITERATIONS: 

422 print(f"Possible infinite loop detected for BR: {metaid}") 

423 print("Starting safety timer. Please stop the process if needed.") 

424 sleep(SAFETY_TIMER) 

425 return [] # Ritorna una lista vuota dopo il timer 

426 

427 chains.append(chain) 

428 # Sort chains by length, then by the lowest sequential number of the starting role  

429 chains.sort(key=lambda chain: (-len(chain), get_resource_number(f'{self.base_iri}/ar/{list(chain[0].keys())[0]}'))) 

430 try: 

431 ordered_ar_list = chains[0] 

432 except Exception as e: 

433 print(f"\nProcessing BR: {metaid} for column: {col_name}") 

434 print(f"Initial dict_ar: {dict_ar}") 

435 print(f"All roles: {all_roles}") 

436 print(f"Start role candidates: {start_role_candidates}") 

437 print(f"Roles with next: {roles_with_next}") 

438 print(f"Error occurred while sorting or selecting chains: {str(e)}") 

439 print(f"Chains at time of error: {chains}") 

440 raise 

441 for chain in chains[1:]: 

442 for ar_dict in chain: 

443 for ar in ar_dict.keys(): 

444 meta_editor = MetaEditor(meta_config=self.meta_config_path, resp_agent='https://w3id.org/oc/meta/prov/pa/1', save_queries=True) 

445 meta_editor.delete(res=f"{self.base_iri}/ar/{ar}") 

446 changes_made = True 

447 else: 

448 start_role = start_role_candidates.pop() 

449 # Follow the "next" chain from the start_role to construct an ordered list. 

450 ordered_ar_list = [] 

451 current_role = start_role 

452 while current_role: 

453 ra_info = self.retrieve_ra_from_meta(dict_ar[current_role]['ra'])[0:2] 

454 ra_tuple = ra_info + (dict_ar[current_role]['ra'],) 

455 ordered_ar_list.append({current_role: ra_tuple}) 

456 current_role = dict_ar[current_role]['next'] 

457 

458 final_chain = [list(ar_dict.keys())[0] for ar_dict in ordered_ar_list] 

459 

460 # Fill gaps in the AR chain 

461 for i in range(len(final_chain) - 1): 

462 current_ar = final_chain[i] 

463 next_ar = final_chain[i + 1] 

464 if dict_ar[current_ar]['next'] != next_ar: 

465 meta_editor = MetaEditor(meta_config=self.meta_config_path, resp_agent='https://w3id.org/oc/meta/prov/pa/1', save_queries=True) 

466 meta_editor.update_property( 

467 res=f"{self.base_iri}/ar/{current_ar}", 

468 property=str(GraphEntity.iri_has_next), 

469 new_value=URIRef(f"{self.base_iri}/ar/{next_ar}") 

470 ) 

471 dict_ar[current_ar]['next'] = next_ar 

472 changes_made = True 

473 

474 # Ensure the last AR doesn't have a 'next' relationship 

475 last_ar = final_chain[-1] 

476 if dict_ar[last_ar]['next']: 

477 meta_editor = MetaEditor(meta_config=self.meta_config_path, resp_agent='https://w3id.org/oc/meta/prov/pa/1', save_queries=True) 

478 meta_editor.delete(res=f"{self.base_iri}/ar/{last_ar}", property=GraphEntity.iri_has_next) 

479 dict_ar[last_ar]['next'] = '' 

480 changes_made = True 

481 

482 if changes_made: 

483 print(f"\nChanges made to AR chain for BR: {metaid}") 

484 # print(f"Initial AR chain: {initial_dict_ar}") 

485 # print(f"Final AR chain: {dict_ar}") 

486 # print(f"Final ordered AR list: {ordered_ar_list}\n") 

487 

488 return ordered_ar_list 

489 

490 def retrieve_re_from_br_meta(self, metaid:str) -> Tuple[str, str]: 

491 ''' 

492 Given a bibliographic resource's MetaID, it returns its resource embodiment's MetaID and pages. 

493 The output has the following format: :: 

494 

495 (METAID, PAGES) 

496 ('2011', '391-397') 

497 

498 :params metaid: a bibliographic resource's MetaID 

499 :type meta_id: str 

500 :returns: Tuple[str, str] -- the output is a two-elements tuple, where the first element is the MetaID of the resource embodiment, and the second is a pages' interval.  

501 ''' 

502 metaid_uri = URIRef(f'{self.base_iri}/br/{str(metaid)}') 

503 re_uri = None 

504 starting_page = None 

505 ending_page = None 

506 for triple in self.local_g.triples((metaid_uri, GraphEntity.iri_embodiment, None)): 

507 re_uri = triple[2].replace(f'{self.base_iri}/re/', '') 

508 for re_triple in self.local_g.triples((triple[2], None, None)): 

509 if re_triple[1] == GraphEntity.iri_starting_page: 

510 starting_page = str(re_triple[2]) 

511 elif re_triple[1] == GraphEntity.iri_ending_page: 

512 ending_page = str(re_triple[2]) 

513 if re_uri: 

514 if starting_page and ending_page: 

515 pages = f'{starting_page}-{ending_page}' 

516 elif starting_page and not ending_page: 

517 pages = f'{starting_page}-{starting_page}' 

518 elif not starting_page and ending_page: 

519 pages = f'{ending_page}-{ending_page}' 

520 elif not starting_page and not ending_page: 

521 pages = '' 

522 return re_uri, pages 

523 

524 def retrieve_br_info_from_meta(self, metaid: str) -> dict: 

525 ''' 

526 Given a bibliographic resource's MetaID, it returns all the information about that resource. 

527 The output has the following format: :: 

528 

529 { 

530 'pub_date': PUB_DATE,  

531 'type': TYPE,  

532 'page': (METAID, PAGES),  

533 'issue': ISSUE,  

534 'volume': VOLUME,  

535 'venue': VENUE 

536 } 

537 { 

538 'pub_date': '2006-02-27',  

539 'type': 'journal article',  

540 'page': ('2011', '391-397'),  

541 'issue': '4',  

542 'volume': '166',  

543 'venue': 'Archives Of Internal Medicine [omid:br/4387]' 

544 } 

545 

546 :param metaid: a bibliographic resource's MetaID 

547 :type metaid: str 

548 :returns: dict -- the output is a dictionary including the publication date, type, page, issue, volume, and venue of the specified bibliographic resource. 

549 ''' 

550 

551 venue_iris = [ 

552 GraphEntity.iri_archival_document, 

553 GraphEntity.iri_journal, 

554 GraphEntity.iri_book, 

555 GraphEntity.iri_book_series, 

556 GraphEntity.iri_series, 

557 GraphEntity.iri_academic_proceedings, 

558 GraphEntity.iri_proceedings_series, 

559 GraphEntity.iri_reference_book, 

560 GraphEntity.iri_series, 

561 

562 GraphEntity.iri_expression 

563 ] 

564 

565 def extract_identifiers(entity_uri): 

566 identifiers = [f"omid:{entity_uri.replace(f'{self.base_iri}/', '')}"] 

567 for id_triple in self.local_g.triples((entity_uri, GraphEntity.iri_has_identifier, None)): 

568 id_obj = id_triple[2] 

569 scheme = value = None 

570 for detail_triple in self.local_g.triples((id_obj, None, None)): 

571 if detail_triple[1] == GraphEntity.iri_uses_identifier_scheme: 

572 scheme = str(detail_triple[2]) 

573 elif detail_triple[1] == GraphEntity.iri_has_literal_value: 

574 value = str(detail_triple[2]) 

575 if scheme and value: 

576 scheme = scheme.replace(GraphEntity.DATACITE, '') 

577 identifiers.append(f"{scheme}:{value}") 

578 return identifiers 

579 

580 metaid = str(metaid) 

581 metaid_uri = URIRef(f'{self.base_iri}/br/{metaid}') if self.base_iri not in metaid else URIRef(metaid) 

582 res_dict = { 

583 'pub_date': '', 

584 'type': '', 

585 'page': self.retrieve_re_from_br_meta(metaid), 

586 'issue': '', 

587 'volume': '', 

588 'venue': '' 

589 } 

590 

591 for triple in self.local_g.triples((metaid_uri, None, None)): 

592 predicate, obj = triple[1], triple[2] 

593 

594 if predicate == GraphEntity.iri_has_publication_date: 

595 res_dict['pub_date'] = str(obj) 

596 elif predicate == RDF.type and obj != GraphEntity.iri_expression: 

597 res_dict['type'] = self._type_it(obj) 

598 elif predicate == GraphEntity.iri_has_sequence_identifier: 

599 for inner_triple in self.local_g.triples((metaid_uri, None, None)): 

600 inner_obj = inner_triple[2] 

601 if inner_obj == GraphEntity.iri_journal_issue: 

602 res_dict['issue'] = str(triple[2]) 

603 elif inner_obj == GraphEntity.iri_journal_volume: 

604 res_dict['volume'] = str(triple[2]) 

605 elif predicate == GraphEntity.iri_part_of: 

606 for vvi_triple in self.local_g.triples((obj, None, None)): 

607 vvi_obj = vvi_triple[2] 

608 if vvi_obj == GraphEntity.iri_journal_issue: 

609 for inner_vvi_triple in self.local_g.triples((obj, None, None)): 

610 if inner_vvi_triple[1] == GraphEntity.iri_has_sequence_identifier: 

611 res_dict['issue'] = str(inner_vvi_triple[2]) 

612 elif vvi_obj == GraphEntity.iri_journal_volume: 

613 for inner_vvi_triple in self.local_g.triples((obj, None, None)): 

614 if inner_vvi_triple[1] == GraphEntity.iri_has_sequence_identifier: 

615 res_dict['volume'] = str(inner_vvi_triple[2]) 

616 elif vvi_obj in venue_iris: 

617 for inner_vvi_triple in self.local_g.triples((obj, None, None)): 

618 if inner_vvi_triple[1] == GraphEntity.iri_title: 

619 venue_title = str(inner_vvi_triple[2]) 

620 venue_ids = extract_identifiers(obj) 

621 res_dict['venue'] = f"{venue_title} [{' '.join(venue_ids)}]" 

622 

623 if vvi_triple[1] == GraphEntity.iri_part_of: 

624 for vi_triple in self.local_g.triples((vvi_obj, None, None)): 

625 vi_obj = vi_triple[2] 

626 if vi_obj == GraphEntity.iri_journal_volume: 

627 for inner_vvi_triple in self.local_g.triples((vvi_obj, None, None)): 

628 if inner_vvi_triple[1] == GraphEntity.iri_has_sequence_identifier: 

629 res_dict['volume'] = str(inner_vvi_triple[2]) 

630 elif vi_obj in venue_iris: 

631 for inner_vvi_triple in self.local_g.triples((vvi_obj, None, None)): 

632 if inner_vvi_triple[1] == GraphEntity.iri_title: 

633 venue_title = str(inner_vvi_triple[2]) 

634 venue_ids = extract_identifiers(vvi_obj) 

635 res_dict['venue'] = f"{venue_title} [{' '.join(venue_ids)}]" 

636 

637 if vi_triple[1] == GraphEntity.iri_part_of: 

638 for venue_triple in self.local_g.triples((vi_obj, None, None)): 

639 if venue_triple[1] == GraphEntity.iri_title: 

640 venue_title = str(venue_triple[2]) 

641 venue_ids = extract_identifiers(vi_obj) 

642 res_dict['venue'] = f"{venue_title} [{' '.join(venue_ids)}]" 

643 return res_dict 

644 

645 @staticmethod 

646 def _type_it(br_type: URIRef) -> str: 

647 output_type = '' 

648 if br_type == GraphEntity.iri_archival_document: 

649 output_type = 'archival document' 

650 if br_type == GraphEntity.iri_book: 

651 output_type = 'book' 

652 if br_type == GraphEntity.iri_book_chapter: 

653 output_type = 'book chapter' 

654 if br_type == GraphEntity.iri_part: 

655 output_type = 'book part' 

656 if br_type == GraphEntity.iri_expression_collection: 

657 output_type = 'book section' 

658 if br_type == GraphEntity.iri_book_series: 

659 output_type = 'book series' 

660 if br_type == GraphEntity.iri_book_set: 

661 output_type = 'book set' 

662 if br_type == GraphEntity.iri_data_file: 

663 output_type = 'data file' 

664 if br_type == GraphEntity.iri_thesis: 

665 output_type = 'dissertation' 

666 if br_type == GraphEntity.iri_journal: 

667 output_type = 'journal' 

668 if br_type == GraphEntity.iri_journal_article: 

669 output_type = 'journal article' 

670 if br_type == GraphEntity.iri_journal_issue: 

671 output_type = 'journal issue' 

672 if br_type == GraphEntity.iri_journal_volume: 

673 output_type = 'journal volume' 

674 if br_type == GraphEntity.iri_proceedings_paper: 

675 output_type = 'proceedings article' 

676 if br_type == GraphEntity.iri_academic_proceedings: 

677 output_type = 'proceedings' 

678 if br_type == GraphEntity.iri_reference_book: 

679 output_type = 'reference book' 

680 if br_type == GraphEntity.iri_reference_entry: 

681 output_type = 'reference entry' 

682 if br_type == GraphEntity.iri_series: 

683 output_type = 'series' 

684 if br_type == GraphEntity.iri_report_document: 

685 output_type = 'report' 

686 if br_type == GraphEntity.iri_specification_document: 

687 output_type = 'standard' 

688 return output_type 

689 

690 def retrieve_publisher_from_br_metaid(self, metaid:str): 

691 metaid_uri = URIRef(f'{self.base_iri}/br/{metaid}') 

692 publishers = set() 

693 for triple in self.local_g.triples((metaid_uri, None, None)): 

694 if triple[1] == GraphEntity.iri_is_document_context_for: 

695 for document_triple in self.local_g.triples((triple[2], None, None)): 

696 if document_triple[2] == GraphEntity.iri_publisher: 

697 publishers.add(triple[2]) 

698 elif triple[1] == GraphEntity.iri_part_of: 

699 for inner_triple in self.local_g.triples((triple[2], None, None)): 

700 if inner_triple[1] == GraphEntity.iri_is_document_context_for: 

701 for document_triple in self.local_g.triples((inner_triple[2], None, None)): 

702 if document_triple[2] == GraphEntity.iri_publisher: 

703 publishers.add(inner_triple[2]) 

704 elif inner_triple[1] == GraphEntity.iri_part_of: 

705 for inner_inner_triple in self.local_g.triples((inner_triple[2], None, None)): 

706 if inner_inner_triple[1] == GraphEntity.iri_is_document_context_for: 

707 for document_triple in self.local_g.triples((inner_inner_triple[2], None, None)): 

708 if document_triple[2] == GraphEntity.iri_publisher: 

709 publishers.add(inner_inner_triple[2]) 

710 publishers_output = [] 

711 for publisher_uri in publishers: 

712 pub_identifiers = [] 

713 pub_name = None 

714 for triple in self.local_g.triples((publisher_uri, None, None)): 

715 if triple[1] == GraphEntity.iri_is_held_by: 

716 pub_metaid = triple[2].replace(f'{self.base_iri}/', 'omid:') 

717 pub_identifiers.append(pub_metaid) 

718 for ra_triple in self.local_g.triples((triple[2], None, None)): 

719 pub_schema = None 

720 pub_literal = None 

721 if ra_triple[1] == GraphEntity.iri_name: 

722 pub_name = ra_triple[2] 

723 elif ra_triple[1] == GraphEntity.iri_has_identifier: 

724 for id_triple in self.local_g.triples((ra_triple[2], None, None)): 

725 if id_triple[1] == GraphEntity.iri_uses_identifier_scheme: 

726 pub_schema = id_triple[2].replace(f'{str(GraphEntity.DATACITE)}', '') 

727 elif id_triple[1] == GraphEntity.iri_has_literal_value: 

728 pub_literal = id_triple[2] 

729 if pub_schema is not None and pub_literal is not None: 

730 pub_id = f'{pub_schema}:{pub_literal}' 

731 pub_identifiers.append(pub_id) 

732 if pub_name is not None: 

733 pub_full = f'{pub_name} [{" ".join(pub_identifiers)}]' 

734 else: 

735 pub_full = f'[{" ".join(pub_identifiers)}]' 

736 publishers_output.append(pub_full) 

737 return '; '.join(publishers_output) 

738 

739 def get_everything_about_res(self, metavals: set, identifiers: set, vvis: set, max_depth: int = 10) -> None: 

740 BATCH_SIZE = 10 

741 def batch_process(input_set, batch_size): 

742 """Generator to split input data into smaller batches if batch_size is not None.""" 

743 if batch_size is None: 

744 yield input_set 

745 else: 

746 for i in range(0, len(input_set), batch_size): 

747 yield input_set[i:i + batch_size] 

748 

749 def process_batch(subjects, cur_depth): 

750 """Process each batch of subjects up to the specified depth.""" 

751 if not subjects or (max_depth and cur_depth > max_depth): 

752 return 

753 

754 next_subjects = set() 

755 for batch in batch_process(list(subjects), BATCH_SIZE): 

756 query_prefix = f''' 

757 SELECT ?s ?p ?o 

758 WHERE {{ 

759 VALUES ?s {{ {' '.join([f"<{s}>" for s in batch])} }} 

760 ?s ?p ?o. 

761 }}''' 

762 result = self.__query(query_prefix) 

763 if result: 

764 for row in result['results']['bindings']: 

765 s = URIRef(row['s']['value']) 

766 p = URIRef(row['p']['value']) 

767 o = row['o']['value'] 

768 o_type = row['o']['type'] 

769 o_datatype = URIRef(row['o']['datatype']) if 'datatype' in row['o'] else None 

770 o = URIRef(o) if o_type == 'uri' else Literal(lexical_or_value=o, datatype=o_datatype) 

771 self.local_g.add((s, p, o)) 

772 if isinstance(o, URIRef) and p not in {RDF.type, GraphEntity.iri_with_role, GraphEntity.iri_uses_identifier_scheme}: 

773 next_subjects.add(str(o)) 

774 

775 # Dopo aver processato tutti i batch di questo livello, procedi con il prossimo livello di profondità 

776 process_batch(next_subjects, cur_depth + 1) 

777 

778 def get_initial_subjects_from_metavals(metavals): 

779 """Convert metavals to a set of subjects.""" 

780 return {f"{self.base_iri}/{mid.replace('omid:', '')}" for mid in metavals} 

781 

782 def get_initial_subjects_from_identifiers(identifiers): 

783 """Convert identifiers to a set of subjects based on batch queries.""" 

784 subjects = set() 

785 for batch in batch_process(list(identifiers), BATCH_SIZE): 

786 if not batch: 

787 continue 

788 

789 if self.blazegraph_full_text_search: 

790 # Processing for text search enabled databases 

791 for identifier in batch: 

792 scheme, literal = identifier.split(":", 1) 

793 escaped_identifier = literal.replace('\\', '\\\\').replace('"', '\\"') 

794 query = f''' 

795 PREFIX bds: <http://www.bigdata.com/rdf/search#> 

796 SELECT ?s WHERE {{ 

797 ?literal bds:search "{escaped_identifier}" ; 

798 bds:matchAllTerms "true" ; 

799 ^<{GraphEntity.iri_has_literal_value}> ?id. 

800 ?id <{GraphEntity.iri_uses_identifier_scheme}> <{GraphEntity.DATACITE + scheme}>; 

801 ^<{GraphEntity.iri_has_identifier}> ?s . 

802 }} 

803 ''' 

804 result = self.__query(query) 

805 for row in result['results']['bindings']: 

806 subjects.add(str(row['s']['value'])) 

807 elif self.virtuoso_full_text_search: 

808 union_blocks = [] 

809 for identifier in batch: 

810 scheme, literal = identifier.split(':', maxsplit=1)[0], identifier.split(':', maxsplit=1)[1] 

811 escaped_literal = literal.replace('\\', '\\\\').replace('"', '\\"') 

812 union_blocks.append(f""" 

813 {{  

814 {{ 

815 ?id <{GraphEntity.iri_has_literal_value}> "{escaped_literal}" . 

816 }} 

817 UNION 

818 {{ 

819 ?id <{GraphEntity.iri_has_literal_value}> "{escaped_literal}"^^<{XSD.string}> . 

820 }} 

821 ?id <{GraphEntity.iri_uses_identifier_scheme}> <{GraphEntity.DATACITE + scheme}> . 

822 ?s <{GraphEntity.iri_has_identifier}> ?id .  

823 }} 

824 """) 

825 union_query = " UNION ".join(union_blocks) 

826 query = f''' 

827 SELECT ?s WHERE {{ 

828 {union_query} 

829 }} 

830 ''' 

831 result = self.__query(query) 

832 for row in result['results']['bindings']: 

833 subjects.add(str(row['s']['value'])) 

834 else: 

835 identifiers_values = [] 

836 for identifier in batch: 

837 scheme, literal = identifier.split(':', maxsplit=1)[0], identifier.split(':', maxsplit=1)[1] 

838 escaped_literal = literal.replace('\\', '\\\\').replace('"', '\\"') 

839 identifiers_values.append(f"(<{GraphEntity.DATACITE + scheme}> \"{escaped_literal}\")") 

840 identifiers_values_str = " ".join(identifiers_values) 

841 query = f''' 

842 SELECT DISTINCT ?s WHERE {{ 

843 VALUES (?scheme ?literal) {{ {identifiers_values_str} }} 

844 ?id <{GraphEntity.iri_uses_identifier_scheme}> ?scheme . 

845 ?id <{GraphEntity.iri_has_literal_value}> ?literalValue . 

846 FILTER(str(?literalValue) = str(?literal)) 

847 ?s <{GraphEntity.iri_has_identifier}> ?id . 

848 }} 

849 ''' 

850 result = self.__query(query) 

851 for row in result['results']['bindings']: 

852 subjects.add(str(row['s']['value'])) 

853 return subjects 

854 

855 def get_initial_subjects_from_vvis(vvis): 

856 """Convert vvis to a set of subjects based on batch queries, handling venue ID to metaid conversion.""" 

857 subjects = set() 

858 

859 for volume, issue, venue_metaid, venue_ids_tuple in vvis: 

860 venues_to_search = set() 

861 

862 if venue_metaid: 

863 venues_to_search.add(venue_metaid) 

864 

865 if venue_ids_tuple: 

866 venue_id_subjects = get_initial_subjects_from_identifiers(venue_ids_tuple) 

867 subjects.update(venue_id_subjects) 

868 

869 # Convert venue URIs to metaid format for VVI search 

870 for venue_uri in venue_id_subjects: 

871 if '/br/' in venue_uri: 

872 metaid = venue_uri.replace(f'{self.base_iri}/br/', '') 

873 venues_to_search.add(f"omid:br/{metaid}") 

874 

875 # Search for VVI structures for each venue 

876 for venue_metaid_to_search in venues_to_search: 

877 venue_uri = f"{self.base_iri}/{venue_metaid_to_search.replace('omid:', '')}" 

878 sequence_value = issue if issue else volume 

879 escaped_sequence = sequence_value.replace('\\', '\\\\').replace('"', '\\"') 

880 

881 if issue: 

882 # Search for journal issue 

883 if volume: 

884 # Search for issue within specific volume 

885 escaped_volume = volume.replace('\\', '\\\\').replace('"', '\\"') 

886 query = f''' 

887 SELECT ?s WHERE {{ 

888 {{ 

889 ?volume a <{GraphEntity.iri_journal_volume}> ; 

890 <{GraphEntity.iri_part_of}> <{venue_uri}> ; 

891 <{GraphEntity.iri_has_sequence_identifier}> "{escaped_volume}" . 

892 ?s a <{GraphEntity.iri_journal_issue}> ; 

893 <{GraphEntity.iri_part_of}> ?volume ; 

894 <{GraphEntity.iri_has_sequence_identifier}> "{escaped_sequence}" . 

895 }} 

896 UNION 

897 {{ 

898 ?volume a <{GraphEntity.iri_journal_volume}> ; 

899 <{GraphEntity.iri_part_of}> <{venue_uri}> ; 

900 <{GraphEntity.iri_has_sequence_identifier}> "{escaped_volume}"^^<{XSD.string}> . 

901 ?s a <{GraphEntity.iri_journal_issue}> ; 

902 <{GraphEntity.iri_part_of}> ?volume ; 

903 <{GraphEntity.iri_has_sequence_identifier}> "{escaped_sequence}" . 

904 }} 

905 UNION 

906 {{ 

907 ?volume a <{GraphEntity.iri_journal_volume}> ; 

908 <{GraphEntity.iri_part_of}> <{venue_uri}> ; 

909 <{GraphEntity.iri_has_sequence_identifier}> "{escaped_volume}" . 

910 ?s a <{GraphEntity.iri_journal_issue}> ; 

911 <{GraphEntity.iri_part_of}> ?volume ; 

912 <{GraphEntity.iri_has_sequence_identifier}> "{escaped_sequence}"^^<{XSD.string}> . 

913 }} 

914 UNION 

915 {{ 

916 ?volume a <{GraphEntity.iri_journal_volume}> ; 

917 <{GraphEntity.iri_part_of}> <{venue_uri}> ; 

918 <{GraphEntity.iri_has_sequence_identifier}> "{escaped_volume}"^^<{XSD.string}> . 

919 ?s a <{GraphEntity.iri_journal_issue}> ; 

920 <{GraphEntity.iri_part_of}> ?volume ; 

921 <{GraphEntity.iri_has_sequence_identifier}> "{escaped_sequence}"^^<{XSD.string}> . 

922 }} 

923 }} 

924 ''' 

925 else: 

926 # Search for issue directly under venue (no volume specified) 

927 query = f''' 

928 SELECT ?s WHERE {{ 

929 {{ 

930 ?s a <{GraphEntity.iri_journal_issue}> ; 

931 <{GraphEntity.iri_part_of}> <{venue_uri}> ; 

932 <{GraphEntity.iri_has_sequence_identifier}> "{escaped_sequence}" . 

933 }} 

934 UNION 

935 {{ 

936 ?s a <{GraphEntity.iri_journal_issue}> ; 

937 <{GraphEntity.iri_part_of}> <{venue_uri}> ; 

938 <{GraphEntity.iri_has_sequence_identifier}> "{escaped_sequence}"^^<{XSD.string}> . 

939 }} 

940 }} 

941 ''' 

942 else: 

943 # Search for journal volume (only if volume is specified) 

944 if volume: 

945 query = f''' 

946 SELECT ?s WHERE {{ 

947 {{ 

948 ?s a <{GraphEntity.iri_journal_volume}> ; 

949 <{GraphEntity.iri_part_of}> <{venue_uri}> ; 

950 <{GraphEntity.iri_has_sequence_identifier}> "{escaped_sequence}" . 

951 }} 

952 UNION 

953 {{ 

954 ?s a <{GraphEntity.iri_journal_volume}> ; 

955 <{GraphEntity.iri_part_of}> <{venue_uri}> ; 

956 <{GraphEntity.iri_has_sequence_identifier}> "{escaped_sequence}"^^<{XSD.string}> . 

957 }} 

958 }} 

959 ''' 

960 else: 

961 # No volume specified, skip this VVI tuple 

962 continue 

963 

964 result = self.__query(query) 

965 for row in result['results']['bindings']: 

966 subjects.add(str(row['s']['value'])) 

967 

968 # Also add the venue itself as a subject 

969 subjects.add(venue_uri) 

970 

971 return subjects 

972 

973 initial_subjects = set() 

974 

975 if metavals: 

976 initial_subjects.update(get_initial_subjects_from_metavals(metavals)) 

977 

978 if identifiers: 

979 initial_subjects.update(get_initial_subjects_from_identifiers(identifiers)) 

980 

981 if vvis: 

982 initial_subjects.update(get_initial_subjects_from_vvis(vvis)) 

983 

984 process_batch(initial_subjects, 0) 

985 

986 def get_subgraph(self, res: str, graphs_dict: dict) -> Graph|None: 

987 if res in graphs_dict: 

988 return graphs_dict[res] 

989 subgraph = Graph() 

990 for triple in self.local_g.triples((res, None, None)): 

991 subgraph.add(triple) 

992 if len(subgraph): 

993 graphs_dict[res] = subgraph 

994 return subgraph 

995 

996 def retrieve_venue_from_local_graph(self, meta_id: str) -> Dict[str, Dict[str, str]]: 

997 """ 

998 Retrieve venue VVI structure from local graph instead of querying triplestore. 

999  

1000 :params meta_id: a MetaID 

1001 :type meta_id: str 

1002 :returns: Dict[str, Dict[str, str]] -- the venue structure with volumes and issues 

1003 """ 

1004 content = { 

1005 'issue': {}, 

1006 'volume': {} 

1007 } 

1008 

1009 volumes = {} 

1010 venue_uri = URIRef(f'{self.base_iri}/br/{meta_id}') 

1011 

1012 # Find all volumes directly part of this venue 

1013 for triple in self.local_g.triples((None, RDF.type, GraphEntity.iri_journal_volume)): 

1014 entity = triple[0] 

1015 # Check if this volume is part of our venue 

1016 for part_triple in self.local_g.triples((entity, GraphEntity.iri_part_of, venue_uri)): 

1017 entity_id = str(entity).replace(f'{self.base_iri}/br/', '') 

1018 for seq_triple in self.local_g.triples((entity, GraphEntity.iri_has_sequence_identifier, None)): 

1019 seq = str(seq_triple[2]) 

1020 volumes[entity_id] = seq 

1021 content['volume'][seq] = { 

1022 'id': entity_id, 

1023 'issue': {} 

1024 } 

1025 

1026 # Find all issues 

1027 for triple in self.local_g.triples((None, RDF.type, GraphEntity.iri_journal_issue)): 

1028 entity = triple[0] 

1029 entity_id = str(entity).replace(f'{self.base_iri}/br/', '') 

1030 seq = None 

1031 container = None 

1032 

1033 # Get sequence identifier 

1034 for seq_triple in self.local_g.triples((entity, GraphEntity.iri_has_sequence_identifier, None)): 

1035 seq = str(seq_triple[2]) 

1036 

1037 # Get container (could be venue or volume) 

1038 for container_triple in self.local_g.triples((entity, GraphEntity.iri_part_of, None)): 

1039 container = str(container_triple[2]) 

1040 

1041 if seq: 

1042 if container: 

1043 container_id = container.replace(f'{self.base_iri}/br/', '') 

1044 # Check if container is a volume of our venue 

1045 if container_id in volumes: 

1046 volume_seq = volumes[container_id] 

1047 content['volume'][volume_seq]['issue'][seq] = {'id': entity_id} 

1048 # Check if container is directly our venue 

1049 elif container == str(venue_uri): 

1050 content['issue'][seq] = {'id': entity_id} 

1051 

1052 return content