Coverage for oc_ds_converter / datacite / datacite_processing.py: 81%

654 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2019-2020 Fabio Mariani <fabio.mariani555@gmail.com> 

2# SPDX-FileCopyrightText: 2021-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

3# SPDX-FileCopyrightText: 2023-2025 Arianna Moretti <arianna.moretti4@unibo.it> 

4# SPDX-FileCopyrightText: 2023-2026 Marta Soricetti <marta.soricetti@unibo.it> 

5# 

6# SPDX-License-Identifier: ISC 

7 

8import html 

9import re 

10import warnings 

11import os 

12from xml.etree.ElementInclude import include 

13 

14import fakeredis 

15import csv 

16import json 

17 

18from bs4 import BeautifulSoup 

19from pandas.core.apply import include_axis 

20from soupsieve.util import lower 

21 

22from oc_ds_converter.oc_idmanager import WikidataManager 

23from oc_ds_converter.oc_idmanager.doi import DOIManager 

24from oc_ds_converter.oc_idmanager.orcid import ORCIDManager 

25from oc_ds_converter.lib.master_of_regex import * 

26from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager 

27from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager 

28from oc_ds_converter.oc_idmanager.oc_data_storage.sqlite_manager import SqliteStorageManager 

29from oc_ds_converter.oc_idmanager.issn import ISSNManager 

30from oc_ds_converter.oc_idmanager.isbn import ISBNManager 

31 

32#for publishers 

33from oc_ds_converter.oc_idmanager.arxiv import ArXivManager 

34from oc_ds_converter.oc_idmanager.ror import RORManager 

35from oc_ds_converter.oc_idmanager.viaf import ViafManager 

36from oc_ds_converter.oc_idmanager.crossref import CrossrefManager 

37 

38from oc_ds_converter.datasource.redis import RedisDataSource 

39from oc_ds_converter.ra_processor import RaProcessor 

40from typing import Dict, List, Tuple, Optional, Type, Callable 

41from pathlib import Path 

42from typing import List, Tuple 

43 

44from bs4 import BeautifulSoup 

45 

46from oc_ds_converter.datasource.redis import FakeRedisWrapper, RedisDataSource 

47from oc_ds_converter.lib.cleaner import Cleaner 

48from oc_ds_converter.oc_idmanager.doi import DOIManager 

49from oc_ds_converter.oc_idmanager.isbn import ISBNManager 

50from oc_ds_converter.oc_idmanager.issn import ISSNManager 

51from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import RedisStorageManager 

52from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager 

53from oc_ds_converter.oc_idmanager.oc_data_storage.batch_manager import BatchManager 

54from oc_ds_converter.oc_idmanager.orcid import ORCIDManager 

55from oc_ds_converter.ra_processor import RaProcessor 

56 

57warnings.filterwarnings("ignore", category=UserWarning, module='bs4') 

58 

59 

60class DataciteProcessing(RaProcessor): 

61 def __init__(self, orcid_index: str = None, doi_csv: str = None, publishers_filepath_dc: str = None, 

62 testing: bool = True, storage_manager: Optional[StorageManager] = None, 

63 use_orcid_api: bool = True, use_ror_api: bool = True, use_viaf_api:bool = True, use_wikidata_api:bool = True, 

64 exclude_existing: bool = False): 

65 super(DataciteProcessing, self).__init__(orcid_index, doi_csv) 

66 # self.preprocessor = DatacitePreProcessing(inp_dir, out_dir, interval, filter) 

67 if storage_manager is None: 

68 self.storage_manager = RedisStorageManager(testing=testing) 

69 else: 

70 self.storage_manager = storage_manager 

71 

72 self.exclude_existing = exclude_existing 

73 self.temporary_manager = BatchManager() 

74 

75 self.needed_info = ["relationType", "relatedIdentifierType", "relatedIdentifier"] 

76 self.filter = ["references", "isreferencedby", "cites", "iscitedby"] 

77 

78 self.accepted_identifiers_ra = ['ror', 'viaf', 'orcid', 'wikidata'] 

79 

80 self.RIS_types_map = {'abst': 'abstract', 

81 'news': 'newspaper article', 

82 'slide': 'presentation', 

83 'book': 'book', 

84 'data': 'dataset', 

85 'thes': 'dissertation', 

86 'jour': 'journal article', 

87 'mgzn': 'journal article', 

88 'gen': 'other', 

89 'advs': 'other', 

90 'video': 'other', 

91 'unpb': 'other', 

92 'ctlg': 'other', 

93 'art': 'other', 

94 'case': 'other', 

95 'icomm': 'other', 

96 'inpr': 'other', 

97 'map': 'other', 

98 'mpct': 'other', 

99 'music': 'other', 

100 'pamp': 'other', 

101 'pat': 'other', 

102 'pcomm': 'other', 

103 'catalog': 'other', 

104 'elec': 'other', 

105 'hear': 'other', 

106 'stat': 'other', 

107 'bill': 'other', 

108 'unbill': 'other', 

109 'cpaper': 'proceedings article', 

110 'rprt': 'report', 

111 'chap': 'book chapter', 

112 'ser': 'book series', 

113 'jfull': 'journal', 

114 'conf': 'proceedings', 

115 'comp': 'computer program', 

116 'sound': 'audio document'} 

117 self.BIBTEX_types_map = {'book': 'book', 

118 'mastersthesis': 'dissertation', 

119 'phdthesis': 'dissertation', 

120 'article': 'journal article', 

121 'misc': 'other', 

122 'unpublished': 'other', 

123 'manual': 'other', 

124 'booklet': 'other', 

125 'inproceedings': 'proceedings article', 

126 'techreport': 'report', 

127 'inbook': 'book chapter', 

128 'incollection': 'book part', 

129 'proceedings': 'proceedings'} 

130 self.CITEPROC_types_map = {'book': 'book', 

131 'dataset': 'dataset', 

132 'thesis': 'dissertation', 

133 'article-journal': 'journal article', 

134 'article': 'other', 

135 'graphic': 'other', 

136 'post-weblog': 'web content', 

137 'paper-conference': 'proceedings article', 

138 'report': 'report', 

139 'chapter': 'book chapter', 

140 'song': 'audio document'} 

141 self.SCHEMAORG_types_map = {'book': 'book', 

142 'dataset': 'dataset', 

143 'thesis': 'dissertation', 

144 'scholarlyarticle': 'journal article', 

145 'article': 'journal article', 

146 'creativework': 'other', 

147 'event': 'other', 

148 'service': 'other', 

149 'mediaobject': 'other', 

150 'review': 'other', 

151 'collection': 'other', 

152 'imageobject': 'other', 

153 'blogposting': 'web content', 

154 'report': 'report', 

155 'chapter': 'book chapter', 

156 'periodical': 'journal', 

157 'publicationissue': 'journal issue', 

158 'publicationvolume': 'journal volume', 

159 'softwaresourcecode': 'computer program', 

160 'audioobject': 'audio document'} 

161 self.RESOURCETYPEGENERAL_types_map = {'book': 'book', 

162 'dataset': 'dataset', 

163 'dissertation': 'dissertation', 

164 'journalarticle': 'journal article', 

165 'text': 'other', 

166 'other': 'other', 

167 'datapaper': 'other', 

168 'audiovisual': 'other', 

169 'interactiveresource': 'other', 

170 'physicalobject': 'other', 

171 'event': 'other', 

172 'service': 'other', 

173 'collection': 'other', 

174 'image': 'other', 

175 'model': 'other', 

176 'peerreview': 'peer review', 

177 'conferencepaper': 'proceedings article', 

178 'report': 'report', 

179 'bookchapter': 'book chapter', 

180 'journal': 'journal', 

181 'conferenceproceeding': 'proceedings', 

182 'standard': 'standard', 

183 'outputmanagementplan': 'data management plan', 

184 'preprint': 'preprint', 

185 'software': 'computer program', 

186 'sound': 'audio document', 

187 'workflow': 'workflow'} 

188 

189 # def input_preprocessing(self): 

190 # self.preprocessor.split_input() 

191 

192 self.doi_m = DOIManager(storage_manager=self.storage_manager, testing=testing) 

193 self.orcid_m = ORCIDManager(storage_manager=self.storage_manager, use_api_service=use_orcid_api, testing=testing) 

194 self.issn_m = ISSNManager() 

195 self.isbn_m = ISBNManager() 

196 self.ror_m = RORManager(use_api_service=use_ror_api, storage_manager=self.storage_manager) 

197 self.viaf_m = ViafManager(use_api_service=use_viaf_api, storage_manager=self.storage_manager) 

198 self.wikidata_m = WikidataManager(use_api_service=use_wikidata_api, storage_manager=self.storage_manager) 

199 self.use_orcid_api = use_orcid_api 

200 self.use_ror_api = use_ror_api 

201 self.use_viaf_api = use_viaf_api 

202 self.use_wikidata_api = use_wikidata_api 

203 self.ra_man_dict = {"orcid": self.orcid_m, "viaf": self.viaf_m, "wikidata": self.wikidata_m, "ror": self.ror_m} 

204 self.venue_id_man_dict = {"issn": self.issn_m, "isbn": self.isbn_m} 

205 # Temporary storage managers : all data must be stored in tmp storage manager and passed all together to the 

206 # main storage_manager only once the full file is processed. 

207 self.tmp_doi_m = DOIManager(storage_manager=self.temporary_manager, testing=testing) 

208 self.tmp_orcid_m = ORCIDManager(storage_manager=self.temporary_manager, use_api_service=use_orcid_api, testing=testing) 

209 self.venue_tmp_id_man_dict = {"issn": self.issn_m, "isbn": self.isbn_m} 

210 self.tmp_ror_m = RORManager(use_api_service=use_ror_api, storage_manager=self.temporary_manager) 

211 self.tmp_viaf_m = ViafManager(use_api_service=use_viaf_api, storage_manager=self.temporary_manager) 

212 self.tmp_wikidata_m = WikidataManager(use_api_service=use_wikidata_api, storage_manager=self.temporary_manager) 

213 self.ra_tmp_man_dict = {"orcid": self.tmp_orcid_m, "viaf": self.tmp_viaf_m, "wikidata": self.tmp_wikidata_m, "ror": self.tmp_ror_m} 

214 

215 if testing: 

216 self.BR_redis = FakeRedisWrapper() 

217 self.RA_redis = FakeRedisWrapper() 

218 else: 

219 self.BR_redis = RedisDataSource("DB-META-BR") 

220 self.RA_redis = RedisDataSource("DB-META-RA") 

221 

222 self._redis_values_ra = [] 

223 self._redis_values_br = [] 

224 

225 if not publishers_filepath_dc: 

226 self.publishers_filepath = None 

227 else: 

228 self.publishers_filepath = publishers_filepath_dc 

229 

230 if os.path.exists(self.publishers_filepath): 

231 pfp = dict() 

232 csv_headers = ("id", "name", "prefix") 

233 if self.publishers_filepath.endswith(".csv"): 

234 with open(self.publishers_filepath, encoding="utf8") as f: 

235 csv_reader = csv.DictReader(f, csv_headers) 

236 for row in csv_reader: 

237 pfp[row["prefix"]] = {"name": row["name"], "datacite_member": row["id"]} 

238 self.publishers_filepath = self.publishers_filepath.replace(".csv", ".json") 

239 elif self.publishers_filepath.endswith(".json"): 

240 with open(self.publishers_filepath, encoding="utf8") as f: 

241 pfp = json.load(f) 

242 self.publishers_mapping = pfp 

243 

244 def get_agents_strings_list(self, doi: str, agents_list: List[dict]) -> Tuple[list, list]: 

245 """ 

246 Uniformata a Crossref: 

247 - prova ad arricchire dagli indici DOI→ORCID quando l'ORCID non è nei nameIdentifiers 

248 - DOI normalizzato (accetta input con o senza 'doi:') 

249 """ 

250 authors_strings_list = [] 

251 editors_string_list = [] 

252 

253 # Se almeno un agent NON ha già 'orcid', carica la mappa dall'indice 

254 dict_orcid = None 

255 norm_doi = self.doi_m.normalise(doi, include_prefix=True) if doi else None 

256 if not all(('orcid' in a or 'ORCID' in a) for a in agents_list): 

257 dict_orcid = self.orcid_finder(norm_doi) if norm_doi else None # vedi Note in find_datacite_orcid 

258 

259 # Pulizia base come in Crossref (virgolette strane, spazi ecc.) 

260 agents_list = [ 

261 {k: Cleaner(v).remove_unwanted_characters() if k in {'family', 'given', 'name'} and v is not None else v 

262 for k, v in agent_dict.items()} 

263 for agent_dict in agents_list 

264 ] 

265 

266 for agent in agents_list: 

267 cur_role = agent.get('role', '') 

268 f_name = None 

269 g_name = None 

270 

271 # costruzione display name "Family, Given" come in Crossref 

272 agent_string = None 

273 if agent.get('family') and agent.get('given'): 

274 f_name = agent['family'] 

275 g_name = agent['given'] 

276 agent_string = f_name + ', ' + g_name 

277 elif agent.get('name'): 

278 agent_string = agent['name'] 

279 f_name = agent_string.split(",")[0].strip() if "," in agent_string else None 

280 g_name = agent_string.split(",")[-1].strip() if "," in agent_string else None 

281 if f_name and g_name: 

282 agent_string = f_name + ', ' + g_name 

283 

284 if agent_string is None: 

285 if agent.get('family') and not agent.get('given'): 

286 if g_name: 

287 agent_string = agent['family'] + ', ' + g_name 

288 else: 

289 agent_string = agent['family'] + ', ' 

290 elif agent.get('given') and not agent.get('family'): 

291 if f_name: 

292 agent_string = f_name + ', ' + agent['given'] 

293 else: 

294 agent_string = ', ' + agent['given'] 

295 

296 # ORCID diretto? 

297 orcid = None 

298 if 'orcid' in agent: 

299 orcid = str(agent['orcid'][0]) if isinstance(agent['orcid'], list) else str(agent['orcid']) 

300 elif 'ORCID' in agent: 

301 orcid = str(agent['ORCID'][0]) if isinstance(agent['ORCID'], list) else str(agent['ORCID']) 

302 

303 # Se presente, validalo/correggilo col nostro flusso 

304 if orcid: 

305 orcid = self.find_datacite_orcid([orcid], norm_doi) 

306 # Altrimenti: prova a ricavarlo da indice via NOME (come Crossref) 

307 elif dict_orcid and f_name: 

308 for ori in dict_orcid: 

309 # dict_orcid[ori] è di norma "Cognome, Nome" 

310 orc_n = dict_orcid[ori].split(', ') 

311 orc_f = orc_n[0].lower() 

312 orc_g = orc_n[1] if len(orc_n) == 2 else None 

313 

314 if f_name.lower() in orc_f.lower() or orc_f.lower() in (f_name or '').lower(): 

315 if g_name and orc_g: 

316 # discriminazione omonimi (stessa logica Crossref) 

317 if len([p for p in agents_list if p.get('family') and ( 

318 p['family'].lower() in orc_f or orc_f in p['family'].lower())]) > 1: 

319 if len([p for p in agents_list if 

320 p.get('given') and p['given'][0].lower() == orc_g[0].lower()]) > 1: 

321 homonyms_list = [p for p in agents_list if 

322 p.get('given') and p['given'].lower() == orc_g.lower()] 

323 if len(homonyms_list) > 1: 

324 if [p for p in homonyms_list if p.get('role') != cur_role]: 

325 if orc_g.lower() == g_name.lower(): 

326 orcid = ori 

327 else: 

328 if orc_g.lower() == g_name.lower(): 

329 orcid = ori 

330 elif orc_g[0].lower() == g_name[0].lower(): 

331 orcid = ori 

332 elif any([p for p in agents_list if 

333 p.get('given') and p['given'].lower() == f_name.lower()]): 

334 if orc_g.lower() == g_name.lower(): 

335 orcid = ori 

336 else: 

337 orcid = ori 

338 else: 

339 orcid = ori 

340 

341 # normalizza eventuale ori senza prefisso 

342 if orcid and not str(orcid).startswith("orcid:"): 

343 orcid = f"orcid:{orcid}" 

344 

345 # aggiungi [orcid:…] se trovato 

346 if agent_string and orcid: 

347 agent_string += f" [{orcid}]" 

348 

349 if agent_string: 

350 if cur_role == 'author': 

351 authors_strings_list.append(agent_string) 

352 elif cur_role == 'editor': 

353 editors_string_list.append(agent_string) 

354 

355 return authors_strings_list, editors_string_list 

356 

357 def _normalize_ra(self, r): 

358 """Metodo di supporto per normalizzare un Responsible Agent (per update_redis_values).""" 

359 r_schema = r.split(":")[0] 

360 id_manager = self.get_id_manager(r_schema, self.ra_man_dict) 

361 return id_manager.normalise(r, include_prefix=True) if id_manager else None 

362 

363 def update_redis_values(self, br, ra): 

364 self._redis_values_br = [ 

365 x for x in (self.doi_m.normalise(b, include_prefix=True) for b in (br or [])) if x 

366 ] 

367 self._redis_values_ra = [ 

368 x for x in (self._normalize_ra(r) for r in (ra or [])) if x 

369 ] 

370 

371 def validated_as(self, id_dict): 

372 """ Controllo nello storage temporaneo: Prima verifica se l'id è già stato validato nella memoria/coda temporanea (tmp_doi_m.validated_as_id). 

373 Controllo nello storage principale: Se non lo trova (is None), passa al manager principale (doi_m.validated_as_id). 

374 Validated_as_id chiede al database locale (lo storage_manager) se ha già una risposta chiara (Vero/Falso) sulla validità di una stringa.""" 

375 schema = id_dict["schema"].strip().lower() 

376 identifier = id_dict["identifier"] 

377 

378 if schema == 'doi': 

379 validity_value = self.tmp_doi_m.validated_as_id(identifier) 

380 if validity_value is None: 

381 validity_value = self.doi_m.validated_as_id(identifier) 

382 return validity_value 

383 else: 

384 if schema in self.ra_tmp_man_dict.keys(): 

385 tmp_id_man = self.get_id_manager(schema, self.ra_tmp_man_dict) 

386 id_man = self.get_id_manager(schema, self.ra_man_dict) 

387 validity_value = tmp_id_man.validated_as_id(identifier) 

388 if validity_value is None: 

389 validity_value = id_man.validated_as_id(identifier) 

390 return validity_value 

391 

392 

393 def get_id_manager(self, schema_or_id, id_man_dict): 

394 if ":" in schema_or_id: 

395 split_id_prefix = schema_or_id.split(":") 

396 schema = split_id_prefix[0] 

397 else: 

398 schema = schema_or_id 

399 id_man = id_man_dict.get(schema) 

400 return id_man 

401 

402 def normalise_any_id(self, id_with_prefix): 

403 id_man = self.get_id_manager(id_with_prefix, self.ra_man_dict) 

404 id_no_pref = ":".join(id_with_prefix.split(":")[1:]) 

405 norm_id_w_pref = id_man.normalise(id_no_pref, include_prefix=True) 

406 return norm_id_w_pref 

407 

408 def dict_to_cache(self, dict_to_be_saved, path): 

409 path = Path(path) 

410 parent_dir_path = path.parent.absolute() 

411 if not os.path.exists(parent_dir_path): 

412 Path(parent_dir_path).mkdir(parents=True, exist_ok=True) 

413 with open(path, "w", encoding="utf-8") as fd: 

414 json.dump(dict_to_be_saved, fd, ensure_ascii=False, indent=4) 

415 

416 def csv_creator_objects(self, doi_object: str): 

417 row = dict() 

418 keys = ['id', 'title', 'author', 'pub_date', 'venue', 'volume', 'issue', 'page', 'type', 'publisher', 'editor'] 

419 for k in keys: 

420 row[k] = '' 

421 row['id'] = doi_object 

422 try: 

423 return self.normalise_unicode(row) 

424 except TypeError: 

425 print(row) 

426 raise (TypeError) 

427 

428 def csv_creator(self, item: dict) -> dict: 

429 row = dict() 

430 doi = str(item['id']) 

431 if doi: 

432 norm_id = self.doi_m.normalise(doi, include_prefix=True) 

433 keys = ['id', 'title', 'author', 'pub_date', 'venue', 'volume', 'issue', 'page', 'type', 

434 'publisher', 'editor'] 

435 for k in keys: 

436 row[k] = '' 

437 

438 attributes = item['attributes'] 

439 

440 # row['type'] 

441 if attributes.get('types') is not None: 

442 types_dict = attributes['types'] 

443 for k, v in types_dict.items(): 

444 if k.lower() == 'ris': 

445 if type(v) is str: 

446 norm_v = v.strip().lower() 

447 if norm_v in self.RIS_types_map.keys(): 

448 row['type'] = self.RIS_types_map[norm_v] 

449 break 

450 if k.lower() == 'bibtex': 

451 if type(v) is str: 

452 norm_v = v.strip().lower() 

453 if norm_v in self.BIBTEX_types_map.keys(): 

454 row['type'] = self.BIBTEX_types_map[norm_v] 

455 break 

456 if k.lower() == 'schemaorg': 

457 if type(v) is str: 

458 norm_v = v.strip().lower() 

459 if norm_v in self.SCHEMAORG_types_map.keys(): 

460 row['type'] = self.SCHEMAORG_types_map[norm_v] 

461 break 

462 if k.lower() == 'citeproc': 

463 if type(v) is str: 

464 norm_v = v.strip().lower() 

465 if norm_v in self.CITEPROC_types_map.keys(): 

466 row['type'] = self.CITEPROC_types_map[norm_v] 

467 break 

468 if k.lower() == 'resourcetypegeneral': 

469 if type(v) is str: 

470 norm_v = v.strip().lower() 

471 if norm_v in self.RESOURCETYPEGENERAL_types_map.keys(): 

472 row['type'] = self.RESOURCETYPEGENERAL_types_map[norm_v] 

473 break 

474 

475 # row['id'] 

476 ids_list = list() 

477 ids_list.append(norm_id) 

478 

479 if attributes.get('identifiers'): 

480 for other_id in attributes.get('identifiers'): 

481 if other_id.get('identifier') and other_id.get('identifierType'): 

482 o_id_type = other_id.get('identifierType') 

483 o_id = other_id.get('identifier') 

484 

485 if o_id_type == 'ISBN': 

486 if row['type'] in {'book', 'dissertation', 'edited book', 'monograph', 'reference book', 'report', 

487 'standard'}: 

488 self.id_worker(o_id, ids_list, self.isbn_worker) 

489 

490 elif o_id_type == 'ISSN': 

491 if row['type'] in {'book series', 'book set', 'journal', 'proceedings series', 'series', 

492 'standard series', 'report series'}: 

493 self.id_worker(o_id, ids_list, self.issn_worker) 

494 

495 row['id'] = ' '.join(ids_list) 

496 

497 # row['title'] 

498 pub_title = "" 

499 if attributes.get("titles"): 

500 for title in attributes.get("titles"): 

501 if title.get("title"): 

502 p_title = title.get("title") 

503 soup = BeautifulSoup(p_title, 'html.parser') 

504 title_soup = soup.get_text().replace('\n', '') 

505 title_soup_space_replaced = ' '.join(title_soup.split()) 

506 title_soup_strip = title_soup_space_replaced.strip() 

507 clean_tit = html.unescape(title_soup_strip) 

508 pub_title = clean_tit if clean_tit else p_title 

509 

510 row['title'] = pub_title 

511 

512 agent_list_authors_only = self.add_authors_to_agent_list(attributes, [], doi) 

513 agents_list = self.add_editors_to_agent_list(attributes, agent_list_authors_only, doi) 

514 

515 authors_strings_list, editors_string_list = self.get_agents_strings_list(doi, agents_list) 

516 

517 # row['author'] 

518 if 'creators' in attributes: 

519 row['author'] = '; '.join(authors_strings_list) 

520 

521 # row['pub_date'] 

522 cur_date = "" 

523 dates = attributes.get("dates") 

524 if dates: 

525 for date in dates: 

526 if date.get("dateType") == "Issued": 

527 cur_date = date.get("date") 

528 break 

529 if cur_date == "": 

530 if attributes.get("publicationYear"): 

531 cur_date = str(attributes.get("publicationYear")) 

532 row['pub_date'] = cur_date 

533 

534 # row['venue'] 

535 row['venue'] = self.get_venue_name(attributes, row) 

536 

537 issue = "" 

538 volume = "" 

539 

540 if attributes.get("container"): 

541 container = attributes["container"] 

542 if container and (container.get("identifierType") in ("ISSN", "ISBN")): # fix precedenza and/or 

543 if container.get("issue"): 

544 issue = container.get("issue") 

545 if container.get("volume"): 

546 volume = container.get("volume") 

547 

548 if not issue or not volume: 

549 relatedIdentifiers = attributes.get("relatedIdentifiers") 

550 if relatedIdentifiers: 

551 for related in relatedIdentifiers: 

552 if related.get("relationType"): 

553 if related.get("relationType").lower() == "ispartof": 

554 if related.get("relatedIdentifierType") == "ISSN" or related.get("relatedIdentifierType") == "ISBN": 

555 if not issue and related.get("issue"): 

556 issue = related.get("issue") 

557 if not volume and related.get("volume"): 

558 volume = related.get("volume") 

559 row['volume'] = volume 

560 row['issue'] = issue 

561 row['page'] = self.get_datacite_pages(attributes) 

562 publisher_dict = attributes.get('publisher') 

563 if publisher_dict: 

564 row['publisher'] = self.get_publisher(doi, publisher_dict) 

565 else: 

566 row['publisher'] = '' 

567 

568 if attributes.get("contributors"): 

569 editors = [contributor for contributor in attributes.get("contributors") if 

570 contributor.get("contributorType") == "Editor"] 

571 if editors: 

572 row['editor'] = '; '.join(editors_string_list) 

573 

574 try: 

575 return self.normalise_unicode(row) 

576 except TypeError: 

577 print(row) 

578 raise(TypeError) 

579 return {} 

580 

581 def to_validated_id_list(self, norm_id_dict): 

582 """Questo metodo verifica la validità di un identificatore in base al suo schema (es. 'doi'). 

583 Ottimizza il processo interrogando prima una cache locale pre-caricata (`_redis_values_br`). 

584 Se l'ID viene trovato in questa cache, viene immediatamente contrassegnato come valido nel 

585 gestore di archiviazione (storage manager) temporaneo. In caso contrario, avvia un processo 

586 di validazione completo tramite il metodo `is_valid` del manager temporaneo. 

587 

588 Argomenti: 

589 norm_id_dict (dict): Un dizionario contenente l'identificatore normalizzato 

590 e il suo schema. Le chiavi attese sono "id" (str) e "schema" (str). 

591 

592 Restituisce: 

593 list: Una lista contenente l'identificatore valido se la validazione ha successo, 

594 oppure una lista vuota se l'identificatore risulta non valido.""" 

595 

596 valid_id_list = [] 

597 norm_id = norm_id_dict.get("id") 

598 schema = norm_id_dict.get("schema") 

599 if schema == "doi": 

600 if norm_id in self._redis_values_br: 

601 self.tmp_doi_m.storage_manager.set_value(norm_id, True) 

602 valid_id_list.append(norm_id) 

603 elif self.tmp_doi_m.is_valid(norm_id): 

604 valid_id_list.append(norm_id) 

605 

606 

607 elif schema in self.accepted_identifiers_ra: 

608 tmp_id_man = self.get_id_manager(schema, self.ra_tmp_man_dict) 

609 use_schema_api = getattr(self, f"use_{schema}_api", False) 

610 if norm_id in self._redis_values_ra: 

611 tmp_id_man.storage_manager.set_value(norm_id, True) 

612 valid_id_list.append(norm_id) 

613 elif not use_schema_api: 

614 pass 

615 elif tmp_id_man.is_valid(norm_id): 

616 valid_id_list.append(norm_id) 

617 

618 else: 

619 print("Schema not accepted:", norm_id_dict.get("schema"), "in ", norm_id_dict) 

620 return valid_id_list 

621 

622 #no modified 

623 def get_datacite_pages(self, item: dict) -> str: 

624 container_pages_list = list() 

625 related_pages_list = list() 

626 container = item.get("container") 

627 if container: 

628 if container.get("identifierType") == "ISSN" or container.get("identifierType") == "ISBN": 

629 if container.get("firstPage"): 

630 container_pages_list.append(container.get("firstPage")) 

631 if container.get("lastPage"): 

632 container_pages_list.append(container.get("lastPage")) 

633 

634 relatedIdentifiers = item.get("relatedIdentifiers") 

635 if relatedIdentifiers: 

636 for related in relatedIdentifiers: 

637 if related.get("relationType"): 

638 if related.get("relationType").lower() == "ispartof": 

639 if related.get("relatedIdentifierType") == "ISSN" or related.get("relatedIdentifierType") == "ISBN": 

640 if related.get("firstPage"): 

641 related_pages_list.append(related.get("firstPage")) 

642 if related.get("lastPage"): 

643 related_pages_list.append(related.get("lastPage")) 

644 

645 page_list = related_pages_list if len(related_pages_list)> len(container_pages_list) else container_pages_list 

646 return self.get_pages(page_list) 

647 

648 #modified 

649 def get_publisher(self, doi: str, publisher_item: dict) -> str: 

650 """ 

651 https://datacite-metadata-schema.readthedocs.io/en/4.6/properties/publisher/ 

652 "publisher": 

653 { 

654 "name": "Dryad", 

655 "schemeUri": "https://ror.org", 

656 "publisherIdentifier": "https://ror.org/00x6h5n95", 

657 "publisherIdentifierScheme": "ROR", 

658 "lang": "en" 

659 } 

660 accepted identifiers for publishers in Meta: https://opencitations.github.io/oc_meta/reference/csv_format/#responsible-agents 

661 [orcid, viaf, crossref, wikidata, ror] 

662 """ 

663 publisher_name = publisher_item.get("name") or "" 

664 # 1. Normalizzazione ed esclusione publisher invalidi tramite Regex 

665 if publisher_name: 

666 txt = publisher_name.lower().strip() 

667 txt_no_spaces = txt.replace(' ', '') 

668 

669 if (re.match(r"\(?:unav\)?", txt) or 

670 re.match(r"\(?:unkn\)?", txt) or 

671 re.match(r".*publ?isher not identified.*", txt) or 

672 re.match(r"^\[?unknown]?(:*\[?unknown]?)*$", txt_no_spaces) or 

673 re.match(r"^not yet(?: published)?$", txt) or 

674 re.match(r"[\[({]*s\.*[ln]\.*[)}\]]*([,:][\[({]*s\.*n\.*[)}\]]*)*", txt_no_spaces) or 

675 re.match(r"^(publisher )*not(?: specified\.*)|^(publisher )*not(?: provided\.*)$", txt) or 

676 re.match(r"^not known$", txt) or 

677 re.match(r"^(information )?not available.*", txt)): 

678 publisher_name = "" 

679 

680 # 2. Estrazione ID e composizione nome editore di base 

681 

682 publisher_id = self.get_publisher_id(publisher_item) 

683 publisher_id = f"[{publisher_id}]" if publisher_id else "" 

684 publisher = f"{publisher_name} {publisher_id}".strip() 

685 

686 # 3. Override con DataCite Mapping (struttura "piatta" senza deep nesting) 

687 prefix = doi.split('/')[0] if doi else "" 

688 

689 if prefix and self.publishers_mapping and prefix in self.publishers_mapping: 

690 mapped_data = self.publishers_mapping[prefix] 

691 name = mapped_data.get("name", "") 

692 member = mapped_data.get("datacite_member") 

693 

694 return f"{name} [datacite:{member}]" if member else name 

695 

696 # Fallback se non c'è mapping 

697 return publisher 

698 

699 def get_publisher_id(self, publisher_item: dict) -> str: 

700 publisher_id = publisher_item.get("publisherIdentifier") 

701 if not publisher_id: 

702 return "" 

703 

704 raw_scheme = publisher_item.get('publisherIdentifierScheme') 

705 if not raw_scheme: 

706 return "" 

707 

708 scheme = raw_scheme.lower().strip().replace(" ", "") 

709 

710 # Early return se lo schema non è accettato 

711 if scheme not in self.accepted_identifiers_ra: 

712 return "" 

713 

714 id_man = self.get_id_manager(scheme, self.ra_man_dict) 

715 tmp_id_man = self.get_id_manager(scheme, self.ra_tmp_man_dict) 

716 

717 if not id_man or not tmp_id_man: 

718 return "" 

719 

720 norm_id = id_man.normalise(publisher_id, include_prefix=True) 

721 if not norm_id: 

722 return "" 

723 

724 # --- Inizio fase di validazione --- 

725 #controllo sia nello storage temporaneo che in quello generale 

726 validity = self.validated_as({"identifier": norm_id, "schema": scheme}) 

727 if validity is True: 

728 return norm_id 

729 elif validity is False: 

730 return "" 

731 

732 # Recupero dinamico dell'impostazione API 

733 use_schema_api = getattr(self, f"use_{scheme}_api", False) 

734 

735 # Modalità Offline 

736 if not use_schema_api: 

737 if norm_id in self._redis_values_ra: 

738 tmp_id_man.storage_manager.set_value(norm_id, True) 

739 return norm_id 

740 return "" # Stop qui: se l'API è spenta e non è in Redis, è invalido 

741 

742 # Modalità Online (API) 

743 norm_id_dict = {"id": norm_id, "schema": scheme} 

744 if norm_id in self.to_validated_id_list(norm_id_dict): 

745 return norm_id 

746 

747 #Se arriva fin qui, la validazione API è fallita. 

748 return "" 

749 

750 #no modified 

751 def get_venue_name(self, item: dict, row: dict) -> str: 

752 cont_title = "" 

753 venids_list = list() 

754 

755 container = item.get("container") 

756 if container: 

757 if container.get("title"): 

758 cont_title = (container["title"].lower()).replace('\n', '') 

759 ven_soup = BeautifulSoup(cont_title, 'html.parser') 

760 ventit = html.unescape(ven_soup.get_text()) 

761 ambiguous_brackets = re.search(r'\[\s*((?:[^\s]+:[^\s]+)?(?:\s+[^\s]+:[^\s]+)*)\s*\]', ventit) 

762 if ambiguous_brackets: 

763 match = ambiguous_brackets.group(1) 

764 open_bracket = ventit.find(match) - 1 

765 close_bracket = ventit.find(match) + len(match) 

766 ventit = ventit[:open_bracket] + '(' + ventit[open_bracket + 1:] 

767 ventit = ventit[:close_bracket] + ')' + ventit[close_bracket + 1:] 

768 cont_title = ventit 

769 

770 if container.get("identifierType") == "ISBN": 

771 if row['type'] in {'book chapter', 'book part', 'book section', 'book track', 'reference entry'}: 

772 try: 

773 self.id_worker(container.get("identifier"), venids_list, self.isbn_worker) 

774 except ValueError: 

775 print(f'''{container.get("identifier")} raised a value error''') 

776 

777 if container.get("identifierType") == "ISSN": 

778 if row['type'] in {'book', 'data file', 'dataset', 'edited book', 'journal article', 'journal volume', 

779 'journal issue', 'monograph', 'proceedings', 'peer review', 'reference book', 

780 'reference entry', 'report'}: 

781 try: 

782 self.id_worker(container.get("identifier"), venids_list, self.issn_worker) 

783 except ValueError: 

784 print(f'''{container.get("identifier")} raised a value error''') 

785 elif row['type'] == 'report series': 

786 if container.get("title"): 

787 if container.get("title"): 

788 try: 

789 self.id_worker(container.get("identifier"), venids_list, self.issn_worker) 

790 except ValueError: 

791 print(f'''{container.get("identifier")} raised a value error''') 

792 

793 if not venids_list: 

794 relatedIdentifiers = item.get("relatedIdentifiers") 

795 if relatedIdentifiers: 

796 for related in relatedIdentifiers: 

797 if related.get("relationType"): 

798 if related.get("relationType").lower() == "ispartof": 

799 if related.get("relatedIdentifierType") == "ISBN": 

800 if row['type'] in {'book chapter', 'book part', 'book section', 'book track', 

801 'reference entry'}: 

802 self.id_worker(related.get("relatedIdentifier"), venids_list, self.isbn_worker) 

803 if related.get("relatedIdentifierType") == "ISSN": 

804 if row['type'] in {'book', 'data file', 'dataset', 'edited book', 'journal article', 

805 'journal volume', 

806 'journal issue', 'monograph', 'proceedings', 'peer review', 

807 'reference book', 

808 'reference entry', 'report'}: 

809 self.id_worker(related.get("relatedIdentifier"), venids_list, self.issn_worker) 

810 elif row['type'] == 'report series': 

811 if related.get("title"): 

812 if related.get("title"): 

813 self.id_worker(related.get("relatedIdentifier"), venids_list, self.issn_worker) 

814 

815 if venids_list: 

816 name_and_id = cont_title + ' [' + ' '.join(venids_list) + ']' if cont_title else '[' + ' '.join(venids_list) + ']' 

817 else: 

818 name_and_id = cont_title 

819 

820 return name_and_id 

821 

822 #added the call to find_datacite_orcid 

823 def add_editors_to_agent_list(self, item: dict, ag_list: list, doi: str) -> list: 

824 agent_list = ag_list 

825 contributors = item.get("contributors") 

826 if contributors: 

827 editors = [contributor for contributor in contributors if 

828 contributor.get("contributorType") == "Editor"] 

829 for ed in editors: 

830 agent = {} 

831 agent["role"] = "editor" 

832 if ed.get('name'): 

833 agent["name"] = ed.get("name") 

834 if ed.get("nameType") == "Personal" or ("familyName" in ed or "givenName" in ed): 

835 agent["family"] = ed.get("familyName") 

836 agent["given"] = ed.get("givenName") 

837 if ed.get("nameIdentifiers"): 

838 orcid_ids = [x.get("nameIdentifier") for x in ed.get("nameIdentifiers") 

839 if x.get("nameIdentifierScheme") == "ORCID"] 

840 if orcid_ids: 

841 orcid_id = self.find_datacite_orcid(orcid_ids, doi) 

842 if orcid_id: 

843 agent["orcid"] = orcid_id 

844 

845 missing_names = [x for x in ["family", "given", "name"] if x not in agent] 

846 for mn in missing_names: 

847 agent[mn] = "" 

848 agent_list.append(agent) 

849 return agent_list 

850 

851 # added the call to find_datacite_orcid 

852 def add_authors_to_agent_list(self, item: dict, ag_list: list, doi: str) -> list: 

853 agent_list = ag_list 

854 creators = item.get("creators") 

855 if creators: 

856 for c in creators: 

857 agent = {} 

858 agent["role"] = "author" 

859 if c.get("name"): 

860 agent["name"] = c.get("name") 

861 if c.get("nameType") == "Personal" or ("familyName" in c or "givenName" in c): 

862 agent["family"] = c.get("familyName") 

863 agent["given"] = c.get("givenName") 

864 if c.get("nameIdentifiers"): 

865 orcid_ids = [x.get("nameIdentifier") for x in c.get("nameIdentifiers") 

866 if x.get("nameIdentifierScheme") == "ORCID"] 

867 if orcid_ids: 

868 orcid_id = self.find_datacite_orcid(orcid_ids, doi) 

869 if orcid_id: 

870 agent["orcid"] = orcid_id 

871 missing_names = [x for x in ["family", "given", "name"] if x not in agent] 

872 for mn in missing_names: 

873 agent[mn] = "" 

874 agent_list.append(agent) 

875 return agent_list 

876 

877 #added 

878 def find_datacite_orcid(self, all_author_ids, doi=None): 

879 if not all_author_ids: 

880 return "" 

881 

882 # normalizza DOI 

883 norm_doi = self.doi_m.normalise(doi, include_prefix=True) if doi else None 

884 found_orcids = set() 

885 

886 # FIX: cerca anche la versione senza prefisso "doi:" nell'indice 

887 if norm_doi: 

888 alt_doi = norm_doi.replace("doi:", "") if norm_doi.startswith("doi:") else f"doi:{norm_doi}" 

889 raw = ( 

890 self.orcid_finder(norm_doi) 

891 or self.orcid_finder(alt_doi) 

892 ) 

893 if isinstance(raw, dict): 

894 found_orcids = {k.replace("orcid:", "").strip() for k in raw.keys()} 

895 elif isinstance(raw, (set, list)): 

896 for v in list(raw): 

897 m = re.findall(r"(\d{4}-\d{4}-\d{4}-\d{3,4}[0-9X])", str(v)) 

898 found_orcids.update(m) 

899 elif isinstance(raw, str): 

900 m = re.findall(r"(\d{4}-\d{4}-\d{4}-\d{3,4}[0-9X])", raw) 

901 found_orcids.update(m) 

902 

903 for identifier in all_author_ids: 

904 norm_orcid = self.orcid_m.normalise(identifier, include_prefix=True) 

905 if not norm_orcid: 

906 continue 

907 

908 validity = self.validated_as({"identifier": norm_orcid, "schema": "orcid"}) 

909 if validity is True: 

910 return norm_orcid 

911 if validity is False: 

912 continue 

913 

914 bare_orcid = norm_orcid.split(":", 1)[1] 

915 if bare_orcid in found_orcids: 

916 self.tmp_orcid_m.storage_manager.set_value(norm_orcid, True) 

917 return norm_orcid 

918 

919 if not self.use_orcid_api: 

920 if norm_orcid in self._redis_values_ra: 

921 self.tmp_orcid_m.storage_manager.set_value(norm_orcid, True) 

922 return norm_orcid 

923 return "" # offline: se non in redis, stop qui 

924 

925 norm_id_dict = {"id": norm_orcid, "schema": "orcid"} 

926 if norm_orcid in self.to_validated_id_list(norm_id_dict): 

927 return norm_orcid 

928 

929 return "" 

930 

931 # added 

932 def memory_to_storage(self): 

933 kv_in_memory = self.temporary_manager.get_validity_list_of_tuples() 

934 if kv_in_memory: 

935 self.storage_manager.set_multi_value(kv_in_memory) 

936 self.temporary_manager.delete_storage() 

937 

938 # added (division in first and second iteration) 

939 def extract_all_ids(self, citation, is_citing: bool): 

940 

941 """Nella prima iterazione estraggo e normalizzo gli identificativi dei RA (authors, editors, publishers)""" 

942 if is_citing: 

943 all_br = set() 

944 all_ra = set() 

945 

946 attributes = citation.get("attributes") 

947 if attributes: 

948 creators = attributes.get("creators") 

949 if creators: 

950 for c in creators: 

951 c_ids = c.get("nameIdentifiers") 

952 if c_ids: 

953 norm_c_orcids = {self.orcid_m.normalise(x.get("nameIdentifier"), include_prefix=True) for x in c.get("nameIdentifiers") if 

954 x.get("nameIdentifierScheme") == "ORCID"} 

955 if norm_c_orcids: 

956 all_ra.update(norm_c_orcids) 

957 

958 if attributes.get("contributors"): 

959 editors = [contributor for contributor in attributes.get("contributors") if 

960 contributor.get("contributorType") == "Editor"] 

961 for ed in editors: 

962 if ed.get("nameIdentifiers"): 

963 norm_ed_orcids = {self.orcid_m.normalise(x.get("nameIdentifier"), include_prefix=True) for x in ed.get("nameIdentifiers") if 

964 x.get("nameIdentifierScheme") == "ORCID"} 

965 if norm_ed_orcids: 

966 all_ra.update(norm_ed_orcids) 

967 

968 publisher = attributes.get("publisher") 

969 if publisher: 

970 if publisher.get('publisherIdentifierScheme') and publisher.get('publisherIdentifier'): 

971 identifier_scheme = publisher['publisherIdentifierScheme'] 

972 id_scheme_lower = identifier_scheme.lower().strip().replace(" ", "") 

973 id = publisher['publisherIdentifier'] 

974 if id_scheme_lower in self.accepted_identifiers_ra: 

975 norm_publisher = {} 

976 if id_scheme_lower == 'orcid': 

977 norm_publisher = {self.orcid_m.normalise(id, include_prefix=True)} 

978 elif id_scheme_lower == 'ror': 

979 norm_publisher = {self.ror_m.normalise(id, include_prefix=True)} 

980 elif id_scheme_lower == 'wikidata': 

981 norm_publisher = {self.wikidata_m.normalise(id, include_prefix=True)} 

982 elif id_scheme_lower == 'viaf': 

983 norm_publisher = {self.viaf_m.normalise(id, include_prefix=True)} 

984 if norm_publisher: 

985 all_ra.update(norm_publisher) 

986 

987 all_br = [x for x in all_br if x is not None] 

988 all_ra = [y for y in all_ra if y is not None] 

989 return all_br, all_ra 

990 

991 # Nella seconda iterazione normalizzo ed estraggo i doi dei related items (citanti e/o citati) 

992 else: 

993 all_br = set() 

994 all_ra = set() 

995 attributes = citation.get("attributes", {}) # evita KeyError 

996 rel_ids = attributes.get("relatedIdentifiers") 

997 if rel_ids: 

998 for ref in rel_ids: 

999 if all(elem in ref for elem in self.needed_info): 

1000 relatedIdentifierType = (str(ref["relatedIdentifierType"])).lower() 

1001 relationType = str(ref["relationType"]).lower() 

1002 if relatedIdentifierType == "doi": 

1003 if relationType in self.filter: 

1004 rel_id = self.doi_m.normalise(ref["relatedIdentifier"], include_prefix=True) 

1005 if rel_id: 

1006 all_br.add(rel_id) 

1007 all_br = [x for x in all_br if x is not None] 

1008 all_ra = [y for y in all_ra if y is not None] 

1009 return all_br, all_ra 

1010 

1011 #added 

1012 def get_reids_validity_list(self, id_list, redis_db): 

1013 """Questo metodo interroga Redis per una lista di identificativi e restituisce solo quelli che risultano salvati come validi""" 

1014 ids = list(id_list) # garantisci ordine deterministico 

1015 if redis_db == "ra": 

1016 validity = self.RA_redis.mexists_as_set(ids) 

1017 return [ids[i] for i, v in enumerate(validity) if v] 

1018 elif redis_db == "br": 

1019 validity = self.BR_redis.mexists_as_set(ids) 

1020 return [ids[i] for i, v in enumerate(validity) if v] 

1021 else: 

1022 raise ValueError("redis_db must be either 'ra' or 'br'")