Coverage for oc_ds_converter/datacite/datacite

1# SPDX-FileCopyrightText: 2019-2020 Fabio Mariani <fabio.mariani555@gmail.com>

2# SPDX-FileCopyrightText: 2021-2026 Arcangelo Massari <arcangelo.massari@unibo.it>

3# SPDX-FileCopyrightText: 2023-2025 Arianna Moretti <arianna.moretti4@unibo.it>

4# SPDX-FileCopyrightText: 2023-2026 Marta Soricetti <marta.soricetti@unibo.it>

6# SPDX-License-Identifier: ISC

8import html

9import re

10import warnings

11import os

12from xml.etree.ElementInclude import include

14import fakeredis

15import csv

16import json

18from bs4 import BeautifulSoup

19from pandas.core.apply import include_axis

20from soupsieve.util import lower

22from oc_ds_converter.oc_idmanager import WikidataManager

23from oc_ds_converter.oc_idmanager.doi import DOIManager

24from oc_ds_converter.oc_idmanager.orcid import ORCIDManager

25from oc_ds_converter.lib.master_of_regex import *

26from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager

27from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager

28from oc_ds_converter.oc_idmanager.oc_data_storage.sqlite_manager import SqliteStorageManager

29from oc_ds_converter.oc_idmanager.issn import ISSNManager

30from oc_ds_converter.oc_idmanager.isbn import ISBNManager

32#for publishers

33from oc_ds_converter.oc_idmanager.arxiv import ArXivManager

34from oc_ds_converter.oc_idmanager.ror import RORManager

35from oc_ds_converter.oc_idmanager.viaf import ViafManager

36from oc_ds_converter.oc_idmanager.crossref import CrossrefManager

38from oc_ds_converter.datasource.redis import RedisDataSource

39from oc_ds_converter.ra_processor import RaProcessor

40from typing import Dict, List, Tuple, Optional, Type, Callable

41from pathlib import Path

42from typing import List, Tuple

44from bs4 import BeautifulSoup

46from oc_ds_converter.datasource.redis import FakeRedisWrapper, RedisDataSource

47from oc_ds_converter.lib.cleaner import Cleaner

48from oc_ds_converter.oc_idmanager.doi import DOIManager

49from oc_ds_converter.oc_idmanager.isbn import ISBNManager

50from oc_ds_converter.oc_idmanager.issn import ISSNManager

51from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import RedisStorageManager

52from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager

53from oc_ds_converter.oc_idmanager.oc_data_storage.batch_manager import BatchManager

54from oc_ds_converter.oc_idmanager.orcid import ORCIDManager

55from oc_ds_converter.ra_processor import RaProcessor

57warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

60class DataciteProcessing(RaProcessor):

61 def __init__(self, orcid_index: str = None, doi_csv: str = None, publishers_filepath_dc: str = None,

62 testing: bool = True, storage_manager: Optional[StorageManager] = None,

63 use_orcid_api: bool = True, use_ror_api: bool = True, use_viaf_api:bool = True, use_wikidata_api:bool = True,

64 exclude_existing: bool = False):

65 super(DataciteProcessing, self).__init__(orcid_index)

66 # self.preprocessor = DatacitePreProcessing(inp_dir, out_dir, interval, filter)

67 if storage_manager is None:

68 self.storage_manager = RedisStorageManager(testing=testing)

69 else:

70 self.storage_manager = storage_manager

72 self.exclude_existing = exclude_existing

73 self.temporary_manager = BatchManager()

75 self.needed_info = ["relationType", "relatedIdentifierType", "relatedIdentifier"]

76 self.filter = ["references", "isreferencedby", "cites", "iscitedby"]

78 self.accepted_identifiers_ra = ['ror', 'viaf', 'orcid', 'wikidata']

80 self.RIS_types_map = {'abst': 'abstract',

81 'news': 'newspaper article',

82 'slide': 'presentation',

83 'book': 'book',

84 'data': 'dataset',

85 'thes': 'dissertation',

86 'jour': 'journal article',

87 'mgzn': 'journal article',

88 'gen': 'other',

89 'advs': 'other',

90 'video': 'other',

91 'unpb': 'other',

92 'ctlg': 'other',

93 'art': 'other',

94 'case': 'other',

95 'icomm': 'other',

96 'inpr': 'other',

97 'map': 'other',

98 'mpct': 'other',

99 'music': 'other',

100 'pamp': 'other',

101 'pat': 'other',

102 'pcomm': 'other',

103 'catalog': 'other',

104 'elec': 'other',

105 'hear': 'other',

106 'stat': 'other',

107 'bill': 'other',

108 'unbill': 'other',

109 'cpaper': 'proceedings article',

110 'rprt': 'report',

111 'chap': 'book chapter',

112 'ser': 'book series',

113 'jfull': 'journal',

114 'conf': 'proceedings',

115 'comp': 'computer program',

116 'sound': 'audio document'}

117 self.BIBTEX_types_map = {'book': 'book',

118 'mastersthesis': 'dissertation',

119 'phdthesis': 'dissertation',

120 'article': 'journal article',

121 'misc': 'other',

122 'unpublished': 'other',

123 'manual': 'other',

124 'booklet': 'other',

125 'inproceedings': 'proceedings article',

126 'techreport': 'report',

127 'inbook': 'book chapter',

128 'incollection': 'book part',

129 'proceedings': 'proceedings'}

130 self.CITEPROC_types_map = {'book': 'book',

131 'dataset': 'dataset',

132 'thesis': 'dissertation',

133 'article-journal': 'journal article',

134 'article': 'other',

135 'graphic': 'other',

136 'post-weblog': 'web content',

137 'paper-conference': 'proceedings article',

138 'report': 'report',

139 'chapter': 'book chapter',

140 'song': 'audio document'}

141 self.SCHEMAORG_types_map = {'book': 'book',

142 'dataset': 'dataset',

143 'thesis': 'dissertation',

144 'scholarlyarticle': 'journal article',

145 'article': 'journal article',

146 'creativework': 'other',

147 'event': 'other',

148 'service': 'other',

149 'mediaobject': 'other',

150 'review': 'other',

151 'collection': 'other',

152 'imageobject': 'other',

153 'blogposting': 'web content',

154 'report': 'report',

155 'chapter': 'book chapter',

156 'periodical': 'journal',

157 'publicationissue': 'journal issue',

158 'publicationvolume': 'journal volume',

159 'softwaresourcecode': 'computer program',

160 'audioobject': 'audio document'}

161 self.RESOURCETYPEGENERAL_types_map = {'book': 'book',

162 'dataset': 'dataset',

163 'dissertation': 'dissertation',

164 'journalarticle': 'journal article',

165 'text': 'other',

166 'other': 'other',

167 'datapaper': 'other',

168 'audiovisual': 'other',

169 'interactiveresource': 'other',

170 'physicalobject': 'other',

171 'event': 'other',

172 'service': 'other',

173 'collection': 'other',

174 'image': 'other',

175 'model': 'other',

176 'peerreview': 'peer review',

177 'conferencepaper': 'proceedings article',

178 'report': 'report',

179 'bookchapter': 'book chapter',

180 'journal': 'journal',

181 'conferenceproceeding': 'proceedings',

182 'standard': 'standard',

183 'outputmanagementplan': 'data management plan',

184 'preprint': 'preprint',

185 'software': 'computer program',

186 'sound': 'audio document',

187 'workflow': 'workflow'}

188

189 # def input_preprocessing(self):

190 # self.preprocessor.split_input()

191

192 self.doi_m = DOIManager(storage_manager=self.storage_manager, testing=testing)

193 self.orcid_m = ORCIDManager(storage_manager=self.storage_manager, use_api_service=use_orcid_api, testing=testing)

194 self.issn_m = ISSNManager()

195 self.isbn_m = ISBNManager()

196 self.ror_m = RORManager(use_api_service=use_ror_api, storage_manager=self.storage_manager)

197 self.viaf_m = ViafManager(use_api_service=use_viaf_api, storage_manager=self.storage_manager)

198 self.wikidata_m = WikidataManager(use_api_service=use_wikidata_api, storage_manager=self.storage_manager)

199 self.use_orcid_api = use_orcid_api

200 self.use_ror_api = use_ror_api

201 self.use_viaf_api = use_viaf_api

202 self.use_wikidata_api = use_wikidata_api

203 self.ra_man_dict = {"orcid": self.orcid_m, "viaf": self.viaf_m, "wikidata": self.wikidata_m, "ror": self.ror_m}

204 self.venue_id_man_dict = {"issn": self.issn_m, "isbn": self.isbn_m}

205 # Temporary storage managers : all data must be stored in tmp storage manager and passed all together to the

206 # main storage_manager only once the full file is processed.

207 self.tmp_doi_m = DOIManager(storage_manager=self.temporary_manager, testing=testing)

208 self.tmp_orcid_m = ORCIDManager(storage_manager=self.temporary_manager, use_api_service=use_orcid_api, testing=testing)

209 self.venue_tmp_id_man_dict = {"issn": self.issn_m, "isbn": self.isbn_m}

210 self.tmp_ror_m = RORManager(use_api_service=use_ror_api, storage_manager=self.temporary_manager)

211 self.tmp_viaf_m = ViafManager(use_api_service=use_viaf_api, storage_manager=self.temporary_manager)

212 self.tmp_wikidata_m = WikidataManager(use_api_service=use_wikidata_api, storage_manager=self.temporary_manager)

213 self.ra_tmp_man_dict = {"orcid": self.tmp_orcid_m, "viaf": self.tmp_viaf_m, "wikidata": self.tmp_wikidata_m, "ror": self.tmp_ror_m}

214

215 if testing:

216 self.BR_redis = FakeRedisWrapper()

217 self.RA_redis = FakeRedisWrapper()

218 else:

219 self.BR_redis = RedisDataSource("DB-META-BR")

220 self.RA_redis = RedisDataSource("DB-META-RA")

221

222 self._redis_values_ra = []

223 self._redis_values_br = []

224

225 if not publishers_filepath_dc:

226 self.publishers_filepath = None

227 else:

228 self.publishers_filepath = publishers_filepath_dc

229

230 if os.path.exists(self.publishers_filepath):

231 pfp = dict()

232 csv_headers = ("id", "name", "prefix")

233 if self.publishers_filepath.endswith(".csv"):

234 with open(self.publishers_filepath, encoding="utf8") as f:

235 csv_reader = csv.DictReader(f, csv_headers)

236 for row in csv_reader:

237 pfp[row["prefix"]] = {"name": row["name"], "datacite_member": row["id"]}

238 self.publishers_filepath = self.publishers_filepath.replace(".csv", ".json")

239 elif self.publishers_filepath.endswith(".json"):

240 with open(self.publishers_filepath, encoding="utf8") as f:

241 pfp = json.load(f)

242 self.publishers_mapping = pfp

243

244 def get_agents_strings_list(self, doi: str, agents_list: List[dict]) -> Tuple[list, list]:

245 """

246 Uniformata a Crossref:

247 - prova ad arricchire dagli indici DOI→ORCID quando l'ORCID non è nei nameIdentifiers

248 - DOI normalizzato (accetta input con o senza 'doi:')

249 """

250 authors_strings_list = []

251 editors_string_list = []

252

253 # Se almeno un agent NON ha già 'orcid', carica la mappa dall'indice

254 dict_orcid = None

255 norm_doi = self.doi_m.normalise(doi, include_prefix=True) if doi else None

256 if not all(('orcid' in a or 'ORCID' in a) for a in agents_list):

257 dict_orcid = self.orcid_finder(norm_doi) if norm_doi else None # vedi Note in find_datacite_orcid

258

259 # Pulizia base come in Crossref (virgolette strane, spazi ecc.)

260 agents_list = [

261 {k: Cleaner(v).remove_unwanted_characters() if k in {'family', 'given', 'name'} and v is not None else v

262 for k, v in agent_dict.items()}

263 for agent_dict in agents_list

264 ]

265

266 for agent in agents_list:

267 cur_role = agent.get('role', '')

268 f_name = None

269 g_name = None

270

271 # costruzione display name "Family, Given" come in Crossref

272 agent_string = None

273 if agent.get('family') and agent.get('given'):

274 f_name = agent['family']

275 g_name = agent['given']

276 agent_string = f_name + ', ' + g_name

277 elif agent.get('name'):

278 agent_string = agent['name']

279 f_name = agent_string.split(",")[0].strip() if "," in agent_string else None

280 g_name = agent_string.split(",")[-1].strip() if "," in agent_string else None

281 if f_name and g_name:

282 agent_string = f_name + ', ' + g_name

283

284 if agent_string is None:

285 if agent.get('family') and not agent.get('given'):

286 if g_name:

287 agent_string = agent['family'] + ', ' + g_name

288 else:

289 agent_string = agent['family'] + ', '

290 elif agent.get('given') and not agent.get('family'):

291 if f_name:

292 agent_string = f_name + ', ' + agent['given']

293 else:

294 agent_string = ', ' + agent['given']

295

296 # ORCID diretto?

297 orcid = None

298 if 'orcid' in agent:

299 orcid = str(agent['orcid'][0]) if isinstance(agent['orcid'], list) else str(agent['orcid'])

300 elif 'ORCID' in agent:

301 orcid = str(agent['ORCID'][0]) if isinstance(agent['ORCID'], list) else str(agent['ORCID'])

302

303 # Se presente, validalo/correggilo col nostro flusso

304 if orcid:

305 orcid = self.find_datacite_orcid([orcid], norm_doi)

306 # Altrimenti: prova a ricavarlo da indice via NOME (come Crossref)

307 elif dict_orcid and f_name:

308 for ori in dict_orcid:

309 # dict_orcid[ori] è di norma "Cognome, Nome"

310 orc_n = dict_orcid[ori].split(', ')

311 orc_f = orc_n[0].lower()

312 orc_g = orc_n[1] if len(orc_n) == 2 else None

313

314 if f_name.lower() in orc_f.lower() or orc_f.lower() in (f_name or '').lower():

315 if g_name and orc_g:

316 # discriminazione omonimi (stessa logica Crossref)

317 if len([p for p in agents_list if p.get('family') and (

318 p['family'].lower() in orc_f or orc_f in p['family'].lower())]) > 1:

319 if len([p for p in agents_list if

320 p.get('given') and p['given'][0].lower() == orc_g[0].lower()]) > 1:

321 homonyms_list = [p for p in agents_list if

322 p.get('given') and p['given'].lower() == orc_g.lower()]

323 if len(homonyms_list) > 1:

324 if [p for p in homonyms_list if p.get('role') != cur_role]:

325 if orc_g.lower() == g_name.lower():

326 orcid = ori

327 else:

328 if orc_g.lower() == g_name.lower():

329 orcid = ori

330 elif orc_g[0].lower() == g_name[0].lower():

331 orcid = ori

332 elif any([p for p in agents_list if

333 p.get('given') and p['given'].lower() == f_name.lower()]):

334 if orc_g.lower() == g_name.lower():

335 orcid = ori

336 else:

337 orcid = ori

338 else:

339 orcid = ori

340

341 # normalizza eventuale ori senza prefisso

342 if orcid and not str(orcid).startswith("orcid:"):

343 orcid = f"orcid:{orcid}"

344

345 # aggiungi [orcid:…] se trovato

346 if agent_string and orcid:

347 agent_string += f" [{orcid}]"

348

349 if agent_string:

350 if cur_role == 'author':

351 authors_strings_list.append(agent_string)

352 elif cur_role == 'editor':

353 editors_string_list.append(agent_string)

354

355 return authors_strings_list, editors_string_list

356

357 def _normalize_ra(self, r):

358 """Metodo di supporto per normalizzare un Responsible Agent (per update_redis_values)."""

359 r_schema = r.split(":")[0]

360 id_manager = self.get_id_manager(r_schema, self.ra_man_dict)

361 return id_manager.normalise(r, include_prefix=True) if id_manager else None

362

363 def update_redis_values(self, br, ra):

364 self._redis_values_br = [

365 x for x in (self.doi_m.normalise(b, include_prefix=True) for b in (br or [])) if x

366 ]

367 self._redis_values_ra = [

368 x for x in (self._normalize_ra(r) for r in (ra or [])) if x

369 ]

370

371 def validated_as(self, id_dict):

372 """ Controllo nello storage temporaneo: Prima verifica se l'id è già stato validato nella memoria/coda temporanea (tmp_doi_m.validated_as_id).

373 Controllo nello storage principale: Se non lo trova (is None), passa al manager principale (doi_m.validated_as_id).

374 Validated_as_id chiede al database locale (lo storage_manager) se ha già una risposta chiara (Vero/Falso) sulla validità di una stringa."""

375 schema = id_dict["schema"].strip().lower()

376 identifier = id_dict["identifier"]

377

378 if schema == 'doi':

379 validity_value = self.tmp_doi_m.validated_as_id(identifier)

380 if validity_value is None:

381 validity_value = self.doi_m.validated_as_id(identifier)

382 return validity_value

383 else:

384 if schema in self.ra_tmp_man_dict.keys():

385 tmp_id_man = self.get_id_manager(schema, self.ra_tmp_man_dict)

386 id_man = self.get_id_manager(schema, self.ra_man_dict)

387 validity_value = tmp_id_man.validated_as_id(identifier)

388 if validity_value is None:

389 validity_value = id_man.validated_as_id(identifier)

390 return validity_value

391

392

393 def get_id_manager(self, schema_or_id, id_man_dict):

394 if ":" in schema_or_id:

395 split_id_prefix = schema_or_id.split(":")

396 schema = split_id_prefix[0]

397 else:

398 schema = schema_or_id

399 id_man = id_man_dict.get(schema)

400 return id_man

401

402 def normalise_any_id(self, id_with_prefix):

403 id_man = self.get_id_manager(id_with_prefix, self.ra_man_dict)

404 id_no_pref = ":".join(id_with_prefix.split(":")[1:])

405 norm_id_w_pref = id_man.normalise(id_no_pref, include_prefix=True)

406 return norm_id_w_pref

407

408 def dict_to_cache(self, dict_to_be_saved, path):

409 path = Path(path)

410 parent_dir_path = path.parent.absolute()

411 if not os.path.exists(parent_dir_path):

412 Path(parent_dir_path).mkdir(parents=True, exist_ok=True)

413 with open(path, "w", encoding="utf-8") as fd:

414 json.dump(dict_to_be_saved, fd, ensure_ascii=False, indent=4)

415

416 def csv_creator_objects(self, doi_object: str):

417 row = dict()

418 keys = ['id', 'title', 'author', 'pub_date', 'venue', 'volume', 'issue', 'page', 'type', 'publisher', 'editor']

419 for k in keys:

420 row[k] = ''

421 row['id'] = doi_object

422 try:

423 return self.normalise_unicode(row)

424 except TypeError:

425 print(row)

426 raise (TypeError)

427

428 def csv_creator(self, item: dict) -> dict:

429 row = dict()

430 doi = str(item['id'])

431 if doi:

432 norm_id = self.doi_m.normalise(doi, include_prefix=True)

433 keys = ['id', 'title', 'author', 'pub_date', 'venue', 'volume', 'issue', 'page', 'type',

434 'publisher', 'editor']

435 for k in keys:

436 row[k] = ''

437

438 attributes = item['attributes']

439

440 # row['type']

441 if attributes.get('types') is not None:

442 types_dict = attributes['types']

443 for k, v in types_dict.items():

444 if k.lower() == 'ris':

445 if type(v) is str:

446 norm_v = v.strip().lower()

447 if norm_v in self.RIS_types_map.keys():

448 row['type'] = self.RIS_types_map[norm_v]

449 break

450 if k.lower() == 'bibtex':

451 if type(v) is str:

452 norm_v = v.strip().lower()

453 if norm_v in self.BIBTEX_types_map.keys():

454 row['type'] = self.BIBTEX_types_map[norm_v]

455 break

456 if k.lower() == 'schemaorg':

457 if type(v) is str:

458 norm_v = v.strip().lower()

459 if norm_v in self.SCHEMAORG_types_map.keys():

460 row['type'] = self.SCHEMAORG_types_map[norm_v]

461 break

462 if k.lower() == 'citeproc':

463 if type(v) is str:

464 norm_v = v.strip().lower()

465 if norm_v in self.CITEPROC_types_map.keys():

466 row['type'] = self.CITEPROC_types_map[norm_v]

467 break

468 if k.lower() == 'resourcetypegeneral':

469 if type(v) is str:

470 norm_v = v.strip().lower()

471 if norm_v in self.RESOURCETYPEGENERAL_types_map.keys():

472 row['type'] = self.RESOURCETYPEGENERAL_types_map[norm_v]

473 break

474

475 # row['id']

476 ids_list = list()

477 ids_list.append(norm_id)

478

479 if attributes.get('identifiers'):

480 for other_id in attributes.get('identifiers'):

481 if other_id.get('identifier') and other_id.get('identifierType'):

482 o_id_type = other_id.get('identifierType')

483 o_id = other_id.get('identifier')

484

485 if o_id_type == 'ISBN':

486 if row['type'] in {'book', 'dissertation', 'edited book', 'monograph', 'reference book', 'report',

487 'standard'}:

488 self.id_worker(o_id, ids_list, self.isbn_worker)

489

490 elif o_id_type == 'ISSN':

491 if row['type'] in {'book series', 'book set', 'journal', 'proceedings series', 'series',

492 'standard series', 'report series'}:

493 self.id_worker(o_id, ids_list, self.issn_worker)

494

495 row['id'] = ' '.join(ids_list)

496

497 # row['title']

498 pub_title = ""

499 if attributes.get("titles"):

500 for title in attributes.get("titles"):

501 if title.get("title"):

502 p_title = title.get("title")

503 soup = BeautifulSoup(p_title, 'html.parser')

504 title_soup = soup.get_text().replace('\n', '')

505 title_soup_space_replaced = ' '.join(title_soup.split())

506 title_soup_strip = title_soup_space_replaced.strip()

507 clean_tit = html.unescape(title_soup_strip)

508 pub_title = clean_tit if clean_tit else p_title

509

510 row['title'] = pub_title

511

512 agent_list_authors_only = self.add_authors_to_agent_list(attributes, [], doi)

513 agents_list = self.add_editors_to_agent_list(attributes, agent_list_authors_only, doi)

514

515 authors_strings_list, editors_string_list = self.get_agents_strings_list(doi, agents_list)

516

517 # row['author']

518 if 'creators' in attributes:

519 row['author'] = '; '.join(authors_strings_list)

520

521 # row['pub_date']

522 cur_date = ""

523 dates = attributes.get("dates")

524 if dates:

525 for date in dates:

526 if date.get("dateType") == "Issued":

527 cur_date = date.get("date")

528 break

529 if cur_date == "":

530 if attributes.get("publicationYear"):

531 cur_date = str(attributes.get("publicationYear"))

532 row['pub_date'] = cur_date

533

534 # row['venue']

535 row['venue'] = self.get_venue_name(attributes, row)

536

537 issue = ""

538 volume = ""

539

540 if attributes.get("container"):

541 container = attributes["container"]

542 if container and (container.get("identifierType") in ("ISSN", "ISBN")): # fix precedenza and/or

543 if container.get("issue"):

544 issue = container.get("issue")

545 if container.get("volume"):

546 volume = container.get("volume")

547

548 if not issue or not volume:

549 relatedIdentifiers = attributes.get("relatedIdentifiers")

550 if relatedIdentifiers:

551 for related in relatedIdentifiers:

552 if related.get("relationType"):

553 if related.get("relationType").lower() == "ispartof":

554 if related.get("relatedIdentifierType") == "ISSN" or related.get("relatedIdentifierType") == "ISBN":

555 if not issue and related.get("issue"):

556 issue = related.get("issue")

557 if not volume and related.get("volume"):

558 volume = related.get("volume")

559 row['volume'] = volume

560 row['issue'] = issue

561 row['page'] = self.get_datacite_pages(attributes)

562 publisher_dict = attributes.get('publisher')

563 if publisher_dict:

564 row['publisher'] = self.get_publisher(doi, publisher_dict)

565 else:

566 row['publisher'] = ''

567

568 if attributes.get("contributors"):

569 editors = [contributor for contributor in attributes.get("contributors") if

570 contributor.get("contributorType") == "Editor"]

571 if editors:

572 row['editor'] = '; '.join(editors_string_list)

573

574 try:

575 return self.normalise_unicode(row)

576 except TypeError:

577 print(row)

578 raise(TypeError)

579 return {}

580

581 def to_validated_id_list(self, norm_id_dict):

582 """Questo metodo verifica la validità di un identificatore in base al suo schema (es. 'doi').

583 Ottimizza il processo interrogando prima una cache locale pre-caricata (`_redis_values_br`).

584 Se l'ID viene trovato in questa cache, viene immediatamente contrassegnato come valido nel

585 gestore di archiviazione (storage manager) temporaneo. In caso contrario, avvia un processo

586 di validazione completo tramite il metodo `is_valid` del manager temporaneo.

587

588 Argomenti:

589 norm_id_dict (dict): Un dizionario contenente l'identificatore normalizzato

590 e il suo schema. Le chiavi attese sono "id" (str) e "schema" (str).

591

592 Restituisce:

593 list: Una lista contenente l'identificatore valido se la validazione ha successo,

594 oppure una lista vuota se l'identificatore risulta non valido."""

595

596 valid_id_list = []

597 norm_id = norm_id_dict.get("id")

598 schema = norm_id_dict.get("schema")

599 if schema == "doi":

600 if norm_id in self._redis_values_br:

601 self.tmp_doi_m.storage_manager.set_value(norm_id, True)

602 valid_id_list.append(norm_id)

603 elif self.tmp_doi_m.is_valid(norm_id):

604 valid_id_list.append(norm_id)

605

606

607 elif schema in self.accepted_identifiers_ra:

608 tmp_id_man = self.get_id_manager(schema, self.ra_tmp_man_dict)

609 use_schema_api = getattr(self, f"use_{schema}_api", False)

610 if norm_id in self._redis_values_ra:

611 tmp_id_man.storage_manager.set_value(norm_id, True)

612 valid_id_list.append(norm_id)

613 elif not use_schema_api:

614 pass

615 elif tmp_id_man.is_valid(norm_id):

616 valid_id_list.append(norm_id)

617

618 else:

619 print("Schema not accepted:", norm_id_dict.get("schema"), "in ", norm_id_dict)

620 return valid_id_list

621

622 #no modified

623 def get_datacite_pages(self, item: dict) -> str:

624 container_pages_list = list()

625 related_pages_list = list()

626 container = item.get("container")

627 if container:

628 if container.get("identifierType") == "ISSN" or container.get("identifierType") == "ISBN":

629 if container.get("firstPage"):

630 container_pages_list.append(container.get("firstPage"))

631 if container.get("lastPage"):

632 container_pages_list.append(container.get("lastPage"))

633

634 relatedIdentifiers = item.get("relatedIdentifiers")

635 if relatedIdentifiers:

636 for related in relatedIdentifiers:

637 if related.get("relationType"):

638 if related.get("relationType").lower() == "ispartof":

639 if related.get("relatedIdentifierType") == "ISSN" or related.get("relatedIdentifierType") == "ISBN":

640 if related.get("firstPage"):

641 related_pages_list.append(related.get("firstPage"))

642 if related.get("lastPage"):

643 related_pages_list.append(related.get("lastPage"))

644

645 page_list = related_pages_list if len(related_pages_list)> len(container_pages_list) else container_pages_list

646 return self.get_pages(page_list)

647

648 #modified

649 def get_publisher(self, doi: str, publisher_item: dict) -> str:

650 """

651 https://datacite-metadata-schema.readthedocs.io/en/4.6/properties/publisher/

652 "publisher":

653 {

654 "name": "Dryad",

655 "schemeUri": "https://ror.org",

656 "publisherIdentifier": "https://ror.org/00x6h5n95",

657 "publisherIdentifierScheme": "ROR",

658 "lang": "en"

659 }

660 accepted identifiers for publishers in Meta: https://opencitations.github.io/oc_meta/reference/csv_format/#responsible-agents

661 [orcid, viaf, crossref, wikidata, ror]

662 """

663 publisher_name = publisher_item.get("name") or ""

664 # 1. Normalizzazione ed esclusione publisher invalidi tramite Regex

665 if publisher_name:

666 txt = publisher_name.lower().strip()

667 txt_no_spaces = txt.replace(' ', '')

668

669 if (re.match(r"$?:unav$?", txt) or

670 re.match(r"$?:unkn$?", txt) or

671 re.match(r".*publ?isher not identified.*", txt) or

672 re.match(r"^\[?unknown]?(:*\[?unknown]?)*$", txt_no_spaces) or

673 re.match(r"^not yet(?: published)?$", txt) or

674 re.match(r"[\[({]*s\.*[ln]\.*[)}\]]*([,:][\[({]*s\.*n\.*[)}\]]*)*", txt_no_spaces) or

675 re.match(r"^(publisher )*not(?: specified\.*)|^(publisher )*not(?: provided\.*)$", txt) or

676 re.match(r"^not known$", txt) or

677 re.match(r"^(information )?not available.*", txt)):

678 publisher_name = ""

679

680 # 2. Estrazione ID e composizione nome editore di base

681

682 publisher_id = self.get_publisher_id(publisher_item)

683 publisher_id = f"[{publisher_id}]" if publisher_id else ""

684 publisher = f"{publisher_name} {publisher_id}".strip()

685

686 # 3. Override con DataCite Mapping (struttura "piatta" senza deep nesting)

687 prefix = doi.split('/')[0] if doi else ""

688

689 if prefix and self.publishers_mapping and prefix in self.publishers_mapping:

690 mapped_data = self.publishers_mapping[prefix]

691 name = mapped_data.get("name", "")

692 member = mapped_data.get("datacite_member")

693

694 return f"{name} [datacite:{member}]" if member else name

695

696 # Fallback se non c'è mapping

697 return publisher

698

699 def get_publisher_id(self, publisher_item: dict) -> str:

700 publisher_id = publisher_item.get("publisherIdentifier")

701 if not publisher_id:

702 return ""

703

704 raw_scheme = publisher_item.get('publisherIdentifierScheme')

705 if not raw_scheme:

706 return ""

707

708 scheme = raw_scheme.lower().strip().replace(" ", "")

709

710 # Early return se lo schema non è accettato

711 if scheme not in self.accepted_identifiers_ra:

712 return ""

713

714 id_man = self.get_id_manager(scheme, self.ra_man_dict)

715 tmp_id_man = self.get_id_manager(scheme, self.ra_tmp_man_dict)

716

717 if not id_man or not tmp_id_man:

718 return ""

719

720 norm_id = id_man.normalise(publisher_id, include_prefix=True)

721 if not norm_id:

722 return ""

723

724 # --- Inizio fase di validazione ---

725 #controllo sia nello storage temporaneo che in quello generale

726 validity = self.validated_as({"identifier": norm_id, "schema": scheme})

727 if validity is True:

728 return norm_id

729 elif validity is False:

730 return ""

731

732 # Recupero dinamico dell'impostazione API

733 use_schema_api = getattr(self, f"use_{scheme}_api", False)

734

735 # Modalità Offline

736 if not use_schema_api:

737 if norm_id in self._redis_values_ra:

738 tmp_id_man.storage_manager.set_value(norm_id, True)

739 return norm_id

740 return "" # Stop qui: se l'API è spenta e non è in Redis, è invalido

741

742 # Modalità Online (API)

743 norm_id_dict = {"id": norm_id, "schema": scheme}

744 if norm_id in self.to_validated_id_list(norm_id_dict):

745 return norm_id

746

747 #Se arriva fin qui, la validazione API è fallita.

748 return ""

749

750 #no modified

751 def get_venue_name(self, item: dict, row: dict) -> str:

752 cont_title = ""

753 venids_list = list()

754

755 container = item.get("container")

756 if container:

757 if container.get("title"):

758 cont_title = (container["title"].lower()).replace('\n', '')

759 ven_soup = BeautifulSoup(cont_title, 'html.parser')

760 ventit = html.unescape(ven_soup.get_text())

761 ambiguous_brackets = re.search(r'\[\s*((?:[^\s]+:[^\s]+)?(?:\s+[^\s]+:[^\s]+)*)\s*\]', ventit)

762 if ambiguous_brackets:

763 match = ambiguous_brackets.group(1)

764 open_bracket = ventit.find(match) - 1

765 close_bracket = ventit.find(match) + len(match)

766 ventit = ventit[:open_bracket] + '(' + ventit[open_bracket + 1:]

767 ventit = ventit[:close_bracket] + ')' + ventit[close_bracket + 1:]

768 cont_title = ventit

769

770 if container.get("identifierType") == "ISBN":

771 if row['type'] in {'book chapter', 'book part', 'book section', 'book track', 'reference entry'}:

772 try:

773 self.id_worker(container.get("identifier"), venids_list, self.isbn_worker)

774 except ValueError:

775 print(f'''{container.get("identifier")} raised a value error''')

776

777 if container.get("identifierType") == "ISSN":

778 if row['type'] in {'book', 'data file', 'dataset', 'edited book', 'journal article', 'journal volume',

779 'journal issue', 'monograph', 'proceedings', 'peer review', 'reference book',

780 'reference entry', 'report'}:

781 try:

782 self.id_worker(container.get("identifier"), venids_list, self.issn_worker)

783 except ValueError:

784 print(f'''{container.get("identifier")} raised a value error''')

785 elif row['type'] == 'report series':

786 if container.get("title"):

787 if container.get("title"):

788 try:

789 self.id_worker(container.get("identifier"), venids_list, self.issn_worker)

790 except ValueError:

791 print(f'''{container.get("identifier")} raised a value error''')

792

793 if not venids_list:

794 relatedIdentifiers = item.get("relatedIdentifiers")

795 if relatedIdentifiers:

796 for related in relatedIdentifiers:

797 if related.get("relationType"):

798 if related.get("relationType").lower() == "ispartof":

799 if related.get("relatedIdentifierType") == "ISBN":

800 if row['type'] in {'book chapter', 'book part', 'book section', 'book track',

801 'reference entry'}:

802 self.id_worker(related.get("relatedIdentifier"), venids_list, self.isbn_worker)

803 if related.get("relatedIdentifierType") == "ISSN":

804 if row['type'] in {'book', 'data file', 'dataset', 'edited book', 'journal article',

805 'journal volume',

806 'journal issue', 'monograph', 'proceedings', 'peer review',

807 'reference book',

808 'reference entry', 'report'}:

809 self.id_worker(related.get("relatedIdentifier"), venids_list, self.issn_worker)

810 elif row['type'] == 'report series':

811 if related.get("title"):

812 if related.get("title"):

813 self.id_worker(related.get("relatedIdentifier"), venids_list, self.issn_worker)

814

815 if venids_list:

816 name_and_id = cont_title + ' [' + ' '.join(venids_list) + ']' if cont_title else '[' + ' '.join(venids_list) + ']'

817 else:

818 name_and_id = cont_title

819

820 return name_and_id

821

822 #added the call to find_datacite_orcid

823 def add_editors_to_agent_list(self, item: dict, ag_list: list, doi: str) -> list:

824 agent_list = ag_list

825 contributors = item.get("contributors")

826 if contributors:

827 editors = [contributor for contributor in contributors if

828 contributor.get("contributorType") == "Editor"]

829 for ed in editors:

830 agent = {}

831 agent["role"] = "editor"

832 if ed.get('name'):

833 agent["name"] = ed.get("name")

834 if ed.get("nameType") == "Personal" or ("familyName" in ed or "givenName" in ed):

835 agent["family"] = ed.get("familyName")

836 agent["given"] = ed.get("givenName")

837 if ed.get("nameIdentifiers"):

838 orcid_ids = [x.get("nameIdentifier") for x in ed.get("nameIdentifiers")

839 if x.get("nameIdentifierScheme") == "ORCID"]

840 if orcid_ids:

841 orcid_id = self.find_datacite_orcid(orcid_ids, doi)

842 if orcid_id:

843 agent["orcid"] = orcid_id

844

845 missing_names = [x for x in ["family", "given", "name"] if x not in agent]

846 for mn in missing_names:

847 agent[mn] = ""

848 agent_list.append(agent)

849 return agent_list

850

851 # added the call to find_datacite_orcid

852 def add_authors_to_agent_list(self, item: dict, ag_list: list, doi: str) -> list:

853 agent_list = ag_list

854 creators = item.get("creators")

855 if creators:

856 for c in creators:

857 agent = {}

858 agent["role"] = "author"

859 if c.get("name"):

860 agent["name"] = c.get("name")

861 if c.get("nameType") == "Personal" or ("familyName" in c or "givenName" in c):

862 agent["family"] = c.get("familyName")

863 agent["given"] = c.get("givenName")

864 if c.get("nameIdentifiers"):

865 orcid_ids = [x.get("nameIdentifier") for x in c.get("nameIdentifiers")

866 if x.get("nameIdentifierScheme") == "ORCID"]

867 if orcid_ids:

868 orcid_id = self.find_datacite_orcid(orcid_ids, doi)

869 if orcid_id:

870 agent["orcid"] = orcid_id

871 missing_names = [x for x in ["family", "given", "name"] if x not in agent]

872 for mn in missing_names:

873 agent[mn] = ""

874 agent_list.append(agent)

875 return agent_list

876

877 #added

878 def find_datacite_orcid(self, all_author_ids, doi=None):

879 if not all_author_ids:

880 return ""

881

882 # normalizza DOI

883 norm_doi = self.doi_m.normalise(doi, include_prefix=True) if doi else None

884 found_orcids = set()

885

886 # FIX: cerca anche la versione senza prefisso "doi:" nell'indice

887 if norm_doi:

888 alt_doi = norm_doi.replace("doi:", "") if norm_doi.startswith("doi:") else f"doi:{norm_doi}"

889 raw = (

890 self.orcid_finder(norm_doi)

891 or self.orcid_finder(alt_doi)

892 )

893 if isinstance(raw, dict):

894 found_orcids = {k.replace("orcid:", "").strip() for k in raw.keys()}

895 elif isinstance(raw, (set, list)):

896 for v in list(raw):

897 m = re.findall(r"(\d{4}-\d{4}-\d{4}-\d{3,4}[0-9X])", str(v))

898 found_orcids.update(m)

899 elif isinstance(raw, str):

900 m = re.findall(r"(\d{4}-\d{4}-\d{4}-\d{3,4}[0-9X])", raw)

901 found_orcids.update(m)

902

903 for identifier in all_author_ids:

904 norm_orcid = self.orcid_m.normalise(identifier, include_prefix=True)

905 if not norm_orcid:

906 continue

907

908 validity = self.validated_as({"identifier": norm_orcid, "schema": "orcid"})

909 if validity is True:

910 return norm_orcid

911 if validity is False:

912 continue

913

914 bare_orcid = norm_orcid.split(":", 1)[1]

915 if bare_orcid in found_orcids:

916 self.tmp_orcid_m.storage_manager.set_value(norm_orcid, True)

917 return norm_orcid

918

919 if not self.use_orcid_api:

920 if norm_orcid in self._redis_values_ra:

921 self.tmp_orcid_m.storage_manager.set_value(norm_orcid, True)

922 return norm_orcid

923 return "" # offline: se non in redis, stop qui

924

925 norm_id_dict = {"id": norm_orcid, "schema": "orcid"}

926 if norm_orcid in self.to_validated_id_list(norm_id_dict):

927 return norm_orcid

928

929 return ""

930

931 # added

932 def memory_to_storage(self):

933 kv_in_memory = self.temporary_manager.get_validity_list_of_tuples()

934 if kv_in_memory:

935 self.storage_manager.set_multi_value(kv_in_memory)

936 self.temporary_manager.delete_storage()

937

938 # added (division in first and second iteration)

939 def extract_all_ids(self, citation, is_citing: bool):

940

941 """Nella prima iterazione estraggo e normalizzo gli identificativi dei RA (authors, editors, publishers)"""

942 if is_citing:

943 all_br = set()

944 all_ra = set()

945

946 attributes = citation.get("attributes")

947 if attributes:

948 creators = attributes.get("creators")

949 if creators:

950 for c in creators:

951 c_ids = c.get("nameIdentifiers")

952 if c_ids:

953 norm_c_orcids = {self.orcid_m.normalise(x.get("nameIdentifier"), include_prefix=True) for x in c.get("nameIdentifiers") if

954 x.get("nameIdentifierScheme") == "ORCID"}

955 if norm_c_orcids:

956 all_ra.update(norm_c_orcids)

957

958 if attributes.get("contributors"):

959 editors = [contributor for contributor in attributes.get("contributors") if

960 contributor.get("contributorType") == "Editor"]

961 for ed in editors:

962 if ed.get("nameIdentifiers"):

963 norm_ed_orcids = {self.orcid_m.normalise(x.get("nameIdentifier"), include_prefix=True) for x in ed.get("nameIdentifiers") if

964 x.get("nameIdentifierScheme") == "ORCID"}

965 if norm_ed_orcids:

966 all_ra.update(norm_ed_orcids)

967

968 publisher = attributes.get("publisher")

969 if publisher:

970 if publisher.get('publisherIdentifierScheme') and publisher.get('publisherIdentifier'):

971 identifier_scheme = publisher['publisherIdentifierScheme']

972 id_scheme_lower = identifier_scheme.lower().strip().replace(" ", "")

973 id = publisher['publisherIdentifier']

974 if id_scheme_lower in self.accepted_identifiers_ra:

975 norm_publisher = {}

976 if id_scheme_lower == 'orcid':

977 norm_publisher = {self.orcid_m.normalise(id, include_prefix=True)}

978 elif id_scheme_lower == 'ror':

979 norm_publisher = {self.ror_m.normalise(id, include_prefix=True)}

980 elif id_scheme_lower == 'wikidata':

981 norm_publisher = {self.wikidata_m.normalise(id, include_prefix=True)}

982 elif id_scheme_lower == 'viaf':

983 norm_publisher = {self.viaf_m.normalise(id, include_prefix=True)}

984 if norm_publisher:

985 all_ra.update(norm_publisher)

986

987 all_br = [x for x in all_br if x is not None]

988 all_ra = [y for y in all_ra if y is not None]

989 return all_br, all_ra

990

991 # Nella seconda iterazione normalizzo ed estraggo i doi dei related items (citanti e/o citati)

992 else:

993 all_br = set()

994 all_ra = set()

995 attributes = citation.get("attributes", {}) # evita KeyError

996 rel_ids = attributes.get("relatedIdentifiers")

997 if rel_ids:

998 for ref in rel_ids:

999 if all(elem in ref for elem in self.needed_info):

1000 relatedIdentifierType = (str(ref["relatedIdentifierType"])).lower()

1001 relationType = str(ref["relationType"]).lower()

1002 if relatedIdentifierType == "doi":

1003 if relationType in self.filter:

1004 if ref["relatedIdentifier"] is not None:

1005 rel_id = self.doi_m.normalise(ref["relatedIdentifier"], include_prefix=True)

1006 if rel_id:

1007 all_br.add(rel_id)

1008 all_br = [x for x in all_br if x is not None]

1009 all_ra = [y for y in all_ra if y is not None]

1010 return all_br, all_ra

1011

1012 #added

1013 def get_reids_validity_list(self, id_list, redis_db):

1014 """Questo metodo interroga Redis per una lista di identificativi e restituisce solo quelli che risultano salvati come validi"""

1015 ids = list(id_list) # garantisci ordine deterministico

1016 if redis_db == "ra":

1017 validity = self.RA_redis.mexists_as_set(ids)

1018 return [ids[i] for i, v in enumerate(validity) if v]

1019 elif redis_db == "br":

1020 validity = self.BR_redis.mexists_as_set(ids)

1021 return [ids[i] for i, v in enumerate(validity) if v]

1022 else:

1023 raise ValueError("redis_db must be either 'ra' or 'br'")

1024

1025 def prefetch_doi_orcid_index(self, dois: list[str]) -> None:

1026 keys = [

1027 norm for doi in dois

1028 if (norm := self.doi_m.normalise(doi, include_prefix=True))

1029 ]

1030 self._doi_orcid_cache = self.orcid_index.get_values_batch(keys)

Coverage for oc_ds_converter / datacite / datacite_processing.py: 80%

658 statements