Coverage for oc_ds_converter / datacite / datacite_processing.py: 81%
654 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
1# SPDX-FileCopyrightText: 2019-2020 Fabio Mariani <fabio.mariani555@gmail.com>
2# SPDX-FileCopyrightText: 2021-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
3# SPDX-FileCopyrightText: 2023-2025 Arianna Moretti <arianna.moretti4@unibo.it>
4# SPDX-FileCopyrightText: 2023-2026 Marta Soricetti <marta.soricetti@unibo.it>
5#
6# SPDX-License-Identifier: ISC
8import html
9import re
10import warnings
11import os
12from xml.etree.ElementInclude import include
14import fakeredis
15import csv
16import json
18from bs4 import BeautifulSoup
19from pandas.core.apply import include_axis
20from soupsieve.util import lower
22from oc_ds_converter.oc_idmanager import WikidataManager
23from oc_ds_converter.oc_idmanager.doi import DOIManager
24from oc_ds_converter.oc_idmanager.orcid import ORCIDManager
25from oc_ds_converter.lib.master_of_regex import *
26from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager
27from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager
28from oc_ds_converter.oc_idmanager.oc_data_storage.sqlite_manager import SqliteStorageManager
29from oc_ds_converter.oc_idmanager.issn import ISSNManager
30from oc_ds_converter.oc_idmanager.isbn import ISBNManager
32#for publishers
33from oc_ds_converter.oc_idmanager.arxiv import ArXivManager
34from oc_ds_converter.oc_idmanager.ror import RORManager
35from oc_ds_converter.oc_idmanager.viaf import ViafManager
36from oc_ds_converter.oc_idmanager.crossref import CrossrefManager
38from oc_ds_converter.datasource.redis import RedisDataSource
39from oc_ds_converter.ra_processor import RaProcessor
40from typing import Dict, List, Tuple, Optional, Type, Callable
41from pathlib import Path
42from typing import List, Tuple
44from bs4 import BeautifulSoup
46from oc_ds_converter.datasource.redis import FakeRedisWrapper, RedisDataSource
47from oc_ds_converter.lib.cleaner import Cleaner
48from oc_ds_converter.oc_idmanager.doi import DOIManager
49from oc_ds_converter.oc_idmanager.isbn import ISBNManager
50from oc_ds_converter.oc_idmanager.issn import ISSNManager
51from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import RedisStorageManager
52from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager
53from oc_ds_converter.oc_idmanager.oc_data_storage.batch_manager import BatchManager
54from oc_ds_converter.oc_idmanager.orcid import ORCIDManager
55from oc_ds_converter.ra_processor import RaProcessor
57warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
60class DataciteProcessing(RaProcessor):
61 def __init__(self, orcid_index: str = None, doi_csv: str = None, publishers_filepath_dc: str = None,
62 testing: bool = True, storage_manager: Optional[StorageManager] = None,
63 use_orcid_api: bool = True, use_ror_api: bool = True, use_viaf_api:bool = True, use_wikidata_api:bool = True,
64 exclude_existing: bool = False):
65 super(DataciteProcessing, self).__init__(orcid_index, doi_csv)
66 # self.preprocessor = DatacitePreProcessing(inp_dir, out_dir, interval, filter)
67 if storage_manager is None:
68 self.storage_manager = RedisStorageManager(testing=testing)
69 else:
70 self.storage_manager = storage_manager
72 self.exclude_existing = exclude_existing
73 self.temporary_manager = BatchManager()
75 self.needed_info = ["relationType", "relatedIdentifierType", "relatedIdentifier"]
76 self.filter = ["references", "isreferencedby", "cites", "iscitedby"]
78 self.accepted_identifiers_ra = ['ror', 'viaf', 'orcid', 'wikidata']
80 self.RIS_types_map = {'abst': 'abstract',
81 'news': 'newspaper article',
82 'slide': 'presentation',
83 'book': 'book',
84 'data': 'dataset',
85 'thes': 'dissertation',
86 'jour': 'journal article',
87 'mgzn': 'journal article',
88 'gen': 'other',
89 'advs': 'other',
90 'video': 'other',
91 'unpb': 'other',
92 'ctlg': 'other',
93 'art': 'other',
94 'case': 'other',
95 'icomm': 'other',
96 'inpr': 'other',
97 'map': 'other',
98 'mpct': 'other',
99 'music': 'other',
100 'pamp': 'other',
101 'pat': 'other',
102 'pcomm': 'other',
103 'catalog': 'other',
104 'elec': 'other',
105 'hear': 'other',
106 'stat': 'other',
107 'bill': 'other',
108 'unbill': 'other',
109 'cpaper': 'proceedings article',
110 'rprt': 'report',
111 'chap': 'book chapter',
112 'ser': 'book series',
113 'jfull': 'journal',
114 'conf': 'proceedings',
115 'comp': 'computer program',
116 'sound': 'audio document'}
117 self.BIBTEX_types_map = {'book': 'book',
118 'mastersthesis': 'dissertation',
119 'phdthesis': 'dissertation',
120 'article': 'journal article',
121 'misc': 'other',
122 'unpublished': 'other',
123 'manual': 'other',
124 'booklet': 'other',
125 'inproceedings': 'proceedings article',
126 'techreport': 'report',
127 'inbook': 'book chapter',
128 'incollection': 'book part',
129 'proceedings': 'proceedings'}
130 self.CITEPROC_types_map = {'book': 'book',
131 'dataset': 'dataset',
132 'thesis': 'dissertation',
133 'article-journal': 'journal article',
134 'article': 'other',
135 'graphic': 'other',
136 'post-weblog': 'web content',
137 'paper-conference': 'proceedings article',
138 'report': 'report',
139 'chapter': 'book chapter',
140 'song': 'audio document'}
141 self.SCHEMAORG_types_map = {'book': 'book',
142 'dataset': 'dataset',
143 'thesis': 'dissertation',
144 'scholarlyarticle': 'journal article',
145 'article': 'journal article',
146 'creativework': 'other',
147 'event': 'other',
148 'service': 'other',
149 'mediaobject': 'other',
150 'review': 'other',
151 'collection': 'other',
152 'imageobject': 'other',
153 'blogposting': 'web content',
154 'report': 'report',
155 'chapter': 'book chapter',
156 'periodical': 'journal',
157 'publicationissue': 'journal issue',
158 'publicationvolume': 'journal volume',
159 'softwaresourcecode': 'computer program',
160 'audioobject': 'audio document'}
161 self.RESOURCETYPEGENERAL_types_map = {'book': 'book',
162 'dataset': 'dataset',
163 'dissertation': 'dissertation',
164 'journalarticle': 'journal article',
165 'text': 'other',
166 'other': 'other',
167 'datapaper': 'other',
168 'audiovisual': 'other',
169 'interactiveresource': 'other',
170 'physicalobject': 'other',
171 'event': 'other',
172 'service': 'other',
173 'collection': 'other',
174 'image': 'other',
175 'model': 'other',
176 'peerreview': 'peer review',
177 'conferencepaper': 'proceedings article',
178 'report': 'report',
179 'bookchapter': 'book chapter',
180 'journal': 'journal',
181 'conferenceproceeding': 'proceedings',
182 'standard': 'standard',
183 'outputmanagementplan': 'data management plan',
184 'preprint': 'preprint',
185 'software': 'computer program',
186 'sound': 'audio document',
187 'workflow': 'workflow'}
189 # def input_preprocessing(self):
190 # self.preprocessor.split_input()
192 self.doi_m = DOIManager(storage_manager=self.storage_manager, testing=testing)
193 self.orcid_m = ORCIDManager(storage_manager=self.storage_manager, use_api_service=use_orcid_api, testing=testing)
194 self.issn_m = ISSNManager()
195 self.isbn_m = ISBNManager()
196 self.ror_m = RORManager(use_api_service=use_ror_api, storage_manager=self.storage_manager)
197 self.viaf_m = ViafManager(use_api_service=use_viaf_api, storage_manager=self.storage_manager)
198 self.wikidata_m = WikidataManager(use_api_service=use_wikidata_api, storage_manager=self.storage_manager)
199 self.use_orcid_api = use_orcid_api
200 self.use_ror_api = use_ror_api
201 self.use_viaf_api = use_viaf_api
202 self.use_wikidata_api = use_wikidata_api
203 self.ra_man_dict = {"orcid": self.orcid_m, "viaf": self.viaf_m, "wikidata": self.wikidata_m, "ror": self.ror_m}
204 self.venue_id_man_dict = {"issn": self.issn_m, "isbn": self.isbn_m}
205 # Temporary storage managers : all data must be stored in tmp storage manager and passed all together to the
206 # main storage_manager only once the full file is processed.
207 self.tmp_doi_m = DOIManager(storage_manager=self.temporary_manager, testing=testing)
208 self.tmp_orcid_m = ORCIDManager(storage_manager=self.temporary_manager, use_api_service=use_orcid_api, testing=testing)
209 self.venue_tmp_id_man_dict = {"issn": self.issn_m, "isbn": self.isbn_m}
210 self.tmp_ror_m = RORManager(use_api_service=use_ror_api, storage_manager=self.temporary_manager)
211 self.tmp_viaf_m = ViafManager(use_api_service=use_viaf_api, storage_manager=self.temporary_manager)
212 self.tmp_wikidata_m = WikidataManager(use_api_service=use_wikidata_api, storage_manager=self.temporary_manager)
213 self.ra_tmp_man_dict = {"orcid": self.tmp_orcid_m, "viaf": self.tmp_viaf_m, "wikidata": self.tmp_wikidata_m, "ror": self.tmp_ror_m}
215 if testing:
216 self.BR_redis = FakeRedisWrapper()
217 self.RA_redis = FakeRedisWrapper()
218 else:
219 self.BR_redis = RedisDataSource("DB-META-BR")
220 self.RA_redis = RedisDataSource("DB-META-RA")
222 self._redis_values_ra = []
223 self._redis_values_br = []
225 if not publishers_filepath_dc:
226 self.publishers_filepath = None
227 else:
228 self.publishers_filepath = publishers_filepath_dc
230 if os.path.exists(self.publishers_filepath):
231 pfp = dict()
232 csv_headers = ("id", "name", "prefix")
233 if self.publishers_filepath.endswith(".csv"):
234 with open(self.publishers_filepath, encoding="utf8") as f:
235 csv_reader = csv.DictReader(f, csv_headers)
236 for row in csv_reader:
237 pfp[row["prefix"]] = {"name": row["name"], "datacite_member": row["id"]}
238 self.publishers_filepath = self.publishers_filepath.replace(".csv", ".json")
239 elif self.publishers_filepath.endswith(".json"):
240 with open(self.publishers_filepath, encoding="utf8") as f:
241 pfp = json.load(f)
242 self.publishers_mapping = pfp
244 def get_agents_strings_list(self, doi: str, agents_list: List[dict]) -> Tuple[list, list]:
245 """
246 Uniformata a Crossref:
247 - prova ad arricchire dagli indici DOI→ORCID quando l'ORCID non è nei nameIdentifiers
248 - DOI normalizzato (accetta input con o senza 'doi:')
249 """
250 authors_strings_list = []
251 editors_string_list = []
253 # Se almeno un agent NON ha già 'orcid', carica la mappa dall'indice
254 dict_orcid = None
255 norm_doi = self.doi_m.normalise(doi, include_prefix=True) if doi else None
256 if not all(('orcid' in a or 'ORCID' in a) for a in agents_list):
257 dict_orcid = self.orcid_finder(norm_doi) if norm_doi else None # vedi Note in find_datacite_orcid
259 # Pulizia base come in Crossref (virgolette strane, spazi ecc.)
260 agents_list = [
261 {k: Cleaner(v).remove_unwanted_characters() if k in {'family', 'given', 'name'} and v is not None else v
262 for k, v in agent_dict.items()}
263 for agent_dict in agents_list
264 ]
266 for agent in agents_list:
267 cur_role = agent.get('role', '')
268 f_name = None
269 g_name = None
271 # costruzione display name "Family, Given" come in Crossref
272 agent_string = None
273 if agent.get('family') and agent.get('given'):
274 f_name = agent['family']
275 g_name = agent['given']
276 agent_string = f_name + ', ' + g_name
277 elif agent.get('name'):
278 agent_string = agent['name']
279 f_name = agent_string.split(",")[0].strip() if "," in agent_string else None
280 g_name = agent_string.split(",")[-1].strip() if "," in agent_string else None
281 if f_name and g_name:
282 agent_string = f_name + ', ' + g_name
284 if agent_string is None:
285 if agent.get('family') and not agent.get('given'):
286 if g_name:
287 agent_string = agent['family'] + ', ' + g_name
288 else:
289 agent_string = agent['family'] + ', '
290 elif agent.get('given') and not agent.get('family'):
291 if f_name:
292 agent_string = f_name + ', ' + agent['given']
293 else:
294 agent_string = ', ' + agent['given']
296 # ORCID diretto?
297 orcid = None
298 if 'orcid' in agent:
299 orcid = str(agent['orcid'][0]) if isinstance(agent['orcid'], list) else str(agent['orcid'])
300 elif 'ORCID' in agent:
301 orcid = str(agent['ORCID'][0]) if isinstance(agent['ORCID'], list) else str(agent['ORCID'])
303 # Se presente, validalo/correggilo col nostro flusso
304 if orcid:
305 orcid = self.find_datacite_orcid([orcid], norm_doi)
306 # Altrimenti: prova a ricavarlo da indice via NOME (come Crossref)
307 elif dict_orcid and f_name:
308 for ori in dict_orcid:
309 # dict_orcid[ori] è di norma "Cognome, Nome"
310 orc_n = dict_orcid[ori].split(', ')
311 orc_f = orc_n[0].lower()
312 orc_g = orc_n[1] if len(orc_n) == 2 else None
314 if f_name.lower() in orc_f.lower() or orc_f.lower() in (f_name or '').lower():
315 if g_name and orc_g:
316 # discriminazione omonimi (stessa logica Crossref)
317 if len([p for p in agents_list if p.get('family') and (
318 p['family'].lower() in orc_f or orc_f in p['family'].lower())]) > 1:
319 if len([p for p in agents_list if
320 p.get('given') and p['given'][0].lower() == orc_g[0].lower()]) > 1:
321 homonyms_list = [p for p in agents_list if
322 p.get('given') and p['given'].lower() == orc_g.lower()]
323 if len(homonyms_list) > 1:
324 if [p for p in homonyms_list if p.get('role') != cur_role]:
325 if orc_g.lower() == g_name.lower():
326 orcid = ori
327 else:
328 if orc_g.lower() == g_name.lower():
329 orcid = ori
330 elif orc_g[0].lower() == g_name[0].lower():
331 orcid = ori
332 elif any([p for p in agents_list if
333 p.get('given') and p['given'].lower() == f_name.lower()]):
334 if orc_g.lower() == g_name.lower():
335 orcid = ori
336 else:
337 orcid = ori
338 else:
339 orcid = ori
341 # normalizza eventuale ori senza prefisso
342 if orcid and not str(orcid).startswith("orcid:"):
343 orcid = f"orcid:{orcid}"
345 # aggiungi [orcid:…] se trovato
346 if agent_string and orcid:
347 agent_string += f" [{orcid}]"
349 if agent_string:
350 if cur_role == 'author':
351 authors_strings_list.append(agent_string)
352 elif cur_role == 'editor':
353 editors_string_list.append(agent_string)
355 return authors_strings_list, editors_string_list
357 def _normalize_ra(self, r):
358 """Metodo di supporto per normalizzare un Responsible Agent (per update_redis_values)."""
359 r_schema = r.split(":")[0]
360 id_manager = self.get_id_manager(r_schema, self.ra_man_dict)
361 return id_manager.normalise(r, include_prefix=True) if id_manager else None
363 def update_redis_values(self, br, ra):
364 self._redis_values_br = [
365 x for x in (self.doi_m.normalise(b, include_prefix=True) for b in (br or [])) if x
366 ]
367 self._redis_values_ra = [
368 x for x in (self._normalize_ra(r) for r in (ra or [])) if x
369 ]
371 def validated_as(self, id_dict):
372 """ Controllo nello storage temporaneo: Prima verifica se l'id è già stato validato nella memoria/coda temporanea (tmp_doi_m.validated_as_id).
373 Controllo nello storage principale: Se non lo trova (is None), passa al manager principale (doi_m.validated_as_id).
374 Validated_as_id chiede al database locale (lo storage_manager) se ha già una risposta chiara (Vero/Falso) sulla validità di una stringa."""
375 schema = id_dict["schema"].strip().lower()
376 identifier = id_dict["identifier"]
378 if schema == 'doi':
379 validity_value = self.tmp_doi_m.validated_as_id(identifier)
380 if validity_value is None:
381 validity_value = self.doi_m.validated_as_id(identifier)
382 return validity_value
383 else:
384 if schema in self.ra_tmp_man_dict.keys():
385 tmp_id_man = self.get_id_manager(schema, self.ra_tmp_man_dict)
386 id_man = self.get_id_manager(schema, self.ra_man_dict)
387 validity_value = tmp_id_man.validated_as_id(identifier)
388 if validity_value is None:
389 validity_value = id_man.validated_as_id(identifier)
390 return validity_value
393 def get_id_manager(self, schema_or_id, id_man_dict):
394 if ":" in schema_or_id:
395 split_id_prefix = schema_or_id.split(":")
396 schema = split_id_prefix[0]
397 else:
398 schema = schema_or_id
399 id_man = id_man_dict.get(schema)
400 return id_man
402 def normalise_any_id(self, id_with_prefix):
403 id_man = self.get_id_manager(id_with_prefix, self.ra_man_dict)
404 id_no_pref = ":".join(id_with_prefix.split(":")[1:])
405 norm_id_w_pref = id_man.normalise(id_no_pref, include_prefix=True)
406 return norm_id_w_pref
408 def dict_to_cache(self, dict_to_be_saved, path):
409 path = Path(path)
410 parent_dir_path = path.parent.absolute()
411 if not os.path.exists(parent_dir_path):
412 Path(parent_dir_path).mkdir(parents=True, exist_ok=True)
413 with open(path, "w", encoding="utf-8") as fd:
414 json.dump(dict_to_be_saved, fd, ensure_ascii=False, indent=4)
416 def csv_creator_objects(self, doi_object: str):
417 row = dict()
418 keys = ['id', 'title', 'author', 'pub_date', 'venue', 'volume', 'issue', 'page', 'type', 'publisher', 'editor']
419 for k in keys:
420 row[k] = ''
421 row['id'] = doi_object
422 try:
423 return self.normalise_unicode(row)
424 except TypeError:
425 print(row)
426 raise (TypeError)
428 def csv_creator(self, item: dict) -> dict:
429 row = dict()
430 doi = str(item['id'])
431 if doi:
432 norm_id = self.doi_m.normalise(doi, include_prefix=True)
433 keys = ['id', 'title', 'author', 'pub_date', 'venue', 'volume', 'issue', 'page', 'type',
434 'publisher', 'editor']
435 for k in keys:
436 row[k] = ''
438 attributes = item['attributes']
440 # row['type']
441 if attributes.get('types') is not None:
442 types_dict = attributes['types']
443 for k, v in types_dict.items():
444 if k.lower() == 'ris':
445 if type(v) is str:
446 norm_v = v.strip().lower()
447 if norm_v in self.RIS_types_map.keys():
448 row['type'] = self.RIS_types_map[norm_v]
449 break
450 if k.lower() == 'bibtex':
451 if type(v) is str:
452 norm_v = v.strip().lower()
453 if norm_v in self.BIBTEX_types_map.keys():
454 row['type'] = self.BIBTEX_types_map[norm_v]
455 break
456 if k.lower() == 'schemaorg':
457 if type(v) is str:
458 norm_v = v.strip().lower()
459 if norm_v in self.SCHEMAORG_types_map.keys():
460 row['type'] = self.SCHEMAORG_types_map[norm_v]
461 break
462 if k.lower() == 'citeproc':
463 if type(v) is str:
464 norm_v = v.strip().lower()
465 if norm_v in self.CITEPROC_types_map.keys():
466 row['type'] = self.CITEPROC_types_map[norm_v]
467 break
468 if k.lower() == 'resourcetypegeneral':
469 if type(v) is str:
470 norm_v = v.strip().lower()
471 if norm_v in self.RESOURCETYPEGENERAL_types_map.keys():
472 row['type'] = self.RESOURCETYPEGENERAL_types_map[norm_v]
473 break
475 # row['id']
476 ids_list = list()
477 ids_list.append(norm_id)
479 if attributes.get('identifiers'):
480 for other_id in attributes.get('identifiers'):
481 if other_id.get('identifier') and other_id.get('identifierType'):
482 o_id_type = other_id.get('identifierType')
483 o_id = other_id.get('identifier')
485 if o_id_type == 'ISBN':
486 if row['type'] in {'book', 'dissertation', 'edited book', 'monograph', 'reference book', 'report',
487 'standard'}:
488 self.id_worker(o_id, ids_list, self.isbn_worker)
490 elif o_id_type == 'ISSN':
491 if row['type'] in {'book series', 'book set', 'journal', 'proceedings series', 'series',
492 'standard series', 'report series'}:
493 self.id_worker(o_id, ids_list, self.issn_worker)
495 row['id'] = ' '.join(ids_list)
497 # row['title']
498 pub_title = ""
499 if attributes.get("titles"):
500 for title in attributes.get("titles"):
501 if title.get("title"):
502 p_title = title.get("title")
503 soup = BeautifulSoup(p_title, 'html.parser')
504 title_soup = soup.get_text().replace('\n', '')
505 title_soup_space_replaced = ' '.join(title_soup.split())
506 title_soup_strip = title_soup_space_replaced.strip()
507 clean_tit = html.unescape(title_soup_strip)
508 pub_title = clean_tit if clean_tit else p_title
510 row['title'] = pub_title
512 agent_list_authors_only = self.add_authors_to_agent_list(attributes, [], doi)
513 agents_list = self.add_editors_to_agent_list(attributes, agent_list_authors_only, doi)
515 authors_strings_list, editors_string_list = self.get_agents_strings_list(doi, agents_list)
517 # row['author']
518 if 'creators' in attributes:
519 row['author'] = '; '.join(authors_strings_list)
521 # row['pub_date']
522 cur_date = ""
523 dates = attributes.get("dates")
524 if dates:
525 for date in dates:
526 if date.get("dateType") == "Issued":
527 cur_date = date.get("date")
528 break
529 if cur_date == "":
530 if attributes.get("publicationYear"):
531 cur_date = str(attributes.get("publicationYear"))
532 row['pub_date'] = cur_date
534 # row['venue']
535 row['venue'] = self.get_venue_name(attributes, row)
537 issue = ""
538 volume = ""
540 if attributes.get("container"):
541 container = attributes["container"]
542 if container and (container.get("identifierType") in ("ISSN", "ISBN")): # fix precedenza and/or
543 if container.get("issue"):
544 issue = container.get("issue")
545 if container.get("volume"):
546 volume = container.get("volume")
548 if not issue or not volume:
549 relatedIdentifiers = attributes.get("relatedIdentifiers")
550 if relatedIdentifiers:
551 for related in relatedIdentifiers:
552 if related.get("relationType"):
553 if related.get("relationType").lower() == "ispartof":
554 if related.get("relatedIdentifierType") == "ISSN" or related.get("relatedIdentifierType") == "ISBN":
555 if not issue and related.get("issue"):
556 issue = related.get("issue")
557 if not volume and related.get("volume"):
558 volume = related.get("volume")
559 row['volume'] = volume
560 row['issue'] = issue
561 row['page'] = self.get_datacite_pages(attributes)
562 publisher_dict = attributes.get('publisher')
563 if publisher_dict:
564 row['publisher'] = self.get_publisher(doi, publisher_dict)
565 else:
566 row['publisher'] = ''
568 if attributes.get("contributors"):
569 editors = [contributor for contributor in attributes.get("contributors") if
570 contributor.get("contributorType") == "Editor"]
571 if editors:
572 row['editor'] = '; '.join(editors_string_list)
574 try:
575 return self.normalise_unicode(row)
576 except TypeError:
577 print(row)
578 raise(TypeError)
579 return {}
581 def to_validated_id_list(self, norm_id_dict):
582 """Questo metodo verifica la validità di un identificatore in base al suo schema (es. 'doi').
583 Ottimizza il processo interrogando prima una cache locale pre-caricata (`_redis_values_br`).
584 Se l'ID viene trovato in questa cache, viene immediatamente contrassegnato come valido nel
585 gestore di archiviazione (storage manager) temporaneo. In caso contrario, avvia un processo
586 di validazione completo tramite il metodo `is_valid` del manager temporaneo.
588 Argomenti:
589 norm_id_dict (dict): Un dizionario contenente l'identificatore normalizzato
590 e il suo schema. Le chiavi attese sono "id" (str) e "schema" (str).
592 Restituisce:
593 list: Una lista contenente l'identificatore valido se la validazione ha successo,
594 oppure una lista vuota se l'identificatore risulta non valido."""
596 valid_id_list = []
597 norm_id = norm_id_dict.get("id")
598 schema = norm_id_dict.get("schema")
599 if schema == "doi":
600 if norm_id in self._redis_values_br:
601 self.tmp_doi_m.storage_manager.set_value(norm_id, True)
602 valid_id_list.append(norm_id)
603 elif self.tmp_doi_m.is_valid(norm_id):
604 valid_id_list.append(norm_id)
607 elif schema in self.accepted_identifiers_ra:
608 tmp_id_man = self.get_id_manager(schema, self.ra_tmp_man_dict)
609 use_schema_api = getattr(self, f"use_{schema}_api", False)
610 if norm_id in self._redis_values_ra:
611 tmp_id_man.storage_manager.set_value(norm_id, True)
612 valid_id_list.append(norm_id)
613 elif not use_schema_api:
614 pass
615 elif tmp_id_man.is_valid(norm_id):
616 valid_id_list.append(norm_id)
618 else:
619 print("Schema not accepted:", norm_id_dict.get("schema"), "in ", norm_id_dict)
620 return valid_id_list
622 #no modified
623 def get_datacite_pages(self, item: dict) -> str:
624 container_pages_list = list()
625 related_pages_list = list()
626 container = item.get("container")
627 if container:
628 if container.get("identifierType") == "ISSN" or container.get("identifierType") == "ISBN":
629 if container.get("firstPage"):
630 container_pages_list.append(container.get("firstPage"))
631 if container.get("lastPage"):
632 container_pages_list.append(container.get("lastPage"))
634 relatedIdentifiers = item.get("relatedIdentifiers")
635 if relatedIdentifiers:
636 for related in relatedIdentifiers:
637 if related.get("relationType"):
638 if related.get("relationType").lower() == "ispartof":
639 if related.get("relatedIdentifierType") == "ISSN" or related.get("relatedIdentifierType") == "ISBN":
640 if related.get("firstPage"):
641 related_pages_list.append(related.get("firstPage"))
642 if related.get("lastPage"):
643 related_pages_list.append(related.get("lastPage"))
645 page_list = related_pages_list if len(related_pages_list)> len(container_pages_list) else container_pages_list
646 return self.get_pages(page_list)
648 #modified
649 def get_publisher(self, doi: str, publisher_item: dict) -> str:
650 """
651 https://datacite-metadata-schema.readthedocs.io/en/4.6/properties/publisher/
652 "publisher":
653 {
654 "name": "Dryad",
655 "schemeUri": "https://ror.org",
656 "publisherIdentifier": "https://ror.org/00x6h5n95",
657 "publisherIdentifierScheme": "ROR",
658 "lang": "en"
659 }
660 accepted identifiers for publishers in Meta: https://opencitations.github.io/oc_meta/reference/csv_format/#responsible-agents
661 [orcid, viaf, crossref, wikidata, ror]
662 """
663 publisher_name = publisher_item.get("name") or ""
664 # 1. Normalizzazione ed esclusione publisher invalidi tramite Regex
665 if publisher_name:
666 txt = publisher_name.lower().strip()
667 txt_no_spaces = txt.replace(' ', '')
669 if (re.match(r"\(?:unav\)?", txt) or
670 re.match(r"\(?:unkn\)?", txt) or
671 re.match(r".*publ?isher not identified.*", txt) or
672 re.match(r"^\[?unknown]?(:*\[?unknown]?)*$", txt_no_spaces) or
673 re.match(r"^not yet(?: published)?$", txt) or
674 re.match(r"[\[({]*s\.*[ln]\.*[)}\]]*([,:][\[({]*s\.*n\.*[)}\]]*)*", txt_no_spaces) or
675 re.match(r"^(publisher )*not(?: specified\.*)|^(publisher )*not(?: provided\.*)$", txt) or
676 re.match(r"^not known$", txt) or
677 re.match(r"^(information )?not available.*", txt)):
678 publisher_name = ""
680 # 2. Estrazione ID e composizione nome editore di base
682 publisher_id = self.get_publisher_id(publisher_item)
683 publisher_id = f"[{publisher_id}]" if publisher_id else ""
684 publisher = f"{publisher_name} {publisher_id}".strip()
686 # 3. Override con DataCite Mapping (struttura "piatta" senza deep nesting)
687 prefix = doi.split('/')[0] if doi else ""
689 if prefix and self.publishers_mapping and prefix in self.publishers_mapping:
690 mapped_data = self.publishers_mapping[prefix]
691 name = mapped_data.get("name", "")
692 member = mapped_data.get("datacite_member")
694 return f"{name} [datacite:{member}]" if member else name
696 # Fallback se non c'è mapping
697 return publisher
699 def get_publisher_id(self, publisher_item: dict) -> str:
700 publisher_id = publisher_item.get("publisherIdentifier")
701 if not publisher_id:
702 return ""
704 raw_scheme = publisher_item.get('publisherIdentifierScheme')
705 if not raw_scheme:
706 return ""
708 scheme = raw_scheme.lower().strip().replace(" ", "")
710 # Early return se lo schema non è accettato
711 if scheme not in self.accepted_identifiers_ra:
712 return ""
714 id_man = self.get_id_manager(scheme, self.ra_man_dict)
715 tmp_id_man = self.get_id_manager(scheme, self.ra_tmp_man_dict)
717 if not id_man or not tmp_id_man:
718 return ""
720 norm_id = id_man.normalise(publisher_id, include_prefix=True)
721 if not norm_id:
722 return ""
724 # --- Inizio fase di validazione ---
725 #controllo sia nello storage temporaneo che in quello generale
726 validity = self.validated_as({"identifier": norm_id, "schema": scheme})
727 if validity is True:
728 return norm_id
729 elif validity is False:
730 return ""
732 # Recupero dinamico dell'impostazione API
733 use_schema_api = getattr(self, f"use_{scheme}_api", False)
735 # Modalità Offline
736 if not use_schema_api:
737 if norm_id in self._redis_values_ra:
738 tmp_id_man.storage_manager.set_value(norm_id, True)
739 return norm_id
740 return "" # Stop qui: se l'API è spenta e non è in Redis, è invalido
742 # Modalità Online (API)
743 norm_id_dict = {"id": norm_id, "schema": scheme}
744 if norm_id in self.to_validated_id_list(norm_id_dict):
745 return norm_id
747 #Se arriva fin qui, la validazione API è fallita.
748 return ""
750 #no modified
751 def get_venue_name(self, item: dict, row: dict) -> str:
752 cont_title = ""
753 venids_list = list()
755 container = item.get("container")
756 if container:
757 if container.get("title"):
758 cont_title = (container["title"].lower()).replace('\n', '')
759 ven_soup = BeautifulSoup(cont_title, 'html.parser')
760 ventit = html.unescape(ven_soup.get_text())
761 ambiguous_brackets = re.search(r'\[\s*((?:[^\s]+:[^\s]+)?(?:\s+[^\s]+:[^\s]+)*)\s*\]', ventit)
762 if ambiguous_brackets:
763 match = ambiguous_brackets.group(1)
764 open_bracket = ventit.find(match) - 1
765 close_bracket = ventit.find(match) + len(match)
766 ventit = ventit[:open_bracket] + '(' + ventit[open_bracket + 1:]
767 ventit = ventit[:close_bracket] + ')' + ventit[close_bracket + 1:]
768 cont_title = ventit
770 if container.get("identifierType") == "ISBN":
771 if row['type'] in {'book chapter', 'book part', 'book section', 'book track', 'reference entry'}:
772 try:
773 self.id_worker(container.get("identifier"), venids_list, self.isbn_worker)
774 except ValueError:
775 print(f'''{container.get("identifier")} raised a value error''')
777 if container.get("identifierType") == "ISSN":
778 if row['type'] in {'book', 'data file', 'dataset', 'edited book', 'journal article', 'journal volume',
779 'journal issue', 'monograph', 'proceedings', 'peer review', 'reference book',
780 'reference entry', 'report'}:
781 try:
782 self.id_worker(container.get("identifier"), venids_list, self.issn_worker)
783 except ValueError:
784 print(f'''{container.get("identifier")} raised a value error''')
785 elif row['type'] == 'report series':
786 if container.get("title"):
787 if container.get("title"):
788 try:
789 self.id_worker(container.get("identifier"), venids_list, self.issn_worker)
790 except ValueError:
791 print(f'''{container.get("identifier")} raised a value error''')
793 if not venids_list:
794 relatedIdentifiers = item.get("relatedIdentifiers")
795 if relatedIdentifiers:
796 for related in relatedIdentifiers:
797 if related.get("relationType"):
798 if related.get("relationType").lower() == "ispartof":
799 if related.get("relatedIdentifierType") == "ISBN":
800 if row['type'] in {'book chapter', 'book part', 'book section', 'book track',
801 'reference entry'}:
802 self.id_worker(related.get("relatedIdentifier"), venids_list, self.isbn_worker)
803 if related.get("relatedIdentifierType") == "ISSN":
804 if row['type'] in {'book', 'data file', 'dataset', 'edited book', 'journal article',
805 'journal volume',
806 'journal issue', 'monograph', 'proceedings', 'peer review',
807 'reference book',
808 'reference entry', 'report'}:
809 self.id_worker(related.get("relatedIdentifier"), venids_list, self.issn_worker)
810 elif row['type'] == 'report series':
811 if related.get("title"):
812 if related.get("title"):
813 self.id_worker(related.get("relatedIdentifier"), venids_list, self.issn_worker)
815 if venids_list:
816 name_and_id = cont_title + ' [' + ' '.join(venids_list) + ']' if cont_title else '[' + ' '.join(venids_list) + ']'
817 else:
818 name_and_id = cont_title
820 return name_and_id
822 #added the call to find_datacite_orcid
823 def add_editors_to_agent_list(self, item: dict, ag_list: list, doi: str) -> list:
824 agent_list = ag_list
825 contributors = item.get("contributors")
826 if contributors:
827 editors = [contributor for contributor in contributors if
828 contributor.get("contributorType") == "Editor"]
829 for ed in editors:
830 agent = {}
831 agent["role"] = "editor"
832 if ed.get('name'):
833 agent["name"] = ed.get("name")
834 if ed.get("nameType") == "Personal" or ("familyName" in ed or "givenName" in ed):
835 agent["family"] = ed.get("familyName")
836 agent["given"] = ed.get("givenName")
837 if ed.get("nameIdentifiers"):
838 orcid_ids = [x.get("nameIdentifier") for x in ed.get("nameIdentifiers")
839 if x.get("nameIdentifierScheme") == "ORCID"]
840 if orcid_ids:
841 orcid_id = self.find_datacite_orcid(orcid_ids, doi)
842 if orcid_id:
843 agent["orcid"] = orcid_id
845 missing_names = [x for x in ["family", "given", "name"] if x not in agent]
846 for mn in missing_names:
847 agent[mn] = ""
848 agent_list.append(agent)
849 return agent_list
851 # added the call to find_datacite_orcid
852 def add_authors_to_agent_list(self, item: dict, ag_list: list, doi: str) -> list:
853 agent_list = ag_list
854 creators = item.get("creators")
855 if creators:
856 for c in creators:
857 agent = {}
858 agent["role"] = "author"
859 if c.get("name"):
860 agent["name"] = c.get("name")
861 if c.get("nameType") == "Personal" or ("familyName" in c or "givenName" in c):
862 agent["family"] = c.get("familyName")
863 agent["given"] = c.get("givenName")
864 if c.get("nameIdentifiers"):
865 orcid_ids = [x.get("nameIdentifier") for x in c.get("nameIdentifiers")
866 if x.get("nameIdentifierScheme") == "ORCID"]
867 if orcid_ids:
868 orcid_id = self.find_datacite_orcid(orcid_ids, doi)
869 if orcid_id:
870 agent["orcid"] = orcid_id
871 missing_names = [x for x in ["family", "given", "name"] if x not in agent]
872 for mn in missing_names:
873 agent[mn] = ""
874 agent_list.append(agent)
875 return agent_list
877 #added
878 def find_datacite_orcid(self, all_author_ids, doi=None):
879 if not all_author_ids:
880 return ""
882 # normalizza DOI
883 norm_doi = self.doi_m.normalise(doi, include_prefix=True) if doi else None
884 found_orcids = set()
886 # FIX: cerca anche la versione senza prefisso "doi:" nell'indice
887 if norm_doi:
888 alt_doi = norm_doi.replace("doi:", "") if norm_doi.startswith("doi:") else f"doi:{norm_doi}"
889 raw = (
890 self.orcid_finder(norm_doi)
891 or self.orcid_finder(alt_doi)
892 )
893 if isinstance(raw, dict):
894 found_orcids = {k.replace("orcid:", "").strip() for k in raw.keys()}
895 elif isinstance(raw, (set, list)):
896 for v in list(raw):
897 m = re.findall(r"(\d{4}-\d{4}-\d{4}-\d{3,4}[0-9X])", str(v))
898 found_orcids.update(m)
899 elif isinstance(raw, str):
900 m = re.findall(r"(\d{4}-\d{4}-\d{4}-\d{3,4}[0-9X])", raw)
901 found_orcids.update(m)
903 for identifier in all_author_ids:
904 norm_orcid = self.orcid_m.normalise(identifier, include_prefix=True)
905 if not norm_orcid:
906 continue
908 validity = self.validated_as({"identifier": norm_orcid, "schema": "orcid"})
909 if validity is True:
910 return norm_orcid
911 if validity is False:
912 continue
914 bare_orcid = norm_orcid.split(":", 1)[1]
915 if bare_orcid in found_orcids:
916 self.tmp_orcid_m.storage_manager.set_value(norm_orcid, True)
917 return norm_orcid
919 if not self.use_orcid_api:
920 if norm_orcid in self._redis_values_ra:
921 self.tmp_orcid_m.storage_manager.set_value(norm_orcid, True)
922 return norm_orcid
923 return "" # offline: se non in redis, stop qui
925 norm_id_dict = {"id": norm_orcid, "schema": "orcid"}
926 if norm_orcid in self.to_validated_id_list(norm_id_dict):
927 return norm_orcid
929 return ""
931 # added
932 def memory_to_storage(self):
933 kv_in_memory = self.temporary_manager.get_validity_list_of_tuples()
934 if kv_in_memory:
935 self.storage_manager.set_multi_value(kv_in_memory)
936 self.temporary_manager.delete_storage()
938 # added (division in first and second iteration)
939 def extract_all_ids(self, citation, is_citing: bool):
941 """Nella prima iterazione estraggo e normalizzo gli identificativi dei RA (authors, editors, publishers)"""
942 if is_citing:
943 all_br = set()
944 all_ra = set()
946 attributes = citation.get("attributes")
947 if attributes:
948 creators = attributes.get("creators")
949 if creators:
950 for c in creators:
951 c_ids = c.get("nameIdentifiers")
952 if c_ids:
953 norm_c_orcids = {self.orcid_m.normalise(x.get("nameIdentifier"), include_prefix=True) for x in c.get("nameIdentifiers") if
954 x.get("nameIdentifierScheme") == "ORCID"}
955 if norm_c_orcids:
956 all_ra.update(norm_c_orcids)
958 if attributes.get("contributors"):
959 editors = [contributor for contributor in attributes.get("contributors") if
960 contributor.get("contributorType") == "Editor"]
961 for ed in editors:
962 if ed.get("nameIdentifiers"):
963 norm_ed_orcids = {self.orcid_m.normalise(x.get("nameIdentifier"), include_prefix=True) for x in ed.get("nameIdentifiers") if
964 x.get("nameIdentifierScheme") == "ORCID"}
965 if norm_ed_orcids:
966 all_ra.update(norm_ed_orcids)
968 publisher = attributes.get("publisher")
969 if publisher:
970 if publisher.get('publisherIdentifierScheme') and publisher.get('publisherIdentifier'):
971 identifier_scheme = publisher['publisherIdentifierScheme']
972 id_scheme_lower = identifier_scheme.lower().strip().replace(" ", "")
973 id = publisher['publisherIdentifier']
974 if id_scheme_lower in self.accepted_identifiers_ra:
975 norm_publisher = {}
976 if id_scheme_lower == 'orcid':
977 norm_publisher = {self.orcid_m.normalise(id, include_prefix=True)}
978 elif id_scheme_lower == 'ror':
979 norm_publisher = {self.ror_m.normalise(id, include_prefix=True)}
980 elif id_scheme_lower == 'wikidata':
981 norm_publisher = {self.wikidata_m.normalise(id, include_prefix=True)}
982 elif id_scheme_lower == 'viaf':
983 norm_publisher = {self.viaf_m.normalise(id, include_prefix=True)}
984 if norm_publisher:
985 all_ra.update(norm_publisher)
987 all_br = [x for x in all_br if x is not None]
988 all_ra = [y for y in all_ra if y is not None]
989 return all_br, all_ra
991 # Nella seconda iterazione normalizzo ed estraggo i doi dei related items (citanti e/o citati)
992 else:
993 all_br = set()
994 all_ra = set()
995 attributes = citation.get("attributes", {}) # evita KeyError
996 rel_ids = attributes.get("relatedIdentifiers")
997 if rel_ids:
998 for ref in rel_ids:
999 if all(elem in ref for elem in self.needed_info):
1000 relatedIdentifierType = (str(ref["relatedIdentifierType"])).lower()
1001 relationType = str(ref["relationType"]).lower()
1002 if relatedIdentifierType == "doi":
1003 if relationType in self.filter:
1004 rel_id = self.doi_m.normalise(ref["relatedIdentifier"], include_prefix=True)
1005 if rel_id:
1006 all_br.add(rel_id)
1007 all_br = [x for x in all_br if x is not None]
1008 all_ra = [y for y in all_ra if y is not None]
1009 return all_br, all_ra
1011 #added
1012 def get_reids_validity_list(self, id_list, redis_db):
1013 """Questo metodo interroga Redis per una lista di identificativi e restituisce solo quelli che risultano salvati come validi"""
1014 ids = list(id_list) # garantisci ordine deterministico
1015 if redis_db == "ra":
1016 validity = self.RA_redis.mexists_as_set(ids)
1017 return [ids[i] for i, v in enumerate(validity) if v]
1018 elif redis_db == "br":
1019 validity = self.BR_redis.mexists_as_set(ids)
1020 return [ids[i] for i, v in enumerate(validity) if v]
1021 else:
1022 raise ValueError("redis_db must be either 'ra' or 'br'")