Coverage for oc_meta/plugins/orcid/index_orcid_doi.py: 89%
64 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-07-14 14:06 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-07-14 14:06 +0000
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# Copyright 2019-2020 Fabio Mariani <fabio.mariani555@gmail.com>
4# Copyright 2021-2022 Arcangelo Massari <arcangelo.massari@unibo.it>
5#
6# Permission to use, copy, modify, and/or distribute this software for any purpose
7# with or without fee is hereby granted, provided that the above copyright notice
8# and this permission notice appear in all copies.
9#
10# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
11# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
12# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
13# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
14# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
15# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
16# SOFTWARE.
19import os
21from bs4 import BeautifulSoup
22from oc_ds_converter.oc_idmanager import DOIManager
23from tqdm import tqdm
25from oc_meta.lib.csvmanager import CSVManager
28class Index_orcid_doi:
29 def __init__(self, output_path:str, threshold:int=10000, low_memory:bool=False, verbose:bool=False):
30 self.file_counter = 0
31 self.threshold = 10000 if not threshold else int(threshold)
32 self.verbose = verbose
33 if self.verbose:
34 print("[INFO: CSVManager] Loading existing csv file")
35 self.doimanager = DOIManager(use_api_service=False)
36 self.csvstorage = CSVManager(output_path=output_path, line_threshold=threshold, low_memory=low_memory)
37 # ORCIDs are extracted to skip the corresponding files at the first reading of an existing CSV.
38 self.cache = self.cache = set(el.split("[")[1][:-1].strip() for _,v in self.csvstorage.data.items() for el in v)
40 def explorer(self, summaries_path:str) -> None:
41 if self.verbose:
42 print("[INFO: Index_orcid_doi] Counting files to process")
43 files_to_process = [os.path.join(fold,file) for fold, _, files in os.walk(summaries_path) for file in files if file.replace('.xml', '') not in self.cache]
44 processed_files = len(self.cache)
45 del self.cache
46 if self.verbose:
47 pbar = tqdm(total=len(files_to_process))
48 for file in files_to_process:
49 self.finder(file)
50 self.file_counter += 1
51 cur_file = self.file_counter + processed_files
52 if self.file_counter % self.threshold == 0:
53 self.csvstorage.dump_data(f'{cur_file-self.threshold+1}-{cur_file}.csv')
54 if self.verbose:
55 pbar.update(1)
56 cur_file = self.file_counter + processed_files
57 self.csvstorage.dump_data(f'{cur_file + 1 - (cur_file % self.threshold)}-{cur_file}.csv')
58 if self.verbose:
59 pbar.close()
61 def finder(self, file:str):
62 orcid = file.replace('.xml', '')[-19:]
63 valid_doi = False
64 if file.endswith('.xml'):
65 with open(file, 'r', encoding='utf-8') as xml_file:
66 xml_soup = BeautifulSoup(xml_file, 'xml')
67 ids = xml_soup.findAll('common:external-id')
68 if ids:
69 for el in ids:
70 id_type = el.find('common:external-id-type')
71 rel = el.find('common:external-id-relationship')
72 if id_type and rel:
73 if id_type.get_text().lower() == 'doi' and rel.get_text().lower() == 'self':
74 doi = el.find('common:external-id-value').get_text()
75 doi = self.doimanager.normalise(doi)
76 if doi:
77 g_name = xml_soup.find('personal-details:given-names')
78 f_name = xml_soup.find('personal-details:family-name')
79 if f_name:
80 f_name = f_name.get_text()
81 if g_name:
82 g_name = g_name.get_text()
83 name = f_name + ', ' + g_name
84 else:
85 name = f_name
86 auto = name + ' [' + orcid + ']'
87 valid_doi = True
88 self.csvstorage.add_value(doi, auto)
89 if not valid_doi:
90 # Save file names where nothing was found, to skip them during the next run
91 self.csvstorage.add_value('None', f'[{orcid}]')