Coverage for oc_meta / run / orcid_process.py: 87%
86 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-21 09:24 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-21 09:24 +0000
1# SPDX-FileCopyrightText: 2019-2020 Fabio Mariani <fabio.mariani555@gmail.com>
2# SPDX-FileCopyrightText: 2021-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
3#
4# SPDX-License-Identifier: ISC
6import os
7import re
8from argparse import ArgumentParser
10from bs4 import BeautifulSoup
11from oc_ds_converter.oc_idmanager import DOIManager
12from rich.console import Console
14from oc_meta.lib.console import create_progress
15from oc_meta.lib.csvmanager import CSVManager
16from oc_meta.lib.master_of_regex import orcid_pattern
19class IndexOrcidDoi:
20 def __init__(self, output_path: str, threshold: int = 10000):
21 self.file_counter = 0
22 self.threshold = threshold
23 self.console = Console()
24 self.console.print("[cyan][INFO][/cyan] Loading existing CSV files")
25 self.orcid_re = re.compile(orcid_pattern)
26 self.doimanager = DOIManager(use_api_service=False)
27 self.csvstorage = CSVManager(output_path=output_path)
28 self.cache = self._build_cache()
30 def _build_cache(self) -> set[str]:
31 cache = set()
32 for values in self.csvstorage.data.values():
33 for value in values:
34 orcid = self._extract_orcid(value)
35 if orcid:
36 cache.add(orcid)
37 return cache
39 def _extract_orcid(self, text: str) -> str | None:
40 match = self.orcid_re.search(text)
41 return match.group(0) if match else None
43 def explorer(self, summaries_path: str) -> None:
44 self.console.print("[cyan][INFO][/cyan] Counting files to process")
45 files_to_process = [
46 os.path.join(fold, filename)
47 for fold, _, files in os.walk(summaries_path)
48 for filename in files
49 if filename.endswith('.xml') and self._extract_orcid(filename) not in self.cache
50 ]
51 processed_files = len(self.cache)
52 del self.cache
53 progress = create_progress()
54 with progress:
55 task = progress.add_task("Processing files", total=len(files_to_process))
56 for file in files_to_process:
57 self._process_file(file)
58 self.file_counter += 1
59 if self.file_counter % self.threshold == 0:
60 start = processed_files + self.file_counter - self.threshold + 1
61 end = processed_files + self.file_counter
62 self.csvstorage.dump_data(f'{start}-{end}.csv')
63 progress.advance(task)
64 if self.csvstorage.data_to_store:
65 start = processed_files + self.file_counter - (self.file_counter % self.threshold) + 1
66 end = processed_files + self.file_counter
67 self.csvstorage.dump_data(f'{start}-{end}.csv')
69 def _process_file(self, file_path: str) -> None:
70 orcid = self._extract_orcid(file_path)
71 if not orcid:
72 return
73 with open(file_path, 'r', encoding='utf-8') as xml_file:
74 xml_soup = BeautifulSoup(xml_file, 'xml')
75 name = self._extract_name(xml_soup)
76 author = f'{name} [{orcid}]' if name else f'[{orcid}]'
77 valid_doi = False
78 for el in xml_soup.find_all('common:external-id'):
79 id_type = el.find('common:external-id-type')
80 rel = el.find('common:external-id-relationship')
81 if not (id_type and rel):
82 continue
83 if id_type.get_text().lower() != 'doi' or rel.get_text().lower() != 'self':
84 continue
85 doi_el = el.find('common:external-id-value')
86 if not doi_el:
87 continue
88 doi = self.doimanager.normalise(doi_el.get_text())
89 if not doi:
90 continue
91 valid_doi = True
92 self.csvstorage.add_value(doi, author)
93 if not valid_doi:
94 self.csvstorage.add_value('None', f'[{orcid}]')
96 def _extract_name(self, xml_soup: BeautifulSoup) -> str | None:
97 family_name_el = xml_soup.find('personal-details:family-name')
98 given_name_el = xml_soup.find('personal-details:given-names')
99 if family_name_el and given_name_el:
100 return f'{family_name_el.get_text()}, {given_name_el.get_text()}'
101 if family_name_el:
102 return family_name_el.get_text()
103 if given_name_el:
104 return given_name_el.get_text()
105 return None
108if __name__ == '__main__': # pragma: no cover
109 arg_parser = ArgumentParser(
110 'orcid_process.py',
111 description='Build a CSV index of DOIs associated with ORCIDs from XML summary files'
112 )
113 arg_parser.add_argument('-out', '--output', dest='output_path', required=True,
114 help='Output directory for CSV files')
115 arg_parser.add_argument('-s', '--summaries', dest='summaries_path', required=True,
116 help='Directory containing ORCID XML summaries (scanned recursively)')
117 arg_parser.add_argument('-t', '--threshold', dest='threshold', type=int, default=10000,
118 help='Number of files to process before saving a CSV chunk (default: 10000)')
119 args = arg_parser.parse_args()
120 iod = IndexOrcidDoi(output_path=args.output_path, threshold=args.threshold)
121 iod.explorer(summaries_path=args.summaries_path)