Coverage for oc_meta / run / orcid_process.py: 87%
86 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-03 17:25 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-03 17:25 +0000
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# Copyright 2021-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
4# Copyright 2019-2020 Fabio Mariani <fabio.mariani555@gmail.com>
5#
6# Permission to use, copy, modify, and/or distribute this software for any purpose
7# with or without fee is hereby granted, provided that the above copyright notice
8# and this permission notice appear in all copies.
9#
10# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
11# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
12# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
13# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
14# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
15# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
16# SOFTWARE.
18import os
19import re
20from argparse import ArgumentParser
22from bs4 import BeautifulSoup
23from oc_ds_converter.oc_idmanager import DOIManager
24from rich.console import Console
25from rich.progress import (BarColumn, MofNCompleteColumn, Progress,
26 SpinnerColumn, TextColumn, TimeElapsedColumn,
27 TimeRemainingColumn)
29from oc_meta.lib.csvmanager import CSVManager
30from oc_meta.lib.master_of_regex import orcid_pattern
33class IndexOrcidDoi:
34 def __init__(self, output_path: str, threshold: int = 10000):
35 self.file_counter = 0
36 self.threshold = threshold
37 self.console = Console()
38 self.console.print("[cyan][INFO][/cyan] Loading existing CSV files")
39 self.orcid_re = re.compile(orcid_pattern)
40 self.doimanager = DOIManager(use_api_service=False)
41 self.csvstorage = CSVManager(output_path=output_path)
42 self.cache = self._build_cache()
44 def _build_cache(self) -> set[str]:
45 cache = set()
46 for values in self.csvstorage.data.values():
47 for value in values:
48 orcid = self._extract_orcid(value)
49 if orcid:
50 cache.add(orcid)
51 return cache
53 def _extract_orcid(self, text: str) -> str | None:
54 match = self.orcid_re.search(text)
55 return match.group(0) if match else None
57 def explorer(self, summaries_path: str) -> None:
58 self.console.print("[cyan][INFO][/cyan] Counting files to process")
59 files_to_process = [
60 os.path.join(fold, filename)
61 for fold, _, files in os.walk(summaries_path)
62 for filename in files
63 if filename.endswith('.xml') and self._extract_orcid(filename) not in self.cache
64 ]
65 processed_files = len(self.cache)
66 del self.cache
67 progress = Progress(
68 SpinnerColumn(),
69 TextColumn("[progress.description]{task.description}"),
70 BarColumn(),
71 MofNCompleteColumn(),
72 TimeElapsedColumn(),
73 TimeRemainingColumn(),
74 console=self.console,
75 )
76 with progress:
77 task = progress.add_task("Processing files", total=len(files_to_process))
78 for file in files_to_process:
79 self._process_file(file)
80 self.file_counter += 1
81 if self.file_counter % self.threshold == 0:
82 start = processed_files + self.file_counter - self.threshold + 1
83 end = processed_files + self.file_counter
84 self.csvstorage.dump_data(f'{start}-{end}.csv')
85 progress.advance(task)
86 if self.csvstorage.data_to_store:
87 start = processed_files + self.file_counter - (self.file_counter % self.threshold) + 1
88 end = processed_files + self.file_counter
89 self.csvstorage.dump_data(f'{start}-{end}.csv')
91 def _process_file(self, file_path: str) -> None:
92 orcid = self._extract_orcid(file_path)
93 if not orcid:
94 return
95 with open(file_path, 'r', encoding='utf-8') as xml_file:
96 xml_soup = BeautifulSoup(xml_file, 'xml')
97 name = self._extract_name(xml_soup)
98 author = f'{name} [{orcid}]' if name else f'[{orcid}]'
99 valid_doi = False
100 for el in xml_soup.find_all('common:external-id'):
101 id_type = el.find('common:external-id-type')
102 rel = el.find('common:external-id-relationship')
103 if not (id_type and rel):
104 continue
105 if id_type.get_text().lower() != 'doi' or rel.get_text().lower() != 'self':
106 continue
107 doi_el = el.find('common:external-id-value')
108 if not doi_el:
109 continue
110 doi = self.doimanager.normalise(doi_el.get_text())
111 if not doi:
112 continue
113 valid_doi = True
114 self.csvstorage.add_value(doi, author)
115 if not valid_doi:
116 self.csvstorage.add_value('None', f'[{orcid}]')
118 def _extract_name(self, xml_soup: BeautifulSoup) -> str | None:
119 family_name_el = xml_soup.find('personal-details:family-name')
120 given_name_el = xml_soup.find('personal-details:given-names')
121 if family_name_el and given_name_el:
122 return f'{family_name_el.get_text()}, {given_name_el.get_text()}'
123 if family_name_el:
124 return family_name_el.get_text()
125 if given_name_el:
126 return given_name_el.get_text()
127 return None
130if __name__ == '__main__': # pragma: no cover
131 arg_parser = ArgumentParser(
132 'orcid_process.py',
133 description='Build a CSV index of DOIs associated with ORCIDs from XML summary files'
134 )
135 arg_parser.add_argument('-out', '--output', dest='output_path', required=True,
136 help='Output directory for CSV files')
137 arg_parser.add_argument('-s', '--summaries', dest='summaries_path', required=True,
138 help='Directory containing ORCID XML summaries (scanned recursively)')
139 arg_parser.add_argument('-t', '--threshold', dest='threshold', type=int, default=10000,
140 help='Number of files to process before saving a CSV chunk (default: 10000)')
141 args = arg_parser.parse_args()
142 iod = IndexOrcidDoi(output_path=args.output_path, threshold=args.threshold)
143 iod.explorer(summaries_path=args.summaries_path)