Coverage for oc_meta / run / orcid_process.py: 87%

86 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-03 17:25 +0000

1#!/usr/bin/python 

2# -*- coding: utf-8 -*- 

3# Copyright 2021-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

4# Copyright 2019-2020 Fabio Mariani <fabio.mariani555@gmail.com> 

5# 

6# Permission to use, copy, modify, and/or distribute this software for any purpose 

7# with or without fee is hereby granted, provided that the above copyright notice 

8# and this permission notice appear in all copies. 

9# 

10# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 

11# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 

12# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, 

13# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 

14# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 

15# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 

16# SOFTWARE. 

17 

18import os 

19import re 

20from argparse import ArgumentParser 

21 

22from bs4 import BeautifulSoup 

23from oc_ds_converter.oc_idmanager import DOIManager 

24from rich.console import Console 

25from rich.progress import (BarColumn, MofNCompleteColumn, Progress, 

26 SpinnerColumn, TextColumn, TimeElapsedColumn, 

27 TimeRemainingColumn) 

28 

29from oc_meta.lib.csvmanager import CSVManager 

30from oc_meta.lib.master_of_regex import orcid_pattern 

31 

32 

33class IndexOrcidDoi: 

34 def __init__(self, output_path: str, threshold: int = 10000): 

35 self.file_counter = 0 

36 self.threshold = threshold 

37 self.console = Console() 

38 self.console.print("[cyan][INFO][/cyan] Loading existing CSV files") 

39 self.orcid_re = re.compile(orcid_pattern) 

40 self.doimanager = DOIManager(use_api_service=False) 

41 self.csvstorage = CSVManager(output_path=output_path) 

42 self.cache = self._build_cache() 

43 

44 def _build_cache(self) -> set[str]: 

45 cache = set() 

46 for values in self.csvstorage.data.values(): 

47 for value in values: 

48 orcid = self._extract_orcid(value) 

49 if orcid: 

50 cache.add(orcid) 

51 return cache 

52 

53 def _extract_orcid(self, text: str) -> str | None: 

54 match = self.orcid_re.search(text) 

55 return match.group(0) if match else None 

56 

57 def explorer(self, summaries_path: str) -> None: 

58 self.console.print("[cyan][INFO][/cyan] Counting files to process") 

59 files_to_process = [ 

60 os.path.join(fold, filename) 

61 for fold, _, files in os.walk(summaries_path) 

62 for filename in files 

63 if filename.endswith('.xml') and self._extract_orcid(filename) not in self.cache 

64 ] 

65 processed_files = len(self.cache) 

66 del self.cache 

67 progress = Progress( 

68 SpinnerColumn(), 

69 TextColumn("[progress.description]{task.description}"), 

70 BarColumn(), 

71 MofNCompleteColumn(), 

72 TimeElapsedColumn(), 

73 TimeRemainingColumn(), 

74 console=self.console, 

75 ) 

76 with progress: 

77 task = progress.add_task("Processing files", total=len(files_to_process)) 

78 for file in files_to_process: 

79 self._process_file(file) 

80 self.file_counter += 1 

81 if self.file_counter % self.threshold == 0: 

82 start = processed_files + self.file_counter - self.threshold + 1 

83 end = processed_files + self.file_counter 

84 self.csvstorage.dump_data(f'{start}-{end}.csv') 

85 progress.advance(task) 

86 if self.csvstorage.data_to_store: 

87 start = processed_files + self.file_counter - (self.file_counter % self.threshold) + 1 

88 end = processed_files + self.file_counter 

89 self.csvstorage.dump_data(f'{start}-{end}.csv') 

90 

91 def _process_file(self, file_path: str) -> None: 

92 orcid = self._extract_orcid(file_path) 

93 if not orcid: 

94 return 

95 with open(file_path, 'r', encoding='utf-8') as xml_file: 

96 xml_soup = BeautifulSoup(xml_file, 'xml') 

97 name = self._extract_name(xml_soup) 

98 author = f'{name} [{orcid}]' if name else f'[{orcid}]' 

99 valid_doi = False 

100 for el in xml_soup.find_all('common:external-id'): 

101 id_type = el.find('common:external-id-type') 

102 rel = el.find('common:external-id-relationship') 

103 if not (id_type and rel): 

104 continue 

105 if id_type.get_text().lower() != 'doi' or rel.get_text().lower() != 'self': 

106 continue 

107 doi_el = el.find('common:external-id-value') 

108 if not doi_el: 

109 continue 

110 doi = self.doimanager.normalise(doi_el.get_text()) 

111 if not doi: 

112 continue 

113 valid_doi = True 

114 self.csvstorage.add_value(doi, author) 

115 if not valid_doi: 

116 self.csvstorage.add_value('None', f'[{orcid}]') 

117 

118 def _extract_name(self, xml_soup: BeautifulSoup) -> str | None: 

119 family_name_el = xml_soup.find('personal-details:family-name') 

120 given_name_el = xml_soup.find('personal-details:given-names') 

121 if family_name_el and given_name_el: 

122 return f'{family_name_el.get_text()}, {given_name_el.get_text()}' 

123 if family_name_el: 

124 return family_name_el.get_text() 

125 if given_name_el: 

126 return given_name_el.get_text() 

127 return None 

128 

129 

130if __name__ == '__main__': # pragma: no cover 

131 arg_parser = ArgumentParser( 

132 'orcid_process.py', 

133 description='Build a CSV index of DOIs associated with ORCIDs from XML summary files' 

134 ) 

135 arg_parser.add_argument('-out', '--output', dest='output_path', required=True, 

136 help='Output directory for CSV files') 

137 arg_parser.add_argument('-s', '--summaries', dest='summaries_path', required=True, 

138 help='Directory containing ORCID XML summaries (scanned recursively)') 

139 arg_parser.add_argument('-t', '--threshold', dest='threshold', type=int, default=10000, 

140 help='Number of files to process before saving a CSV chunk (default: 10000)') 

141 args = arg_parser.parse_args() 

142 iod = IndexOrcidDoi(output_path=args.output_path, threshold=args.threshold) 

143 iod.explorer(summaries_path=args.summaries_path)