Coverage for oc_meta / run / orcid_process.py: 87%

86 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-21 09:24 +0000

1# SPDX-FileCopyrightText: 2019-2020 Fabio Mariani <fabio.mariani555@gmail.com> 

2# SPDX-FileCopyrightText: 2021-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

3# 

4# SPDX-License-Identifier: ISC 

5 

6import os 

7import re 

8from argparse import ArgumentParser 

9 

10from bs4 import BeautifulSoup 

11from oc_ds_converter.oc_idmanager import DOIManager 

12from rich.console import Console 

13 

14from oc_meta.lib.console import create_progress 

15from oc_meta.lib.csvmanager import CSVManager 

16from oc_meta.lib.master_of_regex import orcid_pattern 

17 

18 

19class IndexOrcidDoi: 

20 def __init__(self, output_path: str, threshold: int = 10000): 

21 self.file_counter = 0 

22 self.threshold = threshold 

23 self.console = Console() 

24 self.console.print("[cyan][INFO][/cyan] Loading existing CSV files") 

25 self.orcid_re = re.compile(orcid_pattern) 

26 self.doimanager = DOIManager(use_api_service=False) 

27 self.csvstorage = CSVManager(output_path=output_path) 

28 self.cache = self._build_cache() 

29 

30 def _build_cache(self) -> set[str]: 

31 cache = set() 

32 for values in self.csvstorage.data.values(): 

33 for value in values: 

34 orcid = self._extract_orcid(value) 

35 if orcid: 

36 cache.add(orcid) 

37 return cache 

38 

39 def _extract_orcid(self, text: str) -> str | None: 

40 match = self.orcid_re.search(text) 

41 return match.group(0) if match else None 

42 

43 def explorer(self, summaries_path: str) -> None: 

44 self.console.print("[cyan][INFO][/cyan] Counting files to process") 

45 files_to_process = [ 

46 os.path.join(fold, filename) 

47 for fold, _, files in os.walk(summaries_path) 

48 for filename in files 

49 if filename.endswith('.xml') and self._extract_orcid(filename) not in self.cache 

50 ] 

51 processed_files = len(self.cache) 

52 del self.cache 

53 progress = create_progress() 

54 with progress: 

55 task = progress.add_task("Processing files", total=len(files_to_process)) 

56 for file in files_to_process: 

57 self._process_file(file) 

58 self.file_counter += 1 

59 if self.file_counter % self.threshold == 0: 

60 start = processed_files + self.file_counter - self.threshold + 1 

61 end = processed_files + self.file_counter 

62 self.csvstorage.dump_data(f'{start}-{end}.csv') 

63 progress.advance(task) 

64 if self.csvstorage.data_to_store: 

65 start = processed_files + self.file_counter - (self.file_counter % self.threshold) + 1 

66 end = processed_files + self.file_counter 

67 self.csvstorage.dump_data(f'{start}-{end}.csv') 

68 

69 def _process_file(self, file_path: str) -> None: 

70 orcid = self._extract_orcid(file_path) 

71 if not orcid: 

72 return 

73 with open(file_path, 'r', encoding='utf-8') as xml_file: 

74 xml_soup = BeautifulSoup(xml_file, 'xml') 

75 name = self._extract_name(xml_soup) 

76 author = f'{name} [{orcid}]' if name else f'[{orcid}]' 

77 valid_doi = False 

78 for el in xml_soup.find_all('common:external-id'): 

79 id_type = el.find('common:external-id-type') 

80 rel = el.find('common:external-id-relationship') 

81 if not (id_type and rel): 

82 continue 

83 if id_type.get_text().lower() != 'doi' or rel.get_text().lower() != 'self': 

84 continue 

85 doi_el = el.find('common:external-id-value') 

86 if not doi_el: 

87 continue 

88 doi = self.doimanager.normalise(doi_el.get_text()) 

89 if not doi: 

90 continue 

91 valid_doi = True 

92 self.csvstorage.add_value(doi, author) 

93 if not valid_doi: 

94 self.csvstorage.add_value('None', f'[{orcid}]') 

95 

96 def _extract_name(self, xml_soup: BeautifulSoup) -> str | None: 

97 family_name_el = xml_soup.find('personal-details:family-name') 

98 given_name_el = xml_soup.find('personal-details:given-names') 

99 if family_name_el and given_name_el: 

100 return f'{family_name_el.get_text()}, {given_name_el.get_text()}' 

101 if family_name_el: 

102 return family_name_el.get_text() 

103 if given_name_el: 

104 return given_name_el.get_text() 

105 return None 

106 

107 

108if __name__ == '__main__': # pragma: no cover 

109 arg_parser = ArgumentParser( 

110 'orcid_process.py', 

111 description='Build a CSV index of DOIs associated with ORCIDs from XML summary files' 

112 ) 

113 arg_parser.add_argument('-out', '--output', dest='output_path', required=True, 

114 help='Output directory for CSV files') 

115 arg_parser.add_argument('-s', '--summaries', dest='summaries_path', required=True, 

116 help='Directory containing ORCID XML summaries (scanned recursively)') 

117 arg_parser.add_argument('-t', '--threshold', dest='threshold', type=int, default=10000, 

118 help='Number of files to process before saving a CSV chunk (default: 10000)') 

119 args = arg_parser.parse_args() 

120 iod = IndexOrcidDoi(output_path=args.output_path, threshold=args.threshold) 

121 iod.explorer(summaries_path=args.summaries_path)