Coverage for oc_meta/run/orcid

1# SPDX-FileCopyrightText: 2019-2020 Fabio Mariani <fabio.mariani555@gmail.com>

2# SPDX-FileCopyrightText: 2021-2026 Arcangelo Massari <arcangelo.massari@unibo.it>

4# SPDX-License-Identifier: ISC

6import os

7import re

8from argparse import ArgumentParser

10from bs4 import BeautifulSoup

11from oc_ds_converter.oc_idmanager import DOIManager

12from rich.console import Console

14from oc_meta.lib.console import create_progress

15from oc_meta.lib.csvmanager import CSVManager

16from oc_meta.lib.master_of_regex import orcid_pattern

19class IndexOrcidDoi:

20 def __init__(self, output_path: str, threshold: int = 10000):

21 self.file_counter = 0

22 self.threshold = threshold

23 self.console = Console()

24 self.console.print("[cyan][INFO][/cyan] Loading existing CSV files")

25 self.orcid_re = re.compile(orcid_pattern)

26 self.doimanager = DOIManager(use_api_service=False)

27 self.csvstorage = CSVManager(output_path=output_path)

28 self.cache = self._build_cache()

30 def _build_cache(self) -> set[str]:

31 cache = set()

32 for values in self.csvstorage.data.values():

33 for value in values:

34 orcid = self._extract_orcid(value)

35 if orcid:

36 cache.add(orcid)

37 return cache

39 def _extract_orcid(self, text: str) -> str | None:

40 match = self.orcid_re.search(text)

41 return match.group(0) if match else None

43 def explorer(self, summaries_path: str) -> None:

44 self.console.print("[cyan][INFO][/cyan] Counting files to process")

45 files_to_process = [

46 os.path.join(fold, filename)

47 for fold, _, files in os.walk(summaries_path)

48 for filename in files

49 if filename.endswith('.xml') and self._extract_orcid(filename) not in self.cache

50 ]

51 processed_files = len(self.cache)

52 del self.cache

53 progress = create_progress()

54 with progress:

55 task = progress.add_task("Processing files", total=len(files_to_process))

56 for file in files_to_process:

57 self._process_file(file)

58 self.file_counter += 1

59 if self.file_counter % self.threshold == 0:

60 start = processed_files + self.file_counter - self.threshold + 1

61 end = processed_files + self.file_counter

62 self.csvstorage.dump_data(f'{start}-{end}.csv')

63 progress.advance(task)

64 if self.csvstorage.data_to_store:

65 start = processed_files + self.file_counter - (self.file_counter % self.threshold) + 1

66 end = processed_files + self.file_counter

67 self.csvstorage.dump_data(f'{start}-{end}.csv')

69 def _process_file(self, file_path: str) -> None:

70 orcid = self._extract_orcid(file_path)

71 if not orcid:

72 return

73 with open(file_path, 'r', encoding='utf-8') as xml_file:

74 xml_soup = BeautifulSoup(xml_file, 'xml')

75 name = self._extract_name(xml_soup)

76 author = f'{name} [{orcid}]' if name else f'[{orcid}]'

77 valid_doi = False

78 for el in xml_soup.find_all('common:external-id'):

79 id_type = el.find('common:external-id-type')

80 rel = el.find('common:external-id-relationship')

81 if not (id_type and rel):

82 continue

83 if id_type.get_text().lower() != 'doi' or rel.get_text().lower() != 'self':

84 continue

85 doi_el = el.find('common:external-id-value')

86 if not doi_el:

87 continue

88 doi = self.doimanager.normalise(doi_el.get_text())

89 if not doi:

90 continue

91 valid_doi = True

92 self.csvstorage.add_value(doi, author)

93 if not valid_doi:

94 self.csvstorage.add_value('None', f'[{orcid}]')

96 def _extract_name(self, xml_soup: BeautifulSoup) -> str | None:

97 family_name_el = xml_soup.find('personal-details:family-name')

98 given_name_el = xml_soup.find('personal-details:given-names')

99 if family_name_el and given_name_el:

100 return f'{family_name_el.get_text()}, {given_name_el.get_text()}'

101 if family_name_el:

102 return family_name_el.get_text()

103 if given_name_el:

104 return given_name_el.get_text()

105 return None

106

107

108if __name__ == '__main__': # pragma: no cover

109 arg_parser = ArgumentParser(

110 'orcid_process.py',

111 description='Build a CSV index of DOIs associated with ORCIDs from XML summary files'

112 )

113 arg_parser.add_argument('-out', '--output', dest='output_path', required=True,

114 help='Output directory for CSV files')

115 arg_parser.add_argument('-s', '--summaries', dest='summaries_path', required=True,

116 help='Directory containing ORCID XML summaries (scanned recursively)')

117 arg_parser.add_argument('-t', '--threshold', dest='threshold', type=int, default=10000,

118 help='Number of files to process before saving a CSV chunk (default: 10000)')

119 args = arg_parser.parse_args()

120 iod = IndexOrcidDoi(output_path=args.output_path, threshold=args.threshold)

121 iod.explorer(summaries_path=args.summaries_path)

Coverage for oc_meta / run / orcid_process.py: 87%

86 statements