Coverage for oc_meta/plugins/orcid/index_orcid_doi.py: 89%

64 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-07-14 14:06 +0000

1#!/usr/bin/python 

2# -*- coding: utf-8 -*- 

3# Copyright 2019-2020 Fabio Mariani <fabio.mariani555@gmail.com> 

4# Copyright 2021-2022 Arcangelo Massari <arcangelo.massari@unibo.it> 

5# 

6# Permission to use, copy, modify, and/or distribute this software for any purpose 

7# with or without fee is hereby granted, provided that the above copyright notice 

8# and this permission notice appear in all copies. 

9# 

10# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 

11# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 

12# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, 

13# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 

14# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 

15# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 

16# SOFTWARE. 

17 

18 

19import os 

20 

21from bs4 import BeautifulSoup 

22from oc_ds_converter.oc_idmanager import DOIManager 

23from tqdm import tqdm 

24 

25from oc_meta.lib.csvmanager import CSVManager 

26 

27 

28class Index_orcid_doi: 

29 def __init__(self, output_path:str, threshold:int=10000, low_memory:bool=False, verbose:bool=False): 

30 self.file_counter = 0 

31 self.threshold = 10000 if not threshold else int(threshold) 

32 self.verbose = verbose 

33 if self.verbose: 

34 print("[INFO: CSVManager] Loading existing csv file") 

35 self.doimanager = DOIManager(use_api_service=False) 

36 self.csvstorage = CSVManager(output_path=output_path, line_threshold=threshold, low_memory=low_memory) 

37 # ORCIDs are extracted to skip the corresponding files at the first reading of an existing CSV. 

38 self.cache = self.cache = set(el.split("[")[1][:-1].strip() for _,v in self.csvstorage.data.items() for el in v) 

39 

40 def explorer(self, summaries_path:str) -> None: 

41 if self.verbose: 

42 print("[INFO: Index_orcid_doi] Counting files to process") 

43 files_to_process = [os.path.join(fold,file) for fold, _, files in os.walk(summaries_path) for file in files if file.replace('.xml', '') not in self.cache] 

44 processed_files = len(self.cache) 

45 del self.cache 

46 if self.verbose: 

47 pbar = tqdm(total=len(files_to_process)) 

48 for file in files_to_process: 

49 self.finder(file) 

50 self.file_counter += 1 

51 cur_file = self.file_counter + processed_files 

52 if self.file_counter % self.threshold == 0: 

53 self.csvstorage.dump_data(f'{cur_file-self.threshold+1}-{cur_file}.csv') 

54 if self.verbose: 

55 pbar.update(1) 

56 cur_file = self.file_counter + processed_files 

57 self.csvstorage.dump_data(f'{cur_file + 1 - (cur_file % self.threshold)}-{cur_file}.csv') 

58 if self.verbose: 

59 pbar.close() 

60 

61 def finder(self, file:str): 

62 orcid = file.replace('.xml', '')[-19:] 

63 valid_doi = False 

64 if file.endswith('.xml'): 

65 with open(file, 'r', encoding='utf-8') as xml_file: 

66 xml_soup = BeautifulSoup(xml_file, 'xml') 

67 ids = xml_soup.findAll('common:external-id') 

68 if ids: 

69 for el in ids: 

70 id_type = el.find('common:external-id-type') 

71 rel = el.find('common:external-id-relationship') 

72 if id_type and rel: 

73 if id_type.get_text().lower() == 'doi' and rel.get_text().lower() == 'self': 

74 doi = el.find('common:external-id-value').get_text() 

75 doi = self.doimanager.normalise(doi) 

76 if doi: 

77 g_name = xml_soup.find('personal-details:given-names') 

78 f_name = xml_soup.find('personal-details:family-name') 

79 if f_name: 

80 f_name = f_name.get_text() 

81 if g_name: 

82 g_name = g_name.get_text() 

83 name = f_name + ', ' + g_name 

84 else: 

85 name = f_name 

86 auto = name + ' [' + orcid + ']' 

87 valid_doi = True 

88 self.csvstorage.add_value(doi, auto) 

89 if not valid_doi: 

90 # Save file names where nothing was found, to skip them during the next run 

91 self.csvstorage.add_value('None', f'[{orcid}]')