Coverage for oc_meta/plugins/orcid/index_orcid

1#!/usr/bin/python

2# -*- coding: utf-8 -*-

6# Permission to use, copy, modify, and/or distribute this software for any purpose

7# with or without fee is hereby granted, provided that the above copyright notice

8# and this permission notice appear in all copies.

10# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH

11# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND

12# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,

13# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,

14# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

15# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS

16# SOFTWARE.

19import os

21from bs4 import BeautifulSoup

22from oc_ds_converter.oc_idmanager import DOIManager

23from tqdm import tqdm

25from oc_meta.lib.csvmanager import CSVManager

28class Index_orcid_doi:

29 def __init__(self, output_path:str, threshold:int=10000, low_memory:bool=False, verbose:bool=False):

30 self.file_counter = 0

31 self.threshold = 10000 if not threshold else int(threshold)

32 self.verbose = verbose

33 if self.verbose:

34 print("[INFO: CSVManager] Loading existing csv file")

35 self.doimanager = DOIManager(use_api_service=False)

36 self.csvstorage = CSVManager(output_path=output_path, line_threshold=threshold, low_memory=low_memory)

37 # ORCIDs are extracted to skip the corresponding files at the first reading of an existing CSV.

38 self.cache = self.cache = set(el.split("[")[1][:-1].strip() for _,v in self.csvstorage.data.items() for el in v)

40 def explorer(self, summaries_path:str) -> None:

41 if self.verbose:

42 print("[INFO: Index_orcid_doi] Counting files to process")

43 files_to_process = [os.path.join(fold,file) for fold, _, files in os.walk(summaries_path) for file in files if file.replace('.xml', '') not in self.cache]

44 processed_files = len(self.cache)

45 del self.cache

46 if self.verbose:

47 pbar = tqdm(total=len(files_to_process))

48 for file in files_to_process:

49 self.finder(file)

50 self.file_counter += 1

51 cur_file = self.file_counter + processed_files

52 if self.file_counter % self.threshold == 0:

53 self.csvstorage.dump_data(f'{cur_file-self.threshold+1}-{cur_file}.csv')

54 if self.verbose:

55 pbar.update(1)

56 cur_file = self.file_counter + processed_files

57 self.csvstorage.dump_data(f'{cur_file + 1 - (cur_file % self.threshold)}-{cur_file}.csv')

58 if self.verbose:

59 pbar.close()

61 def finder(self, file:str):

62 orcid = file.replace('.xml', '')[-19:]

63 valid_doi = False

64 if file.endswith('.xml'):

65 with open(file, 'r', encoding='utf-8') as xml_file:

66 xml_soup = BeautifulSoup(xml_file, 'xml')

67 ids = xml_soup.findAll('common:external-id')

68 if ids:

69 for el in ids:

70 id_type = el.find('common:external-id-type')

71 rel = el.find('common:external-id-relationship')

72 if id_type and rel:

73 if id_type.get_text().lower() == 'doi' and rel.get_text().lower() == 'self':

74 doi = el.find('common:external-id-value').get_text()

75 doi = self.doimanager.normalise(doi)

76 if doi:

77 g_name = xml_soup.find('personal-details:given-names')

78 f_name = xml_soup.find('personal-details:family-name')

79 if f_name:

80 f_name = f_name.get_text()

81 if g_name:

82 g_name = g_name.get_text()

83 name = f_name + ', ' + g_name

84 else:

85 name = f_name

86 auto = name + ' [' + orcid + ']'

87 valid_doi = True

88 self.csvstorage.add_value(doi, auto)

89 if not valid_doi:

90 # Save file names where nothing was found, to skip them during the next run

91 self.csvstorage.add_value('None', f'[{orcid}]')

Coverage for oc_meta/plugins/orcid/index_orcid_doi.py: 89%

64 statements