Coverage for oc_ds_converter / pubmed / get_publishers.py: 60%

126 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2023 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# SPDX-FileCopyrightText: 2023 Arianna Moretti <arianna.moretti4@unibo.it> 

3# 

4# SPDX-License-Identifier: ISC 

5 

6import re 

7 

8import requests 

9from lxml import etree 

10 

11 

12class ExtractPublisherDOI(object): 

13 def __init__(self, pref_info_dict): 

14 self.description = "class aimed at extracting publishers' names exploiting the DOI " 

15 self.datacite_prefixes = ['10.48550', '10.4230', '10.5281', '10.17863', '10.3929', '10.6084', '10.5451', '10.5445', '10.17615', '10.17877', '10.5167', '10.13140', '10.18154', '10.48350', '10.7892', '10.17605', '10.5283', '10.17169', '10.15488', '10.5169', '10.1184', '10.3204', '10.6073', '10.14288', '10.5061', '10.25384'] 

16 if pref_info_dict: 

17 self._prefix_to_data_dict = pref_info_dict 

18 else: 

19 self._prefix_to_data_dict = dict() 

20 

21 def get_registration_agency(self, prefix): 

22 req_url = "https://doi.org/ra/"+prefix 

23 try: 

24 req = requests.get(url=req_url) 

25 req_status_code = req.status_code 

26 if req_status_code == 200: 

27 req_data = req.json() 

28 ra = req_data[0].get("RA") 

29 if ra: 

30 norm_ra = ra.lower().strip() 

31 if norm_ra: 

32 return norm_ra 

33 else: 

34 return "" 

35 except requests.ConnectionError: 

36 print("failed to connect to crossref for", prefix) 

37 quit() 

38 return "" 

39 

40 def get_last_map_ver(self): 

41 return self._prefix_to_data_dict 

42 

43 def add_prefix_pub_data(self,prefix): 

44 if prefix not in self._prefix_to_data_dict.keys(): 

45 pref_to_publisher = dict() 

46 req_url = "https://api.crossref.org/prefixes/" + prefix 

47 

48 try: 

49 req = requests.get(url=req_url) 

50 req_status_code = req.status_code 

51 if req_status_code == 200: 

52 req_data = req.json() 

53 pref_to_publisher["name"] = req_data["message"]["name"] 

54 extended_member_code = req_data["message"]["member"] 

55 reduced_member_code = (re.findall("(\d+)", extended_member_code))[0] 

56 pref_to_publisher["crossref_member"] = reduced_member_code 

57 pref_to_publisher["from"] = "Crossref" 

58 else: 

59 pref_to_publisher["name"] = "unidentified" 

60 pref_to_publisher["crossref_member"] = "not found" 

61 pref_to_publisher["from"] = "not found" 

62 

63 

64 self._prefix_to_data_dict[prefix] = pref_to_publisher 

65 

66 except requests.ConnectionError: 

67 print("failed to connect to crossref for", prefix) 

68 quit() 

69 return self._prefix_to_data_dict 

70 

71 

72 def search_in_datacite(self,doi): 

73 publisher = dict() 

74 datacite_req_url = "https://api.datacite.org/dois/" + doi 

75 

76 try: 

77 req = requests.get(url=datacite_req_url) 

78 

79 req_status_code = req.status_code 

80 if req_status_code == 200: 

81 req_data = req.json() 

82 publisher["name"] = req_data["data"]["attributes"]["publisher"] 

83 publisher["prefix"] = doi.split('/')[0] 

84 

85 except requests.ConnectionError: 

86 print("failed to connect to datacite for", doi) 

87 

88 return publisher 

89 

90 

91 def search_in_medra(self, doi): 

92 publisher = dict() 

93 medra_req_url = "https://api.medra.org/metadata/" + doi 

94 

95 try: 

96 req = requests.get(url=medra_req_url) 

97 

98 req_status_code = req.status_code 

99 if req_status_code == 200: 

100 tree = etree.XML(req.content) 

101 publisher_xpath = tree.xpath('//x:PublisherName', 

102 namespaces={'x': 'http://www.editeur.org/onix/DOIMetadata/2.0'}) 

103 if len(publisher_xpath) == 0: 

104 return publisher 

105 publisher["name"] = publisher_xpath[0].text 

106 publisher["prefix"] = doi.split('/')[0] 

107 

108 except requests.ConnectionError: 

109 print("failed to connect to crossref for", doi) 

110 

111 return publisher 

112 

113 

114 def search_for_cnki(self,doi): 

115 publisher = dict() 

116 datacite_req_url = "https://doi.org/api/handles/" + doi 

117 

118 try: 

119 req = requests.get(url=datacite_req_url) 

120 

121 req_status_code = req.status_code 

122 if req_status_code == 200: 

123 req_data = req.json() 

124 if 'values' in req_data.keys() and 'data' in req_data['values'][0].keys(): 

125 if 'www.cnki.net' in req_data['values'][0]['data']['value']: 

126 publisher["name"] = 'CNKI Publisher (unspecified)' 

127 publisher["prefix"] = doi.split('/')[0] 

128 

129 except requests.ConnectionError: 

130 print("failed to connect to doi for", doi) 

131 

132 return publisher 

133 

134 

135 def add_extra_publisher(self, publisher, agency): 

136 self._prefix_to_data_dict[publisher["prefix"]] = { 

137 'name': publisher['name'], 

138 'from': agency 

139 } 

140 

141 

142 def search_for_publisher_in_other_agencies(self,doi): 

143 publisher = self.search_in_datacite(doi) 

144 if 'name' in publisher.keys(): 

145 self.add_extra_publisher(publisher, 'datacite') 

146 return self._prefix_to_data_dict 

147 publisher = self.search_in_medra(doi) 

148 if 'name' in publisher.keys(): 

149 self.add_extra_publisher(publisher, 'medra') 

150 return self._prefix_to_data_dict 

151 publisher = self.search_for_cnki(doi) 

152 if 'name' in publisher.keys(): 

153 self.add_extra_publisher(publisher, 'doi') 

154 return self._prefix_to_data_dict 

155 

156 

157 """ 

158 extract_publishers_valid(row, publisher_data, prefix_to_member_code_dict, external_data_dict) manages the  

159 addition of unprocessed publishers’ dictionaries to publisher_data and the update  

160 of the values related to the number of either valid or invalid addressed or received  

161 citations, in the case a dictionary for a given publisher already exists.  

162 In the case a publisher's prefix doesn't allow its identification in Crossref, we call the function  

163 search_for_publisher_in_other_agencies(row[1], external_data_dict), in order to try to identify it in other services. 

164 This very last option is not included for the version for not validated citations.  

165 """ 

166 

167 def extract_publishers_v(self,doi_or_pref, enable_extraagencies=True, get_all_prefix_data=False, skip_update=False): 

168 if "/" in doi_or_pref: 

169 prefix = re.findall("(10.\d{4,9})", doi_or_pref.split('/')[0])[0] 

170 else: 

171 prefix = re.findall("(10.\d{4,9})", doi_or_pref)[0] 

172 if not skip_update: 

173 self._prefix_to_data_dict = self.add_prefix_pub_data(prefix) 

174 

175 # set enable_extraagencies = False if you just want to use data already in the mapping and Crossref data 

176 if not enable_extraagencies: 

177 # set get_all_prefix_data = True if you want to retrieve all the info about the prefix and not only 

178 # the name of the publisher 

179 if get_all_prefix_data: 

180 #return self._prefix_to_data_dict[prefix], self._prefix_to_data_dict 

181 return self._prefix_to_data_dict[prefix] 

182 else: 

183 return self._prefix_to_data_dict[prefix]["name"] 

184 else: 

185 if self._prefix_to_data_dict[prefix]["from"] == "not found": 

186 self.search_for_publisher_in_other_agencies(doi_or_pref) 

187 if get_all_prefix_data: 

188 return self._prefix_to_data_dict[prefix] 

189 #return self._prefix_to_data_dict[prefix], self._prefix_to_data_dict 

190 else: 

191 return self._prefix_to_data_dict[prefix]["name"]