Coverage for oc_ds_converter/pubmed/get

1# SPDX-FileCopyrightText: 2023 Arcangelo Massari <arcangelo.massari@unibo.it>

2# SPDX-FileCopyrightText: 2023 Arianna Moretti <arianna.moretti4@unibo.it>

4# SPDX-License-Identifier: ISC

6import re

8import requests

9from lxml import etree

12class ExtractPublisherDOI(object):

13 def __init__(self, pref_info_dict):

14 self.description = "class aimed at extracting publishers' names exploiting the DOI "

15 self.datacite_prefixes = ['10.48550', '10.4230', '10.5281', '10.17863', '10.3929', '10.6084', '10.5451', '10.5445', '10.17615', '10.17877', '10.5167', '10.13140', '10.18154', '10.48350', '10.7892', '10.17605', '10.5283', '10.17169', '10.15488', '10.5169', '10.1184', '10.3204', '10.6073', '10.14288', '10.5061', '10.25384']

16 if pref_info_dict:

17 self._prefix_to_data_dict = pref_info_dict

18 else:

19 self._prefix_to_data_dict = dict()

21 def get_registration_agency(self, prefix):

22 req_url = "https://doi.org/ra/"+prefix

23 try:

24 req = requests.get(url=req_url)

25 req_status_code = req.status_code

26 if req_status_code == 200:

27 req_data = req.json()

28 ra = req_data[0].get("RA")

29 if ra:

30 norm_ra = ra.lower().strip()

31 if norm_ra:

32 return norm_ra

33 else:

34 return ""

35 except requests.ConnectionError:

36 print("failed to connect to crossref for", prefix)

37 quit()

38 return ""

40 def get_last_map_ver(self):

41 return self._prefix_to_data_dict

43 def add_prefix_pub_data(self,prefix):

44 if prefix not in self._prefix_to_data_dict.keys():

45 pref_to_publisher = dict()

46 req_url = "https://api.crossref.org/prefixes/" + prefix

48 try:

49 req = requests.get(url=req_url)

50 req_status_code = req.status_code

51 if req_status_code == 200:

52 req_data = req.json()

53 pref_to_publisher["name"] = req_data["message"]["name"]

54 extended_member_code = req_data["message"]["member"]

55 reduced_member_code = (re.findall("(\d+)", extended_member_code))[0]

56 pref_to_publisher["crossref_member"] = reduced_member_code

57 pref_to_publisher["from"] = "Crossref"

58 else:

59 pref_to_publisher["name"] = "unidentified"

60 pref_to_publisher["crossref_member"] = "not found"

61 pref_to_publisher["from"] = "not found"

64 self._prefix_to_data_dict[prefix] = pref_to_publisher

66 except requests.ConnectionError:

67 print("failed to connect to crossref for", prefix)

68 quit()

69 return self._prefix_to_data_dict

72 def search_in_datacite(self,doi):

73 publisher = dict()

74 datacite_req_url = "https://api.datacite.org/dois/" + doi

76 try:

77 req = requests.get(url=datacite_req_url)

79 req_status_code = req.status_code

80 if req_status_code == 200:

81 req_data = req.json()

82 publisher["name"] = req_data["data"]["attributes"]["publisher"]

83 publisher["prefix"] = doi.split('/')[0]

85 except requests.ConnectionError:

86 print("failed to connect to datacite for", doi)

88 return publisher

91 def search_in_medra(self, doi):

92 publisher = dict()

93 medra_req_url = "https://api.medra.org/metadata/" + doi

95 try:

96 req = requests.get(url=medra_req_url)

98 req_status_code = req.status_code

99 if req_status_code == 200:

100 tree = etree.XML(req.content)

101 publisher_xpath = tree.xpath('//x:PublisherName',

102 namespaces={'x': 'http://www.editeur.org/onix/DOIMetadata/2.0'})

103 if len(publisher_xpath) == 0:

104 return publisher

105 publisher["name"] = publisher_xpath[0].text

106 publisher["prefix"] = doi.split('/')[0]

107

108 except requests.ConnectionError:

109 print("failed to connect to crossref for", doi)

110

111 return publisher

112

113

114 def search_for_cnki(self,doi):

115 publisher = dict()

116 datacite_req_url = "https://doi.org/api/handles/" + doi

117

118 try:

119 req = requests.get(url=datacite_req_url)

120

121 req_status_code = req.status_code

122 if req_status_code == 200:

123 req_data = req.json()

124 if 'values' in req_data.keys() and 'data' in req_data['values'][0].keys():

125 if 'www.cnki.net' in req_data['values'][0]['data']['value']:

126 publisher["name"] = 'CNKI Publisher (unspecified)'

127 publisher["prefix"] = doi.split('/')[0]

128

129 except requests.ConnectionError:

130 print("failed to connect to doi for", doi)

131

132 return publisher

133

134

135 def add_extra_publisher(self, publisher, agency):

136 self._prefix_to_data_dict[publisher["prefix"]] = {

137 'name': publisher['name'],

138 'from': agency

139 }

140

141

142 def search_for_publisher_in_other_agencies(self,doi):

143 publisher = self.search_in_datacite(doi)

144 if 'name' in publisher.keys():

145 self.add_extra_publisher(publisher, 'datacite')

146 return self._prefix_to_data_dict

147 publisher = self.search_in_medra(doi)

148 if 'name' in publisher.keys():

149 self.add_extra_publisher(publisher, 'medra')

150 return self._prefix_to_data_dict

151 publisher = self.search_for_cnki(doi)

152 if 'name' in publisher.keys():

153 self.add_extra_publisher(publisher, 'doi')

154 return self._prefix_to_data_dict

155

156

157 """

158 extract_publishers_valid(row, publisher_data, prefix_to_member_code_dict, external_data_dict) manages the

159 addition of unprocessed publishers’ dictionaries to publisher_data and the update

160 of the values related to the number of either valid or invalid addressed or received

161 citations, in the case a dictionary for a given publisher already exists.

162 In the case a publisher's prefix doesn't allow its identification in Crossref, we call the function

163 search_for_publisher_in_other_agencies(row[1], external_data_dict), in order to try to identify it in other services.

164 This very last option is not included for the version for not validated citations.

165 """

166

167 def extract_publishers_v(self,doi_or_pref, enable_extraagencies=True, get_all_prefix_data=False, skip_update=False):

168 if "/" in doi_or_pref:

169 prefix = re.findall("(10.\d{4,9})", doi_or_pref.split('/')[0])[0]

170 else:

171 prefix = re.findall("(10.\d{4,9})", doi_or_pref)[0]

172 if not skip_update:

173 self._prefix_to_data_dict = self.add_prefix_pub_data(prefix)

174

175 # set enable_extraagencies = False if you just want to use data already in the mapping and Crossref data

176 if not enable_extraagencies:

177 # set get_all_prefix_data = True if you want to retrieve all the info about the prefix and not only

178 # the name of the publisher

179 if get_all_prefix_data:

180 #return self._prefix_to_data_dict[prefix], self._prefix_to_data_dict

181 return self._prefix_to_data_dict[prefix]

182 else:

183 return self._prefix_to_data_dict[prefix]["name"]

184 else:

185 if self._prefix_to_data_dict[prefix]["from"] == "not found":

186 self.search_for_publisher_in_other_agencies(doi_or_pref)

187 if get_all_prefix_data:

188 return self._prefix_to_data_dict[prefix]

189 #return self._prefix_to_data_dict[prefix], self._prefix_to_data_dict

190 else:

191 return self._prefix_to_data_dict[prefix]["name"]

Coverage for oc_ds_converter / pubmed / get_publishers.py: 60%

126 statements