Coverage for oc_ds_converter / pubmed / get_publishers.py: 60%
126 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
1# SPDX-FileCopyrightText: 2023 Arcangelo Massari <arcangelo.massari@unibo.it>
2# SPDX-FileCopyrightText: 2023 Arianna Moretti <arianna.moretti4@unibo.it>
3#
4# SPDX-License-Identifier: ISC
6import re
8import requests
9from lxml import etree
12class ExtractPublisherDOI(object):
13 def __init__(self, pref_info_dict):
14 self.description = "class aimed at extracting publishers' names exploiting the DOI "
15 self.datacite_prefixes = ['10.48550', '10.4230', '10.5281', '10.17863', '10.3929', '10.6084', '10.5451', '10.5445', '10.17615', '10.17877', '10.5167', '10.13140', '10.18154', '10.48350', '10.7892', '10.17605', '10.5283', '10.17169', '10.15488', '10.5169', '10.1184', '10.3204', '10.6073', '10.14288', '10.5061', '10.25384']
16 if pref_info_dict:
17 self._prefix_to_data_dict = pref_info_dict
18 else:
19 self._prefix_to_data_dict = dict()
21 def get_registration_agency(self, prefix):
22 req_url = "https://doi.org/ra/"+prefix
23 try:
24 req = requests.get(url=req_url)
25 req_status_code = req.status_code
26 if req_status_code == 200:
27 req_data = req.json()
28 ra = req_data[0].get("RA")
29 if ra:
30 norm_ra = ra.lower().strip()
31 if norm_ra:
32 return norm_ra
33 else:
34 return ""
35 except requests.ConnectionError:
36 print("failed to connect to crossref for", prefix)
37 quit()
38 return ""
40 def get_last_map_ver(self):
41 return self._prefix_to_data_dict
43 def add_prefix_pub_data(self,prefix):
44 if prefix not in self._prefix_to_data_dict.keys():
45 pref_to_publisher = dict()
46 req_url = "https://api.crossref.org/prefixes/" + prefix
48 try:
49 req = requests.get(url=req_url)
50 req_status_code = req.status_code
51 if req_status_code == 200:
52 req_data = req.json()
53 pref_to_publisher["name"] = req_data["message"]["name"]
54 extended_member_code = req_data["message"]["member"]
55 reduced_member_code = (re.findall("(\d+)", extended_member_code))[0]
56 pref_to_publisher["crossref_member"] = reduced_member_code
57 pref_to_publisher["from"] = "Crossref"
58 else:
59 pref_to_publisher["name"] = "unidentified"
60 pref_to_publisher["crossref_member"] = "not found"
61 pref_to_publisher["from"] = "not found"
64 self._prefix_to_data_dict[prefix] = pref_to_publisher
66 except requests.ConnectionError:
67 print("failed to connect to crossref for", prefix)
68 quit()
69 return self._prefix_to_data_dict
72 def search_in_datacite(self,doi):
73 publisher = dict()
74 datacite_req_url = "https://api.datacite.org/dois/" + doi
76 try:
77 req = requests.get(url=datacite_req_url)
79 req_status_code = req.status_code
80 if req_status_code == 200:
81 req_data = req.json()
82 publisher["name"] = req_data["data"]["attributes"]["publisher"]
83 publisher["prefix"] = doi.split('/')[0]
85 except requests.ConnectionError:
86 print("failed to connect to datacite for", doi)
88 return publisher
91 def search_in_medra(self, doi):
92 publisher = dict()
93 medra_req_url = "https://api.medra.org/metadata/" + doi
95 try:
96 req = requests.get(url=medra_req_url)
98 req_status_code = req.status_code
99 if req_status_code == 200:
100 tree = etree.XML(req.content)
101 publisher_xpath = tree.xpath('//x:PublisherName',
102 namespaces={'x': 'http://www.editeur.org/onix/DOIMetadata/2.0'})
103 if len(publisher_xpath) == 0:
104 return publisher
105 publisher["name"] = publisher_xpath[0].text
106 publisher["prefix"] = doi.split('/')[0]
108 except requests.ConnectionError:
109 print("failed to connect to crossref for", doi)
111 return publisher
114 def search_for_cnki(self,doi):
115 publisher = dict()
116 datacite_req_url = "https://doi.org/api/handles/" + doi
118 try:
119 req = requests.get(url=datacite_req_url)
121 req_status_code = req.status_code
122 if req_status_code == 200:
123 req_data = req.json()
124 if 'values' in req_data.keys() and 'data' in req_data['values'][0].keys():
125 if 'www.cnki.net' in req_data['values'][0]['data']['value']:
126 publisher["name"] = 'CNKI Publisher (unspecified)'
127 publisher["prefix"] = doi.split('/')[0]
129 except requests.ConnectionError:
130 print("failed to connect to doi for", doi)
132 return publisher
135 def add_extra_publisher(self, publisher, agency):
136 self._prefix_to_data_dict[publisher["prefix"]] = {
137 'name': publisher['name'],
138 'from': agency
139 }
142 def search_for_publisher_in_other_agencies(self,doi):
143 publisher = self.search_in_datacite(doi)
144 if 'name' in publisher.keys():
145 self.add_extra_publisher(publisher, 'datacite')
146 return self._prefix_to_data_dict
147 publisher = self.search_in_medra(doi)
148 if 'name' in publisher.keys():
149 self.add_extra_publisher(publisher, 'medra')
150 return self._prefix_to_data_dict
151 publisher = self.search_for_cnki(doi)
152 if 'name' in publisher.keys():
153 self.add_extra_publisher(publisher, 'doi')
154 return self._prefix_to_data_dict
157 """
158 extract_publishers_valid(row, publisher_data, prefix_to_member_code_dict, external_data_dict) manages the
159 addition of unprocessed publishers’ dictionaries to publisher_data and the update
160 of the values related to the number of either valid or invalid addressed or received
161 citations, in the case a dictionary for a given publisher already exists.
162 In the case a publisher's prefix doesn't allow its identification in Crossref, we call the function
163 search_for_publisher_in_other_agencies(row[1], external_data_dict), in order to try to identify it in other services.
164 This very last option is not included for the version for not validated citations.
165 """
167 def extract_publishers_v(self,doi_or_pref, enable_extraagencies=True, get_all_prefix_data=False, skip_update=False):
168 if "/" in doi_or_pref:
169 prefix = re.findall("(10.\d{4,9})", doi_or_pref.split('/')[0])[0]
170 else:
171 prefix = re.findall("(10.\d{4,9})", doi_or_pref)[0]
172 if not skip_update:
173 self._prefix_to_data_dict = self.add_prefix_pub_data(prefix)
175 # set enable_extraagencies = False if you just want to use data already in the mapping and Crossref data
176 if not enable_extraagencies:
177 # set get_all_prefix_data = True if you want to retrieve all the info about the prefix and not only
178 # the name of the publisher
179 if get_all_prefix_data:
180 #return self._prefix_to_data_dict[prefix], self._prefix_to_data_dict
181 return self._prefix_to_data_dict[prefix]
182 else:
183 return self._prefix_to_data_dict[prefix]["name"]
184 else:
185 if self._prefix_to_data_dict[prefix]["from"] == "not found":
186 self.search_for_publisher_in_other_agencies(doi_or_pref)
187 if get_all_prefix_data:
188 return self._prefix_to_data_dict[prefix]
189 #return self._prefix_to_data_dict[prefix], self._prefix_to_data_dict
190 else:
191 return self._prefix_to_data_dict[prefix]["name"]