Coverage for oc_ds_converter / medra / medra_processing.py: 86%

177 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2022-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# 

3# SPDX-License-Identifier: ISC 

4 

5from datetime import datetime 

6from typing import List, Tuple 

7 

8from bs4 import BeautifulSoup 

9from oc_ds_converter.oc_idmanager import DOIManager, ORCIDManager 

10from oc_ds_converter.lib.csvmanager import CSVManager 

11 

12from oc_ds_converter.ra_processor import RaProcessor 

13 

14 

15class MedraProcessing(RaProcessor): 

16 def __init__(self, orcid_index: str | None = None): 

17 super().__init__(orcid_index) 

18 self.orcid_index = CSVManager(orcid_index) 

19 self._om = ORCIDManager() 

20 

21 def csv_creator(self, xml_soup:BeautifulSoup) -> dict: 

22 try: 

23 br_type = self.get_br_type(xml_soup) 

24 except UnboundLocalError: 

25 print(xml_soup) 

26 raise(UnboundLocalError) 

27 metadata: dict = getattr(self, f"extract_from_{br_type.replace(' ', '_')}")(xml_soup) 

28 return self.normalise_unicode(metadata) 

29 

30 def extract_from_book(self, xml_soup:BeautifulSoup) -> dict: 

31 ids = [self.get_id(xml_soup)] 

32 ids.extend(self.get_isbn(xml_soup)) 

33 title = xml_soup.find('Title').find('TitleText').get_text() 

34 authors, editors = self.get_contributors(xml_soup) 

35 return { 

36 'id': ' '.join(ids), 

37 'title': title, 

38 'author': '; '.join(authors), 

39 'issue': '', 

40 'volume': '', 

41 'venue': '', 

42 'pub_date': self.get_pub_date(xml_soup), 

43 'pages': '', 

44 'type': 'book', 

45 'publisher': self.get_publisher(xml_soup), 

46 'editor': '; '.join(editors) 

47 } 

48 

49 def extract_from_book_chapter(self, xml_soup:BeautifulSoup) -> dict: 

50 venue_ids = self.get_isbn(xml_soup) 

51 monographic_work = xml_soup.find('MonographicWork') 

52 content_item = xml_soup.find('ContentItem') 

53 title_chapter = content_item.find('Title').find('TitleText').get_text() 

54 title_book = monographic_work.find('Title').find('TitleText').get_text() 

55 authors, editors = self.get_contributors(content_item) 

56 venue_name = title_book if title_book != title_chapter else '' 

57 return { 

58 'id': self.get_id(xml_soup), 

59 'title': title_chapter, 

60 'author': '; '.join(authors), 

61 'issue': '', 

62 'volume': '', 

63 'venue': self.build_venue_string(venue_name, venue_ids), 

64 'pub_date': self.get_pub_date(xml_soup), 

65 'pages': '', 

66 'type': 'book chapter', 

67 'publisher': self.get_publisher(xml_soup), 

68 'editor': '; '.join(editors) 

69 } 

70 

71 def extract_from_journal_article(self, xml_soup:BeautifulSoup) -> dict: 

72 serial_publication = xml_soup.find('SerialPublication') 

73 serial_work = serial_publication.find('SerialWork') 

74 publisher_name = self.get_publisher(serial_work) 

75 journal_issue = xml_soup.find('JournalIssue') 

76 volume = journal_issue.find('JournalVolumeNumber') 

77 volume = volume.get_text() if volume else '' 

78 issue = journal_issue.find('JournalIssueNumber') 

79 issue = issue.get_text() if issue else '' 

80 content_item = xml_soup.find('ContentItem') 

81 title = content_item.find('Title').find('TitleText').get_text() 

82 authors, editors = self.get_contributors(xml_soup) 

83 venue_name, venue_ids = self.get_venue(xml_soup) 

84 return { 

85 'id': self.get_id(xml_soup), 

86 'title': title, 

87 'author': '; '.join(authors), 

88 'issue': issue, 

89 'volume': volume, 

90 'venue': self.build_venue_string(venue_name, venue_ids), 

91 'pub_date': self.get_pub_date(content_item), 

92 'pages': self.get_pages(content_item), 

93 'type': 'journal article', 

94 'publisher': publisher_name, 

95 'editor': '; '.join(editors) 

96 } 

97 

98 def build_venue_string(self, venue_name, venue_ids): 

99 if venue_name and venue_ids: 

100 venue = f"{venue_name} [{' '.join(venue_ids)}]" 

101 elif venue_name and not venue_ids: 

102 venue = venue_name 

103 elif venue_ids and not venue_name: 

104 venue = f"[{' '.join(venue_ids)}]" 

105 elif not venue_ids and not venue_name: 

106 venue = '' 

107 return venue 

108 

109 def extract_from_series(self, xml_soup:BeautifulSoup) -> dict: 

110 venue_name, venue_ids = self.get_venue(xml_soup) 

111 ids = [self.get_id(xml_soup)] 

112 ids.extend(venue_ids) 

113 return { 

114 'id': ' '.join(ids), 

115 'title': venue_name, 

116 'author': '', 

117 'issue': '', 

118 'volume': '', 

119 'venue': '', 

120 'pub_date': self.get_pub_date(xml_soup), 

121 'pages': '', 

122 'type': 'series', 

123 'publisher': self.get_publisher(xml_soup.find('SerialPublication').find('SerialWork')), 

124 'editor': '' 

125 } 

126 

127 def get_id(self, context:BeautifulSoup) -> str: 

128 doi_manager = DOIManager(use_api_service=False) 

129 return doi_manager.normalise(context.find('DOI').get_text(), include_prefix=True) 

130 

131 def get_isbn(self, context:BeautifulSoup) -> str: 

132 product_identifiers: List[BeautifulSoup] = context.find_all('ProductIdentifier') 

133 isbn_list = list() 

134 if product_identifiers: 

135 for product_identifier in product_identifiers: 

136 if product_identifier.find('ProductIDType').get_text() in {'02', '15'}: 

137 self.isbn_worker(product_identifier.find('IDValue').get_text(), isbn_list) 

138 return isbn_list 

139 

140 def get_contributors(self, context:BeautifulSoup) -> Tuple[list, list]: 

141 contributors: List[BeautifulSoup] = context.find_all('Contributor') 

142 authors = list(); editors = list() 

143 contributor_roles = {'A': authors, 'B': editors} 

144 for i, contributor in enumerate(contributors): 

145 contributor_role = contributor.find('ContributorRole').get_text()[0] 

146 person_name_inverted = contributor.find('PersonNameInverted') 

147 corporate_name = contributor.find('CorporateName') 

148 person_name = contributor.find('PersonName') 

149 names_before_key = contributor.find('NamesBeforeKey') 

150 key_names = contributor.find('KeyNames') 

151 unnamed_persons = contributor.find('UnnamedPersons') 

152 if person_name_inverted: 

153 author = person_name_inverted.get_text() 

154 elif names_before_key and key_names: 

155 author = f'{key_names.get_text()}, {names_before_key.get_text()}' 

156 elif key_names and not names_before_key: 

157 author = f'{key_names.get_text()},' 

158 elif corporate_name: 

159 author = corporate_name.get_text() 

160 elif person_name: 

161 author = person_name.get_text() 

162 elif unnamed_persons: 

163 continue 

164 else: 

165 raise(ValueError('No author name')) 

166 is_there_name_id = contributor.find('NameIdentifier') 

167 sequence_number = contributor.find('SequenceNumber') 

168 sequence_number = int(sequence_number.get_text()) if sequence_number else i 

169 if is_there_name_id: 

170 name_id = self._om.normalise(is_there_name_id.find('IDValue').get_text(), include_prefix=True) 

171 author += f' [{name_id}]' 

172 contributor_roles[contributor_role].append((sequence_number, author)) 

173 contributor_roles = {k:[ra[1] for ra in sorted(v, key=lambda x:x[0])] for k,v in contributor_roles.items()} 

174 return contributor_roles['A'], contributor_roles['B'] 

175 

176 def get_pub_date(self, context:BeautifulSoup) -> str: 

177 raw_date = context.find('PublicationDate') 

178 if not raw_date: 

179 raw_date = context.find('Date') 

180 if not raw_date: 

181 return '' 

182 raw_date = raw_date.get_text() 

183 try: 

184 clean_date = datetime.strptime(raw_date, '%Y%m%d').strftime('%Y-%m-%d') 

185 except ValueError: 

186 try: 

187 clean_date = datetime.strptime(raw_date, '%Y%m').strftime('%Y-%m') 

188 except ValueError: 

189 clean_date = datetime.strptime(raw_date, '%Y').strftime('%Y') 

190 return clean_date 

191 

192 def get_pages(self, context:BeautifulSoup) -> str: 

193 page_run = context.find('PageRun') 

194 if page_run: 

195 starting_page = page_run.find('FirstPageNumber') 

196 ending_page = page_run.find('LastPageNumber') 

197 starting_page = starting_page.get_text() if starting_page else None 

198 ending_page = ending_page.get_text() if ending_page else None 

199 if starting_page and '-' in starting_page: 

200 starting_page = f'"{starting_page}"' 

201 if ending_page and '-' in ending_page: 

202 ending_page = f'"{ending_page}"' 

203 pages = f'{starting_page}-{ending_page}' if starting_page and ending_page else starting_page if starting_page else '' 

204 else: 

205 pages = '' 

206 return pages 

207 

208 def get_publisher(self, context:BeautifulSoup) -> str: 

209 publisher = context.find('Publisher') 

210 if publisher: 

211 publisher_name = publisher.find('PublisherName') 

212 if publisher: 

213 return publisher_name.get_text() 

214 return '' 

215 

216 def get_venue(self, context:BeautifulSoup) -> str: 

217 serial_publication = context.find('SerialPublication') 

218 serial_work = serial_publication.find('SerialWork') 

219 serial_work_titles: List[BeautifulSoup] = serial_work.find_all('Title') 

220 venue_name = None 

221 for serial_work_title in serial_work_titles: 

222 if serial_work_title.find('TitleType').get_text() == '01': 

223 venue_name = serial_work_title.find('TitleText').get_text() 

224 elif serial_work_title.find('TitleType').get_text() == '05': 

225 venue_name = serial_work_title.find('TitleText').get_text() 

226 serial_versions: List[BeautifulSoup] = serial_publication.find_all('SerialVersion') 

227 venue_ids = list() 

228 for serial_version in serial_versions: 

229 product_id_type = serial_version.find('ProductIDType') 

230 if serial_version.find('ProductForm').get_text() in {'JD', 'JB'} and product_id_type: 

231 if product_id_type.get_text() == '07': 

232 id_value = serial_version.find('IDValue').get_text() 

233 id_value = id_value + 'X' if len(id_value) == 7 else id_value 

234 self.issn_worker(id_value, venue_ids) 

235 return venue_name, venue_ids 

236 

237 @classmethod 

238 def get_br_type(cls, xml_soup:BeautifulSoup) -> str: 

239 if xml_soup.find('DOIMonographicProduct') or xml_soup.find('DOIMonographicWork'): 

240 br_type = 'book' 

241 elif xml_soup.find('DOIMonographChapterWork') or xml_soup.find('DOIMonographChapterVersion'): 

242 br_type = 'book chapter' 

243 elif xml_soup.find('DOISerialArticleWork') or xml_soup.find('DOISerialArticleVersion'): 

244 br_type = 'journal article' 

245 elif xml_soup.find('DOISerialIssueWork'): 

246 br_type = 'journal issue' 

247 elif xml_soup.find('DOISerialTitleWork'): 

248 br_type = 'series' 

249 return br_type