Coverage for oc_ds_converter / medra / medra_processing.py: 86%
177 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
1# SPDX-FileCopyrightText: 2022-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
2#
3# SPDX-License-Identifier: ISC
5from datetime import datetime
6from typing import List, Tuple
8from bs4 import BeautifulSoup
9from oc_ds_converter.oc_idmanager import DOIManager, ORCIDManager
10from oc_ds_converter.lib.csvmanager import CSVManager
12from oc_ds_converter.ra_processor import RaProcessor
15class MedraProcessing(RaProcessor):
16 def __init__(self, orcid_index: str | None = None):
17 super().__init__(orcid_index)
18 self.orcid_index = CSVManager(orcid_index)
19 self._om = ORCIDManager()
21 def csv_creator(self, xml_soup:BeautifulSoup) -> dict:
22 try:
23 br_type = self.get_br_type(xml_soup)
24 except UnboundLocalError:
25 print(xml_soup)
26 raise(UnboundLocalError)
27 metadata: dict = getattr(self, f"extract_from_{br_type.replace(' ', '_')}")(xml_soup)
28 return self.normalise_unicode(metadata)
30 def extract_from_book(self, xml_soup:BeautifulSoup) -> dict:
31 ids = [self.get_id(xml_soup)]
32 ids.extend(self.get_isbn(xml_soup))
33 title = xml_soup.find('Title').find('TitleText').get_text()
34 authors, editors = self.get_contributors(xml_soup)
35 return {
36 'id': ' '.join(ids),
37 'title': title,
38 'author': '; '.join(authors),
39 'issue': '',
40 'volume': '',
41 'venue': '',
42 'pub_date': self.get_pub_date(xml_soup),
43 'pages': '',
44 'type': 'book',
45 'publisher': self.get_publisher(xml_soup),
46 'editor': '; '.join(editors)
47 }
49 def extract_from_book_chapter(self, xml_soup:BeautifulSoup) -> dict:
50 venue_ids = self.get_isbn(xml_soup)
51 monographic_work = xml_soup.find('MonographicWork')
52 content_item = xml_soup.find('ContentItem')
53 title_chapter = content_item.find('Title').find('TitleText').get_text()
54 title_book = monographic_work.find('Title').find('TitleText').get_text()
55 authors, editors = self.get_contributors(content_item)
56 venue_name = title_book if title_book != title_chapter else ''
57 return {
58 'id': self.get_id(xml_soup),
59 'title': title_chapter,
60 'author': '; '.join(authors),
61 'issue': '',
62 'volume': '',
63 'venue': self.build_venue_string(venue_name, venue_ids),
64 'pub_date': self.get_pub_date(xml_soup),
65 'pages': '',
66 'type': 'book chapter',
67 'publisher': self.get_publisher(xml_soup),
68 'editor': '; '.join(editors)
69 }
71 def extract_from_journal_article(self, xml_soup:BeautifulSoup) -> dict:
72 serial_publication = xml_soup.find('SerialPublication')
73 serial_work = serial_publication.find('SerialWork')
74 publisher_name = self.get_publisher(serial_work)
75 journal_issue = xml_soup.find('JournalIssue')
76 volume = journal_issue.find('JournalVolumeNumber')
77 volume = volume.get_text() if volume else ''
78 issue = journal_issue.find('JournalIssueNumber')
79 issue = issue.get_text() if issue else ''
80 content_item = xml_soup.find('ContentItem')
81 title = content_item.find('Title').find('TitleText').get_text()
82 authors, editors = self.get_contributors(xml_soup)
83 venue_name, venue_ids = self.get_venue(xml_soup)
84 return {
85 'id': self.get_id(xml_soup),
86 'title': title,
87 'author': '; '.join(authors),
88 'issue': issue,
89 'volume': volume,
90 'venue': self.build_venue_string(venue_name, venue_ids),
91 'pub_date': self.get_pub_date(content_item),
92 'pages': self.get_pages(content_item),
93 'type': 'journal article',
94 'publisher': publisher_name,
95 'editor': '; '.join(editors)
96 }
98 def build_venue_string(self, venue_name, venue_ids):
99 if venue_name and venue_ids:
100 venue = f"{venue_name} [{' '.join(venue_ids)}]"
101 elif venue_name and not venue_ids:
102 venue = venue_name
103 elif venue_ids and not venue_name:
104 venue = f"[{' '.join(venue_ids)}]"
105 elif not venue_ids and not venue_name:
106 venue = ''
107 return venue
109 def extract_from_series(self, xml_soup:BeautifulSoup) -> dict:
110 venue_name, venue_ids = self.get_venue(xml_soup)
111 ids = [self.get_id(xml_soup)]
112 ids.extend(venue_ids)
113 return {
114 'id': ' '.join(ids),
115 'title': venue_name,
116 'author': '',
117 'issue': '',
118 'volume': '',
119 'venue': '',
120 'pub_date': self.get_pub_date(xml_soup),
121 'pages': '',
122 'type': 'series',
123 'publisher': self.get_publisher(xml_soup.find('SerialPublication').find('SerialWork')),
124 'editor': ''
125 }
127 def get_id(self, context:BeautifulSoup) -> str:
128 doi_manager = DOIManager(use_api_service=False)
129 return doi_manager.normalise(context.find('DOI').get_text(), include_prefix=True)
131 def get_isbn(self, context:BeautifulSoup) -> str:
132 product_identifiers: List[BeautifulSoup] = context.find_all('ProductIdentifier')
133 isbn_list = list()
134 if product_identifiers:
135 for product_identifier in product_identifiers:
136 if product_identifier.find('ProductIDType').get_text() in {'02', '15'}:
137 self.isbn_worker(product_identifier.find('IDValue').get_text(), isbn_list)
138 return isbn_list
140 def get_contributors(self, context:BeautifulSoup) -> Tuple[list, list]:
141 contributors: List[BeautifulSoup] = context.find_all('Contributor')
142 authors = list(); editors = list()
143 contributor_roles = {'A': authors, 'B': editors}
144 for i, contributor in enumerate(contributors):
145 contributor_role = contributor.find('ContributorRole').get_text()[0]
146 person_name_inverted = contributor.find('PersonNameInverted')
147 corporate_name = contributor.find('CorporateName')
148 person_name = contributor.find('PersonName')
149 names_before_key = contributor.find('NamesBeforeKey')
150 key_names = contributor.find('KeyNames')
151 unnamed_persons = contributor.find('UnnamedPersons')
152 if person_name_inverted:
153 author = person_name_inverted.get_text()
154 elif names_before_key and key_names:
155 author = f'{key_names.get_text()}, {names_before_key.get_text()}'
156 elif key_names and not names_before_key:
157 author = f'{key_names.get_text()},'
158 elif corporate_name:
159 author = corporate_name.get_text()
160 elif person_name:
161 author = person_name.get_text()
162 elif unnamed_persons:
163 continue
164 else:
165 raise(ValueError('No author name'))
166 is_there_name_id = contributor.find('NameIdentifier')
167 sequence_number = contributor.find('SequenceNumber')
168 sequence_number = int(sequence_number.get_text()) if sequence_number else i
169 if is_there_name_id:
170 name_id = self._om.normalise(is_there_name_id.find('IDValue').get_text(), include_prefix=True)
171 author += f' [{name_id}]'
172 contributor_roles[contributor_role].append((sequence_number, author))
173 contributor_roles = {k:[ra[1] for ra in sorted(v, key=lambda x:x[0])] for k,v in contributor_roles.items()}
174 return contributor_roles['A'], contributor_roles['B']
176 def get_pub_date(self, context:BeautifulSoup) -> str:
177 raw_date = context.find('PublicationDate')
178 if not raw_date:
179 raw_date = context.find('Date')
180 if not raw_date:
181 return ''
182 raw_date = raw_date.get_text()
183 try:
184 clean_date = datetime.strptime(raw_date, '%Y%m%d').strftime('%Y-%m-%d')
185 except ValueError:
186 try:
187 clean_date = datetime.strptime(raw_date, '%Y%m').strftime('%Y-%m')
188 except ValueError:
189 clean_date = datetime.strptime(raw_date, '%Y').strftime('%Y')
190 return clean_date
192 def get_pages(self, context:BeautifulSoup) -> str:
193 page_run = context.find('PageRun')
194 if page_run:
195 starting_page = page_run.find('FirstPageNumber')
196 ending_page = page_run.find('LastPageNumber')
197 starting_page = starting_page.get_text() if starting_page else None
198 ending_page = ending_page.get_text() if ending_page else None
199 if starting_page and '-' in starting_page:
200 starting_page = f'"{starting_page}"'
201 if ending_page and '-' in ending_page:
202 ending_page = f'"{ending_page}"'
203 pages = f'{starting_page}-{ending_page}' if starting_page and ending_page else starting_page if starting_page else ''
204 else:
205 pages = ''
206 return pages
208 def get_publisher(self, context:BeautifulSoup) -> str:
209 publisher = context.find('Publisher')
210 if publisher:
211 publisher_name = publisher.find('PublisherName')
212 if publisher:
213 return publisher_name.get_text()
214 return ''
216 def get_venue(self, context:BeautifulSoup) -> str:
217 serial_publication = context.find('SerialPublication')
218 serial_work = serial_publication.find('SerialWork')
219 serial_work_titles: List[BeautifulSoup] = serial_work.find_all('Title')
220 venue_name = None
221 for serial_work_title in serial_work_titles:
222 if serial_work_title.find('TitleType').get_text() == '01':
223 venue_name = serial_work_title.find('TitleText').get_text()
224 elif serial_work_title.find('TitleType').get_text() == '05':
225 venue_name = serial_work_title.find('TitleText').get_text()
226 serial_versions: List[BeautifulSoup] = serial_publication.find_all('SerialVersion')
227 venue_ids = list()
228 for serial_version in serial_versions:
229 product_id_type = serial_version.find('ProductIDType')
230 if serial_version.find('ProductForm').get_text() in {'JD', 'JB'} and product_id_type:
231 if product_id_type.get_text() == '07':
232 id_value = serial_version.find('IDValue').get_text()
233 id_value = id_value + 'X' if len(id_value) == 7 else id_value
234 self.issn_worker(id_value, venue_ids)
235 return venue_name, venue_ids
237 @classmethod
238 def get_br_type(cls, xml_soup:BeautifulSoup) -> str:
239 if xml_soup.find('DOIMonographicProduct') or xml_soup.find('DOIMonographicWork'):
240 br_type = 'book'
241 elif xml_soup.find('DOIMonographChapterWork') or xml_soup.find('DOIMonographChapterVersion'):
242 br_type = 'book chapter'
243 elif xml_soup.find('DOISerialArticleWork') or xml_soup.find('DOISerialArticleVersion'):
244 br_type = 'journal article'
245 elif xml_soup.find('DOISerialIssueWork'):
246 br_type = 'journal issue'
247 elif xml_soup.find('DOISerialTitleWork'):
248 br_type = 'series'
249 return br_type