Coverage for oc_validator / table_reader.py: 100%
90 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 15:46 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 15:46 +0000
1# ISC License
2#
3# Copyright (c) 2023-2026, Elia Rizzetto, Silvio Peroni
4#
5# Permission to use, copy, modify, and/or distribute this software for any
6# purpose with or without fee is hereby granted, provided that the above
7# copyright notice and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
12# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
14# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15# PERFORMANCE OF THIS SOFTWARE.
17from typing import List, Optional, Dict
18from re import finditer
21class AgentItem:
22 """
23 Represents a single agent (author, editor, or publisher) with a name and
24 zero or more responsible-agent identifiers.
25 """
26 def __init__(self, raw: str) -> None:
27 """
28 Parse a raw agent string into name and IDs.
30 :param raw: The raw agent string (e.g. ``"Smith, John [orcid:0000-0001]"``).
31 :type raw: str
32 :rtype: None
33 """
34 self._raw = raw
35 self.name: str = ""
36 self.ids: List[str] = []
37 self._parse(raw)
39 def _parse(self, raw: str) -> None:
40 """
41 Extract name and bracketed IDs from the raw string.
43 :param raw: The raw agent string.
44 :type raw: str
45 :rtype: None
46 """
47 # Extract IDs from brackets
48 self.ids = [m.group() for m in finditer(r'((?:crossref|orcid|viaf|wikidata|ror|omid):\S+)(?=\s|\])', raw)]
50 # Extract name part (everything before first '[' or the whole string if no brackets)
51 bracket_pos = raw.find('[')
52 if bracket_pos != -1:
53 self.name = raw[:bracket_pos].strip()
54 else:
55 self.name = raw.strip()
57 def to_dict(self) -> Dict:
58 """
59 Serialize the agent item to a dictionary.
61 :return: Dictionary with ``"name"`` and ``"ids"`` keys.
62 :rtype: Dict
63 """
64 return {
65 "name": self.name,
66 "ids": self.ids
67 }
69 def __repr__(self) -> str:
70 """Return an unambiguous string representation of the agent item."""
71 return f"AgentItem(name='{self.name}', ids={self.ids})"
73 def __str__(self) -> str:
74 """Return the original raw string."""
75 return self._raw
78class VenueInfo:
79 """
80 Represents venue information with a name and zero or more bibliographic-resource
81 identifiers.
82 """
83 def __init__(self, raw: str) -> None:
84 """
85 Parse a raw venue string into name and IDs.
87 :param raw: The raw venue string (e.g. ``"Nature [issn:1234-5678]"``).
88 :type raw: str
89 :rtype: None
90 """
91 self._raw = raw
92 self.name: str = ""
93 self.ids: List[str] = []
94 self._parse(raw)
96 def _parse(self, raw: str) -> None:
97 """
98 Extract name and bracketed IDs from the raw venue string.
100 :param raw: The raw venue string.
101 :type raw: str
102 :rtype: None
103 """
104 # Extract IDs from brackets (using venue ID schemes)
105 self.ids = [m.group() for m in finditer(r'((?:doi|issn|isbn|url|wikidata|wikipedia|openalex|omid|jid|arxiv|pmid):\S+)(?=\s|\])', raw)]
107 # Extract name part (everything before first '[' or the whole string if no brackets)
108 bracket_pos = raw.find('[')
109 if bracket_pos != -1:
110 self.name = raw[:bracket_pos].strip()
111 else:
112 self.name = raw.strip()
114 def to_dict(self) -> Dict:
115 """
116 Serialize the venue info to a dictionary.
118 :return: Dictionary with ``"name"`` and ``"ids"`` keys.
119 :rtype: Dict
120 """
121 return {
122 "name": self.name,
123 "ids": self.ids
124 }
126 def __repr__(self) -> str:
127 """Return an unambiguous string representation of the venue info."""
128 return f"VenueInfo(name='{self.name}', ids={self.ids})"
130 def __str__(self) -> str:
131 """Return the original raw string."""
132 return self._raw
135class MetadataRow:
136 """
137 Structured representation of a metadata (META-CSV) row.
139 Each field is parsed into its appropriate type (lists of strings for IDs,
140 :class:`AgentItem` lists for author/editor/publisher, etc.).
141 """
142 def __init__(self, raw_row: Dict[str, str]) -> None:
143 """
144 Parse a raw CSV row dictionary into a structured MetadataRow.
146 :param raw_row: Dictionary mapping column names to raw string values.
147 :type raw_row: Dict[str, str]
148 :rtype: None
149 """
150 self._raw = raw_row.copy()
151 self.id: List[str] = self._parse_id_field(raw_row.get('id', ''))
152 self.title: Optional[str] = raw_row.get('title')
153 self.author: Optional[List[AgentItem]] = self._parse_agent_field(raw_row.get('author'))
154 self.pub_date: Optional[str] = raw_row.get('pub_date')
155 self.venue: Optional[VenueInfo] = self._parse_venue_field(raw_row.get('venue'))
156 self.volume: Optional[str] = raw_row.get('volume')
157 self.issue: Optional[str] = raw_row.get('issue')
158 self.page: Optional[str] = raw_row.get('page')
159 self.type: Optional[str] = raw_row.get('type')
160 self.publisher: Optional[List[AgentItem]] = self._parse_agent_field(raw_row.get('publisher'))
161 self.editor: Optional[List[AgentItem]] = self._parse_agent_field(raw_row.get('editor'))
163 def _parse_id_field(self, value: str) -> List[str]:
164 """
165 Parse a space-separated ID field into a list of strings.
167 :param value: Raw space-separated ID string.
168 :type value: str
169 :return: List of individual ID strings, or an empty list if blank.
170 :rtype: List[str]
171 """
172 if not value:
173 return []
174 return value.split(' ')
176 def _parse_agent_field(self, value: Optional[str]) -> Optional[List[AgentItem]]:
177 """
178 Parse a semicolon-separated agent field into a list of AgentItem objects.
180 :param value: Raw agent field string, or ``None`` if empty.
181 :type value: Optional[str]
182 :return: List of :class:`AgentItem` instances, or ``None`` if blank.
183 :rtype: Optional[List[AgentItem]]
184 """
185 if not value:
186 return None
187 items = value.split('; ')
188 return [AgentItem(item) for item in items]
190 def _parse_venue_field(self, value: Optional[str]) -> Optional[VenueInfo]:
191 """
192 Parse the venue field into a VenueInfo object.
194 :param value: Raw venue string, or ``None`` if empty.
195 :type value: Optional[str]
196 :return: :class:`VenueInfo` instance, or ``None`` if blank.
197 :rtype: Optional[VenueInfo]
198 """
199 if not value:
200 return None
201 return VenueInfo(value)
204 def flat_serialise(self) -> Dict:
205 """
206 Serialise the row to a flat dictionary where every field value is a list of strings.
208 Multi-item fields (IDs, agents) are represented as lists of their raw
209 string forms; single-value fields are wrapped in a one-element list.
211 :return: Dictionary mapping field names to lists of string items.
212 :rtype: Dict
213 """
214 result = {
215 "id": self.id,
216 "title": [self.title] if self.title is not None else [],
217 "author": [str(agent) for agent in self.author] if self.author is not None else [],
218 "pub_date": [self.pub_date] if self.pub_date is not None else [],
219 "venue": [str(self.venue)] if self.venue is not None else [],
220 "volume": [ self.volume] if self.volume is not None else [],
221 "issue": [self.issue] if self.issue is not None else [],
222 "page": [self.page] if self.page is not None else [],
223 "type": [self.type] if self.type is not None else [],
224 "publisher": [str(agent) for agent in self.publisher] if self.publisher is not None else [],
225 "editor": [str(agent) for agent in self.editor] if self.editor is not None else []
226 }
228 return result
230 def __repr__(self) -> str:
231 """Return an unambiguous string representation of the metadata row."""
232 return f"MetadataRow(id={self.id}, title={self.title})"
235class CitationsRow:
236 """
237 Structured representation of a citations (CITS-CSV) row.
239 Parses citing and cited ID fields and optional publication dates.
240 """
241 def __init__(self, raw_row: Dict[str, str]) -> None:
242 """
243 Parse a raw CSV row dictionary into a structured CitationsRow.
245 :param raw_row: Dictionary mapping column names to raw string values.
246 :type raw_row: Dict[str, str]
247 :rtype: None
248 """
249 self._raw = raw_row.copy()
250 self.citing_id: List[str] = self._parse_id_field(raw_row.get('citing_id', ''))
251 self.citing_publication_date: Optional[str] = raw_row.get('citing_publication_date')
252 self.cited_id: List[str] = self._parse_id_field(raw_row.get('cited_id', ''))
253 self.cited_publication_date: Optional[str] = raw_row.get('cited_publication_date')
255 def _parse_id_field(self, value: str) -> List[str]:
256 """
257 Parse a space-separated ID field into a list of strings.
259 :param value: Raw space-separated ID string.
260 :type value: str
261 :return: List of individual ID strings, or an empty list if blank.
262 :rtype: List[str]
263 """
264 if not value:
265 return []
266 return value.split(' ')
268 def flat_serialise(self) -> Dict:
269 """
270 Serialise the row to a flat dictionary where every field value is a list of strings.
272 :return: Dictionary mapping field names to lists of string items.
273 :rtype: Dict
274 """
275 result = {
276 "citing_id": self.citing_id,
277 "citing_publication_date": [self.citing_publication_date] if self.citing_publication_date is not None else [],
278 "cited_id": self.cited_id,
279 "cited_publication_date": [self.cited_publication_date] if self.cited_publication_date is not None else []
280 }
282 return result
284 def __repr__(self) -> str:
285 """Return an unambiguous string representation of the citations row."""
286 return f"CitationsRow(citing_id={self.citing_id}, cited_id={self.cited_id})"
289def read_metadata_row(row_dict: Dict[str, str]) -> MetadataRow:
290 """
291 Parse a metadata CSV row into a structured :class:`MetadataRow` object.
293 :param row_dict: Dictionary representing a single CSV row (from ``csv.DictReader``).
294 :type row_dict: Dict[str, str]
295 :return: Parsed :class:`MetadataRow` instance.
296 :rtype: MetadataRow
297 """
298 return MetadataRow(row_dict)
301def read_citations_row(row_dict: Dict[str, str]) -> CitationsRow:
302 """
303 Parse a citations CSV row into a structured :class:`CitationsRow` object.
305 :param row_dict: Dictionary representing a single CSV row (from ``csv.DictReader``).
306 :type row_dict: Dict[str, str]
307 :return: Parsed :class:`CitationsRow` instance.
308 :rtype: CitationsRow
309 """
310 return CitationsRow(row_dict)