Coverage for oc_validator / table_reader.py: 100%

90 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-30 15:46 +0000

1# ISC License 

2# 

3# Copyright (c) 2023-2026, Elia Rizzetto, Silvio Peroni 

4# 

5# Permission to use, copy, modify, and/or distribute this software for any 

6# purpose with or without fee is hereby granted, provided that the above 

7# copyright notice and this permission notice appear in all copies. 

8# 

9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 

10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 

11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 

12# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 

13# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 

14# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 

15# PERFORMANCE OF THIS SOFTWARE. 

16 

17from typing import List, Optional, Dict 

18from re import finditer 

19 

20 

21class AgentItem: 

22 """ 

23 Represents a single agent (author, editor, or publisher) with a name and 

24 zero or more responsible-agent identifiers. 

25 """ 

26 def __init__(self, raw: str) -> None: 

27 """ 

28 Parse a raw agent string into name and IDs. 

29 

30 :param raw: The raw agent string (e.g. ``"Smith, John [orcid:0000-0001]"``). 

31 :type raw: str 

32 :rtype: None 

33 """ 

34 self._raw = raw 

35 self.name: str = "" 

36 self.ids: List[str] = [] 

37 self._parse(raw) 

38 

39 def _parse(self, raw: str) -> None: 

40 """ 

41 Extract name and bracketed IDs from the raw string. 

42 

43 :param raw: The raw agent string. 

44 :type raw: str 

45 :rtype: None 

46 """ 

47 # Extract IDs from brackets 

48 self.ids = [m.group() for m in finditer(r'((?:crossref|orcid|viaf|wikidata|ror|omid):\S+)(?=\s|\])', raw)] 

49 

50 # Extract name part (everything before first '[' or the whole string if no brackets) 

51 bracket_pos = raw.find('[') 

52 if bracket_pos != -1: 

53 self.name = raw[:bracket_pos].strip() 

54 else: 

55 self.name = raw.strip() 

56 

57 def to_dict(self) -> Dict: 

58 """ 

59 Serialize the agent item to a dictionary. 

60 

61 :return: Dictionary with ``"name"`` and ``"ids"`` keys. 

62 :rtype: Dict 

63 """ 

64 return { 

65 "name": self.name, 

66 "ids": self.ids 

67 } 

68 

69 def __repr__(self) -> str: 

70 """Return an unambiguous string representation of the agent item.""" 

71 return f"AgentItem(name='{self.name}', ids={self.ids})" 

72 

73 def __str__(self) -> str: 

74 """Return the original raw string.""" 

75 return self._raw 

76 

77 

78class VenueInfo: 

79 """ 

80 Represents venue information with a name and zero or more bibliographic-resource 

81 identifiers. 

82 """ 

83 def __init__(self, raw: str) -> None: 

84 """ 

85 Parse a raw venue string into name and IDs. 

86 

87 :param raw: The raw venue string (e.g. ``"Nature [issn:1234-5678]"``). 

88 :type raw: str 

89 :rtype: None 

90 """ 

91 self._raw = raw 

92 self.name: str = "" 

93 self.ids: List[str] = [] 

94 self._parse(raw) 

95 

96 def _parse(self, raw: str) -> None: 

97 """ 

98 Extract name and bracketed IDs from the raw venue string. 

99 

100 :param raw: The raw venue string. 

101 :type raw: str 

102 :rtype: None 

103 """ 

104 # Extract IDs from brackets (using venue ID schemes) 

105 self.ids = [m.group() for m in finditer(r'((?:doi|issn|isbn|url|wikidata|wikipedia|openalex|omid|jid|arxiv|pmid):\S+)(?=\s|\])', raw)] 

106 

107 # Extract name part (everything before first '[' or the whole string if no brackets) 

108 bracket_pos = raw.find('[') 

109 if bracket_pos != -1: 

110 self.name = raw[:bracket_pos].strip() 

111 else: 

112 self.name = raw.strip() 

113 

114 def to_dict(self) -> Dict: 

115 """ 

116 Serialize the venue info to a dictionary. 

117 

118 :return: Dictionary with ``"name"`` and ``"ids"`` keys. 

119 :rtype: Dict 

120 """ 

121 return { 

122 "name": self.name, 

123 "ids": self.ids 

124 } 

125 

126 def __repr__(self) -> str: 

127 """Return an unambiguous string representation of the venue info.""" 

128 return f"VenueInfo(name='{self.name}', ids={self.ids})" 

129 

130 def __str__(self) -> str: 

131 """Return the original raw string.""" 

132 return self._raw 

133 

134 

135class MetadataRow: 

136 """ 

137 Structured representation of a metadata (META-CSV) row. 

138 

139 Each field is parsed into its appropriate type (lists of strings for IDs, 

140 :class:`AgentItem` lists for author/editor/publisher, etc.). 

141 """ 

142 def __init__(self, raw_row: Dict[str, str]) -> None: 

143 """ 

144 Parse a raw CSV row dictionary into a structured MetadataRow. 

145 

146 :param raw_row: Dictionary mapping column names to raw string values. 

147 :type raw_row: Dict[str, str] 

148 :rtype: None 

149 """ 

150 self._raw = raw_row.copy() 

151 self.id: List[str] = self._parse_id_field(raw_row.get('id', '')) 

152 self.title: Optional[str] = raw_row.get('title') 

153 self.author: Optional[List[AgentItem]] = self._parse_agent_field(raw_row.get('author')) 

154 self.pub_date: Optional[str] = raw_row.get('pub_date') 

155 self.venue: Optional[VenueInfo] = self._parse_venue_field(raw_row.get('venue')) 

156 self.volume: Optional[str] = raw_row.get('volume') 

157 self.issue: Optional[str] = raw_row.get('issue') 

158 self.page: Optional[str] = raw_row.get('page') 

159 self.type: Optional[str] = raw_row.get('type') 

160 self.publisher: Optional[List[AgentItem]] = self._parse_agent_field(raw_row.get('publisher')) 

161 self.editor: Optional[List[AgentItem]] = self._parse_agent_field(raw_row.get('editor')) 

162 

163 def _parse_id_field(self, value: str) -> List[str]: 

164 """ 

165 Parse a space-separated ID field into a list of strings. 

166 

167 :param value: Raw space-separated ID string. 

168 :type value: str 

169 :return: List of individual ID strings, or an empty list if blank. 

170 :rtype: List[str] 

171 """ 

172 if not value: 

173 return [] 

174 return value.split(' ') 

175 

176 def _parse_agent_field(self, value: Optional[str]) -> Optional[List[AgentItem]]: 

177 """ 

178 Parse a semicolon-separated agent field into a list of AgentItem objects. 

179 

180 :param value: Raw agent field string, or ``None`` if empty. 

181 :type value: Optional[str] 

182 :return: List of :class:`AgentItem` instances, or ``None`` if blank. 

183 :rtype: Optional[List[AgentItem]] 

184 """ 

185 if not value: 

186 return None 

187 items = value.split('; ') 

188 return [AgentItem(item) for item in items] 

189 

190 def _parse_venue_field(self, value: Optional[str]) -> Optional[VenueInfo]: 

191 """ 

192 Parse the venue field into a VenueInfo object. 

193 

194 :param value: Raw venue string, or ``None`` if empty. 

195 :type value: Optional[str] 

196 :return: :class:`VenueInfo` instance, or ``None`` if blank. 

197 :rtype: Optional[VenueInfo] 

198 """ 

199 if not value: 

200 return None 

201 return VenueInfo(value) 

202 

203 

204 def flat_serialise(self) -> Dict: 

205 """ 

206 Serialise the row to a flat dictionary where every field value is a list of strings. 

207 

208 Multi-item fields (IDs, agents) are represented as lists of their raw 

209 string forms; single-value fields are wrapped in a one-element list. 

210 

211 :return: Dictionary mapping field names to lists of string items. 

212 :rtype: Dict 

213 """ 

214 result = { 

215 "id": self.id, 

216 "title": [self.title] if self.title is not None else [], 

217 "author": [str(agent) for agent in self.author] if self.author is not None else [], 

218 "pub_date": [self.pub_date] if self.pub_date is not None else [], 

219 "venue": [str(self.venue)] if self.venue is not None else [], 

220 "volume": [ self.volume] if self.volume is not None else [], 

221 "issue": [self.issue] if self.issue is not None else [], 

222 "page": [self.page] if self.page is not None else [], 

223 "type": [self.type] if self.type is not None else [], 

224 "publisher": [str(agent) for agent in self.publisher] if self.publisher is not None else [], 

225 "editor": [str(agent) for agent in self.editor] if self.editor is not None else [] 

226 } 

227 

228 return result 

229 

230 def __repr__(self) -> str: 

231 """Return an unambiguous string representation of the metadata row.""" 

232 return f"MetadataRow(id={self.id}, title={self.title})" 

233 

234 

235class CitationsRow: 

236 """ 

237 Structured representation of a citations (CITS-CSV) row. 

238 

239 Parses citing and cited ID fields and optional publication dates. 

240 """ 

241 def __init__(self, raw_row: Dict[str, str]) -> None: 

242 """ 

243 Parse a raw CSV row dictionary into a structured CitationsRow. 

244 

245 :param raw_row: Dictionary mapping column names to raw string values. 

246 :type raw_row: Dict[str, str] 

247 :rtype: None 

248 """ 

249 self._raw = raw_row.copy() 

250 self.citing_id: List[str] = self._parse_id_field(raw_row.get('citing_id', '')) 

251 self.citing_publication_date: Optional[str] = raw_row.get('citing_publication_date') 

252 self.cited_id: List[str] = self._parse_id_field(raw_row.get('cited_id', '')) 

253 self.cited_publication_date: Optional[str] = raw_row.get('cited_publication_date') 

254 

255 def _parse_id_field(self, value: str) -> List[str]: 

256 """ 

257 Parse a space-separated ID field into a list of strings. 

258 

259 :param value: Raw space-separated ID string. 

260 :type value: str 

261 :return: List of individual ID strings, or an empty list if blank. 

262 :rtype: List[str] 

263 """ 

264 if not value: 

265 return [] 

266 return value.split(' ') 

267 

268 def flat_serialise(self) -> Dict: 

269 """ 

270 Serialise the row to a flat dictionary where every field value is a list of strings. 

271 

272 :return: Dictionary mapping field names to lists of string items. 

273 :rtype: Dict 

274 """ 

275 result = { 

276 "citing_id": self.citing_id, 

277 "citing_publication_date": [self.citing_publication_date] if self.citing_publication_date is not None else [], 

278 "cited_id": self.cited_id, 

279 "cited_publication_date": [self.cited_publication_date] if self.cited_publication_date is not None else [] 

280 } 

281 

282 return result 

283 

284 def __repr__(self) -> str: 

285 """Return an unambiguous string representation of the citations row.""" 

286 return f"CitationsRow(citing_id={self.citing_id}, cited_id={self.cited_id})" 

287 

288 

289def read_metadata_row(row_dict: Dict[str, str]) -> MetadataRow: 

290 """ 

291 Parse a metadata CSV row into a structured :class:`MetadataRow` object. 

292 

293 :param row_dict: Dictionary representing a single CSV row (from ``csv.DictReader``). 

294 :type row_dict: Dict[str, str] 

295 :return: Parsed :class:`MetadataRow` instance. 

296 :rtype: MetadataRow 

297 """ 

298 return MetadataRow(row_dict) 

299 

300 

301def read_citations_row(row_dict: Dict[str, str]) -> CitationsRow: 

302 """ 

303 Parse a citations CSV row into a structured :class:`CitationsRow` object. 

304 

305 :param row_dict: Dictionary representing a single CSV row (from ``csv.DictReader``). 

306 :type row_dict: Dict[str, str] 

307 :return: Parsed :class:`CitationsRow` instance. 

308 :rtype: CitationsRow 

309 """ 

310 return CitationsRow(row_dict)