Coverage for oc_ds_converter / jalc / jalc_processing.py: 94%

171 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-12 21:23 +0000

1# SPDX-FileCopyrightText: 2022-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it> 

3# 

4# SPDX-License-Identifier: ISC 

5 

6from __future__ import annotations 

7 

8import html 

9import re 

10 

11from oc_ds_converter.lib.crossref_style_processing import CrossrefStyleProcessing 

12from oc_ds_converter.oc_idmanager.jid import JIDManager 

13from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager 

14 

15 

16# Inline lang prefixes observed in malformed JaLC entries (e.g. 

17# "金大考古\nen: The Archaeological Journal of Kanazawa University"): 

18# the non-prefixed line is JaLC's primary language (ja), prefixed lines 

19# declare their own language. A full scan of the JaLC dump (9.85M records) 

20# shows only 'en:' appears as an inline prefix in packed-multilang entries; 

21# all other languages (de, ru, fr, ...) occur exclusively as explicit 

22# ``lang`` tags on well-formed entries and are handled by ``get_ja``. 

23_INLINE_LANGS = ('ja', 'en') 

24_LANG_PREFIX_RE = re.compile( 

25 rf'^\s*(?P<lang>{"|".join(_INLINE_LANGS)})\s*:\s*(?P<text>.+?)\s*$' 

26) 

27_WHITESPACE_RE = re.compile(r'\s+') 

28 

29 

30def _expand_multilang_entries(entries: list[dict], text_key: str) -> list[dict]: 

31 """Split lang-less entries whose text packs multiple languages separated 

32 by newlines (``"<ja>\\n en: <text>"``) into one entry per language. The 

33 non-prefixed line is tagged as Japanese (JaLC's primary language). 

34 Entries that already declare ``lang``, or that do not match the 

35 packed-lang pattern, are returned unchanged. Sibling keys (such as 

36 ``type``) are preserved on every derived entry. 

37 """ 

38 out: list[dict] = [] 

39 for entry in entries: 

40 if 'lang' in entry or text_key not in entry: 

41 out.append(entry) 

42 continue 

43 raw = entry[text_key] 

44 if not isinstance(raw, str): 

45 out.append(entry) 

46 continue 

47 # Resolve HTML entities up-front: JaLC sources sometimes encode 

48 # newlines as ``&#10;``, which would otherwise hide the packed-lang 

49 # structure from the \n split below. 

50 text = html.unescape(raw) 

51 if '\n' not in text: 

52 out.append(entry) 

53 continue 

54 lines = [stripped for stripped in (s.strip() for s in text.splitlines()) if stripped] 

55 if len(lines) < 2: 

56 out.append(entry) 

57 continue 

58 split: list[dict] = [] 

59 for index, line in enumerate(lines): 

60 match = _LANG_PREFIX_RE.match(line) 

61 if match: 

62 split.append({'lang': match['lang'], 'text': _WHITESPACE_RE.sub(' ', match['text']).strip()}) 

63 elif index == 0: 

64 split.append({'lang': 'ja', 'text': _WHITESPACE_RE.sub(' ', line).strip()}) 

65 else: 

66 split = [] 

67 break 

68 if not split: 

69 out.append(entry) 

70 continue 

71 extras = {k: v for k, v in entry.items() if k != text_key} 

72 for piece in split: 

73 out.append({**extras, text_key: piece['text'], 'lang': piece['lang']}) 

74 return out 

75 

76 

77class JalcProcessing(CrossrefStyleProcessing): 

78 # Publisher prefix mapping is disabled for JALC. Unlike Crossref, which provides 

79 # a /members API endpoint with authoritative publisher names and their DOI prefixes, 

80 # JALC has no equivalent endpoint. The JALC /prefixes API only returns prefix, ra, 

81 # siteId, and updated_date - no publisher names. Since 99.8% of JALC prefixes are 

82 # not in the Crossref mapping anyway (JALC is a separate DOI registration agency), 

83 # we use publisher names directly from the source data's publisher_list field. 

84 

85 def __init__( 

86 self, 

87 orcid_index: str | None = None, 

88 storage_manager: StorageManager | None = None, 

89 testing: bool = True, 

90 citing: bool = True, 

91 exclude_existing: bool = False, 

92 use_redis_orcid_index: bool = False, 

93 ): 

94 super().__init__( 

95 orcid_index=orcid_index, 

96 publishers_filepath=None, 

97 storage_manager=storage_manager, 

98 testing=testing, 

99 citing=citing, 

100 use_redis_orcid_index=use_redis_orcid_index, 

101 use_redis_publishers=False, 

102 ) 

103 self.exclude_existing = exclude_existing 

104 

105 self.jid_m = JIDManager(storage_manager=self.storage_manager, testing=testing) 

106 self.tmp_jid_m = JIDManager(storage_manager=self.temporary_manager, testing=testing) 

107 

108 self.venue_id_man_dict["jid"] = self.jid_m 

109 self.venue_tmp_id_man_dict["jid"] = self.tmp_jid_m 

110 

111 @classmethod 

112 def get_ja(cls, field: list) -> list: 

113 """Select Japanese version of metadata, falling back to English.""" 

114 if all('lang' in item for item in field): 

115 ja = [item for item in field if item['lang'] == 'ja'] 

116 ja = list(filter(lambda x: x['type'] != 'before' if 'type' in x else x, ja)) 

117 if ja: 

118 return ja 

119 en = [item for item in field if item['lang'] == 'en'] 

120 en = list(filter(lambda x: x['type'] != 'before' if 'type' in x else x, en)) 

121 if en: 

122 return en 

123 return field 

124 

125 def _extract_doi(self, item: dict) -> str: 

126 return item.get("doi", "") 

127 

128 def _extract_title(self, item: dict) -> str: 

129 title_list = item.get('title_list') 

130 if title_list: 

131 expanded = _expand_multilang_entries(title_list, 'title') 

132 return self.sanitize_text(self.get_ja(expanded)[0].get('title', '')) 

133 return '' 

134 

135 def _extract_agents(self, item: dict) -> list[dict]: 

136 authors: list[dict[str, str]] = [] 

137 creator_list = item.get("creator_list") 

138 if creator_list: 

139 for creator in creator_list: 

140 agent: dict[str, str] = {"role": "author"} 

141 names = creator.get('names', []) 

142 if names: 

143 ja_name = self.get_ja(names)[0] 

144 last_name = self.sanitize_text(ja_name.get('last_name', '')) 

145 first_name = self.sanitize_text(ja_name.get('first_name', '')) 

146 else: 

147 last_name = '' 

148 first_name = '' 

149 full_name = '' 

150 if last_name: 

151 full_name += last_name 

152 if first_name: 

153 full_name += f', {first_name}' 

154 agent["name"] = full_name 

155 agent["family"] = last_name 

156 agent["given"] = first_name 

157 researcher_id_list = creator.get('researcher_id_list', []) 

158 for researcher_id in researcher_id_list: 

159 if researcher_id.get('type') == 'ORCID' and researcher_id.get('id_code'): 

160 agent['orcid'] = researcher_id['id_code'] 

161 break 

162 authors.append(agent) 

163 return authors 

164 

165 def _extract_venue(self, item: dict) -> str: 

166 venue_name = '' 

167 journal_ids: list[str] = [] 

168 if 'journal_title_name_list' in item: 

169 expanded = _expand_multilang_entries( 

170 item['journal_title_name_list'], 'journal_title_name' 

171 ) 

172 candidate_venues = self.get_ja(expanded) 

173 if candidate_venues: 

174 full_venue = [v for v in candidate_venues if v.get('type') == 'full'] 

175 if full_venue: 

176 venue_name = self.sanitize_text(full_venue[0].get('journal_title_name', '')) 

177 elif candidate_venues: 

178 venue_name = self.sanitize_text(candidate_venues[0].get('journal_title_name', '')) 

179 if 'journal_id_list' in item: 

180 for v in item['journal_id_list']: 

181 if isinstance(v, dict): 

182 journal_id = v.get("journal_id") 

183 id_type = v.get("type") 

184 if journal_id and id_type: 

185 schema = id_type.lower().strip() 

186 if schema in ["issn", "jid"]: 

187 tmp_id_man = self.venue_tmp_id_man_dict.get(schema) 

188 if tmp_id_man and hasattr(tmp_id_man, 'normalise'): 

189 norm_id = getattr(tmp_id_man, 'normalise')(journal_id, include_prefix=True) 

190 if norm_id: 

191 journal_ids.append(norm_id) 

192 return f"{venue_name} [{' '.join(journal_ids)}]" if journal_ids else venue_name 

193 

194 def _extract_pub_date(self, item: dict) -> str: 

195 pub_date_dict = item.get('publication_date') 

196 if not pub_date_dict or not isinstance(pub_date_dict, dict): 

197 return '' 

198 pub_date_list: list[str] = [] 

199 year = pub_date_dict.get('publication_year', '') 

200 if year: 

201 pub_date_list.append(str(year)) 

202 month = pub_date_dict.get('publication_month', '') 

203 if month: 

204 pub_date_list.append(str(month)) 

205 day = pub_date_dict.get('publication_day', '') 

206 if day: 

207 pub_date_list.append(str(day)) 

208 return '-'.join(pub_date_list) 

209 

210 def _extract_pages(self, item: dict) -> str: 

211 first_page = item.get('first_page', '') 

212 last_page = item.get('last_page', '') 

213 page_list: list[str] = [] 

214 if first_page: 

215 page_list.append(first_page) 

216 if last_page: 

217 page_list.append(last_page) 

218 return self.get_pages(page_list) 

219 

220 def _extract_type(self, item: dict) -> str: 

221 content_type = item.get('content_type') 

222 if not content_type: 

223 return '' 

224 type_map = { 

225 'JA': 'journal article', 

226 'BK': 'book', 

227 'RD': 'dataset', 

228 'EL': 'other', 

229 'GD': 'other', 

230 } 

231 return type_map.get(content_type, '') 

232 

233 def _extract_publisher(self, item: dict) -> str: 

234 if 'publisher_list' in item: 

235 return self.sanitize_text(self.get_ja(item['publisher_list'])[0].get('publisher_name', '')) 

236 return '' 

237 

238 def extract_all_ids(self, entity_dict: dict, is_citing: bool) -> tuple[list[str], list[str]]: 

239 all_br: list[str] = [] 

240 all_ra: list[str] = [] 

241 

242 if not is_citing: 

243 citation_list = entity_dict.get("data", {}).get("citation_list", []) 

244 for citation in citation_list: 

245 doi = citation.get("doi") 

246 if doi: 

247 norm_id = self.doi_m.normalise(doi, include_prefix=True) 

248 if norm_id: 

249 all_br.append(norm_id) 

250 return all_br, all_ra