Coverage for oc_ds_converter/jalc/jalc

1# SPDX-FileCopyrightText: 2022-2026 Arcangelo Massari <arcangelo.massari@unibo.it>

2# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it>

4# SPDX-License-Identifier: ISC

6from __future__ import annotations

8import html

9import re

11from oc_ds_converter.lib.crossref_style_processing import CrossrefStyleProcessing

12from oc_ds_converter.oc_idmanager.jid import JIDManager

13from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager

16# Inline lang prefixes observed in malformed JaLC entries (e.g.

17# "金大考古\nen: The Archaeological Journal of Kanazawa University"):

18# the non-prefixed line is JaLC's primary language (ja), prefixed lines

19# declare their own language. A full scan of the JaLC dump (9.85M records)

20# shows only 'en:' appears as an inline prefix in packed-multilang entries;

21# all other languages (de, ru, fr, ...) occur exclusively as explicit

22# ``lang`` tags on well-formed entries and are handled by ``get_ja``.

23_INLINE_LANGS = ('ja', 'en')

24_LANG_PREFIX_RE = re.compile(

25 rf'^\s*(?P<lang>{"|".join(_INLINE_LANGS)})\s*:\s*(?P<text>.+?)\s*$'

26)

27_WHITESPACE_RE = re.compile(r'\s+')

30def _expand_multilang_entries(entries: list[dict], text_key: str) -> list[dict]:

31 """Split lang-less entries whose text packs multiple languages separated

32 by newlines (``"<ja>\\n en: <text>"``) into one entry per language. The

33 non-prefixed line is tagged as Japanese (JaLC's primary language).

34 Entries that already declare ``lang``, or that do not match the

35 packed-lang pattern, are returned unchanged. Sibling keys (such as

36 ``type``) are preserved on every derived entry.

37 """

38 out: list[dict] = []

39 for entry in entries:

40 if 'lang' in entry or text_key not in entry:

41 out.append(entry)

42 continue

43 raw = entry[text_key]

44 if not isinstance(raw, str):

45 out.append(entry)

46 continue

47 # Resolve HTML entities up-front: JaLC sources sometimes encode

48 # newlines as ``
``, which would otherwise hide the packed-lang

49 # structure from the \n split below.

50 text = html.unescape(raw)

51 if '\n' not in text:

52 out.append(entry)

53 continue

54 lines = [stripped for stripped in (s.strip() for s in text.splitlines()) if stripped]

55 if len(lines) < 2:

56 out.append(entry)

57 continue

58 split: list[dict] = []

59 for index, line in enumerate(lines):

60 match = _LANG_PREFIX_RE.match(line)

61 if match:

62 split.append({'lang': match['lang'], 'text': _WHITESPACE_RE.sub(' ', match['text']).strip()})

63 elif index == 0:

64 split.append({'lang': 'ja', 'text': _WHITESPACE_RE.sub(' ', line).strip()})

65 else:

66 split = []

67 break

68 if not split:

69 out.append(entry)

70 continue

71 extras = {k: v for k, v in entry.items() if k != text_key}

72 for piece in split:

73 out.append({**extras, text_key: piece['text'], 'lang': piece['lang']})

74 return out

77class JalcProcessing(CrossrefStyleProcessing):

78 # Publisher prefix mapping is disabled for JALC. Unlike Crossref, which provides

79 # a /members API endpoint with authoritative publisher names and their DOI prefixes,

80 # JALC has no equivalent endpoint. The JALC /prefixes API only returns prefix, ra,

81 # siteId, and updated_date - no publisher names. Since 99.8% of JALC prefixes are

82 # not in the Crossref mapping anyway (JALC is a separate DOI registration agency),

83 # we use publisher names directly from the source data's publisher_list field.

85 def __init__(

86 self,

87 orcid_index: str | None = None,

88 storage_manager: StorageManager | None = None,

89 testing: bool = True,

90 citing: bool = True,

91 exclude_existing: bool = False,

92 use_redis_orcid_index: bool = False,

93 ):

94 super().__init__(

95 orcid_index=orcid_index,

96 publishers_filepath=None,

97 storage_manager=storage_manager,

98 testing=testing,

99 citing=citing,

100 use_redis_orcid_index=use_redis_orcid_index,

101 use_redis_publishers=False,

102 )

103 self.exclude_existing = exclude_existing

104

105 self.jid_m = JIDManager(storage_manager=self.storage_manager, testing=testing)

106 self.tmp_jid_m = JIDManager(storage_manager=self.temporary_manager, testing=testing)

107

108 self.venue_id_man_dict["jid"] = self.jid_m

109 self.venue_tmp_id_man_dict["jid"] = self.tmp_jid_m

110

111 @classmethod

112 def get_ja(cls, field: list) -> list:

113 """Select Japanese version of metadata, falling back to English."""

114 if all('lang' in item for item in field):

115 ja = [item for item in field if item['lang'] == 'ja']

116 ja = list(filter(lambda x: x['type'] != 'before' if 'type' in x else x, ja))

117 if ja:

118 return ja

119 en = [item for item in field if item['lang'] == 'en']

120 en = list(filter(lambda x: x['type'] != 'before' if 'type' in x else x, en))

121 if en:

122 return en

123 return field

124

125 def _extract_doi(self, item: dict) -> str:

126 return item.get("doi", "")

127

128 def _extract_title(self, item: dict) -> str:

129 title_list = item.get('title_list')

130 if title_list:

131 expanded = _expand_multilang_entries(title_list, 'title')

132 return self.sanitize_text(self.get_ja(expanded)[0].get('title', ''))

133 return ''

134

135 def _extract_agents(self, item: dict) -> list[dict]:

136 authors: list[dict[str, str]] = []

137 creator_list = item.get("creator_list")

138 if creator_list:

139 for creator in creator_list:

140 agent: dict[str, str] = {"role": "author"}

141 names = creator.get('names', [])

142 if names:

143 ja_name = self.get_ja(names)[0]

144 last_name = self.sanitize_text(ja_name.get('last_name', ''))

145 first_name = self.sanitize_text(ja_name.get('first_name', ''))

146 else:

147 last_name = ''

148 first_name = ''

149 full_name = ''

150 if last_name:

151 full_name += last_name

152 if first_name:

153 full_name += f', {first_name}'

154 agent["name"] = full_name

155 agent["family"] = last_name

156 agent["given"] = first_name

157 researcher_id_list = creator.get('researcher_id_list', [])

158 for researcher_id in researcher_id_list:

159 if researcher_id.get('type') == 'ORCID' and researcher_id.get('id_code'):

160 agent['orcid'] = researcher_id['id_code']

161 break

162 authors.append(agent)

163 return authors

164

165 def _extract_venue(self, item: dict) -> str:

166 venue_name = ''

167 journal_ids: list[str] = []

168 if 'journal_title_name_list' in item:

169 expanded = _expand_multilang_entries(

170 item['journal_title_name_list'], 'journal_title_name'

171 )

172 candidate_venues = self.get_ja(expanded)

173 if candidate_venues:

174 full_venue = [v for v in candidate_venues if v.get('type') == 'full']

175 if full_venue:

176 venue_name = self.sanitize_text(full_venue[0].get('journal_title_name', ''))

177 elif candidate_venues:

178 venue_name = self.sanitize_text(candidate_venues[0].get('journal_title_name', ''))

179 if 'journal_id_list' in item:

180 for v in item['journal_id_list']:

181 if isinstance(v, dict):

182 journal_id = v.get("journal_id")

183 id_type = v.get("type")

184 if journal_id and id_type:

185 schema = id_type.lower().strip()

186 if schema in ["issn", "jid"]:

187 tmp_id_man = self.venue_tmp_id_man_dict.get(schema)

188 if tmp_id_man and hasattr(tmp_id_man, 'normalise'):

189 norm_id = getattr(tmp_id_man, 'normalise')(journal_id, include_prefix=True)

190 if norm_id:

191 journal_ids.append(norm_id)

192 return f"{venue_name} [{' '.join(journal_ids)}]" if journal_ids else venue_name

193

194 def _extract_pub_date(self, item: dict) -> str:

195 pub_date_dict = item.get('publication_date')

196 if not pub_date_dict or not isinstance(pub_date_dict, dict):

197 return ''

198 pub_date_list: list[str] = []

199 year = pub_date_dict.get('publication_year', '')

200 if year:

201 pub_date_list.append(str(year))

202 month = pub_date_dict.get('publication_month', '')

203 if month:

204 pub_date_list.append(str(month))

205 day = pub_date_dict.get('publication_day', '')

206 if day:

207 pub_date_list.append(str(day))

208 return '-'.join(pub_date_list)

209

210 def _extract_pages(self, item: dict) -> str:

211 first_page = item.get('first_page', '')

212 last_page = item.get('last_page', '')

213 page_list: list[str] = []

214 if first_page:

215 page_list.append(first_page)

216 if last_page:

217 page_list.append(last_page)

218 return self.get_pages(page_list)

219

220 def _extract_type(self, item: dict) -> str:

221 content_type = item.get('content_type')

222 if not content_type:

223 return ''

224 type_map = {

225 'JA': 'journal article',

226 'BK': 'book',

227 'RD': 'dataset',

228 'EL': 'other',

229 'GD': 'other',

230 }

231 return type_map.get(content_type, '')

232

233 def _extract_publisher(self, item: dict) -> str:

234 if 'publisher_list' in item:

235 return self.sanitize_text(self.get_ja(item['publisher_list'])[0].get('publisher_name', ''))

236 return ''

237

238 def extract_all_ids(self, entity_dict: dict, is_citing: bool) -> tuple[list[str], list[str]]:

239 all_br: list[str] = []

240 all_ra: list[str] = []

241

242 if not is_citing:

243 citation_list = entity_dict.get("data", {}).get("citation_list", [])

244 for citation in citation_list:

245 doi = citation.get("doi")

246 if doi:

247 norm_id = self.doi_m.normalise(doi, include_prefix=True)

248 if norm_id:

249 all_br.append(norm_id)

250 return all_br, all_ra

Coverage for oc_ds_converter / jalc / jalc_processing.py: 94%

171 statements