Coverage for oc_ds_converter / jalc / jalc_processing.py: 94%
171 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-12 21:23 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-06-12 21:23 +0000
1# SPDX-FileCopyrightText: 2022-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
2# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it>
3#
4# SPDX-License-Identifier: ISC
6from __future__ import annotations
8import html
9import re
11from oc_ds_converter.lib.crossref_style_processing import CrossrefStyleProcessing
12from oc_ds_converter.oc_idmanager.jid import JIDManager
13from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager
16# Inline lang prefixes observed in malformed JaLC entries (e.g.
17# "金大考古\nen: The Archaeological Journal of Kanazawa University"):
18# the non-prefixed line is JaLC's primary language (ja), prefixed lines
19# declare their own language. A full scan of the JaLC dump (9.85M records)
20# shows only 'en:' appears as an inline prefix in packed-multilang entries;
21# all other languages (de, ru, fr, ...) occur exclusively as explicit
22# ``lang`` tags on well-formed entries and are handled by ``get_ja``.
23_INLINE_LANGS = ('ja', 'en')
24_LANG_PREFIX_RE = re.compile(
25 rf'^\s*(?P<lang>{"|".join(_INLINE_LANGS)})\s*:\s*(?P<text>.+?)\s*$'
26)
27_WHITESPACE_RE = re.compile(r'\s+')
30def _expand_multilang_entries(entries: list[dict], text_key: str) -> list[dict]:
31 """Split lang-less entries whose text packs multiple languages separated
32 by newlines (``"<ja>\\n en: <text>"``) into one entry per language. The
33 non-prefixed line is tagged as Japanese (JaLC's primary language).
34 Entries that already declare ``lang``, or that do not match the
35 packed-lang pattern, are returned unchanged. Sibling keys (such as
36 ``type``) are preserved on every derived entry.
37 """
38 out: list[dict] = []
39 for entry in entries:
40 if 'lang' in entry or text_key not in entry:
41 out.append(entry)
42 continue
43 raw = entry[text_key]
44 if not isinstance(raw, str):
45 out.append(entry)
46 continue
47 # Resolve HTML entities up-front: JaLC sources sometimes encode
48 # newlines as `` ``, which would otherwise hide the packed-lang
49 # structure from the \n split below.
50 text = html.unescape(raw)
51 if '\n' not in text:
52 out.append(entry)
53 continue
54 lines = [stripped for stripped in (s.strip() for s in text.splitlines()) if stripped]
55 if len(lines) < 2:
56 out.append(entry)
57 continue
58 split: list[dict] = []
59 for index, line in enumerate(lines):
60 match = _LANG_PREFIX_RE.match(line)
61 if match:
62 split.append({'lang': match['lang'], 'text': _WHITESPACE_RE.sub(' ', match['text']).strip()})
63 elif index == 0:
64 split.append({'lang': 'ja', 'text': _WHITESPACE_RE.sub(' ', line).strip()})
65 else:
66 split = []
67 break
68 if not split:
69 out.append(entry)
70 continue
71 extras = {k: v for k, v in entry.items() if k != text_key}
72 for piece in split:
73 out.append({**extras, text_key: piece['text'], 'lang': piece['lang']})
74 return out
77class JalcProcessing(CrossrefStyleProcessing):
78 # Publisher prefix mapping is disabled for JALC. Unlike Crossref, which provides
79 # a /members API endpoint with authoritative publisher names and their DOI prefixes,
80 # JALC has no equivalent endpoint. The JALC /prefixes API only returns prefix, ra,
81 # siteId, and updated_date - no publisher names. Since 99.8% of JALC prefixes are
82 # not in the Crossref mapping anyway (JALC is a separate DOI registration agency),
83 # we use publisher names directly from the source data's publisher_list field.
85 def __init__(
86 self,
87 orcid_index: str | None = None,
88 storage_manager: StorageManager | None = None,
89 testing: bool = True,
90 citing: bool = True,
91 exclude_existing: bool = False,
92 use_redis_orcid_index: bool = False,
93 ):
94 super().__init__(
95 orcid_index=orcid_index,
96 publishers_filepath=None,
97 storage_manager=storage_manager,
98 testing=testing,
99 citing=citing,
100 use_redis_orcid_index=use_redis_orcid_index,
101 use_redis_publishers=False,
102 )
103 self.exclude_existing = exclude_existing
105 self.jid_m = JIDManager(storage_manager=self.storage_manager, testing=testing)
106 self.tmp_jid_m = JIDManager(storage_manager=self.temporary_manager, testing=testing)
108 self.venue_id_man_dict["jid"] = self.jid_m
109 self.venue_tmp_id_man_dict["jid"] = self.tmp_jid_m
111 @classmethod
112 def get_ja(cls, field: list) -> list:
113 """Select Japanese version of metadata, falling back to English."""
114 if all('lang' in item for item in field):
115 ja = [item for item in field if item['lang'] == 'ja']
116 ja = list(filter(lambda x: x['type'] != 'before' if 'type' in x else x, ja))
117 if ja:
118 return ja
119 en = [item for item in field if item['lang'] == 'en']
120 en = list(filter(lambda x: x['type'] != 'before' if 'type' in x else x, en))
121 if en:
122 return en
123 return field
125 def _extract_doi(self, item: dict) -> str:
126 return item.get("doi", "")
128 def _extract_title(self, item: dict) -> str:
129 title_list = item.get('title_list')
130 if title_list:
131 expanded = _expand_multilang_entries(title_list, 'title')
132 return self.sanitize_text(self.get_ja(expanded)[0].get('title', ''))
133 return ''
135 def _extract_agents(self, item: dict) -> list[dict]:
136 authors: list[dict[str, str]] = []
137 creator_list = item.get("creator_list")
138 if creator_list:
139 for creator in creator_list:
140 agent: dict[str, str] = {"role": "author"}
141 names = creator.get('names', [])
142 if names:
143 ja_name = self.get_ja(names)[0]
144 last_name = self.sanitize_text(ja_name.get('last_name', ''))
145 first_name = self.sanitize_text(ja_name.get('first_name', ''))
146 else:
147 last_name = ''
148 first_name = ''
149 full_name = ''
150 if last_name:
151 full_name += last_name
152 if first_name:
153 full_name += f', {first_name}'
154 agent["name"] = full_name
155 agent["family"] = last_name
156 agent["given"] = first_name
157 researcher_id_list = creator.get('researcher_id_list', [])
158 for researcher_id in researcher_id_list:
159 if researcher_id.get('type') == 'ORCID' and researcher_id.get('id_code'):
160 agent['orcid'] = researcher_id['id_code']
161 break
162 authors.append(agent)
163 return authors
165 def _extract_venue(self, item: dict) -> str:
166 venue_name = ''
167 journal_ids: list[str] = []
168 if 'journal_title_name_list' in item:
169 expanded = _expand_multilang_entries(
170 item['journal_title_name_list'], 'journal_title_name'
171 )
172 candidate_venues = self.get_ja(expanded)
173 if candidate_venues:
174 full_venue = [v for v in candidate_venues if v.get('type') == 'full']
175 if full_venue:
176 venue_name = self.sanitize_text(full_venue[0].get('journal_title_name', ''))
177 elif candidate_venues:
178 venue_name = self.sanitize_text(candidate_venues[0].get('journal_title_name', ''))
179 if 'journal_id_list' in item:
180 for v in item['journal_id_list']:
181 if isinstance(v, dict):
182 journal_id = v.get("journal_id")
183 id_type = v.get("type")
184 if journal_id and id_type:
185 schema = id_type.lower().strip()
186 if schema in ["issn", "jid"]:
187 tmp_id_man = self.venue_tmp_id_man_dict.get(schema)
188 if tmp_id_man and hasattr(tmp_id_man, 'normalise'):
189 norm_id = getattr(tmp_id_man, 'normalise')(journal_id, include_prefix=True)
190 if norm_id:
191 journal_ids.append(norm_id)
192 return f"{venue_name} [{' '.join(journal_ids)}]" if journal_ids else venue_name
194 def _extract_pub_date(self, item: dict) -> str:
195 pub_date_dict = item.get('publication_date')
196 if not pub_date_dict or not isinstance(pub_date_dict, dict):
197 return ''
198 pub_date_list: list[str] = []
199 year = pub_date_dict.get('publication_year', '')
200 if year:
201 pub_date_list.append(str(year))
202 month = pub_date_dict.get('publication_month', '')
203 if month:
204 pub_date_list.append(str(month))
205 day = pub_date_dict.get('publication_day', '')
206 if day:
207 pub_date_list.append(str(day))
208 return '-'.join(pub_date_list)
210 def _extract_pages(self, item: dict) -> str:
211 first_page = item.get('first_page', '')
212 last_page = item.get('last_page', '')
213 page_list: list[str] = []
214 if first_page:
215 page_list.append(first_page)
216 if last_page:
217 page_list.append(last_page)
218 return self.get_pages(page_list)
220 def _extract_type(self, item: dict) -> str:
221 content_type = item.get('content_type')
222 if not content_type:
223 return ''
224 type_map = {
225 'JA': 'journal article',
226 'BK': 'book',
227 'RD': 'dataset',
228 'EL': 'other',
229 'GD': 'other',
230 }
231 return type_map.get(content_type, '')
233 def _extract_publisher(self, item: dict) -> str:
234 if 'publisher_list' in item:
235 return self.sanitize_text(self.get_ja(item['publisher_list'])[0].get('publisher_name', ''))
236 return ''
238 def extract_all_ids(self, entity_dict: dict, is_citing: bool) -> tuple[list[str], list[str]]:
239 all_br: list[str] = []
240 all_ra: list[str] = []
242 if not is_citing:
243 citation_list = entity_dict.get("data", {}).get("citation_list", [])
244 for citation in citation_list:
245 doi = citation.get("doi")
246 if doi:
247 norm_id = self.doi_m.normalise(doi, include_prefix=True)
248 if norm_id:
249 all_br.append(norm_id)
250 return all_br, all_ra