Coverage for oc_ds_converter / oc_idmanager / jid.py: 54%
128 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
1# SPDX-FileCopyrightText: 2023 Arianna Moretti <arianna.moretti4@unibo.it>
2# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it>
3# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
4# SPDX-FileCopyrightText: 2024 Ivan Heibi <ivan.heibi2@unibo.it>
5#
6# SPDX-License-Identifier: ISC
8import xml.etree.ElementTree as ET
9from re import match, sub
10from time import sleep
11from urllib.parse import quote
13from bs4 import BeautifulSoup
14from oc_ds_converter.oc_idmanager.base import IdentifierManager
15from requests import ReadTimeout, get
16from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import RedisStorageManager
17from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager
20class JIDManager(IdentifierManager):
21 """This class implements an identifier manager for jid identifier"""
22 def __init__(self, use_api_service: bool = True, storage_manager: StorageManager | None = None, testing: bool = True) -> None:
23 """JID manager constructor"""
24 super(JIDManager, self).__init__()
25 self.use_api_service = use_api_service
26 if storage_manager is None:
27 self.storage_manager = RedisStorageManager(testing=testing)
28 else:
29 self.storage_manager = storage_manager
31 self._p = "jid:"
32 self._api = "https://api.jstage.jst.go.jp/searchapi/"
33 self._api2 = "https://www.jstage.jst.go.jp/browse/"
34 self._headers = {
35 "User-Agent": "Identifier Manager / OpenCitations Indexes "
36 "(http://opencitations.net; mailto:contact@opencitations.net)"
37 }
39 def validated_as_id(self, id_string):
40 jid_validation_value = self.storage_manager.get_value(id_string)
41 if isinstance(jid_validation_value, bool):
42 return jid_validation_value
43 else:
44 return None
46 def is_valid(self, jid, get_extra_info=False):
47 """Check if a jid is valid.
49 Args:
50 id_string (str): the jid to check
52 Returns:
53 bool: true if the jid is valid, false otherwise.
54 """
55 jid = self.normalise(jid, include_prefix=True)
57 if jid is None:
58 return False
59 else:
60 jid_validation_value = self.storage_manager.get_value(jid)
61 if isinstance(jid_validation_value, bool):
62 if get_extra_info:
63 return jid_validation_value, {"id":jid, "valid": jid_validation_value}
64 return jid_validation_value
65 else:
66 if get_extra_info:
67 info = self.exists(jid, get_extra_info=True)
68 self.storage_manager.set_full_value(jid, info[1])
69 return (info[0] and self.syntax_ok(jid)), info[1]
70 validity_check = self.syntax_ok(jid) and self.exists(jid)
71 self.storage_manager.set_value(jid, validity_check)
73 return validity_check
77 def normalise(self, id_string, include_prefix=False):
78 """It returns the jid normalized.
80 Args:
81 id_string (str): the jid to normalize.
82 include_prefix (bool, optional): indicates if include the prefix. Defaults to False.
84 Returns:
85 str: the normalized jid
86 """
87 try:
88 if id_string.startswith(self._p):
89 jid_string = id_string[len(self._p):]
90 else:
91 jid_string = id_string
92 jid_string = sub("[^/a-z0-9]", "", jid_string.lower())
93 return "%s%s" % (self._p if include_prefix else "", jid_string)
94 except:
95 # Any error in processing the JID will return None
96 return None
98 def syntax_ok(self, id_string):
99 if not id_string.startswith(self._p):
100 id_string = self._p+id_string
101 return True if match("^jid:[a-z]+([12][0-9]{3}){0,1}[a-z]*$", id_string) else False
105 def exists(self, jid_full, get_extra_info=False, allow_extra_api=None):
106 valid_bool = True
107 if self.use_api_service:
108 jid = self.normalise(jid_full)
109 if jid is not None:
110 tentative = 3
111 while tentative:
112 tentative -= 1
113 try:
114 r = get(self._api+ "/do?service=2&cdjournal=" + quote(jid), headers=self._headers, timeout=30)
115 #fromstring() parses XML from a string directly into an Element, which is the root element of the parsed tree
116 root = ET.fromstring(r.content)
117 status = root.find(".//{http://www.w3.org/2005/Atom}status").text
118 if status =="0":
119 if get_extra_info:
120 return True, self.extra_info(r.content)
121 return True
122 elif status == "ERR_001":
123 if get_extra_info:
124 return False, {"valid": False}
125 return False
126 else:
127 tentative=3
128 while tentative:
129 tentative -=1
130 try:
131 r = get(self._api+ "/do?service=2&cdjournal=" + quote(jid), headers=self._headers, timeout=30)
132 # fromstring() parses XML from a string directly into an Element, which is the root element of the parsed tree
133 root = ET.fromstring(r.content)
134 status = root.find(".//{http://www.w3.org/2005/Atom}status").text
135 if status == "0":
136 if get_extra_info:
137 return True, self.extra_info(r.content)
138 return True
139 elif status == "ERR_001":
140 if get_extra_info:
141 return False, {"valid": False}
142 return False
143 except ReadTimeout:
144 # Do nothing, just try again
145 pass
146 except ConnectionError:
147 # Sleep 5 seconds, then try again
148 sleep(5)
150 # call to the other API
151 try:
152 r = get(self._api2 + quote(jid), headers=self._headers, timeout=30)
153 if r.status_code == 404:
154 if get_extra_info:
155 return False, {"valid": False}
156 return False
157 elif r.status_code == 200:
158 r.encoding = "utf-8"
159 soup = BeautifulSoup(r.text, features="lxml")
160 txt_obj = str(soup.find(id="page-content"))
161 if get_extra_info:
162 return True, self.extra_info(txt_obj)
163 return True
164 except ReadTimeout:
165 # Do nothing, just try again
166 pass
167 except ConnectionError:
168 # Sleep 5 seconds, then try again
169 sleep(5)
171 if get_extra_info:
172 return False, {"valid": False}
173 return False
174 except ReadTimeout:
175 # Do nothing, just try again
176 pass
177 except ConnectionError:
178 # Sleep 5 seconds, then try again
179 sleep(5)
181 valid_bool=False
183 else:
184 if get_extra_info:
185 return False, {"valid": False}
186 return False
187 if get_extra_info:
188 return valid_bool, {"valid": valid_bool}
189 return valid_bool
192 def extra_info(self, api_response, choose_api=None, info_dict={}):
193 result = {}
194 result["valid"] = True
195 return result