Coverage for oc_ds_converter / oc_idmanager / jid.py: 54%

128 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2023 Arianna Moretti <arianna.moretti4@unibo.it> 

2# SPDX-FileCopyrightText: 2023 Marta Soricetti <marta.soricetti@unibo.it> 

3# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

4# SPDX-FileCopyrightText: 2024 Ivan Heibi <ivan.heibi2@unibo.it> 

5# 

6# SPDX-License-Identifier: ISC 

7 

8import xml.etree.ElementTree as ET 

9from re import match, sub 

10from time import sleep 

11from urllib.parse import quote 

12 

13from bs4 import BeautifulSoup 

14from oc_ds_converter.oc_idmanager.base import IdentifierManager 

15from requests import ReadTimeout, get 

16from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import RedisStorageManager 

17from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager 

18 

19 

20class JIDManager(IdentifierManager): 

21 """This class implements an identifier manager for jid identifier""" 

22 def __init__(self, use_api_service: bool = True, storage_manager: StorageManager | None = None, testing: bool = True) -> None: 

23 """JID manager constructor""" 

24 super(JIDManager, self).__init__() 

25 self.use_api_service = use_api_service 

26 if storage_manager is None: 

27 self.storage_manager = RedisStorageManager(testing=testing) 

28 else: 

29 self.storage_manager = storage_manager 

30 

31 self._p = "jid:" 

32 self._api = "https://api.jstage.jst.go.jp/searchapi/" 

33 self._api2 = "https://www.jstage.jst.go.jp/browse/" 

34 self._headers = { 

35 "User-Agent": "Identifier Manager / OpenCitations Indexes " 

36 "(http://opencitations.net; mailto:contact@opencitations.net)" 

37 } 

38 

39 def validated_as_id(self, id_string): 

40 jid_validation_value = self.storage_manager.get_value(id_string) 

41 if isinstance(jid_validation_value, bool): 

42 return jid_validation_value 

43 else: 

44 return None 

45 

46 def is_valid(self, jid, get_extra_info=False): 

47 """Check if a jid is valid. 

48 

49 Args: 

50 id_string (str): the jid to check 

51 

52 Returns: 

53 bool: true if the jid is valid, false otherwise. 

54 """ 

55 jid = self.normalise(jid, include_prefix=True) 

56 

57 if jid is None: 

58 return False 

59 else: 

60 jid_validation_value = self.storage_manager.get_value(jid) 

61 if isinstance(jid_validation_value, bool): 

62 if get_extra_info: 

63 return jid_validation_value, {"id":jid, "valid": jid_validation_value} 

64 return jid_validation_value 

65 else: 

66 if get_extra_info: 

67 info = self.exists(jid, get_extra_info=True) 

68 self.storage_manager.set_full_value(jid, info[1]) 

69 return (info[0] and self.syntax_ok(jid)), info[1] 

70 validity_check = self.syntax_ok(jid) and self.exists(jid) 

71 self.storage_manager.set_value(jid, validity_check) 

72 

73 return validity_check 

74 

75 

76 

77 def normalise(self, id_string, include_prefix=False): 

78 """It returns the jid normalized. 

79 

80 Args: 

81 id_string (str): the jid to normalize. 

82 include_prefix (bool, optional): indicates if include the prefix. Defaults to False. 

83 

84 Returns: 

85 str: the normalized jid 

86 """ 

87 try: 

88 if id_string.startswith(self._p): 

89 jid_string = id_string[len(self._p):] 

90 else: 

91 jid_string = id_string 

92 jid_string = sub("[^/a-z0-9]", "", jid_string.lower()) 

93 return "%s%s" % (self._p if include_prefix else "", jid_string) 

94 except: 

95 # Any error in processing the JID will return None 

96 return None 

97 

98 def syntax_ok(self, id_string): 

99 if not id_string.startswith(self._p): 

100 id_string = self._p+id_string 

101 return True if match("^jid:[a-z]+([12][0-9]{3}){0,1}[a-z]*$", id_string) else False 

102 

103 

104 

105 def exists(self, jid_full, get_extra_info=False, allow_extra_api=None): 

106 valid_bool = True 

107 if self.use_api_service: 

108 jid = self.normalise(jid_full) 

109 if jid is not None: 

110 tentative = 3 

111 while tentative: 

112 tentative -= 1 

113 try: 

114 r = get(self._api+ "/do?service=2&cdjournal=" + quote(jid), headers=self._headers, timeout=30) 

115 #fromstring() parses XML from a string directly into an Element, which is the root element of the parsed tree 

116 root = ET.fromstring(r.content) 

117 status = root.find(".//{http://www.w3.org/2005/Atom}status").text 

118 if status =="0": 

119 if get_extra_info: 

120 return True, self.extra_info(r.content) 

121 return True 

122 elif status == "ERR_001": 

123 if get_extra_info: 

124 return False, {"valid": False} 

125 return False 

126 else: 

127 tentative=3 

128 while tentative: 

129 tentative -=1 

130 try: 

131 r = get(self._api+ "/do?service=2&cdjournal=" + quote(jid), headers=self._headers, timeout=30) 

132 # fromstring() parses XML from a string directly into an Element, which is the root element of the parsed tree 

133 root = ET.fromstring(r.content) 

134 status = root.find(".//{http://www.w3.org/2005/Atom}status").text 

135 if status == "0": 

136 if get_extra_info: 

137 return True, self.extra_info(r.content) 

138 return True 

139 elif status == "ERR_001": 

140 if get_extra_info: 

141 return False, {"valid": False} 

142 return False 

143 except ReadTimeout: 

144 # Do nothing, just try again 

145 pass 

146 except ConnectionError: 

147 # Sleep 5 seconds, then try again 

148 sleep(5) 

149 

150 # call to the other API 

151 try: 

152 r = get(self._api2 + quote(jid), headers=self._headers, timeout=30) 

153 if r.status_code == 404: 

154 if get_extra_info: 

155 return False, {"valid": False} 

156 return False 

157 elif r.status_code == 200: 

158 r.encoding = "utf-8" 

159 soup = BeautifulSoup(r.text, features="lxml") 

160 txt_obj = str(soup.find(id="page-content")) 

161 if get_extra_info: 

162 return True, self.extra_info(txt_obj) 

163 return True 

164 except ReadTimeout: 

165 # Do nothing, just try again 

166 pass 

167 except ConnectionError: 

168 # Sleep 5 seconds, then try again 

169 sleep(5) 

170 

171 if get_extra_info: 

172 return False, {"valid": False} 

173 return False 

174 except ReadTimeout: 

175 # Do nothing, just try again 

176 pass 

177 except ConnectionError: 

178 # Sleep 5 seconds, then try again 

179 sleep(5) 

180 

181 valid_bool=False 

182 

183 else: 

184 if get_extra_info: 

185 return False, {"valid": False} 

186 return False 

187 if get_extra_info: 

188 return valid_bool, {"valid": valid_bool} 

189 return valid_bool 

190 

191 

192 def extra_info(self, api_response, choose_api=None, info_dict={}): 

193 result = {} 

194 result["valid"] = True 

195 return result