Coverage for test / jalc_process_test.py: 89%

245 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-25 18:06 +0000

1# SPDX-FileCopyrightText: 2023-2024 Marta Soricetti <marta.soricetti@unibo.it> 

2# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

3# SPDX-FileCopyrightText: 2024 Arianna Moretti <arianna.moretti4@unibo.it> 

4# 

5# SPDX-License-Identifier: ISC 

6 

7import csv 

8import json 

9import os 

10import os.path 

11import shutil 

12import unittest 

13from os.path import join 

14 

15from oc_ds_converter.run.jalc_process import preprocess 

16 

17BASE = os.path.join('test', 'jalc_process') 

18OUTPUT1 = os.path.join(BASE, 'meta_input_without_citing') 

19OUTPUT2 = os.path.join(BASE, 'meta_input_with_citing') 

20MULTIPROCESS_OUTPUT = os.path.join(BASE, 'multi_process_test') 

21CITING_ENTITIES = os.path.join(BASE, 'cit_map_dir') 

22OUTPUT = os.path.join(BASE, 'output') 

23SUPPORT_MATERIAL = os.path.join(BASE, 'support_material') 

24IOD_SUPPORT = os.path.join(SUPPORT_MATERIAL, 'iod') 

25INPUT_SUPPORT = os.path.join(SUPPORT_MATERIAL, 'input') 

26 

27class TestJalcProcess(unittest.TestCase): 

28 def setUp(self): 

29 self.test_dir = join("test", "jalc_process") 

30 self.sample_dump_dir= join(self.test_dir, "sample_dump") 

31 self.sample_fake_dump_dir = join(self.test_dir, "sample_fake_dump") 

32 self.output_dir = join(self.test_dir, "output_dir") 

33 self.support_mat = join(self.test_dir, "support_mat") 

34 self.cache_test = join(self.support_mat, "cache_1.json") 

35 self.any_db = join(self.test_dir, "anydb.db") 

36 self.any_db1 = join(self.test_dir, "anydb1.db") 

37 self.orcid_doi = join(self.support_mat, "iod") 

38 self.sample_dupl = join(self.test_dir, "duplicates_sample") 

39 self.cache_test1 = join(self.support_mat, "cache_test1.json") 

40 

41 def test_preprocess_base_decompress_and_read(self): 

42 """Test base functionalities of the JALC processor for producing META csv tables and INDEX tables: 

43 1) All the files in the ZIPs in input are correctly processed 

44 2) The number of files in input corresponds to the number of files in output for citations 

45 3) The number of files in input are duplicated in the output folder for both citing and cited entities 

46 """ 

47 for el in os.listdir(self.sample_dump_dir): 

48 if el.endswith("decompr_zip_dir"): 

49 shutil.rmtree(os.path.join(self.sample_dump_dir, el)) 

50 

51 if os.path.exists(self.output_dir): 

52 shutil.rmtree(self.output_dir) 

53 

54 citations_output_path = self.output_dir + "_citations" 

55 if os.path.exists(citations_output_path): 

56 shutil.rmtree(citations_output_path) 

57 preprocess(jalc_json_dir=self.sample_dump_dir, orcid_doi_filepath=self.orcid_doi, csv_dir=self.output_dir, cache=self.cache_test) 

58 

59 citations_in_output = 0 

60 encountered_ids = set() 

61 unique_entities = 0 

62 

63 for file in os.listdir(citations_output_path): 

64 with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f: 

65 cits_rows = list(csv.DictReader(f)) 

66 citations_in_output += len(cits_rows) 

67 for x in cits_rows: 

68 citing_ids = x["citing"].split(" ") 

69 citied_ids = x["cited"].split(" ") 

70 if all(id not in encountered_ids for id in citing_ids): 

71 unique_entities += 1 

72 encountered_ids.update(citing_ids) 

73 if all(id not in encountered_ids for id in citied_ids): 

74 unique_entities += 1 

75 encountered_ids.update(citied_ids) 

76 

77 expected_citations_in_output = 8 

78 #first zip: {"citing":"10.11426/nagare1970.2.3_3", "cited":"10.1017/S0022112062000762"}, {"citing":"10.11426/nagare1970.2.4_1", "cited": "10.1295/kobunshi.16.842"}, 

79 #{"citing":"10.11426/nagare1970.2.4_1", "cited":"10.1295/kobunshi.16.921"}, {"citing": "10.11426/nagare1970.3.3_13","cited": "10.1002/zamm.19210010401"}, {"citing": "10.11426/nagare1970.3.3_13","cited":"10.1002/zamm.19210010402"}, 

80 #second zip: {"citing":"10.14825/kaseki.68.0_14", "cited":"10.1126/science.235.4793.1156"}, {"citing":"10.14825/kaseki.68.0_14", "cited":"10.1098/rstb.1989.0091"}, {"citing":"10.14825/kaseki.68.0_18","cited": "10.5575/geosoc.96.265"} 

81 

82 expected_entities_in_output = 13 

83 

84 self.assertEqual(expected_entities_in_output, unique_entities) 

85 self.assertEqual(expected_citations_in_output, citations_in_output) 

86 

87 citations_files_n = len(list(os.listdir(citations_output_path))) 

88 

89 #shutil.rmtree(citations_output_path) 

90 

91 meta_files_n = len(list(os.listdir(self.output_dir))) 

92 

93 # Make sure that a meta table row was created for each entity 

94 entities_in_meta_output = 0 

95 for file in os.listdir(self.output_dir): 

96 with open(os.path.join(self.output_dir, file), 'r', encoding='utf-8') as f: 

97 entities_in_meta_output += len(list(csv.DictReader(f))) 

98 

99 self.assertEqual(expected_entities_in_output, entities_in_meta_output) 

100 self.assertEqual(unique_entities, entities_in_meta_output) 

101 

102 input_files_n = 0 

103 for el in os.listdir(self.sample_dump_dir): 

104 if el.endswith("decompr_zip_dir"): 

105 input_files_n = len(list(os.listdir(os.path.join(self.sample_dump_dir, el)))) 

106 

107 # make sure that for each of the input files was created a citation file and two meta input file 

108 self.assertTrue(meta_files_n == 2*input_files_n == 4) 

109 self.assertTrue(citations_files_n == input_files_n) 

110 

111 # shutil.rmtree(self.output_dir) 

112 

113 for el in os.listdir(self.sample_dump_dir): 

114 if el.endswith("decompr_zip_dir"): 

115 shutil.rmtree(os.path.join(self.sample_dump_dir, el)) 

116 if os.path.exists(self.any_db): 

117 os.remove(self.any_db) 

118 

119 def test_preprocess_wrong_doi_cited(self): 

120 for el in os.listdir(self.sample_fake_dump_dir): 

121 if el.endswith("decompr_zip_dir"): 

122 shutil.rmtree(os.path.join(self.sample_fake_dump_dir, el)) 

123 

124 if os.path.exists(self.output_dir): 

125 shutil.rmtree(self.output_dir) 

126 

127 citations_output_path = self.output_dir + "_citations" 

128 if os.path.exists(citations_output_path): 

129 shutil.rmtree(citations_output_path) 

130 preprocess(jalc_json_dir=self.sample_fake_dump_dir, 

131 orcid_doi_filepath=self.orcid_doi, csv_dir=self.output_dir, cache=self.cache_test) 

132 

133 citations_in_output = 0 

134 encountered_ids = set() 

135 unique_entities = 0 

136 

137 for file in os.listdir(citations_output_path): 

138 with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f: 

139 cits_rows = list(csv.DictReader(f)) 

140 citations_in_output += len(cits_rows) 

141 for x in cits_rows: 

142 citing_ids = x["citing"].split(" ") 

143 citied_ids = x["cited"].split(" ") 

144 if all(id not in encountered_ids for id in citing_ids): 

145 unique_entities += 1 

146 encountered_ids.update(citing_ids) 

147 if all(id not in encountered_ids for id in citied_ids): 

148 unique_entities += 1 

149 encountered_ids.update(citied_ids) 

150 

151 expected_citations_in_output = 1 

152 

153 expected_entities_in_output = 2 

154 ''''3 cited: 

155 - 10.5100/jje.30.40: doi not found, 

156 - 10.5100/jje.33.1: https://www.jstage.jst.go.jp/article/jje1965/33/1/33_1_1/_article/-char/ja/, 

157 - 10.1539/joh1959.5.56: doi not found''' 

158 

159 self.assertEqual(expected_entities_in_output, unique_entities) 

160 self.assertEqual(expected_citations_in_output, citations_in_output) 

161 

162 shutil.rmtree(self.output_dir) 

163 shutil.rmtree(citations_output_path) 

164 

165 for el in os.listdir(self.sample_fake_dump_dir): 

166 if el.endswith("decompr_zip_dir"): 

167 shutil.rmtree(os.path.join(self.sample_fake_dump_dir, el)) 

168 #os.remove(self.any_db) 

169 

170 def test_preprocess_base_decompress_and_read_redis(self): 

171 """Test base functionalities of the JALC processor for producing META csv tables and INDEX tables: 

172 1) All the files in the ZIPs in input are correctly processed 

173 2) The number of files in input corresponds to the number of files in output for citations 

174 3) The number of files in input are duplicated in the output folder for both citing and cited entities 

175 """ 

176 for el in os.listdir(self.sample_dump_dir): 

177 if el.endswith("decompr_zip_dir"): 

178 shutil.rmtree(os.path.join(self.sample_dump_dir, el)) 

179 

180 if os.path.exists(self.output_dir): 

181 shutil.rmtree(self.output_dir) 

182 

183 citations_output_path = self.output_dir + "_citations" 

184 if os.path.exists(citations_output_path): 

185 shutil.rmtree(citations_output_path) 

186 preprocess(jalc_json_dir=self.sample_dump_dir, orcid_doi_filepath=self.orcid_doi, csv_dir=self.output_dir, cache=self.cache_test) 

187 

188 citations_in_output = 0 

189 encountered_ids = set() 

190 unique_entities = 0 

191 

192 for file in os.listdir(citations_output_path): 

193 with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f: 

194 cits_rows = list(csv.DictReader(f)) 

195 citations_in_output += len(cits_rows) 

196 for x in cits_rows: 

197 citing_ids = x["citing"].split(" ") 

198 citied_ids = x["cited"].split(" ") 

199 if all(id not in encountered_ids for id in citing_ids): 

200 unique_entities += 1 

201 encountered_ids.update(citing_ids) 

202 if all(id not in encountered_ids for id in citied_ids): 

203 unique_entities += 1 

204 encountered_ids.update(citied_ids) 

205 

206 expected_citations_in_output = 8 

207 #first zip: {"citing":"10.11426/nagare1970.2.3_3", "cited":"10.1017/S0022112062000762"}, {"citing":"10.11426/nagare1970.2.4_1", "cited": "10.1295/kobunshi.16.842"}, 

208 #{"citing":"10.11426/nagare1970.2.4_1", "cited":"10.1295/kobunshi.16.921"}, {"citing": "10.11426/nagare1970.3.3_13","cited": "10.1002/zamm.19210010401"}, {"citing": "10.11426/nagare1970.3.3_13","cited":"10.1002/zamm.19210010402"}, 

209 #second zip: {"citing":"10.14825/kaseki.68.0_14", "cited":"10.1126/science.235.4793.1156"}, {"citing":"10.14825/kaseki.68.0_14", "cited":"10.1098/rstb.1989.0091"}, {"citing":"10.14825/kaseki.68.0_18","cited": "10.5575/geosoc.96.265"} 

210 

211 expected_entities_in_output = 13 

212 

213 self.assertEqual(expected_entities_in_output, unique_entities) 

214 self.assertEqual(expected_citations_in_output, citations_in_output) 

215 

216 citations_files_n = len(list(os.listdir(citations_output_path))) 

217 

218 shutil.rmtree(citations_output_path) 

219 

220 meta_files_n = len(list(os.listdir(self.output_dir))) 

221 

222 # Make sure that a meta table row was created for each entity 

223 entities_in_meta_output = 0 

224 for file in os.listdir(self.output_dir): 

225 with open(os.path.join(self.output_dir, file), 'r', encoding='utf-8') as f: 

226 entities_in_meta_output += len(list(csv.DictReader(f))) 

227 

228 self.assertEqual(expected_entities_in_output, entities_in_meta_output) 

229 self.assertEqual(unique_entities, entities_in_meta_output) 

230 

231 input_files_n = 0 

232 for el in os.listdir(self.sample_dump_dir): 

233 if el.endswith("decompr_zip_dir"): 

234 input_files_n = len(list(os.listdir(os.path.join(self.sample_dump_dir, el)))) 

235 

236 # make sure that for each of the input files was created a citation file and two meta input file 

237 self.assertTrue(meta_files_n == 2*input_files_n == 4) 

238 self.assertTrue(citations_files_n == input_files_n) 

239 

240 shutil.rmtree(self.output_dir) 

241 

242 for el in os.listdir(self.sample_dump_dir): 

243 if el.endswith("decompr_zip_dir"): 

244 shutil.rmtree(os.path.join(self.sample_dump_dir, el)) 

245 #os.remove(self.any_db1) 

246 

247 

248 def test_cache(self): 

249 'Nothing should be produced in output, since the cache file reports that all the files in input were completed' 

250 

251 for el in os.listdir(self.sample_dump_dir): 

252 if el.endswith("decompr_zip_dir"): 

253 shutil.rmtree(os.path.join(self.sample_dump_dir, el)) 

254 

255 if os.path.exists(self.output_dir): 

256 shutil.rmtree(self.output_dir) 

257 

258 citations_output_path = self.output_dir + "_citations" 

259 if os.path.exists(citations_output_path): 

260 shutil.rmtree(citations_output_path) 

261 with open(self.cache_test1, "w") as write_cache: 

262 processed_files_dict = {'citing': ['10.11426.zip', '10.14825.zip'], 

263 'cited': ['10.11426.zip', '10.14825.zip']} 

264 json.dump(processed_files_dict, write_cache) 

265 

266 preprocess(jalc_json_dir=self.sample_dump_dir, orcid_doi_filepath=self.orcid_doi, csv_dir=self.output_dir, 

267 cache=self.cache_test1) 

268 

269 citations_in_output = 0 

270 encountered_ids = set() 

271 unique_entities = 0 

272 

273 

274 

275 for file in os.listdir(citations_output_path): 

276 with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f: 

277 cits_rows = list(csv.DictReader(f)) 

278 citations_in_output += len(cits_rows) 

279 for x in cits_rows: 

280 citing_ids = x["citing"].split(" ") 

281 citied_ids = x["cited"].split(" ") 

282 if all(id not in encountered_ids for id in citing_ids): 

283 unique_entities += 1 

284 encountered_ids.update(citing_ids) 

285 

286 if all(id not in encountered_ids for id in citied_ids): 

287 unique_entities += 1 

288 encountered_ids.update(citied_ids) 

289 

290 expected_citations_in_output = 0 

291 

292 expected_entities_in_output = 0 

293 

294 

295 self.assertEqual(expected_entities_in_output, unique_entities) 

296 self.assertEqual(expected_citations_in_output, citations_in_output) 

297 

298 shutil.rmtree(citations_output_path) 

299 shutil.rmtree(self.output_dir) 

300 

301 for el in os.listdir(self.sample_dump_dir): 

302 if el.endswith("decompr_zip_dir"): 

303 shutil.rmtree(os.path.join(self.sample_dump_dir, el)) 

304 

305 

306 def test_preprocess_second_run_produces_same_output(self): 

307 """Test that running preprocess twice produces the same output. 

308 

309 This verifies that the PROCESS-DB is properly cleaned up after each run, 

310 so that a second execution doesn't skip entities that were already processed. 

311 """ 

312 for el in os.listdir(self.sample_dump_dir): 

313 if el.endswith("decompr_zip_dir"): 

314 shutil.rmtree(os.path.join(self.sample_dump_dir, el)) 

315 

316 if os.path.exists(self.output_dir): 

317 shutil.rmtree(self.output_dir) 

318 

319 citations_output_path = self.output_dir + "_citations" 

320 if os.path.exists(citations_output_path): 

321 shutil.rmtree(citations_output_path) 

322 

323 # First run 

324 preprocess( 

325 jalc_json_dir=self.sample_dump_dir, 

326 orcid_doi_filepath=self.orcid_doi, 

327 csv_dir=self.output_dir, 

328 cache=self.cache_test 

329 ) 

330 

331 # Count entities from first run 

332 first_run_entities = 0 

333 for file in os.listdir(self.output_dir): 

334 with open(os.path.join(self.output_dir, file), 'r', encoding='utf-8') as f: 

335 first_run_entities += len(list(csv.DictReader(f))) 

336 

337 first_run_citations = 0 

338 for file in os.listdir(citations_output_path): 

339 with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f: 

340 first_run_citations += len(list(csv.DictReader(f))) 

341 

342 # Clean output directories 

343 shutil.rmtree(self.output_dir) 

344 shutil.rmtree(citations_output_path) 

345 

346 for el in os.listdir(self.sample_dump_dir): 

347 if el.endswith("decompr_zip_dir"): 

348 shutil.rmtree(os.path.join(self.sample_dump_dir, el)) 

349 

350 # Second run - should produce the same output 

351 preprocess( 

352 jalc_json_dir=self.sample_dump_dir, 

353 orcid_doi_filepath=self.orcid_doi, 

354 csv_dir=self.output_dir, 

355 cache=self.cache_test 

356 ) 

357 

358 # Count entities from second run 

359 second_run_entities = 0 

360 for file in os.listdir(self.output_dir): 

361 with open(os.path.join(self.output_dir, file), 'r', encoding='utf-8') as f: 

362 second_run_entities += len(list(csv.DictReader(f))) 

363 

364 second_run_citations = 0 

365 for file in os.listdir(citations_output_path): 

366 with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f: 

367 second_run_citations += len(list(csv.DictReader(f))) 

368 

369 # Both runs should produce the same number of entities 

370 self.assertEqual(first_run_entities, 13) 

371 self.assertEqual(second_run_entities, 13) 

372 self.assertEqual(first_run_citations, 8) 

373 self.assertEqual(second_run_citations, 8) 

374 

375 # Cleanup 

376 shutil.rmtree(self.output_dir) 

377 shutil.rmtree(citations_output_path) 

378 

379 for el in os.listdir(self.sample_dump_dir): 

380 if el.endswith("decompr_zip_dir"): 

381 shutil.rmtree(os.path.join(self.sample_dump_dir, el)) 

382 

383 

384if __name__ == '__main__': 

385 unittest.main()