Coverage for test / jalc_process_test.py: 89%
245 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-25 18:06 +0000
1# SPDX-FileCopyrightText: 2023-2024 Marta Soricetti <marta.soricetti@unibo.it>
2# SPDX-FileCopyrightText: 2023-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
3# SPDX-FileCopyrightText: 2024 Arianna Moretti <arianna.moretti4@unibo.it>
4#
5# SPDX-License-Identifier: ISC
7import csv
8import json
9import os
10import os.path
11import shutil
12import unittest
13from os.path import join
15from oc_ds_converter.run.jalc_process import preprocess
17BASE = os.path.join('test', 'jalc_process')
18OUTPUT1 = os.path.join(BASE, 'meta_input_without_citing')
19OUTPUT2 = os.path.join(BASE, 'meta_input_with_citing')
20MULTIPROCESS_OUTPUT = os.path.join(BASE, 'multi_process_test')
21CITING_ENTITIES = os.path.join(BASE, 'cit_map_dir')
22OUTPUT = os.path.join(BASE, 'output')
23SUPPORT_MATERIAL = os.path.join(BASE, 'support_material')
24IOD_SUPPORT = os.path.join(SUPPORT_MATERIAL, 'iod')
25INPUT_SUPPORT = os.path.join(SUPPORT_MATERIAL, 'input')
27class TestJalcProcess(unittest.TestCase):
28 def setUp(self):
29 self.test_dir = join("test", "jalc_process")
30 self.sample_dump_dir= join(self.test_dir, "sample_dump")
31 self.sample_fake_dump_dir = join(self.test_dir, "sample_fake_dump")
32 self.output_dir = join(self.test_dir, "output_dir")
33 self.support_mat = join(self.test_dir, "support_mat")
34 self.cache_test = join(self.support_mat, "cache_1.json")
35 self.any_db = join(self.test_dir, "anydb.db")
36 self.any_db1 = join(self.test_dir, "anydb1.db")
37 self.orcid_doi = join(self.support_mat, "iod")
38 self.sample_dupl = join(self.test_dir, "duplicates_sample")
39 self.cache_test1 = join(self.support_mat, "cache_test1.json")
41 def test_preprocess_base_decompress_and_read(self):
42 """Test base functionalities of the JALC processor for producing META csv tables and INDEX tables:
43 1) All the files in the ZIPs in input are correctly processed
44 2) The number of files in input corresponds to the number of files in output for citations
45 3) The number of files in input are duplicated in the output folder for both citing and cited entities
46 """
47 for el in os.listdir(self.sample_dump_dir):
48 if el.endswith("decompr_zip_dir"):
49 shutil.rmtree(os.path.join(self.sample_dump_dir, el))
51 if os.path.exists(self.output_dir):
52 shutil.rmtree(self.output_dir)
54 citations_output_path = self.output_dir + "_citations"
55 if os.path.exists(citations_output_path):
56 shutil.rmtree(citations_output_path)
57 preprocess(jalc_json_dir=self.sample_dump_dir, orcid_doi_filepath=self.orcid_doi, csv_dir=self.output_dir, cache=self.cache_test)
59 citations_in_output = 0
60 encountered_ids = set()
61 unique_entities = 0
63 for file in os.listdir(citations_output_path):
64 with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f:
65 cits_rows = list(csv.DictReader(f))
66 citations_in_output += len(cits_rows)
67 for x in cits_rows:
68 citing_ids = x["citing"].split(" ")
69 citied_ids = x["cited"].split(" ")
70 if all(id not in encountered_ids for id in citing_ids):
71 unique_entities += 1
72 encountered_ids.update(citing_ids)
73 if all(id not in encountered_ids for id in citied_ids):
74 unique_entities += 1
75 encountered_ids.update(citied_ids)
77 expected_citations_in_output = 8
78 #first zip: {"citing":"10.11426/nagare1970.2.3_3", "cited":"10.1017/S0022112062000762"}, {"citing":"10.11426/nagare1970.2.4_1", "cited": "10.1295/kobunshi.16.842"},
79 #{"citing":"10.11426/nagare1970.2.4_1", "cited":"10.1295/kobunshi.16.921"}, {"citing": "10.11426/nagare1970.3.3_13","cited": "10.1002/zamm.19210010401"}, {"citing": "10.11426/nagare1970.3.3_13","cited":"10.1002/zamm.19210010402"},
80 #second zip: {"citing":"10.14825/kaseki.68.0_14", "cited":"10.1126/science.235.4793.1156"}, {"citing":"10.14825/kaseki.68.0_14", "cited":"10.1098/rstb.1989.0091"}, {"citing":"10.14825/kaseki.68.0_18","cited": "10.5575/geosoc.96.265"}
82 expected_entities_in_output = 13
84 self.assertEqual(expected_entities_in_output, unique_entities)
85 self.assertEqual(expected_citations_in_output, citations_in_output)
87 citations_files_n = len(list(os.listdir(citations_output_path)))
89 #shutil.rmtree(citations_output_path)
91 meta_files_n = len(list(os.listdir(self.output_dir)))
93 # Make sure that a meta table row was created for each entity
94 entities_in_meta_output = 0
95 for file in os.listdir(self.output_dir):
96 with open(os.path.join(self.output_dir, file), 'r', encoding='utf-8') as f:
97 entities_in_meta_output += len(list(csv.DictReader(f)))
99 self.assertEqual(expected_entities_in_output, entities_in_meta_output)
100 self.assertEqual(unique_entities, entities_in_meta_output)
102 input_files_n = 0
103 for el in os.listdir(self.sample_dump_dir):
104 if el.endswith("decompr_zip_dir"):
105 input_files_n = len(list(os.listdir(os.path.join(self.sample_dump_dir, el))))
107 # make sure that for each of the input files was created a citation file and two meta input file
108 self.assertTrue(meta_files_n == 2*input_files_n == 4)
109 self.assertTrue(citations_files_n == input_files_n)
111 # shutil.rmtree(self.output_dir)
113 for el in os.listdir(self.sample_dump_dir):
114 if el.endswith("decompr_zip_dir"):
115 shutil.rmtree(os.path.join(self.sample_dump_dir, el))
116 if os.path.exists(self.any_db):
117 os.remove(self.any_db)
119 def test_preprocess_wrong_doi_cited(self):
120 for el in os.listdir(self.sample_fake_dump_dir):
121 if el.endswith("decompr_zip_dir"):
122 shutil.rmtree(os.path.join(self.sample_fake_dump_dir, el))
124 if os.path.exists(self.output_dir):
125 shutil.rmtree(self.output_dir)
127 citations_output_path = self.output_dir + "_citations"
128 if os.path.exists(citations_output_path):
129 shutil.rmtree(citations_output_path)
130 preprocess(jalc_json_dir=self.sample_fake_dump_dir,
131 orcid_doi_filepath=self.orcid_doi, csv_dir=self.output_dir, cache=self.cache_test)
133 citations_in_output = 0
134 encountered_ids = set()
135 unique_entities = 0
137 for file in os.listdir(citations_output_path):
138 with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f:
139 cits_rows = list(csv.DictReader(f))
140 citations_in_output += len(cits_rows)
141 for x in cits_rows:
142 citing_ids = x["citing"].split(" ")
143 citied_ids = x["cited"].split(" ")
144 if all(id not in encountered_ids for id in citing_ids):
145 unique_entities += 1
146 encountered_ids.update(citing_ids)
147 if all(id not in encountered_ids for id in citied_ids):
148 unique_entities += 1
149 encountered_ids.update(citied_ids)
151 expected_citations_in_output = 1
153 expected_entities_in_output = 2
154 ''''3 cited:
155 - 10.5100/jje.30.40: doi not found,
156 - 10.5100/jje.33.1: https://www.jstage.jst.go.jp/article/jje1965/33/1/33_1_1/_article/-char/ja/,
157 - 10.1539/joh1959.5.56: doi not found'''
159 self.assertEqual(expected_entities_in_output, unique_entities)
160 self.assertEqual(expected_citations_in_output, citations_in_output)
162 shutil.rmtree(self.output_dir)
163 shutil.rmtree(citations_output_path)
165 for el in os.listdir(self.sample_fake_dump_dir):
166 if el.endswith("decompr_zip_dir"):
167 shutil.rmtree(os.path.join(self.sample_fake_dump_dir, el))
168 #os.remove(self.any_db)
170 def test_preprocess_base_decompress_and_read_redis(self):
171 """Test base functionalities of the JALC processor for producing META csv tables and INDEX tables:
172 1) All the files in the ZIPs in input are correctly processed
173 2) The number of files in input corresponds to the number of files in output for citations
174 3) The number of files in input are duplicated in the output folder for both citing and cited entities
175 """
176 for el in os.listdir(self.sample_dump_dir):
177 if el.endswith("decompr_zip_dir"):
178 shutil.rmtree(os.path.join(self.sample_dump_dir, el))
180 if os.path.exists(self.output_dir):
181 shutil.rmtree(self.output_dir)
183 citations_output_path = self.output_dir + "_citations"
184 if os.path.exists(citations_output_path):
185 shutil.rmtree(citations_output_path)
186 preprocess(jalc_json_dir=self.sample_dump_dir, orcid_doi_filepath=self.orcid_doi, csv_dir=self.output_dir, cache=self.cache_test)
188 citations_in_output = 0
189 encountered_ids = set()
190 unique_entities = 0
192 for file in os.listdir(citations_output_path):
193 with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f:
194 cits_rows = list(csv.DictReader(f))
195 citations_in_output += len(cits_rows)
196 for x in cits_rows:
197 citing_ids = x["citing"].split(" ")
198 citied_ids = x["cited"].split(" ")
199 if all(id not in encountered_ids for id in citing_ids):
200 unique_entities += 1
201 encountered_ids.update(citing_ids)
202 if all(id not in encountered_ids for id in citied_ids):
203 unique_entities += 1
204 encountered_ids.update(citied_ids)
206 expected_citations_in_output = 8
207 #first zip: {"citing":"10.11426/nagare1970.2.3_3", "cited":"10.1017/S0022112062000762"}, {"citing":"10.11426/nagare1970.2.4_1", "cited": "10.1295/kobunshi.16.842"},
208 #{"citing":"10.11426/nagare1970.2.4_1", "cited":"10.1295/kobunshi.16.921"}, {"citing": "10.11426/nagare1970.3.3_13","cited": "10.1002/zamm.19210010401"}, {"citing": "10.11426/nagare1970.3.3_13","cited":"10.1002/zamm.19210010402"},
209 #second zip: {"citing":"10.14825/kaseki.68.0_14", "cited":"10.1126/science.235.4793.1156"}, {"citing":"10.14825/kaseki.68.0_14", "cited":"10.1098/rstb.1989.0091"}, {"citing":"10.14825/kaseki.68.0_18","cited": "10.5575/geosoc.96.265"}
211 expected_entities_in_output = 13
213 self.assertEqual(expected_entities_in_output, unique_entities)
214 self.assertEqual(expected_citations_in_output, citations_in_output)
216 citations_files_n = len(list(os.listdir(citations_output_path)))
218 shutil.rmtree(citations_output_path)
220 meta_files_n = len(list(os.listdir(self.output_dir)))
222 # Make sure that a meta table row was created for each entity
223 entities_in_meta_output = 0
224 for file in os.listdir(self.output_dir):
225 with open(os.path.join(self.output_dir, file), 'r', encoding='utf-8') as f:
226 entities_in_meta_output += len(list(csv.DictReader(f)))
228 self.assertEqual(expected_entities_in_output, entities_in_meta_output)
229 self.assertEqual(unique_entities, entities_in_meta_output)
231 input_files_n = 0
232 for el in os.listdir(self.sample_dump_dir):
233 if el.endswith("decompr_zip_dir"):
234 input_files_n = len(list(os.listdir(os.path.join(self.sample_dump_dir, el))))
236 # make sure that for each of the input files was created a citation file and two meta input file
237 self.assertTrue(meta_files_n == 2*input_files_n == 4)
238 self.assertTrue(citations_files_n == input_files_n)
240 shutil.rmtree(self.output_dir)
242 for el in os.listdir(self.sample_dump_dir):
243 if el.endswith("decompr_zip_dir"):
244 shutil.rmtree(os.path.join(self.sample_dump_dir, el))
245 #os.remove(self.any_db1)
248 def test_cache(self):
249 'Nothing should be produced in output, since the cache file reports that all the files in input were completed'
251 for el in os.listdir(self.sample_dump_dir):
252 if el.endswith("decompr_zip_dir"):
253 shutil.rmtree(os.path.join(self.sample_dump_dir, el))
255 if os.path.exists(self.output_dir):
256 shutil.rmtree(self.output_dir)
258 citations_output_path = self.output_dir + "_citations"
259 if os.path.exists(citations_output_path):
260 shutil.rmtree(citations_output_path)
261 with open(self.cache_test1, "w") as write_cache:
262 processed_files_dict = {'citing': ['10.11426.zip', '10.14825.zip'],
263 'cited': ['10.11426.zip', '10.14825.zip']}
264 json.dump(processed_files_dict, write_cache)
266 preprocess(jalc_json_dir=self.sample_dump_dir, orcid_doi_filepath=self.orcid_doi, csv_dir=self.output_dir,
267 cache=self.cache_test1)
269 citations_in_output = 0
270 encountered_ids = set()
271 unique_entities = 0
275 for file in os.listdir(citations_output_path):
276 with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f:
277 cits_rows = list(csv.DictReader(f))
278 citations_in_output += len(cits_rows)
279 for x in cits_rows:
280 citing_ids = x["citing"].split(" ")
281 citied_ids = x["cited"].split(" ")
282 if all(id not in encountered_ids for id in citing_ids):
283 unique_entities += 1
284 encountered_ids.update(citing_ids)
286 if all(id not in encountered_ids for id in citied_ids):
287 unique_entities += 1
288 encountered_ids.update(citied_ids)
290 expected_citations_in_output = 0
292 expected_entities_in_output = 0
295 self.assertEqual(expected_entities_in_output, unique_entities)
296 self.assertEqual(expected_citations_in_output, citations_in_output)
298 shutil.rmtree(citations_output_path)
299 shutil.rmtree(self.output_dir)
301 for el in os.listdir(self.sample_dump_dir):
302 if el.endswith("decompr_zip_dir"):
303 shutil.rmtree(os.path.join(self.sample_dump_dir, el))
306 def test_preprocess_second_run_produces_same_output(self):
307 """Test that running preprocess twice produces the same output.
309 This verifies that the PROCESS-DB is properly cleaned up after each run,
310 so that a second execution doesn't skip entities that were already processed.
311 """
312 for el in os.listdir(self.sample_dump_dir):
313 if el.endswith("decompr_zip_dir"):
314 shutil.rmtree(os.path.join(self.sample_dump_dir, el))
316 if os.path.exists(self.output_dir):
317 shutil.rmtree(self.output_dir)
319 citations_output_path = self.output_dir + "_citations"
320 if os.path.exists(citations_output_path):
321 shutil.rmtree(citations_output_path)
323 # First run
324 preprocess(
325 jalc_json_dir=self.sample_dump_dir,
326 orcid_doi_filepath=self.orcid_doi,
327 csv_dir=self.output_dir,
328 cache=self.cache_test
329 )
331 # Count entities from first run
332 first_run_entities = 0
333 for file in os.listdir(self.output_dir):
334 with open(os.path.join(self.output_dir, file), 'r', encoding='utf-8') as f:
335 first_run_entities += len(list(csv.DictReader(f)))
337 first_run_citations = 0
338 for file in os.listdir(citations_output_path):
339 with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f:
340 first_run_citations += len(list(csv.DictReader(f)))
342 # Clean output directories
343 shutil.rmtree(self.output_dir)
344 shutil.rmtree(citations_output_path)
346 for el in os.listdir(self.sample_dump_dir):
347 if el.endswith("decompr_zip_dir"):
348 shutil.rmtree(os.path.join(self.sample_dump_dir, el))
350 # Second run - should produce the same output
351 preprocess(
352 jalc_json_dir=self.sample_dump_dir,
353 orcid_doi_filepath=self.orcid_doi,
354 csv_dir=self.output_dir,
355 cache=self.cache_test
356 )
358 # Count entities from second run
359 second_run_entities = 0
360 for file in os.listdir(self.output_dir):
361 with open(os.path.join(self.output_dir, file), 'r', encoding='utf-8') as f:
362 second_run_entities += len(list(csv.DictReader(f)))
364 second_run_citations = 0
365 for file in os.listdir(citations_output_path):
366 with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f:
367 second_run_citations += len(list(csv.DictReader(f)))
369 # Both runs should produce the same number of entities
370 self.assertEqual(first_run_entities, 13)
371 self.assertEqual(second_run_entities, 13)
372 self.assertEqual(first_run_citations, 8)
373 self.assertEqual(second_run_citations, 8)
375 # Cleanup
376 shutil.rmtree(self.output_dir)
377 shutil.rmtree(citations_output_path)
379 for el in os.listdir(self.sample_dump_dir):
380 if el.endswith("decompr_zip_dir"):
381 shutil.rmtree(os.path.join(self.sample_dump_dir, el))
384if __name__ == '__main__':
385 unittest.main()