Coverage for test/csv_generator_lite_test.py: 99%

455 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-12-20 08:55 +0000

1#!/usr/bin/python 

2# -*- coding: utf-8 -*- 

3# Copyright (c) 2024 Arcangelo Massari <arcangelo.massari@unibo.it> 

4# 

5# Permission to use, copy, modify, and/or distribute this software for any purpose 

6# with or without fee is hereby granted, provided that the above copyright notice 

7# and this permission notice appear in all copies. 

8# 

9# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 

10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 

11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, 

12# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 

13# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 

14# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 

15# SOFTWARE. 

16 

17import csv 

18import json 

19import os 

20import unittest 

21from shutil import rmtree 

22from zipfile import ZipFile 

23 

24import redis 

25from oc_meta.lib.file_manager import get_csv_data 

26from oc_meta.plugins.csv_generator_lite.csv_generator_lite import ( 

27 generate_csv, 

28 init_redis_connection, 

29 is_omid_processed, 

30 load_processed_omids_to_redis, 

31) 

32 

33 

34class TestCSVGeneratorLite(unittest.TestCase): 

35 def setUp(self): 

36 self.base_dir = os.path.join("test", "csv_generator_lite") 

37 self.input_dir = os.path.join(self.base_dir, "input") 

38 self.output_dir = os.path.join(self.base_dir, "output") 

39 

40 # Create test directories if they don't exist 

41 os.makedirs(self.input_dir, exist_ok=True) 

42 os.makedirs(self.output_dir, exist_ok=True) 

43 

44 # Create test RDF structure 

45 self.rdf_dir = os.path.join(self.input_dir, "rdf") 

46 self.br_dir = os.path.join(self.rdf_dir, "br") 

47 os.makedirs(self.br_dir, exist_ok=True) 

48 

49 # Initialize Redis connection for tests 

50 self.redis_client = init_redis_connection(port=6381, db=5) 

51 self.redis_client.flushdb() # Clear test database 

52 

53 def tearDown(self): 

54 if os.path.exists(self.base_dir): 

55 rmtree(self.base_dir) 

56 # Clean up Redis test database 

57 self.redis_client.flushdb() 

58 

59 def _write_test_data(self, data): 

60 """Helper method to write test data to the input directory""" 

61 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True) 

62 test_data = [ 

63 { 

64 "@graph": [ 

65 { 

66 "@id": f"https://w3id.org/oc/meta/{item['id'].replace('omid:', '')}", 

67 "@type": [ 

68 "http://purl.org/spar/fabio/Expression", 

69 "http://purl.org/spar/fabio/JournalArticle", 

70 ], 

71 "http://purl.org/dc/terms/title": [{"@value": item["title"]}], 

72 } 

73 for item in data 

74 ] 

75 } 

76 ] 

77 with ZipFile( 

78 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w" 

79 ) as zip_file: 

80 zip_file.writestr("1000.json", json.dumps(test_data)) 

81 

82 def test_redis_connection_and_caching(self): 

83 """Test Redis connection and basic caching operations""" 

84 # Test connection initialization 

85 redis_client = init_redis_connection(port=6381, db=5) 

86 self.assertIsInstance(redis_client, redis.Redis) 

87 

88 # Create a test CSV file with some OMIDs 

89 test_data = [ 

90 {"id": "omid:br/0601", "title": "Test 1"}, 

91 {"id": "omid:br/0602", "title": "Test 2"}, 

92 {"id": "omid:br/0603 issn:456", "title": "Test 3"}, 

93 ] 

94 os.makedirs(self.output_dir, exist_ok=True) 

95 with open( 

96 os.path.join(self.output_dir, "test.csv"), "w", newline="", encoding="utf-8" 

97 ) as f: 

98 writer = csv.DictWriter(f, fieldnames=["id", "title"]) 

99 writer.writeheader() 

100 writer.writerows(test_data) 

101 

102 # Test loading OMIDs into Redis 

103 count = load_processed_omids_to_redis(self.output_dir, redis_client) 

104 self.assertEqual(count, 3) 

105 

106 # Test OMID lookup 

107 self.assertTrue(is_omid_processed("omid:br/0601", redis_client)) 

108 self.assertTrue(is_omid_processed("omid:br/0602", redis_client)) 

109 self.assertTrue(is_omid_processed("omid:br/0603", redis_client)) 

110 self.assertFalse(is_omid_processed("omid:br/0604", redis_client)) 

111 

112 def test_redis_cache_persistence(self): 

113 """Test that Redis is populated from existing CSV files and cleared after completion""" 

114 # Create initial test data 

115 test_data = [ 

116 { 

117 "@graph": [ 

118 { 

119 "@id": "https://w3id.org/oc/meta/br/0601", 

120 "@type": [ 

121 "http://purl.org/spar/fabio/Expression", 

122 "http://purl.org/spar/fabio/JournalArticle", 

123 ], 

124 "http://purl.org/dc/terms/title": [{"@value": "First Run"}], 

125 } 

126 ] 

127 } 

128 ] 

129 

130 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True) 

131 with ZipFile( 

132 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w" 

133 ) as zip_file: 

134 zip_file.writestr("1000.json", json.dumps(test_data)) 

135 

136 # First run - creates initial CSV 

137 generate_csv( 

138 input_dir=self.rdf_dir, 

139 output_dir=self.output_dir, 

140 dir_split_number=10000, 

141 items_per_file=1000, 

142 zip_output_rdf=True, 

143 redis_port=6381, 

144 redis_db=5, 

145 ) 

146 

147 # Verify Redis is empty after first run 

148 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client)) 

149 

150 # Create new test data 

151 test_data_2 = [ 

152 { 

153 "@graph": [ 

154 { 

155 "@id": "https://w3id.org/oc/meta/br/0601", # Same OMID as before 

156 "@type": [ 

157 "http://purl.org/spar/fabio/Expression", 

158 "http://purl.org/spar/fabio/JournalArticle", 

159 ], 

160 "http://purl.org/dc/terms/title": [ 

161 {"@value": "Should Be Skipped"} 

162 ], 

163 }, 

164 { 

165 "@id": "https://w3id.org/oc/meta/br/0602", # New OMID 

166 "@type": [ 

167 "http://purl.org/spar/fabio/Expression", 

168 "http://purl.org/spar/fabio/JournalArticle", 

169 ], 

170 "http://purl.org/dc/terms/title": [ 

171 {"@value": "Should Be Processed"} 

172 ], 

173 }, 

174 ] 

175 } 

176 ] 

177 

178 with ZipFile( 

179 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w" 

180 ) as zip_file: 

181 zip_file.writestr("1000.json", json.dumps(test_data_2)) 

182 

183 # Second run - should load OMIDs from existing CSV and skip already processed resources 

184 generate_csv( 

185 input_dir=self.rdf_dir, 

186 output_dir=self.output_dir, 

187 dir_split_number=10000, 

188 items_per_file=1000, 

189 zip_output_rdf=True, 

190 redis_port=6381, 

191 redis_db=5, 

192 ) 

193 

194 # Check output files 

195 output_data = [] 

196 for filename in os.listdir(self.output_dir): 

197 if filename.endswith(".csv"): 

198 output_data.extend( 

199 get_csv_data(os.path.join(self.output_dir, filename)) 

200 ) 

201 

202 # Verify results 

203 # Should find exactly two entries - one from first run and one new one 

204 self.assertEqual(len(output_data), 2) 

205 

206 # Find entries by title 

207 first_run_entry = next( 

208 item for item in output_data if item["title"] == "First Run" 

209 ) 

210 second_run_entry = next( 

211 item for item in output_data if item["title"] == "Should Be Processed" 

212 ) 

213 

214 # Verify the first entry wasn't overwritten with "Should Be Skipped" 

215 self.assertEqual(first_run_entry["title"], "First Run") 

216 self.assertEqual(first_run_entry["id"], "omid:br/0601") 

217 

218 # Verify the new entry was processed 

219 self.assertEqual(second_run_entry["title"], "Should Be Processed") 

220 self.assertEqual(second_run_entry["id"], "omid:br/0602") 

221 

222 # Verify Redis is empty after completion 

223 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client)) 

224 self.assertFalse(is_omid_processed("omid:br/0602", self.redis_client)) 

225 

226 def test_redis_cache_cleanup(self): 

227 """Test that Redis cache is properly cleaned up in various scenarios""" 

228 # First run - should process successfully and clear Redis 

229 input_data = [{"id": "omid:br/0601", "title": "First Entry"}] 

230 self._write_test_data(input_data) 

231 

232 # Run with valid directory - should process and clear Redis 

233 generate_csv( 

234 input_dir=self.rdf_dir, 

235 output_dir=self.output_dir, 

236 dir_split_number=10000, 

237 items_per_file=1000, 

238 zip_output_rdf=True, 

239 redis_port=6381, 

240 redis_db=5, 

241 ) 

242 

243 # Verify Redis is empty after successful run 

244 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client)) 

245 

246 # Load processed OMIDs into Redis 

247 load_processed_omids_to_redis(self.output_dir, self.redis_client) 

248 

249 # Verify that after loading from CSV, the OMID is in Redis 

250 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client)) 

251 

252 # Run with non-existent directory - should fail but keep Redis populated 

253 generate_csv( 

254 input_dir="/nonexistent/dir", 

255 output_dir=self.output_dir, 

256 dir_split_number=10000, 

257 items_per_file=1000, 

258 zip_output_rdf=True, 

259 redis_port=6381, 

260 redis_db=5, 

261 ) 

262 

263 # Verify Redis still has the data after failed run 

264 self.assertTrue( 

265 is_omid_processed("omid:br/0601", self.redis_client), 

266 "Redis cache should be retained after a failed run", 

267 ) 

268 

269 def test_redis_error_handling(self): 

270 """Test handling of Redis connection errors""" 

271 # Test with invalid Redis connection 

272 with self.assertRaises(redis.ConnectionError): 

273 init_redis_connection(port=9999) # Invalid port 

274 

275 # Test loading OMIDs with non-existent directory 

276 count = load_processed_omids_to_redis("/nonexistent/dir", self.redis_client) 

277 self.assertEqual(count, 0) 

278 

279 def test_concurrent_processing_with_redis(self): 

280 """Test concurrent processing with Redis caching""" 

281 # Create multiple test files 

282 test_data = [] 

283 for i in range(100): # Create 100 test entries 

284 test_data.append( 

285 { 

286 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}", 

287 "@type": [ 

288 "http://purl.org/spar/fabio/Expression", 

289 "http://purl.org/spar/fabio/JournalArticle", 

290 ], 

291 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}], 

292 } 

293 ) 

294 

295 # Split into multiple files 

296 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True) 

297 for i in range(0, 100, 10): # Create 10 files with 10 entries each 

298 file_data = [{"@graph": test_data[i : i + 10]}] 

299 with ZipFile( 

300 os.path.join(self.br_dir, "060", "10000", f"{i+1000}.zip"), "w" 

301 ) as zip_file: 

302 zip_file.writestr(f"{i+1000}.json", json.dumps(file_data)) 

303 

304 # First run to create some CSV files 

305 generate_csv( 

306 input_dir=self.rdf_dir, 

307 output_dir=self.output_dir, 

308 dir_split_number=10000, 

309 items_per_file=1000, 

310 zip_output_rdf=True, 

311 redis_port=6381, 

312 redis_db=5, 

313 ) 

314 

315 # Create more test entries 

316 more_test_data = [] 

317 for i in range(100, 200): # Create 100 more test entries 

318 more_test_data.append( 

319 { 

320 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}", 

321 "@type": [ 

322 "http://purl.org/spar/fabio/Expression", 

323 "http://purl.org/spar/fabio/JournalArticle", 

324 ], 

325 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}], 

326 } 

327 ) 

328 

329 # Add new files 

330 for i in range(0, 100, 10): 

331 file_data = [{"@graph": more_test_data[i : i + 10]}] 

332 with ZipFile( 

333 os.path.join(self.br_dir, "060", "10000", f"{i+2000}.zip"), "w" 

334 ) as zip_file: 

335 zip_file.writestr(f"{i+2000}.json", json.dumps(file_data)) 

336 

337 # Second run with existing cache 

338 generate_csv( 

339 input_dir=self.rdf_dir, 

340 output_dir=self.output_dir, 

341 dir_split_number=10000, 

342 items_per_file=1000, 

343 zip_output_rdf=True, 

344 redis_port=6381, 

345 redis_db=5, 

346 ) 

347 

348 # Verify results 

349 all_output_data = [] 

350 for filename in os.listdir(self.output_dir): 

351 if filename.endswith(".csv"): 

352 all_output_data.extend( 

353 get_csv_data(os.path.join(self.output_dir, filename)) 

354 ) 

355 

356 # Should have processed all 200 entries 

357 self.assertEqual(len(all_output_data), 200) 

358 

359 # Verify no duplicates 

360 processed_ids = {row["id"] for row in all_output_data} 

361 self.assertEqual(len(processed_ids), 200) 

362 

363 def test_basic_br_processing(self): 

364 """Test basic bibliographic resource processing""" 

365 test_data = [ 

366 { 

367 "@graph": [ 

368 { 

369 "@id": "https://w3id.org/oc/meta/br/0601", 

370 "@type": [ 

371 "http://purl.org/spar/fabio/Expression", 

372 "http://purl.org/spar/fabio/JournalArticle", 

373 ], 

374 "http://purl.org/dc/terms/title": [{"@value": "Test Article"}], 

375 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

376 {"@value": "2024-01-01"} 

377 ], 

378 "http://purl.org/spar/datacite/hasIdentifier": [ 

379 {"@id": "https://w3id.org/oc/meta/id/0601"} 

380 ], 

381 } 

382 ], 

383 "@id": "https://w3id.org/oc/meta/br/", 

384 } 

385 ] 

386 

387 # Write test data to file 

388 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True) 

389 with ZipFile( 

390 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w" 

391 ) as zip_file: 

392 zip_file.writestr("1000.json", json.dumps(test_data)) 

393 

394 # Run generator 

395 generate_csv( 

396 input_dir=self.rdf_dir, 

397 output_dir=self.output_dir, 

398 dir_split_number=10000, 

399 items_per_file=1000, 

400 zip_output_rdf=True, 

401 redis_port=6381, 

402 redis_db=5, 

403 ) 

404 

405 # Check output 

406 output_files = os.listdir(self.output_dir) 

407 self.assertEqual(len(output_files), 1) 

408 

409 output_data = get_csv_data(os.path.join(self.output_dir, output_files[0])) 

410 self.assertEqual(len(output_data), 1) 

411 self.assertEqual(output_data[0]["title"], "Test Article") 

412 self.assertEqual(output_data[0]["pub_date"], "2024-01-01") 

413 self.assertEqual(output_data[0]["type"], "journal article") 

414 self.assertEqual(output_data[0]["id"], "omid:br/0601") 

415 

416 def test_complex_br_with_related_entities(self): 

417 """Test processing of BR with authors, venue, and other related entities""" 

418 # Create directory structure for each entity type 

419 supplier_prefix = "060" 

420 for entity_type in ["br", "ra", "ar", "id"]: 

421 os.makedirs( 

422 os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000"), 

423 exist_ok=True, 

424 ) 

425 

426 # BR data including both the article and the venue 

427 br_data = [ 

428 { 

429 "@graph": [ 

430 { 

431 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2", 

432 "@type": [ 

433 "http://purl.org/spar/fabio/Expression", 

434 "http://purl.org/spar/fabio/JournalArticle", 

435 ], 

436 "http://purl.org/dc/terms/title": [ 

437 {"@value": "Complex Article"} 

438 ], 

439 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

440 {"@value": "2024-02-01"} 

441 ], 

442 "http://purl.org/spar/pro/isDocumentContextFor": [ 

443 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"} 

444 ], 

445 "http://purl.org/vocab/frbr/core#partOf": [ 

446 {"@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3"} 

447 ], 

448 }, 

449 { 

450 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3", 

451 "@type": [ 

452 "http://purl.org/spar/fabio/Expression", 

453 "http://purl.org/spar/fabio/Journal", 

454 ], 

455 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}], 

456 }, 

457 ], 

458 "@id": "https://w3id.org/oc/meta/br/", 

459 } 

460 ] 

461 

462 ar_data = [ 

463 { 

464 "@graph": [ 

465 { 

466 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

467 "http://purl.org/spar/pro/withRole": [ 

468 {"@id": "http://purl.org/spar/pro/author"} 

469 ], 

470 "http://purl.org/spar/pro/isHeldBy": [ 

471 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

472 ], 

473 } 

474 ], 

475 "@id": "https://w3id.org/oc/meta/ar/", 

476 } 

477 ] 

478 

479 ra_data = [ 

480 { 

481 "@graph": [ 

482 { 

483 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

484 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}], 

485 } 

486 ], 

487 "@id": "https://w3id.org/oc/meta/ra/", 

488 } 

489 ] 

490 

491 # Write test data files in correct locations 

492 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

493 

494 for entity_type, data in data_files.items(): 

495 zip_path = os.path.join( 

496 self.rdf_dir, entity_type, supplier_prefix, "10000", "1000.zip" 

497 ) 

498 with ZipFile(zip_path, "w") as zip_file: 

499 zip_file.writestr("1000.json", json.dumps(data)) 

500 

501 # Run generator 

502 generate_csv( 

503 input_dir=self.rdf_dir, 

504 output_dir=self.output_dir, 

505 dir_split_number=10000, 

506 items_per_file=1000, 

507 zip_output_rdf=True, 

508 redis_port=6381, 

509 redis_db=5, 

510 ) 

511 

512 # Check output 

513 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

514 self.assertEqual(len(output_data), 2) # Should have 2 rows: article and journal 

515 

516 # Find article and journal entries 

517 article = next( 

518 (item for item in output_data if item["type"] == "journal article"), None 

519 ) 

520 journal = next( 

521 (item for item in output_data if item["type"] == "journal"), None 

522 ) 

523 

524 # Verify article data 

525 self.assertIsNotNone(article) 

526 self.assertEqual(article["title"], "Complex Article") 

527 self.assertEqual(article["venue"], f"Test Journal [omid:br/{supplier_prefix}3]") 

528 self.assertEqual(article["author"], "Test Author [omid:ra/0601]") 

529 self.assertEqual(article["id"], f"omid:br/{supplier_prefix}2") 

530 

531 # Verify journal data 

532 self.assertIsNotNone(journal) 

533 self.assertEqual(journal["title"], "Test Journal") 

534 self.assertEqual(journal["type"], "journal") 

535 self.assertEqual(journal["id"], f"omid:br/{supplier_prefix}3") 

536 

537 def test_empty_input_directory(self): 

538 """Test behavior with empty input directory""" 

539 generate_csv( 

540 input_dir=self.rdf_dir, 

541 output_dir=self.output_dir, 

542 dir_split_number=10000, 

543 items_per_file=1000, 

544 zip_output_rdf=True, 

545 redis_port=6381, 

546 redis_db=5, 

547 ) 

548 

549 self.assertEqual(len(os.listdir(self.output_dir)), 0) 

550 

551 def test_br_with_multiple_authors_and_editors(self): 

552 """Test processing of BR with multiple authors and editors""" 

553 supplier_prefix = "060" 

554 br_data = [ 

555 { 

556 "@graph": [ 

557 { 

558 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

559 "@type": [ 

560 "http://purl.org/spar/fabio/Expression", 

561 "http://purl.org/spar/fabio/Book", 

562 ], 

563 "http://purl.org/dc/terms/title": [ 

564 {"@value": "Multi-Author Book"} 

565 ], 

566 "http://purl.org/spar/pro/isDocumentContextFor": [ 

567 { 

568 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1" 

569 }, # First author 

570 { 

571 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2" 

572 }, # Second author 

573 { 

574 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3" 

575 }, # First editor 

576 { 

577 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4" 

578 }, # Second editor 

579 ], 

580 } 

581 ] 

582 } 

583 ] 

584 

585 # Setup agent roles for authors and editors with hasNext relations 

586 ar_data = [ 

587 { 

588 "@graph": [ 

589 { 

590 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

591 "http://purl.org/spar/pro/withRole": [ 

592 {"@id": "http://purl.org/spar/pro/author"} 

593 ], 

594 "http://purl.org/spar/pro/isHeldBy": [ 

595 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

596 ], 

597 "https://w3id.org/oc/ontology/hasNext": [ 

598 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"} 

599 ], 

600 }, 

601 { 

602 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

603 "http://purl.org/spar/pro/withRole": [ 

604 {"@id": "http://purl.org/spar/pro/author"} 

605 ], 

606 "http://purl.org/spar/pro/isHeldBy": [ 

607 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

608 ], 

609 "https://w3id.org/oc/ontology/hasNext": [ 

610 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

611 ], 

612 }, 

613 { 

614 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

615 "http://purl.org/spar/pro/withRole": [ 

616 {"@id": "http://purl.org/spar/pro/editor"} 

617 ], 

618 "http://purl.org/spar/pro/isHeldBy": [ 

619 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

620 ], 

621 "https://w3id.org/oc/ontology/hasNext": [ 

622 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4"} 

623 ], 

624 }, 

625 { 

626 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4", 

627 "http://purl.org/spar/pro/withRole": [ 

628 {"@id": "http://purl.org/spar/pro/editor"} 

629 ], 

630 "http://purl.org/spar/pro/isHeldBy": [ 

631 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4"} 

632 ], 

633 }, 

634 ] 

635 } 

636 ] 

637 

638 # Setup responsible agents 

639 ra_data = [ 

640 { 

641 "@graph": [ 

642 { 

643 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

644 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}], 

645 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}], 

646 }, 

647 { 

648 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

649 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Doe"}], 

650 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Jane"}], 

651 }, 

652 { 

653 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

654 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Brown"}], 

655 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Bob"}], 

656 }, 

657 { 

658 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4", 

659 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Wilson"}], 

660 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Alice"}], 

661 }, 

662 ] 

663 } 

664 ] 

665 

666 # Write test data files 

667 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

668 

669 for entity_type, data in data_files.items(): 

670 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

671 os.makedirs(dir_path, exist_ok=True) 

672 

673 zip_path = os.path.join(dir_path, "1000.zip") 

674 with ZipFile(zip_path, "w") as zip_file: 

675 zip_file.writestr("1000.json", json.dumps(data)) 

676 

677 # Run generator 

678 generate_csv( 

679 input_dir=self.rdf_dir, 

680 output_dir=self.output_dir, 

681 dir_split_number=10000, 

682 items_per_file=1000, 

683 zip_output_rdf=True, 

684 redis_port=6381, 

685 redis_db=5, 

686 ) 

687 

688 # Check output 

689 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

690 self.assertEqual(len(output_data), 1) 

691 

692 # Verify authors and editors are in the correct order 

693 expected_authors = ( 

694 f"Smith, John [omid:ra/{supplier_prefix}1]; " 

695 f"Doe, Jane [omid:ra/{supplier_prefix}2]" 

696 ) 

697 expected_editors = ( 

698 f"Brown, Bob [omid:ra/{supplier_prefix}3]; " 

699 f"Wilson, Alice [omid:ra/{supplier_prefix}4]" 

700 ) 

701 

702 self.assertEqual(output_data[0]["author"], expected_authors) 

703 self.assertEqual(output_data[0]["editor"], expected_editors) 

704 

705 def test_br_with_identifiers(self): 

706 """Test processing of BR with multiple identifiers""" 

707 supplier_prefix = "060" 

708 br_data = [ 

709 { 

710 "@graph": [ 

711 { 

712 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

713 "@type": [ 

714 "http://purl.org/spar/fabio/Expression", 

715 "http://purl.org/spar/fabio/JournalArticle", 

716 ], 

717 "http://purl.org/dc/terms/title": [ 

718 {"@value": "Article With DOI"} 

719 ], 

720 "http://purl.org/spar/datacite/hasIdentifier": [ 

721 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1"}, 

722 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2"}, 

723 ], 

724 } 

725 ] 

726 } 

727 ] 

728 

729 id_data = [ 

730 { 

731 "@graph": [ 

732 { 

733 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1", 

734 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

735 {"@id": "http://purl.org/spar/datacite/doi"} 

736 ], 

737 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

738 {"@value": "10.1234/test.123"} 

739 ], 

740 }, 

741 { 

742 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2", 

743 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

744 {"@id": "http://purl.org/spar/datacite/isbn"} 

745 ], 

746 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

747 {"@value": "978-0-123456-47-2"} 

748 ], 

749 }, 

750 ] 

751 } 

752 ] 

753 

754 # Write test data files in correct locations 

755 data_files = {"br": br_data, "id": id_data} 

756 

757 for entity_type, data in data_files.items(): 

758 # Create all necessary directories 

759 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

760 os.makedirs(dir_path, exist_ok=True) 

761 

762 zip_path = os.path.join(dir_path, "1000.zip") 

763 with ZipFile(zip_path, "w") as zip_file: 

764 zip_file.writestr("1000.json", json.dumps(data)) 

765 

766 # Run generator 

767 generate_csv( 

768 input_dir=self.rdf_dir, 

769 output_dir=self.output_dir, 

770 dir_split_number=10000, 

771 items_per_file=1000, 

772 zip_output_rdf=True, 

773 redis_port=6381, 

774 redis_db=5, 

775 ) 

776 

777 # Check output 

778 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

779 self.assertEqual(len(output_data), 1) 

780 

781 # Verify all identifiers are included 

782 expected_ids = ( 

783 f"omid:br/{supplier_prefix}1 doi:10.1234/test.123 isbn:978-0-123456-47-2" 

784 ) 

785 self.assertEqual(output_data[0]["id"], expected_ids) 

786 

787 def test_br_with_page_numbers(self): 

788 """Test processing of BR with page information""" 

789 supplier_prefix = "060" 

790 br_data = [ 

791 { 

792 "@graph": [ 

793 { 

794 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

795 "@type": [ 

796 "http://purl.org/spar/fabio/Expression", 

797 "http://purl.org/spar/fabio/JournalArticle", 

798 ], 

799 "http://purl.org/dc/terms/title": [{"@value": "Paged Article"}], 

800 "http://purl.org/vocab/frbr/core#embodiment": [ 

801 {"@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1"} 

802 ], 

803 } 

804 ] 

805 } 

806 ] 

807 

808 re_data = [ 

809 { 

810 "@graph": [ 

811 { 

812 "@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1", 

813 "http://prismstandard.org/namespaces/basic/2.0/startingPage": [ 

814 {"@value": "100"} 

815 ], 

816 "http://prismstandard.org/namespaces/basic/2.0/endingPage": [ 

817 {"@value": "120"} 

818 ], 

819 } 

820 ] 

821 } 

822 ] 

823 

824 # Write test data files in correct locations 

825 data_files = {"br": br_data, "re": re_data} 

826 

827 for entity_type, data in data_files.items(): 

828 # Create all necessary directories 

829 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

830 os.makedirs(dir_path, exist_ok=True) 

831 

832 zip_path = os.path.join(dir_path, "1000.zip") 

833 with ZipFile(zip_path, "w") as zip_file: 

834 zip_file.writestr("1000.json", json.dumps(data)) 

835 

836 # Run generator 

837 generate_csv( 

838 input_dir=self.rdf_dir, 

839 output_dir=self.output_dir, 

840 dir_split_number=10000, 

841 items_per_file=1000, 

842 zip_output_rdf=True, 

843 redis_port=6381, 

844 redis_db=5, 

845 ) 

846 

847 # Check output 

848 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

849 self.assertEqual(len(output_data), 1) 

850 self.assertEqual(output_data[0]["page"], "100-120") 

851 

852 def test_malformed_data_handling(self): 

853 """Test handling of malformed or incomplete data""" 

854 supplier_prefix = "060" 

855 br_data = [ 

856 { 

857 "@graph": [ 

858 { 

859 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

860 "@type": [ 

861 "http://purl.org/spar/fabio/Expression", 

862 "http://purl.org/spar/fabio/JournalArticle", 

863 ], 

864 # Missing title 

865 "http://purl.org/spar/pro/isDocumentContextFor": [ 

866 {"@id": "invalid_uri"}, # Invalid URI 

867 ], 

868 "http://purl.org/vocab/frbr/core#partOf": [ 

869 {"@id": "non_existent_venue"} # Non-existent venue 

870 ], 

871 } 

872 ] 

873 } 

874 ] 

875 

876 # Write test data files in correct locations 

877 data_files = {"br": br_data} 

878 

879 for entity_type, data in data_files.items(): 

880 # Create all necessary directories 

881 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

882 os.makedirs(dir_path, exist_ok=True) 

883 

884 zip_path = os.path.join(dir_path, "1000.zip") 

885 with ZipFile(zip_path, "w") as zip_file: 

886 zip_file.writestr("1000.json", json.dumps(data)) 

887 

888 # Run generator 

889 generate_csv( 

890 input_dir=self.rdf_dir, 

891 output_dir=self.output_dir, 

892 dir_split_number=10000, 

893 items_per_file=1000, 

894 zip_output_rdf=True, 

895 redis_port=6381, 

896 redis_db=5, 

897 ) 

898 

899 # Check output 

900 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

901 self.assertEqual(len(output_data), 1) 

902 # Verify graceful handling of missing/invalid data 

903 self.assertEqual(output_data[0]["title"], "") 

904 self.assertEqual(output_data[0]["author"], "") 

905 self.assertEqual(output_data[0]["venue"], "") 

906 

907 def test_br_with_hierarchical_venue_structures(self): 

908 """Test different hierarchical venue structures (issue->volume->journal, issue->journal, volume->journal, direct journal)""" 

909 supplier_prefix = "060" 

910 

911 # Create test data for different hierarchical structures 

912 br_data = [ 

913 { 

914 "@graph": [ 

915 # Article in issue->volume->journal structure 

916 { 

917 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

918 "@type": [ 

919 "http://purl.org/spar/fabio/Expression", 

920 "http://purl.org/spar/fabio/JournalArticle", 

921 ], 

922 "http://purl.org/dc/terms/title": [ 

923 {"@value": "Article in Full Hierarchy"} 

924 ], 

925 "http://purl.org/vocab/frbr/core#partOf": [ 

926 { 

927 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2" 

928 } # Issue 

929 ], 

930 }, 

931 # Article in issue->journal structure (no volume) 

932 { 

933 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}5", 

934 "@type": [ 

935 "http://purl.org/spar/fabio/Expression", 

936 "http://purl.org/spar/fabio/JournalArticle", 

937 ], 

938 "http://purl.org/dc/terms/title": [ 

939 {"@value": "Article in Issue-Journal"} 

940 ], 

941 "http://purl.org/vocab/frbr/core#partOf": [ 

942 { 

943 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6" 

944 } # Issue 

945 ], 

946 }, 

947 # Article in volume->journal structure (no issue) 

948 { 

949 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}9", 

950 "@type": [ 

951 "http://purl.org/spar/fabio/Expression", 

952 "http://purl.org/spar/fabio/JournalArticle", 

953 ], 

954 "http://purl.org/dc/terms/title": [ 

955 {"@value": "Article in Volume-Journal"} 

956 ], 

957 "http://purl.org/vocab/frbr/core#partOf": [ 

958 { 

959 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10" 

960 } # Volume 

961 ], 

962 }, 

963 # Article directly in journal 

964 { 

965 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}13", 

966 "@type": [ 

967 "http://purl.org/spar/fabio/Expression", 

968 "http://purl.org/spar/fabio/JournalArticle", 

969 ], 

970 "http://purl.org/dc/terms/title": [ 

971 {"@value": "Article in Journal"} 

972 ], 

973 "http://purl.org/vocab/frbr/core#partOf": [ 

974 { 

975 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4" 

976 } # Journal 

977 ], 

978 }, 

979 # Issue in full hierarchy 

980 { 

981 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2", 

982 "@type": ["http://purl.org/spar/fabio/JournalIssue"], 

983 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

984 {"@value": "2"} 

985 ], 

986 "http://purl.org/vocab/frbr/core#partOf": [ 

987 { 

988 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3" 

989 } # Volume 

990 ], 

991 }, 

992 # Volume in full hierarchy 

993 { 

994 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3", 

995 "@type": ["http://purl.org/spar/fabio/JournalVolume"], 

996 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

997 {"@value": "42"} 

998 ], 

999 "http://purl.org/vocab/frbr/core#partOf": [ 

1000 { 

1001 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4" 

1002 } # Journal 

1003 ], 

1004 }, 

1005 # Journal 

1006 { 

1007 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4", 

1008 "@type": ["http://purl.org/spar/fabio/Journal"], 

1009 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}], 

1010 }, 

1011 # Issue directly in journal 

1012 { 

1013 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6", 

1014 "@type": ["http://purl.org/spar/fabio/JournalIssue"], 

1015 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

1016 {"@value": "3"} 

1017 ], 

1018 "http://purl.org/vocab/frbr/core#partOf": [ 

1019 { 

1020 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4" 

1021 } # Journal 

1022 ], 

1023 }, 

1024 # Volume directly in journal 

1025 { 

1026 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10", 

1027 "@type": ["http://purl.org/spar/fabio/JournalVolume"], 

1028 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

1029 {"@value": "5"} 

1030 ], 

1031 "http://purl.org/vocab/frbr/core#partOf": [ 

1032 { 

1033 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4" 

1034 } # Journal 

1035 ], 

1036 }, 

1037 ] 

1038 } 

1039 ] 

1040 

1041 # Write test data files 

1042 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000") 

1043 os.makedirs(dir_path, exist_ok=True) 

1044 

1045 zip_path = os.path.join(dir_path, "1000.zip") 

1046 with ZipFile(zip_path, "w") as zip_file: 

1047 zip_file.writestr("1000.json", json.dumps(br_data)) 

1048 

1049 # Run generator 

1050 generate_csv( 

1051 input_dir=self.rdf_dir, 

1052 output_dir=self.output_dir, 

1053 dir_split_number=10000, 

1054 items_per_file=1000, 

1055 zip_output_rdf=True, 

1056 redis_port=6381, 

1057 redis_db=5, 

1058 ) 

1059 

1060 # Check output 

1061 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1062 

1063 # Verify we only have the articles and journal in the output 

1064 self.assertEqual(len(output_data), 5) # 4 articles + 1 journal 

1065 

1066 # Verify no JournalVolume or JournalIssue entries exist 

1067 volume_or_issue_entries = [ 

1068 item 

1069 for item in output_data 

1070 if item["type"] in ["journal volume", "journal issue"] 

1071 ] 

1072 self.assertEqual(len(volume_or_issue_entries), 0) 

1073 

1074 # Find each article by title 

1075 full_hierarchy = next( 

1076 item for item in output_data if item["title"] == "Article in Full Hierarchy" 

1077 ) 

1078 issue_journal = next( 

1079 item for item in output_data if item["title"] == "Article in Issue-Journal" 

1080 ) 

1081 volume_journal = next( 

1082 item for item in output_data if item["title"] == "Article in Volume-Journal" 

1083 ) 

1084 direct_journal = next( 

1085 item for item in output_data if item["title"] == "Article in Journal" 

1086 ) 

1087 

1088 # Test full hierarchy (issue->volume->journal) 

1089 self.assertEqual(full_hierarchy["issue"], "2") 

1090 self.assertEqual(full_hierarchy["volume"], "42") 

1091 self.assertEqual( 

1092 full_hierarchy["venue"], f"Test Journal [omid:br/{supplier_prefix}4]" 

1093 ) 

1094 

1095 # Test issue->journal (no volume) 

1096 self.assertEqual(issue_journal["issue"], "3") 

1097 self.assertEqual(issue_journal["volume"], "") 

1098 self.assertEqual( 

1099 issue_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]" 

1100 ) 

1101 

1102 # Test volume->journal (no issue) 

1103 self.assertEqual(volume_journal["issue"], "") 

1104 self.assertEqual(volume_journal["volume"], "5") 

1105 self.assertEqual( 

1106 volume_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]" 

1107 ) 

1108 

1109 # Test direct journal connection 

1110 self.assertEqual(direct_journal["issue"], "") 

1111 self.assertEqual(direct_journal["volume"], "") 

1112 self.assertEqual( 

1113 direct_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]" 

1114 ) 

1115 

1116 def test_book_in_series(self): 

1117 """Test processing of a book that is part of a book series""" 

1118 supplier_prefix = "060" 

1119 

1120 # Create test data for book in series 

1121 br_data = [ 

1122 { 

1123 "@graph": [ 

1124 # Book 

1125 { 

1126 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1127 "@type": [ 

1128 "http://purl.org/spar/fabio/Expression", 

1129 "http://purl.org/spar/fabio/Book", 

1130 ], 

1131 "http://purl.org/dc/terms/title": [{"@value": "Test Book"}], 

1132 "http://purl.org/vocab/frbr/core#partOf": [ 

1133 { 

1134 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2" 

1135 } # Series 

1136 ], 

1137 }, 

1138 # Book Series 

1139 { 

1140 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2", 

1141 "@type": ["http://purl.org/spar/fabio/BookSeries"], 

1142 "http://purl.org/dc/terms/title": [ 

1143 {"@value": "Test Book Series"} 

1144 ], 

1145 }, 

1146 ] 

1147 } 

1148 ] 

1149 

1150 # Write test data 

1151 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000") 

1152 os.makedirs(dir_path, exist_ok=True) 

1153 

1154 zip_path = os.path.join(dir_path, "1000.zip") 

1155 with ZipFile(zip_path, "w") as zip_file: 

1156 zip_file.writestr("1000.json", json.dumps(br_data)) 

1157 

1158 # Run generator 

1159 generate_csv( 

1160 input_dir=self.rdf_dir, 

1161 output_dir=self.output_dir, 

1162 dir_split_number=10000, 

1163 items_per_file=1000, 

1164 zip_output_rdf=True, 

1165 redis_port=6381, 

1166 redis_db=5, 

1167 ) 

1168 

1169 # Check output 

1170 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1171 

1172 # Find book entry 

1173 book = next(item for item in output_data if item["type"] == "book") 

1174 

1175 # Verify book is correctly linked to series 

1176 self.assertEqual(book["title"], "Test Book") 

1177 self.assertEqual( 

1178 book["venue"], f"Test Book Series [omid:br/{supplier_prefix}2]" 

1179 ) 

1180 self.assertEqual(book["volume"], "") # Should not have volume 

1181 self.assertEqual(book["issue"], "") # Should not have issue 

1182 

1183 def test_br_with_multiple_roles(self): 

1184 """Test processing of BR with authors, editors and publishers""" 

1185 supplier_prefix = "060" 

1186 br_data = [ 

1187 { 

1188 "@graph": [ 

1189 { 

1190 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1191 "@type": [ 

1192 "http://purl.org/spar/fabio/Expression", 

1193 "http://purl.org/spar/fabio/Book", 

1194 ], 

1195 "http://purl.org/dc/terms/title": [ 

1196 {"@value": "Multi-Role Book"} 

1197 ], 

1198 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1199 { 

1200 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1" 

1201 }, # Author 

1202 { 

1203 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2" 

1204 }, # Editor 

1205 { 

1206 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3" 

1207 }, # Publisher 

1208 ], 

1209 } 

1210 ] 

1211 } 

1212 ] 

1213 

1214 # Setup agent roles for authors, editors and publishers 

1215 ar_data = [ 

1216 { 

1217 "@graph": [ 

1218 { 

1219 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

1220 "http://purl.org/spar/pro/withRole": [ 

1221 {"@id": "http://purl.org/spar/pro/author"} 

1222 ], 

1223 "http://purl.org/spar/pro/isHeldBy": [ 

1224 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

1225 ], 

1226 "https://w3id.org/oc/ontology/hasNext": [ 

1227 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"} 

1228 ], 

1229 }, 

1230 { 

1231 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

1232 "http://purl.org/spar/pro/withRole": [ 

1233 {"@id": "http://purl.org/spar/pro/editor"} 

1234 ], 

1235 "http://purl.org/spar/pro/isHeldBy": [ 

1236 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

1237 ], 

1238 "https://w3id.org/oc/ontology/hasNext": [ 

1239 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

1240 ], 

1241 }, 

1242 { 

1243 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

1244 "http://purl.org/spar/pro/withRole": [ 

1245 {"@id": "http://purl.org/spar/pro/publisher"} 

1246 ], 

1247 "http://purl.org/spar/pro/isHeldBy": [ 

1248 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

1249 ], 

1250 }, 

1251 ] 

1252 } 

1253 ] 

1254 

1255 # Setup responsible agents with different name formats 

1256 ra_data = [ 

1257 { 

1258 "@graph": [ 

1259 { 

1260 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

1261 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}], 

1262 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}], 

1263 }, 

1264 { 

1265 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

1266 "http://xmlns.com/foaf/0.1/name": [{"@value": "Editor Name"}], 

1267 }, 

1268 { 

1269 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

1270 "http://xmlns.com/foaf/0.1/name": [ 

1271 {"@value": "Publisher House"} 

1272 ], 

1273 }, 

1274 ] 

1275 } 

1276 ] 

1277 

1278 # Write test data files 

1279 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

1280 

1281 for entity_type, data in data_files.items(): 

1282 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

1283 os.makedirs(dir_path, exist_ok=True) 

1284 

1285 zip_path = os.path.join(dir_path, "1000.zip") 

1286 with ZipFile(zip_path, "w") as zip_file: 

1287 zip_file.writestr("1000.json", json.dumps(data)) 

1288 

1289 # Run generator 

1290 generate_csv( 

1291 input_dir=self.rdf_dir, 

1292 output_dir=self.output_dir, 

1293 dir_split_number=10000, 

1294 items_per_file=1000, 

1295 zip_output_rdf=True, 

1296 redis_port=6381, 

1297 redis_db=5, 

1298 ) 

1299 

1300 # Check output 

1301 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1302 self.assertEqual(len(output_data), 1) 

1303 

1304 # Verify all roles are correctly processed 

1305 book = output_data[0] 

1306 self.assertEqual(book["title"], "Multi-Role Book") 

1307 self.assertEqual(book["author"], f"Smith, John [omid:ra/{supplier_prefix}1]") 

1308 self.assertEqual(book["editor"], f"Editor Name [omid:ra/{supplier_prefix}2]") 

1309 self.assertEqual( 

1310 book["publisher"], f"Publisher House [omid:ra/{supplier_prefix}3]" 

1311 ) 

1312 

1313 def test_ordered_authors(self): 

1314 """Test that authors are ordered according to hasNext relations""" 

1315 supplier_prefix = "060" 

1316 br_data = [ 

1317 { 

1318 "@graph": [ 

1319 { 

1320 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1321 "@type": [ 

1322 "http://purl.org/spar/fabio/Expression", 

1323 "http://purl.org/spar/fabio/JournalArticle", 

1324 ], 

1325 "http://purl.org/dc/terms/title": [ 

1326 {"@value": "Ordered Authors Article"} 

1327 ], 

1328 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1329 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}, 

1330 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}, 

1331 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}, 

1332 ], 

1333 } 

1334 ] 

1335 } 

1336 ] 

1337 

1338 # Setup agent roles with hasNext relations 

1339 ar_data = [ 

1340 { 

1341 "@graph": [ 

1342 { 

1343 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

1344 "http://purl.org/spar/pro/withRole": [ 

1345 {"@id": "http://purl.org/spar/pro/author"} 

1346 ], 

1347 "http://purl.org/spar/pro/isHeldBy": [ 

1348 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

1349 ], 

1350 "https://w3id.org/oc/ontology/hasNext": [ 

1351 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"} 

1352 ], 

1353 }, 

1354 { 

1355 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

1356 "http://purl.org/spar/pro/withRole": [ 

1357 {"@id": "http://purl.org/spar/pro/author"} 

1358 ], 

1359 "http://purl.org/spar/pro/isHeldBy": [ 

1360 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

1361 ], 

1362 "https://w3id.org/oc/ontology/hasNext": [ 

1363 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

1364 ], 

1365 }, 

1366 { 

1367 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

1368 "http://purl.org/spar/pro/withRole": [ 

1369 {"@id": "http://purl.org/spar/pro/author"} 

1370 ], 

1371 "http://purl.org/spar/pro/isHeldBy": [ 

1372 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

1373 ], 

1374 }, 

1375 ] 

1376 } 

1377 ] 

1378 

1379 # Setup responsible agents with different names 

1380 ra_data = [ 

1381 { 

1382 "@graph": [ 

1383 { 

1384 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

1385 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}], 

1386 }, 

1387 { 

1388 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

1389 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}], 

1390 }, 

1391 { 

1392 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

1393 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}], 

1394 }, 

1395 ] 

1396 } 

1397 ] 

1398 

1399 # Write test data files 

1400 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

1401 

1402 for entity_type, data in data_files.items(): 

1403 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

1404 os.makedirs(dir_path, exist_ok=True) 

1405 

1406 zip_path = os.path.join(dir_path, "1000.zip") 

1407 with ZipFile(zip_path, "w") as zip_file: 

1408 zip_file.writestr("1000.json", json.dumps(data)) 

1409 

1410 # Run generator 

1411 generate_csv( 

1412 input_dir=self.rdf_dir, 

1413 output_dir=self.output_dir, 

1414 dir_split_number=10000, 

1415 items_per_file=1000, 

1416 zip_output_rdf=True, 

1417 redis_port=6381, 

1418 redis_db=5, 

1419 ) 

1420 

1421 # Check output 

1422 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1423 self.assertEqual(len(output_data), 1) 

1424 

1425 # Verify authors are in the correct order 

1426 expected_authors = ( 

1427 f"First Author [omid:ra/{supplier_prefix}1]; " 

1428 f"Second Author [omid:ra/{supplier_prefix}2]; " 

1429 f"Third Author [omid:ra/{supplier_prefix}3]" 

1430 ) 

1431 self.assertEqual(output_data[0]["author"], expected_authors) 

1432 

1433 def test_cyclic_hasNext_relations(self): 

1434 """Test handling of cyclic hasNext relations between agent roles""" 

1435 supplier_prefix = "060" 

1436 br_data = [ 

1437 { 

1438 "@graph": [ 

1439 { 

1440 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1441 "@type": [ 

1442 "http://purl.org/spar/fabio/Expression", 

1443 "http://purl.org/spar/fabio/JournalArticle", 

1444 ], 

1445 "http://purl.org/dc/terms/title": [ 

1446 {"@value": "Cyclic Authors Article"} 

1447 ], 

1448 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1449 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}, 

1450 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}, 

1451 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}, 

1452 ], 

1453 } 

1454 ] 

1455 } 

1456 ] 

1457 

1458 # Setup agent roles with cyclic hasNext relations 

1459 ar_data = [ 

1460 { 

1461 "@graph": [ 

1462 { 

1463 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

1464 "http://purl.org/spar/pro/withRole": [ 

1465 {"@id": "http://purl.org/spar/pro/author"} 

1466 ], 

1467 "http://purl.org/spar/pro/isHeldBy": [ 

1468 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

1469 ], 

1470 "https://w3id.org/oc/ontology/hasNext": [ 

1471 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"} 

1472 ], 

1473 }, 

1474 { 

1475 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

1476 "http://purl.org/spar/pro/withRole": [ 

1477 {"@id": "http://purl.org/spar/pro/author"} 

1478 ], 

1479 "http://purl.org/spar/pro/isHeldBy": [ 

1480 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

1481 ], 

1482 # Creates a cycle: 1 -> 2 -> 3 -> 1 

1483 "https://w3id.org/oc/ontology/hasNext": [ 

1484 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

1485 ], 

1486 }, 

1487 { 

1488 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

1489 "http://purl.org/spar/pro/withRole": [ 

1490 {"@id": "http://purl.org/spar/pro/author"} 

1491 ], 

1492 "http://purl.org/spar/pro/isHeldBy": [ 

1493 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

1494 ], 

1495 # Cycle completion 

1496 "https://w3id.org/oc/ontology/hasNext": [ 

1497 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"} 

1498 ], 

1499 }, 

1500 ] 

1501 } 

1502 ] 

1503 

1504 # Setup responsible agents 

1505 ra_data = [ 

1506 { 

1507 "@graph": [ 

1508 { 

1509 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

1510 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}], 

1511 }, 

1512 { 

1513 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

1514 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}], 

1515 }, 

1516 { 

1517 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

1518 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}], 

1519 }, 

1520 ] 

1521 } 

1522 ] 

1523 

1524 # Write test data files 

1525 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

1526 

1527 for entity_type, data in data_files.items(): 

1528 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

1529 os.makedirs(dir_path, exist_ok=True) 

1530 

1531 zip_path = os.path.join(dir_path, "1000.zip") 

1532 with ZipFile(zip_path, "w") as zip_file: 

1533 zip_file.writestr("1000.json", json.dumps(data)) 

1534 

1535 # Run generator 

1536 generate_csv( 

1537 input_dir=self.rdf_dir, 

1538 output_dir=self.output_dir, 

1539 dir_split_number=10000, 

1540 items_per_file=1000, 

1541 zip_output_rdf=True, 

1542 redis_port=6381, 

1543 redis_db=5, 

1544 ) 

1545 

1546 # Check output 

1547 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1548 self.assertEqual(len(output_data), 1) 

1549 

1550 # Verify that we get at least some authors before the cycle is detected 

1551 # The order should be maintained until the cycle is detected 

1552 authors = output_data[0]["author"].split("; ") 

1553 self.assertGreater(len(authors), 0) 

1554 

1555 # Verify the presence and order of authors 

1556 self.assertTrue( 

1557 any( 

1558 f"First Author [omid:ra/{supplier_prefix}1]" in author 

1559 for author in authors 

1560 ) 

1561 ) 

1562 self.assertTrue( 

1563 any( 

1564 f"Second Author [omid:ra/{supplier_prefix}2]" in author 

1565 for author in authors 

1566 ) 

1567 ) 

1568 

1569 # Verify no duplicates in the output 

1570 author_set = set(authors) 

1571 self.assertEqual( 

1572 len(authors), 

1573 len(author_set), 

1574 "Found duplicate authors in output: each author should appear exactly once", 

1575 ) 

1576 

1577 # Verify the exact order and number of authors 

1578 expected_authors = [ 

1579 f"First Author [omid:ra/{supplier_prefix}1]", 

1580 f"Second Author [omid:ra/{supplier_prefix}2]", 

1581 f"Third Author [omid:ra/{supplier_prefix}3]", 

1582 ] 

1583 self.assertEqual( 

1584 authors, 

1585 expected_authors, 

1586 "Authors should be in correct order and each should appear exactly once", 

1587 ) 

1588 

1589 def test_multiple_input_files(self): 

1590 """Test processing of multiple input files with sequential entity IDs""" 

1591 supplier_prefix = "060" 

1592 

1593 # Create test data spanning multiple files 

1594 # First file (entities 1-1000) 

1595 br_data_1 = [ 

1596 { 

1597 "@graph": [ 

1598 { 

1599 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1600 "@type": [ 

1601 "http://purl.org/spar/fabio/Expression", 

1602 "http://purl.org/spar/fabio/JournalArticle", 

1603 ], 

1604 "http://purl.org/dc/terms/title": [{"@value": "Article 1"}], 

1605 }, 

1606 { 

1607 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1000", 

1608 "@type": [ 

1609 "http://purl.org/spar/fabio/Expression", 

1610 "http://purl.org/spar/fabio/JournalArticle", 

1611 ], 

1612 "http://purl.org/dc/terms/title": [{"@value": "Article 1000"}], 

1613 }, 

1614 ] 

1615 } 

1616 ] 

1617 

1618 # Second file (entities 1001-2000) 

1619 br_data_2 = [ 

1620 { 

1621 "@graph": [ 

1622 { 

1623 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1001", 

1624 "@type": [ 

1625 "http://purl.org/spar/fabio/Expression", 

1626 "http://purl.org/spar/fabio/JournalArticle", 

1627 ], 

1628 "http://purl.org/dc/terms/title": [{"@value": "Article 1001"}], 

1629 }, 

1630 { 

1631 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2000", 

1632 "@type": [ 

1633 "http://purl.org/spar/fabio/Expression", 

1634 "http://purl.org/spar/fabio/JournalArticle", 

1635 ], 

1636 "http://purl.org/dc/terms/title": [{"@value": "Article 2000"}], 

1637 }, 

1638 ] 

1639 } 

1640 ] 

1641 

1642 # Third file (entities 2001-3000) 

1643 br_data_3 = [ 

1644 { 

1645 "@graph": [ 

1646 { 

1647 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2001", 

1648 "@type": [ 

1649 "http://purl.org/spar/fabio/Expression", 

1650 "http://purl.org/spar/fabio/JournalArticle", 

1651 ], 

1652 "http://purl.org/dc/terms/title": [{"@value": "Article 2001"}], 

1653 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1654 { 

1655 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001" 

1656 } 

1657 ], 

1658 } 

1659 ] 

1660 } 

1661 ] 

1662 

1663 # Create agent role data in a different file 

1664 ar_data = [ 

1665 { 

1666 "@graph": [ 

1667 { 

1668 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001", 

1669 "http://purl.org/spar/pro/withRole": [ 

1670 {"@id": "http://purl.org/spar/pro/author"} 

1671 ], 

1672 "http://purl.org/spar/pro/isHeldBy": [ 

1673 { 

1674 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001" 

1675 } 

1676 ], 

1677 } 

1678 ] 

1679 } 

1680 ] 

1681 

1682 # Create responsible agent data in a different file 

1683 ra_data = [ 

1684 { 

1685 "@graph": [ 

1686 { 

1687 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001", 

1688 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}], 

1689 } 

1690 ] 

1691 } 

1692 ] 

1693 

1694 # Write test data to appropriate locations based on ID ranges 

1695 os.makedirs(os.path.join(self.br_dir, supplier_prefix, "10000"), exist_ok=True) 

1696 os.makedirs( 

1697 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000"), exist_ok=True 

1698 ) 

1699 os.makedirs( 

1700 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000"), exist_ok=True 

1701 ) 

1702 

1703 # Write BR files 

1704 with ZipFile( 

1705 os.path.join(self.br_dir, supplier_prefix, "10000", "1000.zip"), "w" 

1706 ) as zip_file: 

1707 zip_file.writestr("1000.json", json.dumps(br_data_1)) 

1708 with ZipFile( 

1709 os.path.join(self.br_dir, supplier_prefix, "10000", "2000.zip"), "w" 

1710 ) as zip_file: 

1711 zip_file.writestr("2000.json", json.dumps(br_data_2)) 

1712 with ZipFile( 

1713 os.path.join(self.br_dir, supplier_prefix, "10000", "3000.zip"), "w" 

1714 ) as zip_file: 

1715 zip_file.writestr("3000.json", json.dumps(br_data_3)) 

1716 

1717 # Write AR and RA files 

1718 with ZipFile( 

1719 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000", "3000.zip"), "w" 

1720 ) as zip_file: 

1721 zip_file.writestr("3000.json", json.dumps(ar_data)) 

1722 with ZipFile( 

1723 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000", "3000.zip"), "w" 

1724 ) as zip_file: 

1725 zip_file.writestr("3000.json", json.dumps(ra_data)) 

1726 

1727 # Run generator 

1728 generate_csv( 

1729 input_dir=self.rdf_dir, 

1730 output_dir=self.output_dir, 

1731 dir_split_number=10000, 

1732 items_per_file=1000, 

1733 zip_output_rdf=True, 

1734 redis_port=6381, 

1735 redis_db=5, 

1736 ) 

1737 

1738 # Check output 

1739 output_files = sorted(os.listdir(self.output_dir)) 

1740 self.assertGreater(len(output_files), 0) 

1741 

1742 # Collect all output data 

1743 all_output_data = [] 

1744 for output_file in output_files: 

1745 all_output_data.extend( 

1746 get_csv_data(os.path.join(self.output_dir, output_file)) 

1747 ) 

1748 

1749 # Verify we have all expected entries 

1750 self.assertEqual(len(all_output_data), 5) # Should have 5 articles total 

1751 

1752 # Verify specific entries 

1753 article_1 = next( 

1754 item 

1755 for item in all_output_data 

1756 if item["id"] == f"omid:br/{supplier_prefix}1" 

1757 ) 

1758 article_1000 = next( 

1759 item 

1760 for item in all_output_data 

1761 if item["id"] == f"omid:br/{supplier_prefix}1000" 

1762 ) 

1763 article_1001 = next( 

1764 item 

1765 for item in all_output_data 

1766 if item["id"] == f"omid:br/{supplier_prefix}1001" 

1767 ) 

1768 article_2000 = next( 

1769 item 

1770 for item in all_output_data 

1771 if item["id"] == f"omid:br/{supplier_prefix}2000" 

1772 ) 

1773 article_2001 = next( 

1774 item 

1775 for item in all_output_data 

1776 if item["id"] == f"omid:br/{supplier_prefix}2001" 

1777 ) 

1778 

1779 # Check titles 

1780 self.assertEqual(article_1["title"], "Article 1") 

1781 self.assertEqual(article_1000["title"], "Article 1000") 

1782 self.assertEqual(article_1001["title"], "Article 1001") 

1783 self.assertEqual(article_2000["title"], "Article 2000") 

1784 self.assertEqual(article_2001["title"], "Article 2001") 

1785 

1786 # Check author for article 2001 (which has related entities) 

1787 self.assertEqual( 

1788 article_2001["author"], f"Test Author [omid:ra/{supplier_prefix}2001]" 

1789 ) 

1790 

1791 def test_max_rows_per_file_and_data_integrity(self): 

1792 """Test that output files respect max rows limit and no data is lost in multiprocessing""" 

1793 supplier_prefix = "060" 

1794 

1795 # Create test data with more than 3000 entries 

1796 br_data = [ 

1797 { 

1798 "@graph": [ 

1799 # Generate 3500 test entries 

1800 *[ 

1801 { 

1802 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}{i}", 

1803 "@type": [ 

1804 "http://purl.org/spar/fabio/Expression", 

1805 "http://purl.org/spar/fabio/JournalArticle", 

1806 ], 

1807 "http://purl.org/dc/terms/title": [ 

1808 {"@value": f"Article {i}"} 

1809 ], 

1810 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

1811 {"@value": "2024-01-01"} 

1812 ], 

1813 } 

1814 for i in range(1, 3501) 

1815 ] # This will create 3500 entries 

1816 ] 

1817 } 

1818 ] 

1819 

1820 # Split data into multiple files to test multiprocessing 

1821 entries_per_file = 1000 

1822 for i in range(0, 3500, entries_per_file): 

1823 file_data = [{"@graph": br_data[0]["@graph"][i : i + entries_per_file]}] 

1824 

1825 # Create directory structure for the file 

1826 file_number = i + entries_per_file 

1827 dir_path = os.path.join(self.br_dir, supplier_prefix, "10000") 

1828 os.makedirs(dir_path, exist_ok=True) 

1829 

1830 # Write the file 

1831 with ZipFile(os.path.join(dir_path, f"{file_number}.zip"), "w") as zip_file: 

1832 zip_file.writestr(f"{file_number}.json", json.dumps(file_data)) 

1833 

1834 # Run generator 

1835 generate_csv( 

1836 input_dir=self.rdf_dir, 

1837 output_dir=self.output_dir, 

1838 dir_split_number=10000, 

1839 items_per_file=1000, 

1840 zip_output_rdf=True, 

1841 redis_port=6381, 

1842 redis_db=5, 

1843 ) 

1844 

1845 # Check output files 

1846 output_files = sorted(os.listdir(self.output_dir)) 

1847 

1848 # Verify number of output files 

1849 # We expect at least 2 files: 3500 entries should create 2 files (3000 + 500) 

1850 self.assertGreaterEqual( 

1851 len(output_files), 2, "Should have at least 2 output files for 3500 entries" 

1852 ) 

1853 

1854 # Collect all entries from all output files 

1855 all_entries = [] 

1856 for output_file in output_files: 

1857 entries = get_csv_data(os.path.join(self.output_dir, output_file)) 

1858 

1859 # Verify each file has at most 3000 rows 

1860 self.assertLessEqual( 

1861 len(entries), 

1862 3000, 

1863 f"File {output_file} has more than 3000 rows: {len(entries)}", 

1864 ) 

1865 

1866 all_entries.extend(entries) 

1867 

1868 # Verify total number of entries 

1869 self.assertEqual( 

1870 len(all_entries), 

1871 3500, 

1872 f"Expected 3500 total entries, got {len(all_entries)}", 

1873 ) 

1874 

1875 # Verify no duplicate entries 

1876 unique_ids = {entry["id"] for entry in all_entries} 

1877 self.assertEqual( 

1878 len(unique_ids), 

1879 3500, 

1880 f"Expected 3500 unique entries, got {len(unique_ids)}", 

1881 ) 

1882 

1883 # Verify all entries are present (no missing entries) 

1884 expected_ids = {f"omid:br/{supplier_prefix}{i}" for i in range(1, 3501)} 

1885 self.assertEqual( 

1886 unique_ids, 

1887 expected_ids, 

1888 "Some entries are missing or unexpected entries are present", 

1889 ) 

1890 

1891 # Verify data integrity 

1892 for i in range(1, 3501): 

1893 entry = next( 

1894 e for e in all_entries if e["id"] == f"omid:br/{supplier_prefix}{i}" 

1895 ) 

1896 self.assertEqual(entry["title"], f"Article {i}") 

1897 self.assertEqual(entry["pub_date"], "2024-01-01") 

1898 self.assertEqual(entry["type"], "journal article") 

1899 

1900 def test_csv_field_limit_handling(self): 

1901 """Test handling of CSV files with large fields that exceed the default limit""" 

1902 # Create a test CSV with a very large field 

1903 large_field = "omid:br/0601 " + " ".join( 

1904 [f"id:{i}" for i in range(10000)] 

1905 ) # This will create a field > 131072 chars 

1906 test_data = {"id": large_field, "title": "Test Large Field"} 

1907 

1908 os.makedirs(self.output_dir, exist_ok=True) 

1909 with open( 

1910 os.path.join(self.output_dir, "large_field.csv"), 

1911 "w", 

1912 newline="", 

1913 encoding="utf-8", 

1914 ) as f: 

1915 writer = csv.DictWriter(f, fieldnames=["id", "title"]) 

1916 writer.writeheader() 

1917 writer.writerow(test_data) 

1918 

1919 # Try loading the data - this should trigger the field limit increase 

1920 count = load_processed_omids_to_redis(self.output_dir, self.redis_client) 

1921 

1922 # Verify the OMID was loaded despite the large field 

1923 self.assertEqual(count, 1) 

1924 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client)) 

1925 

1926 def test_complex_br_with_missing_authors(self): 

1927 """Test processing of a complex BR with multiple related entities where authors might be missing""" 

1928 supplier_prefix = "06250" 

1929 br_data = [ 

1930 { 

1931 "@graph": [ 

1932 { 

1933 "@id": "https://w3id.org/oc/meta/br/062501777134", 

1934 "@type": [ 

1935 "http://purl.org/spar/fabio/JournalArticle", 

1936 "http://purl.org/spar/fabio/Expression", 

1937 ], 

1938 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

1939 { 

1940 "@type": "http://www.w3.org/2001/XMLSchema#gYearMonth", 

1941 "@value": "2020-02", 

1942 } 

1943 ], 

1944 "http://purl.org/dc/terms/title": [ 

1945 { 

1946 "@value": "OpenCitations, An Infrastructure Organization For Open Scholarship" 

1947 } 

1948 ], 

1949 "http://purl.org/spar/datacite/hasIdentifier": [ 

1950 {"@id": "https://w3id.org/oc/meta/id/062501806985"}, 

1951 {"@id": "https://w3id.org/oc/meta/id/06850624745"}, 

1952 ], 

1953 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1954 {"@id": "https://w3id.org/oc/meta/ar/062507977761"}, 

1955 {"@id": "https://w3id.org/oc/meta/ar/062507977760"}, 

1956 {"@id": "https://w3id.org/oc/meta/ar/062507977759"}, 

1957 ], 

1958 "http://purl.org/vocab/frbr/core#embodiment": [ 

1959 {"@id": "https://w3id.org/oc/meta/re/062501477439"} 

1960 ], 

1961 "http://purl.org/vocab/frbr/core#partOf": [ 

1962 {"@id": "https://w3id.org/oc/meta/br/062501778111"} 

1963 ], 

1964 } 

1965 ] 

1966 } 

1967 ] 

1968 

1969 ar_data = [ 

1970 { 

1971 "@graph": [ 

1972 { 

1973 "@id": "https://w3id.org/oc/meta/ar/062507977761", 

1974 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

1975 "http://purl.org/spar/pro/isHeldBy": [ 

1976 {"@id": "https://w3id.org/oc/meta/ra/0610116105"} 

1977 ], 

1978 "http://purl.org/spar/pro/withRole": [ 

1979 {"@id": "http://purl.org/spar/pro/publisher"} 

1980 ], 

1981 }, 

1982 { 

1983 "@id": "https://w3id.org/oc/meta/ar/062507977760", 

1984 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

1985 "http://purl.org/spar/pro/isHeldBy": [ 

1986 {"@id": "https://w3id.org/oc/meta/ra/0621010775619"} 

1987 ], 

1988 "http://purl.org/spar/pro/withRole": [ 

1989 {"@id": "http://purl.org/spar/pro/author"} 

1990 ], 

1991 }, 

1992 { 

1993 "@id": "https://w3id.org/oc/meta/ar/062507977759", 

1994 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

1995 "http://purl.org/spar/pro/isHeldBy": [ 

1996 {"@id": "https://w3id.org/oc/meta/ra/0614010840729"} 

1997 ], 

1998 "http://purl.org/spar/pro/withRole": [ 

1999 {"@id": "http://purl.org/spar/pro/author"} 

2000 ], 

2001 "https://w3id.org/oc/ontology/hasNext": [ 

2002 {"@id": "https://w3id.org/oc/meta/ar/062507977760"} 

2003 ], 

2004 }, 

2005 ] 

2006 } 

2007 ] 

2008 

2009 ra_data_peroni = [ 

2010 { 

2011 "@graph": [ 

2012 { 

2013 "@id": "https://w3id.org/oc/meta/ra/0614010840729", 

2014 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

2015 "http://purl.org/spar/datacite/hasIdentifier": [ 

2016 {"@id": "https://w3id.org/oc/meta/id/06304949238"} 

2017 ], 

2018 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Peroni"}], 

2019 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Silvio"}], 

2020 "http://xmlns.com/foaf/0.1/name": [{"@value": "Peroni Silvio"}], 

2021 } 

2022 ] 

2023 } 

2024 ] 

2025 

2026 ra_data_shotton = [ 

2027 { 

2028 "@graph": [ 

2029 { 

2030 "@id": "https://w3id.org/oc/meta/ra/0621010775619", 

2031 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

2032 "http://purl.org/spar/datacite/hasIdentifier": [ 

2033 {"@id": "https://w3id.org/oc/meta/id/062404672414"} 

2034 ], 

2035 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Shotton"}], 

2036 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "D M"}], 

2037 "http://xmlns.com/foaf/0.1/name": [{"@value": "Shotton David"}], 

2038 } 

2039 ] 

2040 } 

2041 ] 

2042 

2043 # Create directory structure for BR data 

2044 br_dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "1780000") 

2045 os.makedirs(br_dir_path, exist_ok=True) 

2046 

2047 # Create directory structure for AR data 

2048 ar_dir_path = os.path.join(self.rdf_dir, "ar", supplier_prefix, "7980000") 

2049 os.makedirs(ar_dir_path, exist_ok=True) 

2050 

2051 # Create directory structure for RA data (Peroni) 

2052 ra_peroni_dir_path = os.path.join(self.rdf_dir, "ra", "06140", "10850000") 

2053 os.makedirs(ra_peroni_dir_path, exist_ok=True) 

2054 

2055 # Create directory structure for RA data (Shotton) 

2056 ra_shotton_dir_path = os.path.join(self.rdf_dir, "ra", "06210", "10780000") 

2057 os.makedirs(ra_shotton_dir_path, exist_ok=True) 

2058 

2059 # Write BR data 

2060 with ZipFile(os.path.join(br_dir_path, "1778000.zip"), "w") as zip_file: 

2061 zip_file.writestr("1778000.json", json.dumps(br_data)) 

2062 

2063 # Write AR data 

2064 with ZipFile(os.path.join(ar_dir_path, "7978000.zip"), "w") as zip_file: 

2065 zip_file.writestr("7978000.json", json.dumps(ar_data)) 

2066 

2067 # Write RA data (Peroni) 

2068 with ZipFile(os.path.join(ra_peroni_dir_path, "10841000.zip"), "w") as zip_file: 

2069 zip_file.writestr("10841000.json", json.dumps(ra_data_peroni)) 

2070 

2071 # Write RA data (Shotton) 

2072 with ZipFile( 

2073 os.path.join(ra_shotton_dir_path, "10776000.zip"), "w" 

2074 ) as zip_file: 

2075 zip_file.writestr("10776000.json", json.dumps(ra_data_shotton)) 

2076 

2077 # Run generator 

2078 generate_csv( 

2079 input_dir=self.rdf_dir, 

2080 output_dir=self.output_dir, 

2081 dir_split_number=10000, 

2082 items_per_file=1000, 

2083 zip_output_rdf=True, 

2084 redis_port=6381, 

2085 redis_db=5, 

2086 ) 

2087 

2088 # Check output 

2089 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

2090 self.assertEqual(len(output_data), 1) 

2091 # Verify basic metadata 

2092 article = output_data[0] 

2093 self.assertEqual( 

2094 article["title"], 

2095 "OpenCitations, An Infrastructure Organization For Open Scholarship", 

2096 ) 

2097 self.assertEqual(article["pub_date"], "2020-02") 

2098 self.assertEqual(article["type"], "journal article") 

2099 self.assertEqual(article["id"], "omid:br/062501777134") 

2100 

2101 # Now we expect the authors to be present in the correct order 

2102 expected_authors = ( 

2103 "Peroni, Silvio [omid:ra/0614010840729]; " 

2104 "Shotton, D M [omid:ra/0621010775619]" 

2105 ) 

2106 self.assertEqual(article["author"], expected_authors) 

2107 

2108 # Publisher field should still be empty since we haven't added the publisher RA data 

2109 self.assertEqual(article["publisher"], "") 

2110 

2111 def test_multiple_first_ars(self): 

2112 """Test behavior when there are multiple first ARs in the same chain (no hasNext pointing to them). 

2113 The current behavior is to process only one of the first ARs and its hasNext chain. 

2114 """ 

2115 supplier_prefix = "060" 

2116 br_data = [ 

2117 { 

2118 "@graph": [ 

2119 { 

2120 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

2121 "@type": [ 

2122 "http://purl.org/spar/fabio/Expression", 

2123 "http://purl.org/spar/fabio/JournalArticle", 

2124 ], 

2125 "http://purl.org/dc/terms/title": [ 

2126 {"@value": "Article With Multiple First Authors"} 

2127 ], 

2128 "http://purl.org/spar/pro/isDocumentContextFor": [ 

2129 { 

2130 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1" 

2131 }, # First potential author (will be processed) 

2132 { 

2133 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2" 

2134 }, # Second potential author (will be ignored) 

2135 { 

2136 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3" 

2137 }, # Connected to author 1 (will be processed) 

2138 ], 

2139 } 

2140 ] 

2141 } 

2142 ] 

2143 

2144 # Setup agent roles with two potential "first" authors (no hasNext pointing to them) 

2145 # and one author connected to the first one 

2146 ar_data = [ 

2147 { 

2148 "@graph": [ 

2149 { 

2150 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

2151 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

2152 "http://purl.org/spar/pro/withRole": [ 

2153 {"@id": "http://purl.org/spar/pro/author"} 

2154 ], 

2155 "http://purl.org/spar/pro/isHeldBy": [ 

2156 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

2157 ], 

2158 "https://w3id.org/oc/ontology/hasNext": [ 

2159 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

2160 ], 

2161 }, 

2162 { 

2163 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

2164 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

2165 "http://purl.org/spar/pro/withRole": [ 

2166 {"@id": "http://purl.org/spar/pro/author"} 

2167 ], 

2168 "http://purl.org/spar/pro/isHeldBy": [ 

2169 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

2170 ], 

2171 # This is also a potential first author but will be ignored 

2172 }, 

2173 { 

2174 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

2175 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

2176 "http://purl.org/spar/pro/withRole": [ 

2177 {"@id": "http://purl.org/spar/pro/author"} 

2178 ], 

2179 "http://purl.org/spar/pro/isHeldBy": [ 

2180 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

2181 ], 

2182 # This one is connected to author 1 via hasNext and will be processed 

2183 }, 

2184 ] 

2185 } 

2186 ] 

2187 

2188 # Setup responsible agents 

2189 ra_data = [ 

2190 { 

2191 "@graph": [ 

2192 { 

2193 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

2194 "http://xmlns.com/foaf/0.1/name": [ 

2195 {"@value": "First Potential Author"} 

2196 ], 

2197 }, 

2198 { 

2199 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

2200 "http://xmlns.com/foaf/0.1/name": [ 

2201 {"@value": "Second Potential Author"} 

2202 ], 

2203 }, 

2204 { 

2205 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

2206 "http://xmlns.com/foaf/0.1/name": [ 

2207 {"@value": "Connected Author"} 

2208 ], 

2209 }, 

2210 ] 

2211 } 

2212 ] 

2213 

2214 # Write test data files 

2215 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

2216 

2217 for entity_type, data in data_files.items(): 

2218 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

2219 os.makedirs(dir_path, exist_ok=True) 

2220 

2221 zip_path = os.path.join(dir_path, "1000.zip") 

2222 with ZipFile(zip_path, "w") as zip_file: 

2223 zip_file.writestr("1000.json", json.dumps(data)) 

2224 

2225 # Run generator 

2226 generate_csv( 

2227 input_dir=self.rdf_dir, 

2228 output_dir=self.output_dir, 

2229 dir_split_number=10000, 

2230 items_per_file=1000, 

2231 zip_output_rdf=True, 

2232 redis_port=6381, 

2233 redis_db=5, 

2234 ) 

2235 

2236 # Check output 

2237 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

2238 self.assertEqual(len(output_data), 1) 

2239 

2240 article = output_data[0] 

2241 authors = article["author"].split("; ") 

2242 

2243 # Verify we have exactly two authors (the first one found and its connected author) 

2244 self.assertEqual( 

2245 len(authors), 

2246 2, 

2247 "Should have exactly two authors (first author and connected one)", 

2248 ) 

2249 

2250 # Verify the specific authors we expect 

2251 expected_authors = [ 

2252 f"First Potential Author [omid:ra/{supplier_prefix}1]", 

2253 f"Connected Author [omid:ra/{supplier_prefix}3]", 

2254 ] 

2255 self.assertEqual( 

2256 authors, 

2257 expected_authors, 

2258 "Should have first author and connected author in correct order", 

2259 ) 

2260 

2261 # Verify the second potential author is NOT in the output 

2262 self.assertNotIn( 

2263 f"Second Potential Author [omid:ra/{supplier_prefix}2]", 

2264 article["author"], 

2265 "Second potential author should not be in the output", 

2266 ) 

2267 

2268 

2269if __name__ == "__main__": 

2270 unittest.main()