Coverage for test/csv_generator_lite_test.py: 99%

455 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-07-14 14:06 +0000

1#!/usr/bin/python 

2# -*- coding: utf-8 -*- 

3# Copyright (c) 2024 Arcangelo Massari <arcangelo.massari@unibo.it> 

4# 

5# Permission to use, copy, modify, and/or distribute this software for any purpose 

6# with or without fee is hereby granted, provided that the above copyright notice 

7# and this permission notice appear in all copies. 

8# 

9# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 

10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 

11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, 

12# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 

13# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 

14# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 

15# SOFTWARE. 

16 

17import csv 

18import json 

19import os 

20import unittest 

21from shutil import rmtree 

22from zipfile import ZipFile 

23 

24import redis 

25from oc_meta.lib.file_manager import get_csv_data 

26from oc_meta.plugins.csv_generator_lite.csv_generator_lite import ( 

27 generate_csv, 

28 init_redis_connection, 

29 is_omid_processed, 

30 load_processed_omids_to_redis, 

31) 

32 

33 

34class TestCSVGeneratorLite(unittest.TestCase): 

35 def setUp(self): 

36 self.base_dir = os.path.join("test", "csv_generator_lite") 

37 self.input_dir = os.path.join(self.base_dir, "input") 

38 self.output_dir = os.path.join(self.base_dir, "output") 

39 

40 # Create test directories if they don't exist 

41 os.makedirs(self.input_dir, exist_ok=True) 

42 os.makedirs(self.output_dir, exist_ok=True) 

43 

44 # Create test RDF structure 

45 self.rdf_dir = os.path.join(self.input_dir, "rdf") 

46 self.br_dir = os.path.join(self.rdf_dir, "br") 

47 os.makedirs(self.br_dir, exist_ok=True) 

48 

49 # Initialize Redis connection for tests 

50 self.redis_client = init_redis_connection(db=5) # Use DB 5 for testing 

51 self.redis_client.flushdb() # Clear test database 

52 

53 def tearDown(self): 

54 if os.path.exists(self.base_dir): 

55 rmtree(self.base_dir) 

56 # Clean up Redis test database 

57 self.redis_client.flushdb() 

58 

59 def _write_test_data(self, data): 

60 """Helper method to write test data to the input directory""" 

61 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True) 

62 test_data = [ 

63 { 

64 "@graph": [ 

65 { 

66 "@id": f"https://w3id.org/oc/meta/{item['id'].replace('omid:', '')}", 

67 "@type": [ 

68 "http://purl.org/spar/fabio/Expression", 

69 "http://purl.org/spar/fabio/JournalArticle", 

70 ], 

71 "http://purl.org/dc/terms/title": [{"@value": item["title"]}], 

72 } 

73 for item in data 

74 ] 

75 } 

76 ] 

77 with ZipFile( 

78 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w" 

79 ) as zip_file: 

80 zip_file.writestr("1000.json", json.dumps(test_data)) 

81 

82 def test_redis_connection_and_caching(self): 

83 """Test Redis connection and basic caching operations""" 

84 # Test connection initialization 

85 redis_client = init_redis_connection(db=5) 

86 self.assertIsInstance(redis_client, redis.Redis) 

87 

88 # Create a test CSV file with some OMIDs 

89 test_data = [ 

90 {"id": "omid:br/0601", "title": "Test 1"}, 

91 {"id": "omid:br/0602", "title": "Test 2"}, 

92 {"id": "omid:br/0603 issn:456", "title": "Test 3"}, 

93 ] 

94 os.makedirs(self.output_dir, exist_ok=True) 

95 with open( 

96 os.path.join(self.output_dir, "test.csv"), "w", newline="", encoding="utf-8" 

97 ) as f: 

98 writer = csv.DictWriter(f, fieldnames=["id", "title"]) 

99 writer.writeheader() 

100 writer.writerows(test_data) 

101 

102 # Test loading OMIDs into Redis 

103 count = load_processed_omids_to_redis(self.output_dir, redis_client) 

104 self.assertEqual(count, 3) 

105 

106 # Test OMID lookup 

107 self.assertTrue(is_omid_processed("omid:br/0601", redis_client)) 

108 self.assertTrue(is_omid_processed("omid:br/0602", redis_client)) 

109 self.assertTrue(is_omid_processed("omid:br/0603", redis_client)) 

110 self.assertFalse(is_omid_processed("omid:br/0604", redis_client)) 

111 

112 def test_redis_cache_persistence(self): 

113 """Test that Redis is populated from existing CSV files and cleared after completion""" 

114 # Create initial test data 

115 test_data = [ 

116 { 

117 "@graph": [ 

118 { 

119 "@id": "https://w3id.org/oc/meta/br/0601", 

120 "@type": [ 

121 "http://purl.org/spar/fabio/Expression", 

122 "http://purl.org/spar/fabio/JournalArticle", 

123 ], 

124 "http://purl.org/dc/terms/title": [{"@value": "First Run"}], 

125 } 

126 ] 

127 } 

128 ] 

129 

130 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True) 

131 with ZipFile( 

132 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w" 

133 ) as zip_file: 

134 zip_file.writestr("1000.json", json.dumps(test_data)) 

135 

136 # First run - creates initial CSV 

137 generate_csv( 

138 input_dir=self.rdf_dir, 

139 output_dir=self.output_dir, 

140 dir_split_number=10000, 

141 items_per_file=1000, 

142 zip_output_rdf=True, 

143 redis_db=5, 

144 ) 

145 

146 # Verify Redis is empty after first run 

147 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client)) 

148 

149 # Create new test data 

150 test_data_2 = [ 

151 { 

152 "@graph": [ 

153 { 

154 "@id": "https://w3id.org/oc/meta/br/0601", # Same OMID as before 

155 "@type": [ 

156 "http://purl.org/spar/fabio/Expression", 

157 "http://purl.org/spar/fabio/JournalArticle", 

158 ], 

159 "http://purl.org/dc/terms/title": [ 

160 {"@value": "Should Be Skipped"} 

161 ], 

162 }, 

163 { 

164 "@id": "https://w3id.org/oc/meta/br/0602", # New OMID 

165 "@type": [ 

166 "http://purl.org/spar/fabio/Expression", 

167 "http://purl.org/spar/fabio/JournalArticle", 

168 ], 

169 "http://purl.org/dc/terms/title": [ 

170 {"@value": "Should Be Processed"} 

171 ], 

172 }, 

173 ] 

174 } 

175 ] 

176 

177 with ZipFile( 

178 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w" 

179 ) as zip_file: 

180 zip_file.writestr("1000.json", json.dumps(test_data_2)) 

181 

182 # Second run - should load OMIDs from existing CSV and skip already processed resources 

183 generate_csv( 

184 input_dir=self.rdf_dir, 

185 output_dir=self.output_dir, 

186 dir_split_number=10000, 

187 items_per_file=1000, 

188 zip_output_rdf=True, 

189 redis_db=5, 

190 ) 

191 

192 # Check output files 

193 output_data = [] 

194 for filename in os.listdir(self.output_dir): 

195 if filename.endswith(".csv"): 

196 output_data.extend( 

197 get_csv_data(os.path.join(self.output_dir, filename)) 

198 ) 

199 

200 # Verify results 

201 # Should find exactly two entries - one from first run and one new one 

202 self.assertEqual(len(output_data), 2) 

203 

204 # Find entries by title 

205 first_run_entry = next( 

206 item for item in output_data if item["title"] == "First Run" 

207 ) 

208 second_run_entry = next( 

209 item for item in output_data if item["title"] == "Should Be Processed" 

210 ) 

211 

212 # Verify the first entry wasn't overwritten with "Should Be Skipped" 

213 self.assertEqual(first_run_entry["title"], "First Run") 

214 self.assertEqual(first_run_entry["id"], "omid:br/0601") 

215 

216 # Verify the new entry was processed 

217 self.assertEqual(second_run_entry["title"], "Should Be Processed") 

218 self.assertEqual(second_run_entry["id"], "omid:br/0602") 

219 

220 # Verify Redis is empty after completion 

221 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client)) 

222 self.assertFalse(is_omid_processed("omid:br/0602", self.redis_client)) 

223 

224 def test_redis_cache_cleanup(self): 

225 """Test that Redis cache is properly cleaned up in various scenarios""" 

226 # First run - should process successfully and clear Redis 

227 input_data = [{"id": "omid:br/0601", "title": "First Entry"}] 

228 self._write_test_data(input_data) 

229 

230 # Run with valid directory - should process and clear Redis 

231 generate_csv( 

232 input_dir=self.rdf_dir, 

233 output_dir=self.output_dir, 

234 dir_split_number=10000, 

235 items_per_file=1000, 

236 zip_output_rdf=True, 

237 redis_db=5, 

238 ) 

239 

240 # Verify Redis is empty after successful run 

241 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client)) 

242 

243 # Load processed OMIDs into Redis 

244 load_processed_omids_to_redis(self.output_dir, self.redis_client) 

245 

246 # Verify that after loading from CSV, the OMID is in Redis 

247 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client)) 

248 

249 # Run with non-existent directory - should fail but keep Redis populated 

250 generate_csv( 

251 input_dir="/nonexistent/dir", 

252 output_dir=self.output_dir, 

253 dir_split_number=10000, 

254 items_per_file=1000, 

255 zip_output_rdf=True, 

256 redis_db=5, 

257 ) 

258 

259 # Verify Redis still has the data after failed run 

260 self.assertTrue( 

261 is_omid_processed("omid:br/0601", self.redis_client), 

262 "Redis cache should be retained after a failed run", 

263 ) 

264 

265 def test_redis_error_handling(self): 

266 """Test handling of Redis connection errors""" 

267 # Test with invalid Redis connection 

268 with self.assertRaises(redis.ConnectionError): 

269 init_redis_connection(port=12345) # Invalid port 

270 

271 # Test loading OMIDs with non-existent directory 

272 count = load_processed_omids_to_redis("/nonexistent/dir", self.redis_client) 

273 self.assertEqual(count, 0) 

274 

275 def test_concurrent_processing_with_redis(self): 

276 """Test concurrent processing with Redis caching""" 

277 # Create multiple test files 

278 test_data = [] 

279 for i in range(100): # Create 100 test entries 

280 test_data.append( 

281 { 

282 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}", 

283 "@type": [ 

284 "http://purl.org/spar/fabio/Expression", 

285 "http://purl.org/spar/fabio/JournalArticle", 

286 ], 

287 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}], 

288 } 

289 ) 

290 

291 # Split into multiple files 

292 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True) 

293 for i in range(0, 100, 10): # Create 10 files with 10 entries each 

294 file_data = [{"@graph": test_data[i : i + 10]}] 

295 with ZipFile( 

296 os.path.join(self.br_dir, "060", "10000", f"{i+1000}.zip"), "w" 

297 ) as zip_file: 

298 zip_file.writestr(f"{i+1000}.json", json.dumps(file_data)) 

299 

300 # First run to create some CSV files 

301 generate_csv( 

302 input_dir=self.rdf_dir, 

303 output_dir=self.output_dir, 

304 dir_split_number=10000, 

305 items_per_file=1000, 

306 zip_output_rdf=True, 

307 redis_db=5, 

308 ) 

309 

310 # Create more test entries 

311 more_test_data = [] 

312 for i in range(100, 200): # Create 100 more test entries 

313 more_test_data.append( 

314 { 

315 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}", 

316 "@type": [ 

317 "http://purl.org/spar/fabio/Expression", 

318 "http://purl.org/spar/fabio/JournalArticle", 

319 ], 

320 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}], 

321 } 

322 ) 

323 

324 # Add new files 

325 for i in range(0, 100, 10): 

326 file_data = [{"@graph": more_test_data[i : i + 10]}] 

327 with ZipFile( 

328 os.path.join(self.br_dir, "060", "10000", f"{i+2000}.zip"), "w" 

329 ) as zip_file: 

330 zip_file.writestr(f"{i+2000}.json", json.dumps(file_data)) 

331 

332 # Second run with existing cache 

333 generate_csv( 

334 input_dir=self.rdf_dir, 

335 output_dir=self.output_dir, 

336 dir_split_number=10000, 

337 items_per_file=1000, 

338 zip_output_rdf=True, 

339 redis_db=5, 

340 ) 

341 

342 # Verify results 

343 all_output_data = [] 

344 for filename in os.listdir(self.output_dir): 

345 if filename.endswith(".csv"): 

346 all_output_data.extend( 

347 get_csv_data(os.path.join(self.output_dir, filename)) 

348 ) 

349 

350 # Should have processed all 200 entries 

351 self.assertEqual(len(all_output_data), 200) 

352 

353 # Verify no duplicates 

354 processed_ids = {row["id"] for row in all_output_data} 

355 self.assertEqual(len(processed_ids), 200) 

356 

357 def test_basic_br_processing(self): 

358 """Test basic bibliographic resource processing""" 

359 test_data = [ 

360 { 

361 "@graph": [ 

362 { 

363 "@id": "https://w3id.org/oc/meta/br/0601", 

364 "@type": [ 

365 "http://purl.org/spar/fabio/Expression", 

366 "http://purl.org/spar/fabio/JournalArticle", 

367 ], 

368 "http://purl.org/dc/terms/title": [{"@value": "Test Article"}], 

369 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

370 {"@value": "2024-01-01"} 

371 ], 

372 "http://purl.org/spar/datacite/hasIdentifier": [ 

373 {"@id": "https://w3id.org/oc/meta/id/0601"} 

374 ], 

375 } 

376 ], 

377 "@id": "https://w3id.org/oc/meta/br/", 

378 } 

379 ] 

380 

381 # Write test data to file 

382 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True) 

383 with ZipFile( 

384 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w" 

385 ) as zip_file: 

386 zip_file.writestr("1000.json", json.dumps(test_data)) 

387 

388 # Run generator 

389 generate_csv( 

390 input_dir=self.rdf_dir, 

391 output_dir=self.output_dir, 

392 dir_split_number=10000, 

393 items_per_file=1000, 

394 zip_output_rdf=True, 

395 ) 

396 

397 # Check output 

398 output_files = os.listdir(self.output_dir) 

399 self.assertEqual(len(output_files), 1) 

400 

401 output_data = get_csv_data(os.path.join(self.output_dir, output_files[0])) 

402 self.assertEqual(len(output_data), 1) 

403 self.assertEqual(output_data[0]["title"], "Test Article") 

404 self.assertEqual(output_data[0]["pub_date"], "2024-01-01") 

405 self.assertEqual(output_data[0]["type"], "journal article") 

406 self.assertEqual(output_data[0]["id"], "omid:br/0601") 

407 

408 def test_complex_br_with_related_entities(self): 

409 """Test processing of BR with authors, venue, and other related entities""" 

410 # Create directory structure for each entity type 

411 supplier_prefix = "060" 

412 for entity_type in ["br", "ra", "ar", "id"]: 

413 os.makedirs( 

414 os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000"), 

415 exist_ok=True, 

416 ) 

417 

418 # BR data including both the article and the venue 

419 br_data = [ 

420 { 

421 "@graph": [ 

422 { 

423 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2", 

424 "@type": [ 

425 "http://purl.org/spar/fabio/Expression", 

426 "http://purl.org/spar/fabio/JournalArticle", 

427 ], 

428 "http://purl.org/dc/terms/title": [ 

429 {"@value": "Complex Article"} 

430 ], 

431 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

432 {"@value": "2024-02-01"} 

433 ], 

434 "http://purl.org/spar/pro/isDocumentContextFor": [ 

435 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"} 

436 ], 

437 "http://purl.org/vocab/frbr/core#partOf": [ 

438 {"@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3"} 

439 ], 

440 }, 

441 { 

442 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3", 

443 "@type": [ 

444 "http://purl.org/spar/fabio/Expression", 

445 "http://purl.org/spar/fabio/Journal", 

446 ], 

447 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}], 

448 }, 

449 ], 

450 "@id": "https://w3id.org/oc/meta/br/", 

451 } 

452 ] 

453 

454 ar_data = [ 

455 { 

456 "@graph": [ 

457 { 

458 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

459 "http://purl.org/spar/pro/withRole": [ 

460 {"@id": "http://purl.org/spar/pro/author"} 

461 ], 

462 "http://purl.org/spar/pro/isHeldBy": [ 

463 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

464 ], 

465 } 

466 ], 

467 "@id": "https://w3id.org/oc/meta/ar/", 

468 } 

469 ] 

470 

471 ra_data = [ 

472 { 

473 "@graph": [ 

474 { 

475 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

476 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}], 

477 } 

478 ], 

479 "@id": "https://w3id.org/oc/meta/ra/", 

480 } 

481 ] 

482 

483 # Write test data files in correct locations 

484 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

485 

486 for entity_type, data in data_files.items(): 

487 zip_path = os.path.join( 

488 self.rdf_dir, entity_type, supplier_prefix, "10000", "1000.zip" 

489 ) 

490 with ZipFile(zip_path, "w") as zip_file: 

491 zip_file.writestr("1000.json", json.dumps(data)) 

492 

493 # Run generator 

494 generate_csv( 

495 input_dir=self.rdf_dir, 

496 output_dir=self.output_dir, 

497 dir_split_number=10000, 

498 items_per_file=1000, 

499 zip_output_rdf=True, 

500 ) 

501 

502 # Check output 

503 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

504 self.assertEqual(len(output_data), 2) # Should have 2 rows: article and journal 

505 

506 # Find article and journal entries 

507 article = next( 

508 (item for item in output_data if item["type"] == "journal article"), None 

509 ) 

510 journal = next( 

511 (item for item in output_data if item["type"] == "journal"), None 

512 ) 

513 

514 # Verify article data 

515 self.assertIsNotNone(article) 

516 self.assertEqual(article["title"], "Complex Article") 

517 self.assertEqual(article["venue"], f"Test Journal [omid:br/{supplier_prefix}3]") 

518 self.assertEqual(article["author"], "Test Author [omid:ra/0601]") 

519 self.assertEqual(article["id"], f"omid:br/{supplier_prefix}2") 

520 

521 # Verify journal data 

522 self.assertIsNotNone(journal) 

523 self.assertEqual(journal["title"], "Test Journal") 

524 self.assertEqual(journal["type"], "journal") 

525 self.assertEqual(journal["id"], f"omid:br/{supplier_prefix}3") 

526 

527 def test_empty_input_directory(self): 

528 """Test behavior with empty input directory""" 

529 generate_csv( 

530 input_dir=self.rdf_dir, 

531 output_dir=self.output_dir, 

532 dir_split_number=10000, 

533 items_per_file=1000, 

534 zip_output_rdf=True, 

535 ) 

536 

537 self.assertEqual(len(os.listdir(self.output_dir)), 0) 

538 

539 def test_br_with_multiple_authors_and_editors(self): 

540 """Test processing of BR with multiple authors and editors""" 

541 supplier_prefix = "060" 

542 br_data = [ 

543 { 

544 "@graph": [ 

545 { 

546 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

547 "@type": [ 

548 "http://purl.org/spar/fabio/Expression", 

549 "http://purl.org/spar/fabio/Book", 

550 ], 

551 "http://purl.org/dc/terms/title": [ 

552 {"@value": "Multi-Author Book"} 

553 ], 

554 "http://purl.org/spar/pro/isDocumentContextFor": [ 

555 { 

556 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1" 

557 }, # First author 

558 { 

559 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2" 

560 }, # Second author 

561 { 

562 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3" 

563 }, # First editor 

564 { 

565 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4" 

566 }, # Second editor 

567 ], 

568 } 

569 ] 

570 } 

571 ] 

572 

573 # Setup agent roles for authors and editors with hasNext relations 

574 ar_data = [ 

575 { 

576 "@graph": [ 

577 { 

578 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

579 "http://purl.org/spar/pro/withRole": [ 

580 {"@id": "http://purl.org/spar/pro/author"} 

581 ], 

582 "http://purl.org/spar/pro/isHeldBy": [ 

583 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

584 ], 

585 "https://w3id.org/oc/ontology/hasNext": [ 

586 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"} 

587 ], 

588 }, 

589 { 

590 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

591 "http://purl.org/spar/pro/withRole": [ 

592 {"@id": "http://purl.org/spar/pro/author"} 

593 ], 

594 "http://purl.org/spar/pro/isHeldBy": [ 

595 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

596 ], 

597 "https://w3id.org/oc/ontology/hasNext": [ 

598 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

599 ], 

600 }, 

601 { 

602 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

603 "http://purl.org/spar/pro/withRole": [ 

604 {"@id": "http://purl.org/spar/pro/editor"} 

605 ], 

606 "http://purl.org/spar/pro/isHeldBy": [ 

607 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

608 ], 

609 "https://w3id.org/oc/ontology/hasNext": [ 

610 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4"} 

611 ], 

612 }, 

613 { 

614 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4", 

615 "http://purl.org/spar/pro/withRole": [ 

616 {"@id": "http://purl.org/spar/pro/editor"} 

617 ], 

618 "http://purl.org/spar/pro/isHeldBy": [ 

619 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4"} 

620 ], 

621 }, 

622 ] 

623 } 

624 ] 

625 

626 # Setup responsible agents 

627 ra_data = [ 

628 { 

629 "@graph": [ 

630 { 

631 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

632 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}], 

633 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}], 

634 }, 

635 { 

636 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

637 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Doe"}], 

638 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Jane"}], 

639 }, 

640 { 

641 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

642 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Brown"}], 

643 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Bob"}], 

644 }, 

645 { 

646 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4", 

647 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Wilson"}], 

648 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Alice"}], 

649 }, 

650 ] 

651 } 

652 ] 

653 

654 # Write test data files 

655 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

656 

657 for entity_type, data in data_files.items(): 

658 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

659 os.makedirs(dir_path, exist_ok=True) 

660 

661 zip_path = os.path.join(dir_path, "1000.zip") 

662 with ZipFile(zip_path, "w") as zip_file: 

663 zip_file.writestr("1000.json", json.dumps(data)) 

664 

665 # Run generator 

666 generate_csv( 

667 input_dir=self.rdf_dir, 

668 output_dir=self.output_dir, 

669 dir_split_number=10000, 

670 items_per_file=1000, 

671 zip_output_rdf=True, 

672 ) 

673 

674 # Check output 

675 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

676 self.assertEqual(len(output_data), 1) 

677 

678 # Verify authors and editors are in the correct order 

679 expected_authors = ( 

680 f"Smith, John [omid:ra/{supplier_prefix}1]; " 

681 f"Doe, Jane [omid:ra/{supplier_prefix}2]" 

682 ) 

683 expected_editors = ( 

684 f"Brown, Bob [omid:ra/{supplier_prefix}3]; " 

685 f"Wilson, Alice [omid:ra/{supplier_prefix}4]" 

686 ) 

687 

688 self.assertEqual(output_data[0]["author"], expected_authors) 

689 self.assertEqual(output_data[0]["editor"], expected_editors) 

690 

691 def test_br_with_identifiers(self): 

692 """Test processing of BR with multiple identifiers""" 

693 supplier_prefix = "060" 

694 br_data = [ 

695 { 

696 "@graph": [ 

697 { 

698 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

699 "@type": [ 

700 "http://purl.org/spar/fabio/Expression", 

701 "http://purl.org/spar/fabio/JournalArticle", 

702 ], 

703 "http://purl.org/dc/terms/title": [ 

704 {"@value": "Article With DOI"} 

705 ], 

706 "http://purl.org/spar/datacite/hasIdentifier": [ 

707 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1"}, 

708 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2"}, 

709 ], 

710 } 

711 ] 

712 } 

713 ] 

714 

715 id_data = [ 

716 { 

717 "@graph": [ 

718 { 

719 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1", 

720 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

721 {"@id": "http://purl.org/spar/datacite/doi"} 

722 ], 

723 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

724 {"@value": "10.1234/test.123"} 

725 ], 

726 }, 

727 { 

728 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2", 

729 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

730 {"@id": "http://purl.org/spar/datacite/isbn"} 

731 ], 

732 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

733 {"@value": "978-0-123456-47-2"} 

734 ], 

735 }, 

736 ] 

737 } 

738 ] 

739 

740 # Write test data files in correct locations 

741 data_files = {"br": br_data, "id": id_data} 

742 

743 for entity_type, data in data_files.items(): 

744 # Create all necessary directories 

745 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

746 os.makedirs(dir_path, exist_ok=True) 

747 

748 zip_path = os.path.join(dir_path, "1000.zip") 

749 with ZipFile(zip_path, "w") as zip_file: 

750 zip_file.writestr("1000.json", json.dumps(data)) 

751 

752 # Run generator 

753 generate_csv( 

754 input_dir=self.rdf_dir, 

755 output_dir=self.output_dir, 

756 dir_split_number=10000, 

757 items_per_file=1000, 

758 zip_output_rdf=True, 

759 ) 

760 

761 # Check output 

762 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

763 self.assertEqual(len(output_data), 1) 

764 

765 # Verify all identifiers are included 

766 expected_ids = ( 

767 f"omid:br/{supplier_prefix}1 doi:10.1234/test.123 isbn:978-0-123456-47-2" 

768 ) 

769 self.assertEqual(output_data[0]["id"], expected_ids) 

770 

771 def test_br_with_page_numbers(self): 

772 """Test processing of BR with page information""" 

773 supplier_prefix = "060" 

774 br_data = [ 

775 { 

776 "@graph": [ 

777 { 

778 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

779 "@type": [ 

780 "http://purl.org/spar/fabio/Expression", 

781 "http://purl.org/spar/fabio/JournalArticle", 

782 ], 

783 "http://purl.org/dc/terms/title": [{"@value": "Paged Article"}], 

784 "http://purl.org/vocab/frbr/core#embodiment": [ 

785 {"@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1"} 

786 ], 

787 } 

788 ] 

789 } 

790 ] 

791 

792 re_data = [ 

793 { 

794 "@graph": [ 

795 { 

796 "@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1", 

797 "http://prismstandard.org/namespaces/basic/2.0/startingPage": [ 

798 {"@value": "100"} 

799 ], 

800 "http://prismstandard.org/namespaces/basic/2.0/endingPage": [ 

801 {"@value": "120"} 

802 ], 

803 } 

804 ] 

805 } 

806 ] 

807 

808 # Write test data files in correct locations 

809 data_files = {"br": br_data, "re": re_data} 

810 

811 for entity_type, data in data_files.items(): 

812 # Create all necessary directories 

813 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

814 os.makedirs(dir_path, exist_ok=True) 

815 

816 zip_path = os.path.join(dir_path, "1000.zip") 

817 with ZipFile(zip_path, "w") as zip_file: 

818 zip_file.writestr("1000.json", json.dumps(data)) 

819 

820 # Run generator 

821 generate_csv( 

822 input_dir=self.rdf_dir, 

823 output_dir=self.output_dir, 

824 dir_split_number=10000, 

825 items_per_file=1000, 

826 zip_output_rdf=True, 

827 ) 

828 

829 # Check output 

830 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

831 self.assertEqual(len(output_data), 1) 

832 self.assertEqual(output_data[0]["page"], "100-120") 

833 

834 def test_malformed_data_handling(self): 

835 """Test handling of malformed or incomplete data""" 

836 supplier_prefix = "060" 

837 br_data = [ 

838 { 

839 "@graph": [ 

840 { 

841 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

842 "@type": [ 

843 "http://purl.org/spar/fabio/Expression", 

844 "http://purl.org/spar/fabio/JournalArticle", 

845 ], 

846 # Missing title 

847 "http://purl.org/spar/pro/isDocumentContextFor": [ 

848 {"@id": "invalid_uri"}, # Invalid URI 

849 ], 

850 "http://purl.org/vocab/frbr/core#partOf": [ 

851 {"@id": "non_existent_venue"} # Non-existent venue 

852 ], 

853 } 

854 ] 

855 } 

856 ] 

857 

858 # Write test data files in correct locations 

859 data_files = {"br": br_data} 

860 

861 for entity_type, data in data_files.items(): 

862 # Create all necessary directories 

863 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

864 os.makedirs(dir_path, exist_ok=True) 

865 

866 zip_path = os.path.join(dir_path, "1000.zip") 

867 with ZipFile(zip_path, "w") as zip_file: 

868 zip_file.writestr("1000.json", json.dumps(data)) 

869 

870 # Run generator 

871 generate_csv( 

872 input_dir=self.rdf_dir, 

873 output_dir=self.output_dir, 

874 dir_split_number=10000, 

875 items_per_file=1000, 

876 zip_output_rdf=True, 

877 ) 

878 

879 # Check output 

880 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

881 self.assertEqual(len(output_data), 1) 

882 # Verify graceful handling of missing/invalid data 

883 self.assertEqual(output_data[0]["title"], "") 

884 self.assertEqual(output_data[0]["author"], "") 

885 self.assertEqual(output_data[0]["venue"], "") 

886 

887 def test_br_with_hierarchical_venue_structures(self): 

888 """Test different hierarchical venue structures (issue->volume->journal, issue->journal, volume->journal, direct journal)""" 

889 supplier_prefix = "060" 

890 

891 # Create test data for different hierarchical structures 

892 br_data = [ 

893 { 

894 "@graph": [ 

895 # Article in issue->volume->journal structure 

896 { 

897 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

898 "@type": [ 

899 "http://purl.org/spar/fabio/Expression", 

900 "http://purl.org/spar/fabio/JournalArticle", 

901 ], 

902 "http://purl.org/dc/terms/title": [ 

903 {"@value": "Article in Full Hierarchy"} 

904 ], 

905 "http://purl.org/vocab/frbr/core#partOf": [ 

906 { 

907 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2" 

908 } # Issue 

909 ], 

910 }, 

911 # Article in issue->journal structure (no volume) 

912 { 

913 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}5", 

914 "@type": [ 

915 "http://purl.org/spar/fabio/Expression", 

916 "http://purl.org/spar/fabio/JournalArticle", 

917 ], 

918 "http://purl.org/dc/terms/title": [ 

919 {"@value": "Article in Issue-Journal"} 

920 ], 

921 "http://purl.org/vocab/frbr/core#partOf": [ 

922 { 

923 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6" 

924 } # Issue 

925 ], 

926 }, 

927 # Article in volume->journal structure (no issue) 

928 { 

929 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}9", 

930 "@type": [ 

931 "http://purl.org/spar/fabio/Expression", 

932 "http://purl.org/spar/fabio/JournalArticle", 

933 ], 

934 "http://purl.org/dc/terms/title": [ 

935 {"@value": "Article in Volume-Journal"} 

936 ], 

937 "http://purl.org/vocab/frbr/core#partOf": [ 

938 { 

939 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10" 

940 } # Volume 

941 ], 

942 }, 

943 # Article directly in journal 

944 { 

945 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}13", 

946 "@type": [ 

947 "http://purl.org/spar/fabio/Expression", 

948 "http://purl.org/spar/fabio/JournalArticle", 

949 ], 

950 "http://purl.org/dc/terms/title": [ 

951 {"@value": "Article in Journal"} 

952 ], 

953 "http://purl.org/vocab/frbr/core#partOf": [ 

954 { 

955 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4" 

956 } # Journal 

957 ], 

958 }, 

959 # Issue in full hierarchy 

960 { 

961 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2", 

962 "@type": ["http://purl.org/spar/fabio/JournalIssue"], 

963 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

964 {"@value": "2"} 

965 ], 

966 "http://purl.org/vocab/frbr/core#partOf": [ 

967 { 

968 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3" 

969 } # Volume 

970 ], 

971 }, 

972 # Volume in full hierarchy 

973 { 

974 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3", 

975 "@type": ["http://purl.org/spar/fabio/JournalVolume"], 

976 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

977 {"@value": "42"} 

978 ], 

979 "http://purl.org/vocab/frbr/core#partOf": [ 

980 { 

981 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4" 

982 } # Journal 

983 ], 

984 }, 

985 # Journal 

986 { 

987 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4", 

988 "@type": ["http://purl.org/spar/fabio/Journal"], 

989 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}], 

990 }, 

991 # Issue directly in journal 

992 { 

993 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6", 

994 "@type": ["http://purl.org/spar/fabio/JournalIssue"], 

995 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

996 {"@value": "3"} 

997 ], 

998 "http://purl.org/vocab/frbr/core#partOf": [ 

999 { 

1000 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4" 

1001 } # Journal 

1002 ], 

1003 }, 

1004 # Volume directly in journal 

1005 { 

1006 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10", 

1007 "@type": ["http://purl.org/spar/fabio/JournalVolume"], 

1008 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

1009 {"@value": "5"} 

1010 ], 

1011 "http://purl.org/vocab/frbr/core#partOf": [ 

1012 { 

1013 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4" 

1014 } # Journal 

1015 ], 

1016 }, 

1017 ] 

1018 } 

1019 ] 

1020 

1021 # Write test data files 

1022 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000") 

1023 os.makedirs(dir_path, exist_ok=True) 

1024 

1025 zip_path = os.path.join(dir_path, "1000.zip") 

1026 with ZipFile(zip_path, "w") as zip_file: 

1027 zip_file.writestr("1000.json", json.dumps(br_data)) 

1028 

1029 # Run generator 

1030 generate_csv( 

1031 input_dir=self.rdf_dir, 

1032 output_dir=self.output_dir, 

1033 dir_split_number=10000, 

1034 items_per_file=1000, 

1035 zip_output_rdf=True, 

1036 ) 

1037 

1038 # Check output 

1039 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1040 

1041 # Verify we only have the articles and journal in the output 

1042 self.assertEqual(len(output_data), 5) # 4 articles + 1 journal 

1043 

1044 # Verify no JournalVolume or JournalIssue entries exist 

1045 volume_or_issue_entries = [ 

1046 item 

1047 for item in output_data 

1048 if item["type"] in ["journal volume", "journal issue"] 

1049 ] 

1050 self.assertEqual(len(volume_or_issue_entries), 0) 

1051 

1052 # Find each article by title 

1053 full_hierarchy = next( 

1054 item for item in output_data if item["title"] == "Article in Full Hierarchy" 

1055 ) 

1056 issue_journal = next( 

1057 item for item in output_data if item["title"] == "Article in Issue-Journal" 

1058 ) 

1059 volume_journal = next( 

1060 item for item in output_data if item["title"] == "Article in Volume-Journal" 

1061 ) 

1062 direct_journal = next( 

1063 item for item in output_data if item["title"] == "Article in Journal" 

1064 ) 

1065 

1066 # Test full hierarchy (issue->volume->journal) 

1067 self.assertEqual(full_hierarchy["issue"], "2") 

1068 self.assertEqual(full_hierarchy["volume"], "42") 

1069 self.assertEqual( 

1070 full_hierarchy["venue"], f"Test Journal [omid:br/{supplier_prefix}4]" 

1071 ) 

1072 

1073 # Test issue->journal (no volume) 

1074 self.assertEqual(issue_journal["issue"], "3") 

1075 self.assertEqual(issue_journal["volume"], "") 

1076 self.assertEqual( 

1077 issue_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]" 

1078 ) 

1079 

1080 # Test volume->journal (no issue) 

1081 self.assertEqual(volume_journal["issue"], "") 

1082 self.assertEqual(volume_journal["volume"], "5") 

1083 self.assertEqual( 

1084 volume_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]" 

1085 ) 

1086 

1087 # Test direct journal connection 

1088 self.assertEqual(direct_journal["issue"], "") 

1089 self.assertEqual(direct_journal["volume"], "") 

1090 self.assertEqual( 

1091 direct_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]" 

1092 ) 

1093 

1094 def test_book_in_series(self): 

1095 """Test processing of a book that is part of a book series""" 

1096 supplier_prefix = "060" 

1097 

1098 # Create test data for book in series 

1099 br_data = [ 

1100 { 

1101 "@graph": [ 

1102 # Book 

1103 { 

1104 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1105 "@type": [ 

1106 "http://purl.org/spar/fabio/Expression", 

1107 "http://purl.org/spar/fabio/Book", 

1108 ], 

1109 "http://purl.org/dc/terms/title": [{"@value": "Test Book"}], 

1110 "http://purl.org/vocab/frbr/core#partOf": [ 

1111 { 

1112 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2" 

1113 } # Series 

1114 ], 

1115 }, 

1116 # Book Series 

1117 { 

1118 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2", 

1119 "@type": ["http://purl.org/spar/fabio/BookSeries"], 

1120 "http://purl.org/dc/terms/title": [ 

1121 {"@value": "Test Book Series"} 

1122 ], 

1123 }, 

1124 ] 

1125 } 

1126 ] 

1127 

1128 # Write test data 

1129 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000") 

1130 os.makedirs(dir_path, exist_ok=True) 

1131 

1132 zip_path = os.path.join(dir_path, "1000.zip") 

1133 with ZipFile(zip_path, "w") as zip_file: 

1134 zip_file.writestr("1000.json", json.dumps(br_data)) 

1135 

1136 # Run generator 

1137 generate_csv( 

1138 input_dir=self.rdf_dir, 

1139 output_dir=self.output_dir, 

1140 dir_split_number=10000, 

1141 items_per_file=1000, 

1142 zip_output_rdf=True, 

1143 ) 

1144 

1145 # Check output 

1146 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1147 

1148 # Find book entry 

1149 book = next(item for item in output_data if item["type"] == "book") 

1150 

1151 # Verify book is correctly linked to series 

1152 self.assertEqual(book["title"], "Test Book") 

1153 self.assertEqual( 

1154 book["venue"], f"Test Book Series [omid:br/{supplier_prefix}2]" 

1155 ) 

1156 self.assertEqual(book["volume"], "") # Should not have volume 

1157 self.assertEqual(book["issue"], "") # Should not have issue 

1158 

1159 def test_br_with_multiple_roles(self): 

1160 """Test processing of BR with authors, editors and publishers""" 

1161 supplier_prefix = "060" 

1162 br_data = [ 

1163 { 

1164 "@graph": [ 

1165 { 

1166 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1167 "@type": [ 

1168 "http://purl.org/spar/fabio/Expression", 

1169 "http://purl.org/spar/fabio/Book", 

1170 ], 

1171 "http://purl.org/dc/terms/title": [ 

1172 {"@value": "Multi-Role Book"} 

1173 ], 

1174 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1175 { 

1176 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1" 

1177 }, # Author 

1178 { 

1179 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2" 

1180 }, # Editor 

1181 { 

1182 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3" 

1183 }, # Publisher 

1184 ], 

1185 } 

1186 ] 

1187 } 

1188 ] 

1189 

1190 # Setup agent roles for authors, editors and publishers 

1191 ar_data = [ 

1192 { 

1193 "@graph": [ 

1194 { 

1195 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

1196 "http://purl.org/spar/pro/withRole": [ 

1197 {"@id": "http://purl.org/spar/pro/author"} 

1198 ], 

1199 "http://purl.org/spar/pro/isHeldBy": [ 

1200 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

1201 ], 

1202 "https://w3id.org/oc/ontology/hasNext": [ 

1203 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"} 

1204 ], 

1205 }, 

1206 { 

1207 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

1208 "http://purl.org/spar/pro/withRole": [ 

1209 {"@id": "http://purl.org/spar/pro/editor"} 

1210 ], 

1211 "http://purl.org/spar/pro/isHeldBy": [ 

1212 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

1213 ], 

1214 "https://w3id.org/oc/ontology/hasNext": [ 

1215 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

1216 ], 

1217 }, 

1218 { 

1219 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

1220 "http://purl.org/spar/pro/withRole": [ 

1221 {"@id": "http://purl.org/spar/pro/publisher"} 

1222 ], 

1223 "http://purl.org/spar/pro/isHeldBy": [ 

1224 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

1225 ], 

1226 }, 

1227 ] 

1228 } 

1229 ] 

1230 

1231 # Setup responsible agents with different name formats 

1232 ra_data = [ 

1233 { 

1234 "@graph": [ 

1235 { 

1236 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

1237 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}], 

1238 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}], 

1239 }, 

1240 { 

1241 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

1242 "http://xmlns.com/foaf/0.1/name": [{"@value": "Editor Name"}], 

1243 }, 

1244 { 

1245 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

1246 "http://xmlns.com/foaf/0.1/name": [ 

1247 {"@value": "Publisher House"} 

1248 ], 

1249 }, 

1250 ] 

1251 } 

1252 ] 

1253 

1254 # Write test data files 

1255 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

1256 

1257 for entity_type, data in data_files.items(): 

1258 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

1259 os.makedirs(dir_path, exist_ok=True) 

1260 

1261 zip_path = os.path.join(dir_path, "1000.zip") 

1262 with ZipFile(zip_path, "w") as zip_file: 

1263 zip_file.writestr("1000.json", json.dumps(data)) 

1264 

1265 # Run generator 

1266 generate_csv( 

1267 input_dir=self.rdf_dir, 

1268 output_dir=self.output_dir, 

1269 dir_split_number=10000, 

1270 items_per_file=1000, 

1271 zip_output_rdf=True, 

1272 ) 

1273 

1274 # Check output 

1275 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1276 self.assertEqual(len(output_data), 1) 

1277 

1278 # Verify all roles are correctly processed 

1279 book = output_data[0] 

1280 self.assertEqual(book["title"], "Multi-Role Book") 

1281 self.assertEqual(book["author"], f"Smith, John [omid:ra/{supplier_prefix}1]") 

1282 self.assertEqual(book["editor"], f"Editor Name [omid:ra/{supplier_prefix}2]") 

1283 self.assertEqual( 

1284 book["publisher"], f"Publisher House [omid:ra/{supplier_prefix}3]" 

1285 ) 

1286 

1287 def test_ordered_authors(self): 

1288 """Test that authors are ordered according to hasNext relations""" 

1289 supplier_prefix = "060" 

1290 br_data = [ 

1291 { 

1292 "@graph": [ 

1293 { 

1294 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1295 "@type": [ 

1296 "http://purl.org/spar/fabio/Expression", 

1297 "http://purl.org/spar/fabio/JournalArticle", 

1298 ], 

1299 "http://purl.org/dc/terms/title": [ 

1300 {"@value": "Ordered Authors Article"} 

1301 ], 

1302 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1303 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}, 

1304 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}, 

1305 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}, 

1306 ], 

1307 } 

1308 ] 

1309 } 

1310 ] 

1311 

1312 # Setup agent roles with hasNext relations 

1313 ar_data = [ 

1314 { 

1315 "@graph": [ 

1316 { 

1317 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

1318 "http://purl.org/spar/pro/withRole": [ 

1319 {"@id": "http://purl.org/spar/pro/author"} 

1320 ], 

1321 "http://purl.org/spar/pro/isHeldBy": [ 

1322 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

1323 ], 

1324 "https://w3id.org/oc/ontology/hasNext": [ 

1325 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"} 

1326 ], 

1327 }, 

1328 { 

1329 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

1330 "http://purl.org/spar/pro/withRole": [ 

1331 {"@id": "http://purl.org/spar/pro/author"} 

1332 ], 

1333 "http://purl.org/spar/pro/isHeldBy": [ 

1334 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

1335 ], 

1336 "https://w3id.org/oc/ontology/hasNext": [ 

1337 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

1338 ], 

1339 }, 

1340 { 

1341 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

1342 "http://purl.org/spar/pro/withRole": [ 

1343 {"@id": "http://purl.org/spar/pro/author"} 

1344 ], 

1345 "http://purl.org/spar/pro/isHeldBy": [ 

1346 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

1347 ], 

1348 }, 

1349 ] 

1350 } 

1351 ] 

1352 

1353 # Setup responsible agents with different names 

1354 ra_data = [ 

1355 { 

1356 "@graph": [ 

1357 { 

1358 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

1359 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}], 

1360 }, 

1361 { 

1362 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

1363 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}], 

1364 }, 

1365 { 

1366 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

1367 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}], 

1368 }, 

1369 ] 

1370 } 

1371 ] 

1372 

1373 # Write test data files 

1374 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

1375 

1376 for entity_type, data in data_files.items(): 

1377 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

1378 os.makedirs(dir_path, exist_ok=True) 

1379 

1380 zip_path = os.path.join(dir_path, "1000.zip") 

1381 with ZipFile(zip_path, "w") as zip_file: 

1382 zip_file.writestr("1000.json", json.dumps(data)) 

1383 

1384 # Run generator 

1385 generate_csv( 

1386 input_dir=self.rdf_dir, 

1387 output_dir=self.output_dir, 

1388 dir_split_number=10000, 

1389 items_per_file=1000, 

1390 zip_output_rdf=True, 

1391 ) 

1392 

1393 # Check output 

1394 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1395 self.assertEqual(len(output_data), 1) 

1396 

1397 # Verify authors are in the correct order 

1398 expected_authors = ( 

1399 f"First Author [omid:ra/{supplier_prefix}1]; " 

1400 f"Second Author [omid:ra/{supplier_prefix}2]; " 

1401 f"Third Author [omid:ra/{supplier_prefix}3]" 

1402 ) 

1403 self.assertEqual(output_data[0]["author"], expected_authors) 

1404 

1405 def test_cyclic_hasNext_relations(self): 

1406 """Test handling of cyclic hasNext relations between agent roles""" 

1407 supplier_prefix = "060" 

1408 br_data = [ 

1409 { 

1410 "@graph": [ 

1411 { 

1412 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1413 "@type": [ 

1414 "http://purl.org/spar/fabio/Expression", 

1415 "http://purl.org/spar/fabio/JournalArticle", 

1416 ], 

1417 "http://purl.org/dc/terms/title": [ 

1418 {"@value": "Cyclic Authors Article"} 

1419 ], 

1420 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1421 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}, 

1422 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}, 

1423 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}, 

1424 ], 

1425 } 

1426 ] 

1427 } 

1428 ] 

1429 

1430 # Setup agent roles with cyclic hasNext relations 

1431 ar_data = [ 

1432 { 

1433 "@graph": [ 

1434 { 

1435 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

1436 "http://purl.org/spar/pro/withRole": [ 

1437 {"@id": "http://purl.org/spar/pro/author"} 

1438 ], 

1439 "http://purl.org/spar/pro/isHeldBy": [ 

1440 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

1441 ], 

1442 "https://w3id.org/oc/ontology/hasNext": [ 

1443 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"} 

1444 ], 

1445 }, 

1446 { 

1447 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

1448 "http://purl.org/spar/pro/withRole": [ 

1449 {"@id": "http://purl.org/spar/pro/author"} 

1450 ], 

1451 "http://purl.org/spar/pro/isHeldBy": [ 

1452 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

1453 ], 

1454 # Creates a cycle: 1 -> 2 -> 3 -> 1 

1455 "https://w3id.org/oc/ontology/hasNext": [ 

1456 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

1457 ], 

1458 }, 

1459 { 

1460 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

1461 "http://purl.org/spar/pro/withRole": [ 

1462 {"@id": "http://purl.org/spar/pro/author"} 

1463 ], 

1464 "http://purl.org/spar/pro/isHeldBy": [ 

1465 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

1466 ], 

1467 # Cycle completion 

1468 "https://w3id.org/oc/ontology/hasNext": [ 

1469 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"} 

1470 ], 

1471 }, 

1472 ] 

1473 } 

1474 ] 

1475 

1476 # Setup responsible agents 

1477 ra_data = [ 

1478 { 

1479 "@graph": [ 

1480 { 

1481 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

1482 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}], 

1483 }, 

1484 { 

1485 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

1486 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}], 

1487 }, 

1488 { 

1489 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

1490 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}], 

1491 }, 

1492 ] 

1493 } 

1494 ] 

1495 

1496 # Write test data files 

1497 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

1498 

1499 for entity_type, data in data_files.items(): 

1500 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

1501 os.makedirs(dir_path, exist_ok=True) 

1502 

1503 zip_path = os.path.join(dir_path, "1000.zip") 

1504 with ZipFile(zip_path, "w") as zip_file: 

1505 zip_file.writestr("1000.json", json.dumps(data)) 

1506 

1507 # Run generator 

1508 generate_csv( 

1509 input_dir=self.rdf_dir, 

1510 output_dir=self.output_dir, 

1511 dir_split_number=10000, 

1512 items_per_file=1000, 

1513 zip_output_rdf=True, 

1514 ) 

1515 

1516 # Check output 

1517 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1518 self.assertEqual(len(output_data), 1) 

1519 

1520 # Verify that we get at least some authors before the cycle is detected 

1521 # The order should be maintained until the cycle is detected 

1522 authors = output_data[0]["author"].split("; ") 

1523 self.assertGreater(len(authors), 0) 

1524 

1525 # Verify the presence and order of authors 

1526 self.assertTrue( 

1527 any( 

1528 f"First Author [omid:ra/{supplier_prefix}1]" in author 

1529 for author in authors 

1530 ) 

1531 ) 

1532 self.assertTrue( 

1533 any( 

1534 f"Second Author [omid:ra/{supplier_prefix}2]" in author 

1535 for author in authors 

1536 ) 

1537 ) 

1538 

1539 # Verify no duplicates in the output 

1540 author_set = set(authors) 

1541 self.assertEqual( 

1542 len(authors), 

1543 len(author_set), 

1544 "Found duplicate authors in output: each author should appear exactly once", 

1545 ) 

1546 

1547 # Verify the exact order and number of authors 

1548 expected_authors = [ 

1549 f"First Author [omid:ra/{supplier_prefix}1]", 

1550 f"Second Author [omid:ra/{supplier_prefix}2]", 

1551 f"Third Author [omid:ra/{supplier_prefix}3]", 

1552 ] 

1553 self.assertEqual( 

1554 authors, 

1555 expected_authors, 

1556 "Authors should be in correct order and each should appear exactly once", 

1557 ) 

1558 

1559 def test_multiple_input_files(self): 

1560 """Test processing of multiple input files with sequential entity IDs""" 

1561 supplier_prefix = "060" 

1562 

1563 # Create test data spanning multiple files 

1564 # First file (entities 1-1000) 

1565 br_data_1 = [ 

1566 { 

1567 "@graph": [ 

1568 { 

1569 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1570 "@type": [ 

1571 "http://purl.org/spar/fabio/Expression", 

1572 "http://purl.org/spar/fabio/JournalArticle", 

1573 ], 

1574 "http://purl.org/dc/terms/title": [{"@value": "Article 1"}], 

1575 }, 

1576 { 

1577 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1000", 

1578 "@type": [ 

1579 "http://purl.org/spar/fabio/Expression", 

1580 "http://purl.org/spar/fabio/JournalArticle", 

1581 ], 

1582 "http://purl.org/dc/terms/title": [{"@value": "Article 1000"}], 

1583 }, 

1584 ] 

1585 } 

1586 ] 

1587 

1588 # Second file (entities 1001-2000) 

1589 br_data_2 = [ 

1590 { 

1591 "@graph": [ 

1592 { 

1593 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1001", 

1594 "@type": [ 

1595 "http://purl.org/spar/fabio/Expression", 

1596 "http://purl.org/spar/fabio/JournalArticle", 

1597 ], 

1598 "http://purl.org/dc/terms/title": [{"@value": "Article 1001"}], 

1599 }, 

1600 { 

1601 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2000", 

1602 "@type": [ 

1603 "http://purl.org/spar/fabio/Expression", 

1604 "http://purl.org/spar/fabio/JournalArticle", 

1605 ], 

1606 "http://purl.org/dc/terms/title": [{"@value": "Article 2000"}], 

1607 }, 

1608 ] 

1609 } 

1610 ] 

1611 

1612 # Third file (entities 2001-3000) 

1613 br_data_3 = [ 

1614 { 

1615 "@graph": [ 

1616 { 

1617 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2001", 

1618 "@type": [ 

1619 "http://purl.org/spar/fabio/Expression", 

1620 "http://purl.org/spar/fabio/JournalArticle", 

1621 ], 

1622 "http://purl.org/dc/terms/title": [{"@value": "Article 2001"}], 

1623 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1624 { 

1625 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001" 

1626 } 

1627 ], 

1628 } 

1629 ] 

1630 } 

1631 ] 

1632 

1633 # Create agent role data in a different file 

1634 ar_data = [ 

1635 { 

1636 "@graph": [ 

1637 { 

1638 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001", 

1639 "http://purl.org/spar/pro/withRole": [ 

1640 {"@id": "http://purl.org/spar/pro/author"} 

1641 ], 

1642 "http://purl.org/spar/pro/isHeldBy": [ 

1643 { 

1644 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001" 

1645 } 

1646 ], 

1647 } 

1648 ] 

1649 } 

1650 ] 

1651 

1652 # Create responsible agent data in a different file 

1653 ra_data = [ 

1654 { 

1655 "@graph": [ 

1656 { 

1657 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001", 

1658 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}], 

1659 } 

1660 ] 

1661 } 

1662 ] 

1663 

1664 # Write test data to appropriate locations based on ID ranges 

1665 os.makedirs(os.path.join(self.br_dir, supplier_prefix, "10000"), exist_ok=True) 

1666 os.makedirs( 

1667 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000"), exist_ok=True 

1668 ) 

1669 os.makedirs( 

1670 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000"), exist_ok=True 

1671 ) 

1672 

1673 # Write BR files 

1674 with ZipFile( 

1675 os.path.join(self.br_dir, supplier_prefix, "10000", "1000.zip"), "w" 

1676 ) as zip_file: 

1677 zip_file.writestr("1000.json", json.dumps(br_data_1)) 

1678 with ZipFile( 

1679 os.path.join(self.br_dir, supplier_prefix, "10000", "2000.zip"), "w" 

1680 ) as zip_file: 

1681 zip_file.writestr("2000.json", json.dumps(br_data_2)) 

1682 with ZipFile( 

1683 os.path.join(self.br_dir, supplier_prefix, "10000", "3000.zip"), "w" 

1684 ) as zip_file: 

1685 zip_file.writestr("3000.json", json.dumps(br_data_3)) 

1686 

1687 # Write AR and RA files 

1688 with ZipFile( 

1689 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000", "3000.zip"), "w" 

1690 ) as zip_file: 

1691 zip_file.writestr("3000.json", json.dumps(ar_data)) 

1692 with ZipFile( 

1693 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000", "3000.zip"), "w" 

1694 ) as zip_file: 

1695 zip_file.writestr("3000.json", json.dumps(ra_data)) 

1696 

1697 # Run generator 

1698 generate_csv( 

1699 input_dir=self.rdf_dir, 

1700 output_dir=self.output_dir, 

1701 dir_split_number=10000, 

1702 items_per_file=1000, 

1703 zip_output_rdf=True, 

1704 ) 

1705 

1706 # Check output 

1707 output_files = sorted(os.listdir(self.output_dir)) 

1708 self.assertGreater(len(output_files), 0) 

1709 

1710 # Collect all output data 

1711 all_output_data = [] 

1712 for output_file in output_files: 

1713 all_output_data.extend( 

1714 get_csv_data(os.path.join(self.output_dir, output_file)) 

1715 ) 

1716 

1717 # Verify we have all expected entries 

1718 self.assertEqual(len(all_output_data), 5) # Should have 5 articles total 

1719 

1720 # Verify specific entries 

1721 article_1 = next( 

1722 item 

1723 for item in all_output_data 

1724 if item["id"] == f"omid:br/{supplier_prefix}1" 

1725 ) 

1726 article_1000 = next( 

1727 item 

1728 for item in all_output_data 

1729 if item["id"] == f"omid:br/{supplier_prefix}1000" 

1730 ) 

1731 article_1001 = next( 

1732 item 

1733 for item in all_output_data 

1734 if item["id"] == f"omid:br/{supplier_prefix}1001" 

1735 ) 

1736 article_2000 = next( 

1737 item 

1738 for item in all_output_data 

1739 if item["id"] == f"omid:br/{supplier_prefix}2000" 

1740 ) 

1741 article_2001 = next( 

1742 item 

1743 for item in all_output_data 

1744 if item["id"] == f"omid:br/{supplier_prefix}2001" 

1745 ) 

1746 

1747 # Check titles 

1748 self.assertEqual(article_1["title"], "Article 1") 

1749 self.assertEqual(article_1000["title"], "Article 1000") 

1750 self.assertEqual(article_1001["title"], "Article 1001") 

1751 self.assertEqual(article_2000["title"], "Article 2000") 

1752 self.assertEqual(article_2001["title"], "Article 2001") 

1753 

1754 # Check author for article 2001 (which has related entities) 

1755 self.assertEqual( 

1756 article_2001["author"], f"Test Author [omid:ra/{supplier_prefix}2001]" 

1757 ) 

1758 

1759 def test_max_rows_per_file_and_data_integrity(self): 

1760 """Test that output files respect max rows limit and no data is lost in multiprocessing""" 

1761 supplier_prefix = "060" 

1762 

1763 # Create test data with more than 3000 entries 

1764 br_data = [ 

1765 { 

1766 "@graph": [ 

1767 # Generate 3500 test entries 

1768 *[ 

1769 { 

1770 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}{i}", 

1771 "@type": [ 

1772 "http://purl.org/spar/fabio/Expression", 

1773 "http://purl.org/spar/fabio/JournalArticle", 

1774 ], 

1775 "http://purl.org/dc/terms/title": [ 

1776 {"@value": f"Article {i}"} 

1777 ], 

1778 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

1779 {"@value": "2024-01-01"} 

1780 ], 

1781 } 

1782 for i in range(1, 3501) 

1783 ] # This will create 3500 entries 

1784 ] 

1785 } 

1786 ] 

1787 

1788 # Split data into multiple files to test multiprocessing 

1789 entries_per_file = 1000 

1790 for i in range(0, 3500, entries_per_file): 

1791 file_data = [{"@graph": br_data[0]["@graph"][i : i + entries_per_file]}] 

1792 

1793 # Create directory structure for the file 

1794 file_number = i + entries_per_file 

1795 dir_path = os.path.join(self.br_dir, supplier_prefix, "10000") 

1796 os.makedirs(dir_path, exist_ok=True) 

1797 

1798 # Write the file 

1799 with ZipFile(os.path.join(dir_path, f"{file_number}.zip"), "w") as zip_file: 

1800 zip_file.writestr(f"{file_number}.json", json.dumps(file_data)) 

1801 

1802 # Run generator 

1803 generate_csv( 

1804 input_dir=self.rdf_dir, 

1805 output_dir=self.output_dir, 

1806 dir_split_number=10000, 

1807 items_per_file=1000, 

1808 zip_output_rdf=True, 

1809 ) 

1810 

1811 # Check output files 

1812 output_files = sorted(os.listdir(self.output_dir)) 

1813 

1814 # Verify number of output files 

1815 # We expect at least 2 files: 3500 entries should create 2 files (3000 + 500) 

1816 self.assertGreaterEqual( 

1817 len(output_files), 2, "Should have at least 2 output files for 3500 entries" 

1818 ) 

1819 

1820 # Collect all entries from all output files 

1821 all_entries = [] 

1822 for output_file in output_files: 

1823 entries = get_csv_data(os.path.join(self.output_dir, output_file)) 

1824 

1825 # Verify each file has at most 3000 rows 

1826 self.assertLessEqual( 

1827 len(entries), 

1828 3000, 

1829 f"File {output_file} has more than 3000 rows: {len(entries)}", 

1830 ) 

1831 

1832 all_entries.extend(entries) 

1833 

1834 # Verify total number of entries 

1835 self.assertEqual( 

1836 len(all_entries), 

1837 3500, 

1838 f"Expected 3500 total entries, got {len(all_entries)}", 

1839 ) 

1840 

1841 # Verify no duplicate entries 

1842 unique_ids = {entry["id"] for entry in all_entries} 

1843 self.assertEqual( 

1844 len(unique_ids), 

1845 3500, 

1846 f"Expected 3500 unique entries, got {len(unique_ids)}", 

1847 ) 

1848 

1849 # Verify all entries are present (no missing entries) 

1850 expected_ids = {f"omid:br/{supplier_prefix}{i}" for i in range(1, 3501)} 

1851 self.assertEqual( 

1852 unique_ids, 

1853 expected_ids, 

1854 "Some entries are missing or unexpected entries are present", 

1855 ) 

1856 

1857 # Verify data integrity 

1858 for i in range(1, 3501): 

1859 entry = next( 

1860 e for e in all_entries if e["id"] == f"omid:br/{supplier_prefix}{i}" 

1861 ) 

1862 self.assertEqual(entry["title"], f"Article {i}") 

1863 self.assertEqual(entry["pub_date"], "2024-01-01") 

1864 self.assertEqual(entry["type"], "journal article") 

1865 

1866 def test_csv_field_limit_handling(self): 

1867 """Test handling of CSV files with large fields that exceed the default limit""" 

1868 # Create a test CSV with a very large field 

1869 large_field = "omid:br/0601 " + " ".join( 

1870 [f"id:{i}" for i in range(10000)] 

1871 ) # This will create a field > 131072 chars 

1872 test_data = {"id": large_field, "title": "Test Large Field"} 

1873 

1874 os.makedirs(self.output_dir, exist_ok=True) 

1875 with open( 

1876 os.path.join(self.output_dir, "large_field.csv"), 

1877 "w", 

1878 newline="", 

1879 encoding="utf-8", 

1880 ) as f: 

1881 writer = csv.DictWriter(f, fieldnames=["id", "title"]) 

1882 writer.writeheader() 

1883 writer.writerow(test_data) 

1884 

1885 # Try loading the data - this should trigger the field limit increase 

1886 count = load_processed_omids_to_redis(self.output_dir, self.redis_client) 

1887 

1888 # Verify the OMID was loaded despite the large field 

1889 self.assertEqual(count, 1) 

1890 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client)) 

1891 

1892 def test_complex_br_with_missing_authors(self): 

1893 """Test processing of a complex BR with multiple related entities where authors might be missing""" 

1894 supplier_prefix = "06250" 

1895 br_data = [ 

1896 { 

1897 "@graph": [ 

1898 { 

1899 "@id": "https://w3id.org/oc/meta/br/062501777134", 

1900 "@type": [ 

1901 "http://purl.org/spar/fabio/JournalArticle", 

1902 "http://purl.org/spar/fabio/Expression", 

1903 ], 

1904 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

1905 { 

1906 "@type": "http://www.w3.org/2001/XMLSchema#gYearMonth", 

1907 "@value": "2020-02", 

1908 } 

1909 ], 

1910 "http://purl.org/dc/terms/title": [ 

1911 { 

1912 "@value": "OpenCitations, An Infrastructure Organization For Open Scholarship" 

1913 } 

1914 ], 

1915 "http://purl.org/spar/datacite/hasIdentifier": [ 

1916 {"@id": "https://w3id.org/oc/meta/id/062501806985"}, 

1917 {"@id": "https://w3id.org/oc/meta/id/06850624745"}, 

1918 ], 

1919 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1920 {"@id": "https://w3id.org/oc/meta/ar/062507977761"}, 

1921 {"@id": "https://w3id.org/oc/meta/ar/062507977760"}, 

1922 {"@id": "https://w3id.org/oc/meta/ar/062507977759"}, 

1923 ], 

1924 "http://purl.org/vocab/frbr/core#embodiment": [ 

1925 {"@id": "https://w3id.org/oc/meta/re/062501477439"} 

1926 ], 

1927 "http://purl.org/vocab/frbr/core#partOf": [ 

1928 {"@id": "https://w3id.org/oc/meta/br/062501778111"} 

1929 ], 

1930 } 

1931 ] 

1932 } 

1933 ] 

1934 

1935 ar_data = [ 

1936 { 

1937 "@graph": [ 

1938 { 

1939 "@id": "https://w3id.org/oc/meta/ar/062507977761", 

1940 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

1941 "http://purl.org/spar/pro/isHeldBy": [ 

1942 {"@id": "https://w3id.org/oc/meta/ra/0610116105"} 

1943 ], 

1944 "http://purl.org/spar/pro/withRole": [ 

1945 {"@id": "http://purl.org/spar/pro/publisher"} 

1946 ], 

1947 }, 

1948 { 

1949 "@id": "https://w3id.org/oc/meta/ar/062507977760", 

1950 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

1951 "http://purl.org/spar/pro/isHeldBy": [ 

1952 {"@id": "https://w3id.org/oc/meta/ra/0621010775619"} 

1953 ], 

1954 "http://purl.org/spar/pro/withRole": [ 

1955 {"@id": "http://purl.org/spar/pro/author"} 

1956 ], 

1957 }, 

1958 { 

1959 "@id": "https://w3id.org/oc/meta/ar/062507977759", 

1960 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

1961 "http://purl.org/spar/pro/isHeldBy": [ 

1962 {"@id": "https://w3id.org/oc/meta/ra/0614010840729"} 

1963 ], 

1964 "http://purl.org/spar/pro/withRole": [ 

1965 {"@id": "http://purl.org/spar/pro/author"} 

1966 ], 

1967 "https://w3id.org/oc/ontology/hasNext": [ 

1968 {"@id": "https://w3id.org/oc/meta/ar/062507977760"} 

1969 ], 

1970 }, 

1971 ] 

1972 } 

1973 ] 

1974 

1975 ra_data_peroni = [ 

1976 { 

1977 "@graph": [ 

1978 { 

1979 "@id": "https://w3id.org/oc/meta/ra/0614010840729", 

1980 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

1981 "http://purl.org/spar/datacite/hasIdentifier": [ 

1982 {"@id": "https://w3id.org/oc/meta/id/06304949238"} 

1983 ], 

1984 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Peroni"}], 

1985 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Silvio"}], 

1986 "http://xmlns.com/foaf/0.1/name": [{"@value": "Peroni Silvio"}], 

1987 } 

1988 ] 

1989 } 

1990 ] 

1991 

1992 ra_data_shotton = [ 

1993 { 

1994 "@graph": [ 

1995 { 

1996 "@id": "https://w3id.org/oc/meta/ra/0621010775619", 

1997 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

1998 "http://purl.org/spar/datacite/hasIdentifier": [ 

1999 {"@id": "https://w3id.org/oc/meta/id/062404672414"} 

2000 ], 

2001 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Shotton"}], 

2002 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "D M"}], 

2003 "http://xmlns.com/foaf/0.1/name": [{"@value": "Shotton David"}], 

2004 } 

2005 ] 

2006 } 

2007 ] 

2008 

2009 # Create directory structure for BR data 

2010 br_dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "1780000") 

2011 os.makedirs(br_dir_path, exist_ok=True) 

2012 

2013 # Create directory structure for AR data 

2014 ar_dir_path = os.path.join(self.rdf_dir, "ar", supplier_prefix, "7980000") 

2015 os.makedirs(ar_dir_path, exist_ok=True) 

2016 

2017 # Create directory structure for RA data (Peroni) 

2018 ra_peroni_dir_path = os.path.join(self.rdf_dir, "ra", "06140", "10850000") 

2019 os.makedirs(ra_peroni_dir_path, exist_ok=True) 

2020 

2021 # Create directory structure for RA data (Shotton) 

2022 ra_shotton_dir_path = os.path.join(self.rdf_dir, "ra", "06210", "10780000") 

2023 os.makedirs(ra_shotton_dir_path, exist_ok=True) 

2024 

2025 # Write BR data 

2026 with ZipFile(os.path.join(br_dir_path, "1778000.zip"), "w") as zip_file: 

2027 zip_file.writestr("1778000.json", json.dumps(br_data)) 

2028 

2029 # Write AR data 

2030 with ZipFile(os.path.join(ar_dir_path, "7978000.zip"), "w") as zip_file: 

2031 zip_file.writestr("7978000.json", json.dumps(ar_data)) 

2032 

2033 # Write RA data (Peroni) 

2034 with ZipFile(os.path.join(ra_peroni_dir_path, "10841000.zip"), "w") as zip_file: 

2035 zip_file.writestr("10841000.json", json.dumps(ra_data_peroni)) 

2036 

2037 # Write RA data (Shotton) 

2038 with ZipFile( 

2039 os.path.join(ra_shotton_dir_path, "10776000.zip"), "w" 

2040 ) as zip_file: 

2041 zip_file.writestr("10776000.json", json.dumps(ra_data_shotton)) 

2042 

2043 # Run generator 

2044 generate_csv( 

2045 input_dir=self.rdf_dir, 

2046 output_dir=self.output_dir, 

2047 dir_split_number=10000, 

2048 items_per_file=1000, 

2049 zip_output_rdf=True, 

2050 ) 

2051 

2052 # Check output 

2053 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

2054 self.assertEqual(len(output_data), 1) 

2055 # Verify basic metadata 

2056 article = output_data[0] 

2057 self.assertEqual( 

2058 article["title"], 

2059 "OpenCitations, An Infrastructure Organization For Open Scholarship", 

2060 ) 

2061 self.assertEqual(article["pub_date"], "2020-02") 

2062 self.assertEqual(article["type"], "journal article") 

2063 self.assertEqual(article["id"], "omid:br/062501777134") 

2064 

2065 # Now we expect the authors to be present in the correct order 

2066 expected_authors = ( 

2067 "Peroni, Silvio [omid:ra/0614010840729]; " 

2068 "Shotton, D M [omid:ra/0621010775619]" 

2069 ) 

2070 self.assertEqual(article["author"], expected_authors) 

2071 

2072 # Publisher field should still be empty since we haven't added the publisher RA data 

2073 self.assertEqual(article["publisher"], "") 

2074 

2075 def test_multiple_first_ars(self): 

2076 """Test behavior when there are multiple first ARs in the same chain (no hasNext pointing to them). 

2077 The current behavior is to process only one of the first ARs and its hasNext chain. 

2078 """ 

2079 supplier_prefix = "060" 

2080 br_data = [ 

2081 { 

2082 "@graph": [ 

2083 { 

2084 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

2085 "@type": [ 

2086 "http://purl.org/spar/fabio/Expression", 

2087 "http://purl.org/spar/fabio/JournalArticle", 

2088 ], 

2089 "http://purl.org/dc/terms/title": [ 

2090 {"@value": "Article With Multiple First Authors"} 

2091 ], 

2092 "http://purl.org/spar/pro/isDocumentContextFor": [ 

2093 { 

2094 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1" 

2095 }, # First potential author (will be processed) 

2096 { 

2097 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2" 

2098 }, # Second potential author (will be ignored) 

2099 { 

2100 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3" 

2101 }, # Connected to author 1 (will be processed) 

2102 ], 

2103 } 

2104 ] 

2105 } 

2106 ] 

2107 

2108 # Setup agent roles with two potential "first" authors (no hasNext pointing to them) 

2109 # and one author connected to the first one 

2110 ar_data = [ 

2111 { 

2112 "@graph": [ 

2113 { 

2114 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

2115 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

2116 "http://purl.org/spar/pro/withRole": [ 

2117 {"@id": "http://purl.org/spar/pro/author"} 

2118 ], 

2119 "http://purl.org/spar/pro/isHeldBy": [ 

2120 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

2121 ], 

2122 "https://w3id.org/oc/ontology/hasNext": [ 

2123 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

2124 ], 

2125 }, 

2126 { 

2127 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

2128 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

2129 "http://purl.org/spar/pro/withRole": [ 

2130 {"@id": "http://purl.org/spar/pro/author"} 

2131 ], 

2132 "http://purl.org/spar/pro/isHeldBy": [ 

2133 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

2134 ], 

2135 # This is also a potential first author but will be ignored 

2136 }, 

2137 { 

2138 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

2139 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

2140 "http://purl.org/spar/pro/withRole": [ 

2141 {"@id": "http://purl.org/spar/pro/author"} 

2142 ], 

2143 "http://purl.org/spar/pro/isHeldBy": [ 

2144 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

2145 ], 

2146 # This one is connected to author 1 via hasNext and will be processed 

2147 }, 

2148 ] 

2149 } 

2150 ] 

2151 

2152 # Setup responsible agents 

2153 ra_data = [ 

2154 { 

2155 "@graph": [ 

2156 { 

2157 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

2158 "http://xmlns.com/foaf/0.1/name": [ 

2159 {"@value": "First Potential Author"} 

2160 ], 

2161 }, 

2162 { 

2163 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

2164 "http://xmlns.com/foaf/0.1/name": [ 

2165 {"@value": "Second Potential Author"} 

2166 ], 

2167 }, 

2168 { 

2169 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

2170 "http://xmlns.com/foaf/0.1/name": [ 

2171 {"@value": "Connected Author"} 

2172 ], 

2173 }, 

2174 ] 

2175 } 

2176 ] 

2177 

2178 # Write test data files 

2179 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

2180 

2181 for entity_type, data in data_files.items(): 

2182 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

2183 os.makedirs(dir_path, exist_ok=True) 

2184 

2185 zip_path = os.path.join(dir_path, "1000.zip") 

2186 with ZipFile(zip_path, "w") as zip_file: 

2187 zip_file.writestr("1000.json", json.dumps(data)) 

2188 

2189 # Run generator 

2190 generate_csv( 

2191 input_dir=self.rdf_dir, 

2192 output_dir=self.output_dir, 

2193 dir_split_number=10000, 

2194 items_per_file=1000, 

2195 zip_output_rdf=True, 

2196 ) 

2197 

2198 # Check output 

2199 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

2200 self.assertEqual(len(output_data), 1) 

2201 

2202 article = output_data[0] 

2203 authors = article["author"].split("; ") 

2204 

2205 # Verify we have exactly two authors (the first one found and its connected author) 

2206 self.assertEqual( 

2207 len(authors), 

2208 2, 

2209 "Should have exactly two authors (first author and connected one)", 

2210 ) 

2211 

2212 # Verify the specific authors we expect 

2213 expected_authors = [ 

2214 f"First Potential Author [omid:ra/{supplier_prefix}1]", 

2215 f"Connected Author [omid:ra/{supplier_prefix}3]", 

2216 ] 

2217 self.assertEqual( 

2218 authors, 

2219 expected_authors, 

2220 "Should have first author and connected author in correct order", 

2221 ) 

2222 

2223 # Verify the second potential author is NOT in the output 

2224 self.assertNotIn( 

2225 f"Second Potential Author [omid:ra/{supplier_prefix}2]", 

2226 article["author"], 

2227 "Second potential author should not be in the output", 

2228 ) 

2229 

2230 

2231if __name__ == "__main__": 

2232 unittest.main()