Coverage for test/csv_generator_lite_test.py: 99%

455 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2026-01-15 10:29 +0000

1#!/usr/bin/python 

2# -*- coding: utf-8 -*- 

3# Copyright (c) 2024 Arcangelo Massari <arcangelo.massari@unibo.it> 

4# 

5# Permission to use, copy, modify, and/or distribute this software for any purpose 

6# with or without fee is hereby granted, provided that the above copyright notice 

7# and this permission notice appear in all copies. 

8# 

9# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 

10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 

11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, 

12# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 

13# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 

14# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 

15# SOFTWARE. 

16 

17import csv 

18import json 

19import os 

20import unittest 

21from shutil import rmtree 

22from zipfile import ZipFile 

23 

24import redis 

25from oc_meta.lib.file_manager import get_csv_data 

26from oc_meta.plugins.csv_generator_lite.csv_generator_lite import ( 

27 generate_csv, 

28 init_redis_connection, 

29 is_omid_processed, 

30 load_processed_omids_to_redis, 

31) 

32 

33 

34class TestCSVGeneratorLite(unittest.TestCase): 

35 def setUp(self): 

36 self.base_dir = os.path.join("test", "csv_generator_lite") 

37 self.input_dir = os.path.join(self.base_dir, "input") 

38 self.output_dir = os.path.join(self.base_dir, "output") 

39 

40 os.makedirs(self.input_dir, exist_ok=True) 

41 os.makedirs(self.output_dir, exist_ok=True) 

42 

43 self.rdf_dir = os.path.join(self.input_dir, "rdf") 

44 self.br_dir = os.path.join(self.rdf_dir, "br") 

45 os.makedirs(self.br_dir, exist_ok=True) 

46 

47 self.redis_client = init_redis_connection(port=6381, db=5) 

48 self.redis_client.flushdb() # Clear test database 

49 

50 def tearDown(self): 

51 if os.path.exists(self.base_dir): 

52 rmtree(self.base_dir) 

53 self.redis_client.flushdb() 

54 

55 def _write_test_data(self, data): 

56 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True) 

57 test_data = [ 

58 { 

59 "@graph": [ 

60 { 

61 "@id": f"https://w3id.org/oc/meta/{item['id'].replace('omid:', '')}", 

62 "@type": [ 

63 "http://purl.org/spar/fabio/Expression", 

64 "http://purl.org/spar/fabio/JournalArticle", 

65 ], 

66 "http://purl.org/dc/terms/title": [{"@value": item["title"]}], 

67 } 

68 for item in data 

69 ] 

70 } 

71 ] 

72 with ZipFile( 

73 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w" 

74 ) as zip_file: 

75 zip_file.writestr("1000.json", json.dumps(test_data)) 

76 

77 def test_redis_connection_and_caching(self): 

78 redis_client = init_redis_connection(port=6381, db=5) 

79 self.assertIsInstance(redis_client, redis.Redis) 

80 

81 # Create a test CSV file with some OMIDs 

82 test_data = [ 

83 {"id": "omid:br/0601", "title": "Test 1"}, 

84 {"id": "omid:br/0602", "title": "Test 2"}, 

85 {"id": "omid:br/0603 issn:456", "title": "Test 3"}, 

86 ] 

87 os.makedirs(self.output_dir, exist_ok=True) 

88 with open( 

89 os.path.join(self.output_dir, "test.csv"), "w", newline="", encoding="utf-8" 

90 ) as f: 

91 writer = csv.DictWriter(f, fieldnames=["id", "title"]) 

92 writer.writeheader() 

93 writer.writerows(test_data) 

94 

95 count = load_processed_omids_to_redis(self.output_dir, redis_client) 

96 self.assertEqual(count, 3) 

97 

98 self.assertTrue(is_omid_processed("omid:br/0601", redis_client)) 

99 self.assertTrue(is_omid_processed("omid:br/0602", redis_client)) 

100 self.assertTrue(is_omid_processed("omid:br/0603", redis_client)) 

101 self.assertFalse(is_omid_processed("omid:br/0604", redis_client)) 

102 

103 def test_redis_cache_persistence(self): 

104 # Create initial test data 

105 test_data = [ 

106 { 

107 "@graph": [ 

108 { 

109 "@id": "https://w3id.org/oc/meta/br/0601", 

110 "@type": [ 

111 "http://purl.org/spar/fabio/Expression", 

112 "http://purl.org/spar/fabio/JournalArticle", 

113 ], 

114 "http://purl.org/dc/terms/title": [{"@value": "First Run"}], 

115 } 

116 ] 

117 } 

118 ] 

119 

120 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True) 

121 with ZipFile( 

122 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w" 

123 ) as zip_file: 

124 zip_file.writestr("1000.json", json.dumps(test_data)) 

125 

126 generate_csv( 

127 input_dir=self.rdf_dir, 

128 output_dir=self.output_dir, 

129 dir_split_number=10000, 

130 items_per_file=1000, 

131 redis_port=6381, 

132 redis_db=5, 

133 ) 

134 

135 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client)) 

136 

137 # Create new test data 

138 test_data_2 = [ 

139 { 

140 "@graph": [ 

141 { 

142 "@id": "https://w3id.org/oc/meta/br/0601", # Same OMID as before 

143 "@type": [ 

144 "http://purl.org/spar/fabio/Expression", 

145 "http://purl.org/spar/fabio/JournalArticle", 

146 ], 

147 "http://purl.org/dc/terms/title": [ 

148 {"@value": "Should Be Skipped"} 

149 ], 

150 }, 

151 { 

152 "@id": "https://w3id.org/oc/meta/br/0602", # New OMID 

153 "@type": [ 

154 "http://purl.org/spar/fabio/Expression", 

155 "http://purl.org/spar/fabio/JournalArticle", 

156 ], 

157 "http://purl.org/dc/terms/title": [ 

158 {"@value": "Should Be Processed"} 

159 ], 

160 }, 

161 ] 

162 } 

163 ] 

164 

165 with ZipFile( 

166 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w" 

167 ) as zip_file: 

168 zip_file.writestr("1000.json", json.dumps(test_data_2)) 

169 

170 generate_csv( 

171 input_dir=self.rdf_dir, 

172 output_dir=self.output_dir, 

173 dir_split_number=10000, 

174 items_per_file=1000, 

175 redis_port=6381, 

176 redis_db=5, 

177 ) 

178 

179 output_data = [] 

180 for filename in os.listdir(self.output_dir): 

181 if filename.endswith(".csv"): 

182 output_data.extend( 

183 get_csv_data(os.path.join(self.output_dir, filename)) 

184 ) 

185 

186 self.assertEqual(len(output_data), 2) 

187 

188 first_run_entry = next( 

189 item for item in output_data if item["title"] == "First Run" 

190 ) 

191 second_run_entry = next( 

192 item for item in output_data if item["title"] == "Should Be Processed" 

193 ) 

194 

195 self.assertEqual(first_run_entry["title"], "First Run") 

196 self.assertEqual(first_run_entry["id"], "omid:br/0601") 

197 

198 self.assertEqual(second_run_entry["title"], "Should Be Processed") 

199 self.assertEqual(second_run_entry["id"], "omid:br/0602") 

200 

201 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client)) 

202 self.assertFalse(is_omid_processed("omid:br/0602", self.redis_client)) 

203 

204 def test_redis_cache_cleanup(self): 

205 input_data = [{"id": "omid:br/0601", "title": "First Entry"}] 

206 self._write_test_data(input_data) 

207 

208 generate_csv( 

209 input_dir=self.rdf_dir, 

210 output_dir=self.output_dir, 

211 dir_split_number=10000, 

212 items_per_file=1000, 

213 redis_port=6381, 

214 redis_db=5, 

215 ) 

216 

217 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client)) 

218 

219 load_processed_omids_to_redis(self.output_dir, self.redis_client) 

220 

221 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client)) 

222 

223 generate_csv( 

224 input_dir="/nonexistent/dir", 

225 output_dir=self.output_dir, 

226 dir_split_number=10000, 

227 items_per_file=1000, 

228 redis_port=6381, 

229 redis_db=5, 

230 ) 

231 

232 self.assertTrue( 

233 is_omid_processed("omid:br/0601", self.redis_client), 

234 "Redis cache should be retained after a failed run", 

235 ) 

236 

237 def test_redis_error_handling(self): 

238 with self.assertRaises(redis.ConnectionError): 

239 init_redis_connection(port=9999) # Invalid port 

240 

241 count = load_processed_omids_to_redis("/nonexistent/dir", self.redis_client) 

242 self.assertEqual(count, 0) 

243 

244 def test_concurrent_processing_with_redis(self): 

245 # Create multiple test files 

246 test_data = [] 

247 for i in range(100): # Create 100 test entries 

248 test_data.append( 

249 { 

250 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}", 

251 "@type": [ 

252 "http://purl.org/spar/fabio/Expression", 

253 "http://purl.org/spar/fabio/JournalArticle", 

254 ], 

255 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}], 

256 } 

257 ) 

258 

259 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True) 

260 for i in range(0, 100, 10): # Create 10 files with 10 entries each 

261 file_data = [{"@graph": test_data[i : i + 10]}] 

262 with ZipFile( 

263 os.path.join(self.br_dir, "060", "10000", f"{i+1000}.zip"), "w" 

264 ) as zip_file: 

265 zip_file.writestr(f"{i+1000}.json", json.dumps(file_data)) 

266 

267 generate_csv( 

268 input_dir=self.rdf_dir, 

269 output_dir=self.output_dir, 

270 dir_split_number=10000, 

271 items_per_file=1000, 

272 redis_port=6381, 

273 redis_db=5, 

274 ) 

275 

276 # Create more test entries 

277 more_test_data = [] 

278 for i in range(100, 200): # Create 100 more test entries 

279 more_test_data.append( 

280 { 

281 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}", 

282 "@type": [ 

283 "http://purl.org/spar/fabio/Expression", 

284 "http://purl.org/spar/fabio/JournalArticle", 

285 ], 

286 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}], 

287 } 

288 ) 

289 

290 for i in range(0, 100, 10): 

291 file_data = [{"@graph": more_test_data[i : i + 10]}] 

292 with ZipFile( 

293 os.path.join(self.br_dir, "060", "10000", f"{i+2000}.zip"), "w" 

294 ) as zip_file: 

295 zip_file.writestr(f"{i+2000}.json", json.dumps(file_data)) 

296 

297 generate_csv( 

298 input_dir=self.rdf_dir, 

299 output_dir=self.output_dir, 

300 dir_split_number=10000, 

301 items_per_file=1000, 

302 redis_port=6381, 

303 redis_db=5, 

304 ) 

305 

306 all_output_data = [] 

307 for filename in os.listdir(self.output_dir): 

308 if filename.endswith(".csv"): 

309 all_output_data.extend( 

310 get_csv_data(os.path.join(self.output_dir, filename)) 

311 ) 

312 

313 self.assertEqual(len(all_output_data), 200) 

314 

315 processed_ids = {row["id"] for row in all_output_data} 

316 self.assertEqual(len(processed_ids), 200) 

317 

318 def test_basic_br_processing(self): 

319 test_data = [ 

320 { 

321 "@graph": [ 

322 { 

323 "@id": "https://w3id.org/oc/meta/br/0601", 

324 "@type": [ 

325 "http://purl.org/spar/fabio/Expression", 

326 "http://purl.org/spar/fabio/JournalArticle", 

327 ], 

328 "http://purl.org/dc/terms/title": [{"@value": "Test Article"}], 

329 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

330 {"@value": "2024-01-01"} 

331 ], 

332 "http://purl.org/spar/datacite/hasIdentifier": [ 

333 {"@id": "https://w3id.org/oc/meta/id/0601"} 

334 ], 

335 } 

336 ], 

337 "@id": "https://w3id.org/oc/meta/br/", 

338 } 

339 ] 

340 

341 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True) 

342 with ZipFile( 

343 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w" 

344 ) as zip_file: 

345 zip_file.writestr("1000.json", json.dumps(test_data)) 

346 

347 generate_csv( 

348 input_dir=self.rdf_dir, 

349 output_dir=self.output_dir, 

350 dir_split_number=10000, 

351 items_per_file=1000, 

352 redis_port=6381, 

353 redis_db=5, 

354 ) 

355 

356 output_files = os.listdir(self.output_dir) 

357 self.assertEqual(len(output_files), 1) 

358 

359 output_data = get_csv_data(os.path.join(self.output_dir, output_files[0])) 

360 self.assertEqual(len(output_data), 1) 

361 self.assertEqual(output_data[0]["title"], "Test Article") 

362 self.assertEqual(output_data[0]["pub_date"], "2024-01-01") 

363 self.assertEqual(output_data[0]["type"], "journal article") 

364 self.assertEqual(output_data[0]["id"], "omid:br/0601") 

365 

366 def test_complex_br_with_related_entities(self): 

367 # Create directory structure for each entity type 

368 supplier_prefix = "060" 

369 for entity_type in ["br", "ra", "ar", "id"]: 

370 os.makedirs( 

371 os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000"), 

372 exist_ok=True, 

373 ) 

374 

375 # BR data including both the article and the venue 

376 br_data = [ 

377 { 

378 "@graph": [ 

379 { 

380 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2", 

381 "@type": [ 

382 "http://purl.org/spar/fabio/Expression", 

383 "http://purl.org/spar/fabio/JournalArticle", 

384 ], 

385 "http://purl.org/dc/terms/title": [ 

386 {"@value": "Complex Article"} 

387 ], 

388 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

389 {"@value": "2024-02-01"} 

390 ], 

391 "http://purl.org/spar/pro/isDocumentContextFor": [ 

392 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"} 

393 ], 

394 "http://purl.org/vocab/frbr/core#partOf": [ 

395 {"@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3"} 

396 ], 

397 }, 

398 { 

399 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3", 

400 "@type": [ 

401 "http://purl.org/spar/fabio/Expression", 

402 "http://purl.org/spar/fabio/Journal", 

403 ], 

404 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}], 

405 }, 

406 ], 

407 "@id": "https://w3id.org/oc/meta/br/", 

408 } 

409 ] 

410 

411 ar_data = [ 

412 { 

413 "@graph": [ 

414 { 

415 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

416 "http://purl.org/spar/pro/withRole": [ 

417 {"@id": "http://purl.org/spar/pro/author"} 

418 ], 

419 "http://purl.org/spar/pro/isHeldBy": [ 

420 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

421 ], 

422 } 

423 ], 

424 "@id": "https://w3id.org/oc/meta/ar/", 

425 } 

426 ] 

427 

428 ra_data = [ 

429 { 

430 "@graph": [ 

431 { 

432 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

433 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}], 

434 } 

435 ], 

436 "@id": "https://w3id.org/oc/meta/ra/", 

437 } 

438 ] 

439 

440 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

441 

442 for entity_type, data in data_files.items(): 

443 zip_path = os.path.join( 

444 self.rdf_dir, entity_type, supplier_prefix, "10000", "1000.zip" 

445 ) 

446 with ZipFile(zip_path, "w") as zip_file: 

447 zip_file.writestr("1000.json", json.dumps(data)) 

448 

449 generate_csv( 

450 input_dir=self.rdf_dir, 

451 output_dir=self.output_dir, 

452 dir_split_number=10000, 

453 items_per_file=1000, 

454 redis_port=6381, 

455 redis_db=5, 

456 ) 

457 

458 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

459 self.assertEqual(len(output_data), 2) # Should have 2 rows: article and journal 

460 

461 article = next( 

462 (item for item in output_data if item["type"] == "journal article"), None 

463 ) 

464 journal = next( 

465 (item for item in output_data if item["type"] == "journal"), None 

466 ) 

467 

468 self.assertIsNotNone(article) 

469 self.assertEqual(article["title"], "Complex Article") 

470 self.assertEqual(article["venue"], f"Test Journal [omid:br/{supplier_prefix}3]") 

471 self.assertEqual(article["author"], "Test Author [omid:ra/0601]") 

472 self.assertEqual(article["id"], f"omid:br/{supplier_prefix}2") 

473 

474 self.assertIsNotNone(journal) 

475 self.assertEqual(journal["title"], "Test Journal") 

476 self.assertEqual(journal["type"], "journal") 

477 self.assertEqual(journal["id"], f"omid:br/{supplier_prefix}3") 

478 

479 def test_empty_input_directory(self): 

480 generate_csv( 

481 input_dir=self.rdf_dir, 

482 output_dir=self.output_dir, 

483 dir_split_number=10000, 

484 items_per_file=1000, 

485 redis_port=6381, 

486 redis_db=5, 

487 ) 

488 

489 self.assertEqual(len(os.listdir(self.output_dir)), 0) 

490 

491 def test_br_with_multiple_authors_and_editors(self): 

492 supplier_prefix = "060" 

493 br_data = [ 

494 { 

495 "@graph": [ 

496 { 

497 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

498 "@type": [ 

499 "http://purl.org/spar/fabio/Expression", 

500 "http://purl.org/spar/fabio/Book", 

501 ], 

502 "http://purl.org/dc/terms/title": [ 

503 {"@value": "Multi-Author Book"} 

504 ], 

505 "http://purl.org/spar/pro/isDocumentContextFor": [ 

506 { 

507 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1" 

508 }, # First author 

509 { 

510 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2" 

511 }, # Second author 

512 { 

513 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3" 

514 }, # First editor 

515 { 

516 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4" 

517 }, # Second editor 

518 ], 

519 } 

520 ] 

521 } 

522 ] 

523 

524 # Setup agent roles for authors and editors with hasNext relations 

525 ar_data = [ 

526 { 

527 "@graph": [ 

528 { 

529 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

530 "http://purl.org/spar/pro/withRole": [ 

531 {"@id": "http://purl.org/spar/pro/author"} 

532 ], 

533 "http://purl.org/spar/pro/isHeldBy": [ 

534 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

535 ], 

536 "https://w3id.org/oc/ontology/hasNext": [ 

537 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"} 

538 ], 

539 }, 

540 { 

541 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

542 "http://purl.org/spar/pro/withRole": [ 

543 {"@id": "http://purl.org/spar/pro/author"} 

544 ], 

545 "http://purl.org/spar/pro/isHeldBy": [ 

546 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

547 ], 

548 "https://w3id.org/oc/ontology/hasNext": [ 

549 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

550 ], 

551 }, 

552 { 

553 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

554 "http://purl.org/spar/pro/withRole": [ 

555 {"@id": "http://purl.org/spar/pro/editor"} 

556 ], 

557 "http://purl.org/spar/pro/isHeldBy": [ 

558 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

559 ], 

560 "https://w3id.org/oc/ontology/hasNext": [ 

561 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4"} 

562 ], 

563 }, 

564 { 

565 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4", 

566 "http://purl.org/spar/pro/withRole": [ 

567 {"@id": "http://purl.org/spar/pro/editor"} 

568 ], 

569 "http://purl.org/spar/pro/isHeldBy": [ 

570 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4"} 

571 ], 

572 }, 

573 ] 

574 } 

575 ] 

576 

577 # Setup responsible agents 

578 ra_data = [ 

579 { 

580 "@graph": [ 

581 { 

582 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

583 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}], 

584 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}], 

585 }, 

586 { 

587 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

588 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Doe"}], 

589 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Jane"}], 

590 }, 

591 { 

592 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

593 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Brown"}], 

594 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Bob"}], 

595 }, 

596 { 

597 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4", 

598 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Wilson"}], 

599 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Alice"}], 

600 }, 

601 ] 

602 } 

603 ] 

604 

605 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

606 

607 for entity_type, data in data_files.items(): 

608 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

609 os.makedirs(dir_path, exist_ok=True) 

610 

611 zip_path = os.path.join(dir_path, "1000.zip") 

612 with ZipFile(zip_path, "w") as zip_file: 

613 zip_file.writestr("1000.json", json.dumps(data)) 

614 

615 generate_csv( 

616 input_dir=self.rdf_dir, 

617 output_dir=self.output_dir, 

618 dir_split_number=10000, 

619 items_per_file=1000, 

620 redis_port=6381, 

621 redis_db=5, 

622 ) 

623 

624 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

625 self.assertEqual(len(output_data), 1) 

626 

627 expected_authors = ( 

628 f"Smith, John [omid:ra/{supplier_prefix}1]; " 

629 f"Doe, Jane [omid:ra/{supplier_prefix}2]" 

630 ) 

631 expected_editors = ( 

632 f"Brown, Bob [omid:ra/{supplier_prefix}3]; " 

633 f"Wilson, Alice [omid:ra/{supplier_prefix}4]" 

634 ) 

635 

636 self.assertEqual(output_data[0]["author"], expected_authors) 

637 self.assertEqual(output_data[0]["editor"], expected_editors) 

638 

639 def test_br_with_identifiers(self): 

640 supplier_prefix = "060" 

641 br_data = [ 

642 { 

643 "@graph": [ 

644 { 

645 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

646 "@type": [ 

647 "http://purl.org/spar/fabio/Expression", 

648 "http://purl.org/spar/fabio/JournalArticle", 

649 ], 

650 "http://purl.org/dc/terms/title": [ 

651 {"@value": "Article With DOI"} 

652 ], 

653 "http://purl.org/spar/datacite/hasIdentifier": [ 

654 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1"}, 

655 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2"}, 

656 ], 

657 } 

658 ] 

659 } 

660 ] 

661 

662 id_data = [ 

663 { 

664 "@graph": [ 

665 { 

666 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1", 

667 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

668 {"@id": "http://purl.org/spar/datacite/doi"} 

669 ], 

670 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

671 {"@value": "10.1234/test.123"} 

672 ], 

673 }, 

674 { 

675 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2", 

676 "http://purl.org/spar/datacite/usesIdentifierScheme": [ 

677 {"@id": "http://purl.org/spar/datacite/isbn"} 

678 ], 

679 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [ 

680 {"@value": "978-0-123456-47-2"} 

681 ], 

682 }, 

683 ] 

684 } 

685 ] 

686 

687 data_files = {"br": br_data, "id": id_data} 

688 

689 for entity_type, data in data_files.items(): 

690 # Create all necessary directories 

691 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

692 os.makedirs(dir_path, exist_ok=True) 

693 

694 zip_path = os.path.join(dir_path, "1000.zip") 

695 with ZipFile(zip_path, "w") as zip_file: 

696 zip_file.writestr("1000.json", json.dumps(data)) 

697 

698 generate_csv( 

699 input_dir=self.rdf_dir, 

700 output_dir=self.output_dir, 

701 dir_split_number=10000, 

702 items_per_file=1000, 

703 redis_port=6381, 

704 redis_db=5, 

705 ) 

706 

707 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

708 self.assertEqual(len(output_data), 1) 

709 

710 expected_ids = ( 

711 f"omid:br/{supplier_prefix}1 doi:10.1234/test.123 isbn:978-0-123456-47-2" 

712 ) 

713 self.assertEqual(output_data[0]["id"], expected_ids) 

714 

715 def test_br_with_page_numbers(self): 

716 supplier_prefix = "060" 

717 br_data = [ 

718 { 

719 "@graph": [ 

720 { 

721 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

722 "@type": [ 

723 "http://purl.org/spar/fabio/Expression", 

724 "http://purl.org/spar/fabio/JournalArticle", 

725 ], 

726 "http://purl.org/dc/terms/title": [{"@value": "Paged Article"}], 

727 "http://purl.org/vocab/frbr/core#embodiment": [ 

728 {"@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1"} 

729 ], 

730 } 

731 ] 

732 } 

733 ] 

734 

735 re_data = [ 

736 { 

737 "@graph": [ 

738 { 

739 "@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1", 

740 "http://prismstandard.org/namespaces/basic/2.0/startingPage": [ 

741 {"@value": "100"} 

742 ], 

743 "http://prismstandard.org/namespaces/basic/2.0/endingPage": [ 

744 {"@value": "120"} 

745 ], 

746 } 

747 ] 

748 } 

749 ] 

750 

751 data_files = {"br": br_data, "re": re_data} 

752 

753 for entity_type, data in data_files.items(): 

754 # Create all necessary directories 

755 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

756 os.makedirs(dir_path, exist_ok=True) 

757 

758 zip_path = os.path.join(dir_path, "1000.zip") 

759 with ZipFile(zip_path, "w") as zip_file: 

760 zip_file.writestr("1000.json", json.dumps(data)) 

761 

762 generate_csv( 

763 input_dir=self.rdf_dir, 

764 output_dir=self.output_dir, 

765 dir_split_number=10000, 

766 items_per_file=1000, 

767 redis_port=6381, 

768 redis_db=5, 

769 ) 

770 

771 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

772 self.assertEqual(len(output_data), 1) 

773 self.assertEqual(output_data[0]["page"], "100-120") 

774 

775 def test_malformed_data_handling(self): 

776 supplier_prefix = "060" 

777 br_data = [ 

778 { 

779 "@graph": [ 

780 { 

781 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

782 "@type": [ 

783 "http://purl.org/spar/fabio/Expression", 

784 "http://purl.org/spar/fabio/JournalArticle", 

785 ], 

786 # Missing title 

787 "http://purl.org/spar/pro/isDocumentContextFor": [ 

788 {"@id": "invalid_uri"}, # Invalid URI 

789 ], 

790 "http://purl.org/vocab/frbr/core#partOf": [ 

791 {"@id": "non_existent_venue"} # Non-existent venue 

792 ], 

793 } 

794 ] 

795 } 

796 ] 

797 

798 data_files = {"br": br_data} 

799 

800 for entity_type, data in data_files.items(): 

801 # Create all necessary directories 

802 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

803 os.makedirs(dir_path, exist_ok=True) 

804 

805 zip_path = os.path.join(dir_path, "1000.zip") 

806 with ZipFile(zip_path, "w") as zip_file: 

807 zip_file.writestr("1000.json", json.dumps(data)) 

808 

809 generate_csv( 

810 input_dir=self.rdf_dir, 

811 output_dir=self.output_dir, 

812 dir_split_number=10000, 

813 items_per_file=1000, 

814 redis_port=6381, 

815 redis_db=5, 

816 ) 

817 

818 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

819 self.assertEqual(len(output_data), 1) 

820 self.assertEqual(output_data[0]["title"], "") 

821 self.assertEqual(output_data[0]["author"], "") 

822 self.assertEqual(output_data[0]["venue"], "") 

823 

824 def test_br_with_hierarchical_venue_structures(self): 

825 supplier_prefix = "060" 

826 

827 br_data = [ 

828 { 

829 "@graph": [ 

830 # Article in issue->volume->journal structure 

831 { 

832 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

833 "@type": [ 

834 "http://purl.org/spar/fabio/Expression", 

835 "http://purl.org/spar/fabio/JournalArticle", 

836 ], 

837 "http://purl.org/dc/terms/title": [ 

838 {"@value": "Article in Full Hierarchy"} 

839 ], 

840 "http://purl.org/vocab/frbr/core#partOf": [ 

841 { 

842 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2" 

843 } # Issue 

844 ], 

845 }, 

846 # Article in issue->journal structure (no volume) 

847 { 

848 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}5", 

849 "@type": [ 

850 "http://purl.org/spar/fabio/Expression", 

851 "http://purl.org/spar/fabio/JournalArticle", 

852 ], 

853 "http://purl.org/dc/terms/title": [ 

854 {"@value": "Article in Issue-Journal"} 

855 ], 

856 "http://purl.org/vocab/frbr/core#partOf": [ 

857 { 

858 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6" 

859 } # Issue 

860 ], 

861 }, 

862 # Article in volume->journal structure (no issue) 

863 { 

864 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}9", 

865 "@type": [ 

866 "http://purl.org/spar/fabio/Expression", 

867 "http://purl.org/spar/fabio/JournalArticle", 

868 ], 

869 "http://purl.org/dc/terms/title": [ 

870 {"@value": "Article in Volume-Journal"} 

871 ], 

872 "http://purl.org/vocab/frbr/core#partOf": [ 

873 { 

874 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10" 

875 } # Volume 

876 ], 

877 }, 

878 # Article directly in journal 

879 { 

880 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}13", 

881 "@type": [ 

882 "http://purl.org/spar/fabio/Expression", 

883 "http://purl.org/spar/fabio/JournalArticle", 

884 ], 

885 "http://purl.org/dc/terms/title": [ 

886 {"@value": "Article in Journal"} 

887 ], 

888 "http://purl.org/vocab/frbr/core#partOf": [ 

889 { 

890 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4" 

891 } # Journal 

892 ], 

893 }, 

894 # Issue in full hierarchy 

895 { 

896 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2", 

897 "@type": ["http://purl.org/spar/fabio/JournalIssue"], 

898 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

899 {"@value": "2"} 

900 ], 

901 "http://purl.org/vocab/frbr/core#partOf": [ 

902 { 

903 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3" 

904 } # Volume 

905 ], 

906 }, 

907 # Volume in full hierarchy 

908 { 

909 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3", 

910 "@type": ["http://purl.org/spar/fabio/JournalVolume"], 

911 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

912 {"@value": "42"} 

913 ], 

914 "http://purl.org/vocab/frbr/core#partOf": [ 

915 { 

916 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4" 

917 } # Journal 

918 ], 

919 }, 

920 # Journal 

921 { 

922 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4", 

923 "@type": ["http://purl.org/spar/fabio/Journal"], 

924 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}], 

925 }, 

926 # Issue directly in journal 

927 { 

928 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6", 

929 "@type": ["http://purl.org/spar/fabio/JournalIssue"], 

930 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

931 {"@value": "3"} 

932 ], 

933 "http://purl.org/vocab/frbr/core#partOf": [ 

934 { 

935 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4" 

936 } # Journal 

937 ], 

938 }, 

939 # Volume directly in journal 

940 { 

941 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10", 

942 "@type": ["http://purl.org/spar/fabio/JournalVolume"], 

943 "http://purl.org/spar/fabio/hasSequenceIdentifier": [ 

944 {"@value": "5"} 

945 ], 

946 "http://purl.org/vocab/frbr/core#partOf": [ 

947 { 

948 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4" 

949 } # Journal 

950 ], 

951 }, 

952 ] 

953 } 

954 ] 

955 

956 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000") 

957 os.makedirs(dir_path, exist_ok=True) 

958 

959 zip_path = os.path.join(dir_path, "1000.zip") 

960 with ZipFile(zip_path, "w") as zip_file: 

961 zip_file.writestr("1000.json", json.dumps(br_data)) 

962 

963 generate_csv( 

964 input_dir=self.rdf_dir, 

965 output_dir=self.output_dir, 

966 dir_split_number=10000, 

967 items_per_file=1000, 

968 redis_port=6381, 

969 redis_db=5, 

970 ) 

971 

972 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

973 

974 self.assertEqual(len(output_data), 5) # 4 articles + 1 journal 

975 

976 volume_or_issue_entries = [ 

977 item 

978 for item in output_data 

979 if item["type"] in ["journal volume", "journal issue"] 

980 ] 

981 self.assertEqual(len(volume_or_issue_entries), 0) 

982 

983 full_hierarchy = next( 

984 item for item in output_data if item["title"] == "Article in Full Hierarchy" 

985 ) 

986 issue_journal = next( 

987 item for item in output_data if item["title"] == "Article in Issue-Journal" 

988 ) 

989 volume_journal = next( 

990 item for item in output_data if item["title"] == "Article in Volume-Journal" 

991 ) 

992 direct_journal = next( 

993 item for item in output_data if item["title"] == "Article in Journal" 

994 ) 

995 

996 self.assertEqual(full_hierarchy["issue"], "2") 

997 self.assertEqual(full_hierarchy["volume"], "42") 

998 self.assertEqual( 

999 full_hierarchy["venue"], f"Test Journal [omid:br/{supplier_prefix}4]" 

1000 ) 

1001 

1002 self.assertEqual(issue_journal["issue"], "3") 

1003 self.assertEqual(issue_journal["volume"], "") 

1004 self.assertEqual( 

1005 issue_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]" 

1006 ) 

1007 

1008 self.assertEqual(volume_journal["issue"], "") 

1009 self.assertEqual(volume_journal["volume"], "5") 

1010 self.assertEqual( 

1011 volume_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]" 

1012 ) 

1013 

1014 self.assertEqual(direct_journal["issue"], "") 

1015 self.assertEqual(direct_journal["volume"], "") 

1016 self.assertEqual( 

1017 direct_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]" 

1018 ) 

1019 

1020 def test_book_in_series(self): 

1021 supplier_prefix = "060" 

1022 

1023 br_data = [ 

1024 { 

1025 "@graph": [ 

1026 # Book 

1027 { 

1028 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1029 "@type": [ 

1030 "http://purl.org/spar/fabio/Expression", 

1031 "http://purl.org/spar/fabio/Book", 

1032 ], 

1033 "http://purl.org/dc/terms/title": [{"@value": "Test Book"}], 

1034 "http://purl.org/vocab/frbr/core#partOf": [ 

1035 { 

1036 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2" 

1037 } # Series 

1038 ], 

1039 }, 

1040 # Book Series 

1041 { 

1042 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2", 

1043 "@type": ["http://purl.org/spar/fabio/BookSeries"], 

1044 "http://purl.org/dc/terms/title": [ 

1045 {"@value": "Test Book Series"} 

1046 ], 

1047 }, 

1048 ] 

1049 } 

1050 ] 

1051 

1052 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000") 

1053 os.makedirs(dir_path, exist_ok=True) 

1054 

1055 zip_path = os.path.join(dir_path, "1000.zip") 

1056 with ZipFile(zip_path, "w") as zip_file: 

1057 zip_file.writestr("1000.json", json.dumps(br_data)) 

1058 

1059 generate_csv( 

1060 input_dir=self.rdf_dir, 

1061 output_dir=self.output_dir, 

1062 dir_split_number=10000, 

1063 items_per_file=1000, 

1064 redis_port=6381, 

1065 redis_db=5, 

1066 ) 

1067 

1068 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1069 

1070 book = next(item for item in output_data if item["type"] == "book") 

1071 

1072 self.assertEqual(book["title"], "Test Book") 

1073 self.assertEqual( 

1074 book["venue"], f"Test Book Series [omid:br/{supplier_prefix}2]" 

1075 ) 

1076 self.assertEqual(book["volume"], "") # Should not have volume 

1077 self.assertEqual(book["issue"], "") # Should not have issue 

1078 

1079 def test_br_with_multiple_roles(self): 

1080 supplier_prefix = "060" 

1081 br_data = [ 

1082 { 

1083 "@graph": [ 

1084 { 

1085 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1086 "@type": [ 

1087 "http://purl.org/spar/fabio/Expression", 

1088 "http://purl.org/spar/fabio/Book", 

1089 ], 

1090 "http://purl.org/dc/terms/title": [ 

1091 {"@value": "Multi-Role Book"} 

1092 ], 

1093 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1094 { 

1095 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1" 

1096 }, # Author 

1097 { 

1098 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2" 

1099 }, # Editor 

1100 { 

1101 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3" 

1102 }, # Publisher 

1103 ], 

1104 } 

1105 ] 

1106 } 

1107 ] 

1108 

1109 # Setup agent roles for authors, editors and publishers 

1110 ar_data = [ 

1111 { 

1112 "@graph": [ 

1113 { 

1114 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

1115 "http://purl.org/spar/pro/withRole": [ 

1116 {"@id": "http://purl.org/spar/pro/author"} 

1117 ], 

1118 "http://purl.org/spar/pro/isHeldBy": [ 

1119 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

1120 ], 

1121 "https://w3id.org/oc/ontology/hasNext": [ 

1122 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"} 

1123 ], 

1124 }, 

1125 { 

1126 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

1127 "http://purl.org/spar/pro/withRole": [ 

1128 {"@id": "http://purl.org/spar/pro/editor"} 

1129 ], 

1130 "http://purl.org/spar/pro/isHeldBy": [ 

1131 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

1132 ], 

1133 "https://w3id.org/oc/ontology/hasNext": [ 

1134 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

1135 ], 

1136 }, 

1137 { 

1138 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

1139 "http://purl.org/spar/pro/withRole": [ 

1140 {"@id": "http://purl.org/spar/pro/publisher"} 

1141 ], 

1142 "http://purl.org/spar/pro/isHeldBy": [ 

1143 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

1144 ], 

1145 }, 

1146 ] 

1147 } 

1148 ] 

1149 

1150 # Setup responsible agents with different name formats 

1151 ra_data = [ 

1152 { 

1153 "@graph": [ 

1154 { 

1155 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

1156 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}], 

1157 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}], 

1158 }, 

1159 { 

1160 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

1161 "http://xmlns.com/foaf/0.1/name": [{"@value": "Editor Name"}], 

1162 }, 

1163 { 

1164 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

1165 "http://xmlns.com/foaf/0.1/name": [ 

1166 {"@value": "Publisher House"} 

1167 ], 

1168 }, 

1169 ] 

1170 } 

1171 ] 

1172 

1173 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

1174 

1175 for entity_type, data in data_files.items(): 

1176 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

1177 os.makedirs(dir_path, exist_ok=True) 

1178 

1179 zip_path = os.path.join(dir_path, "1000.zip") 

1180 with ZipFile(zip_path, "w") as zip_file: 

1181 zip_file.writestr("1000.json", json.dumps(data)) 

1182 

1183 generate_csv( 

1184 input_dir=self.rdf_dir, 

1185 output_dir=self.output_dir, 

1186 dir_split_number=10000, 

1187 items_per_file=1000, 

1188 redis_port=6381, 

1189 redis_db=5, 

1190 ) 

1191 

1192 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1193 self.assertEqual(len(output_data), 1) 

1194 

1195 book = output_data[0] 

1196 self.assertEqual(book["title"], "Multi-Role Book") 

1197 self.assertEqual(book["author"], f"Smith, John [omid:ra/{supplier_prefix}1]") 

1198 self.assertEqual(book["editor"], f"Editor Name [omid:ra/{supplier_prefix}2]") 

1199 self.assertEqual( 

1200 book["publisher"], f"Publisher House [omid:ra/{supplier_prefix}3]" 

1201 ) 

1202 

1203 def test_ordered_authors(self): 

1204 supplier_prefix = "060" 

1205 br_data = [ 

1206 { 

1207 "@graph": [ 

1208 { 

1209 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1210 "@type": [ 

1211 "http://purl.org/spar/fabio/Expression", 

1212 "http://purl.org/spar/fabio/JournalArticle", 

1213 ], 

1214 "http://purl.org/dc/terms/title": [ 

1215 {"@value": "Ordered Authors Article"} 

1216 ], 

1217 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1218 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}, 

1219 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}, 

1220 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}, 

1221 ], 

1222 } 

1223 ] 

1224 } 

1225 ] 

1226 

1227 # Setup agent roles with hasNext relations 

1228 ar_data = [ 

1229 { 

1230 "@graph": [ 

1231 { 

1232 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

1233 "http://purl.org/spar/pro/withRole": [ 

1234 {"@id": "http://purl.org/spar/pro/author"} 

1235 ], 

1236 "http://purl.org/spar/pro/isHeldBy": [ 

1237 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

1238 ], 

1239 "https://w3id.org/oc/ontology/hasNext": [ 

1240 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"} 

1241 ], 

1242 }, 

1243 { 

1244 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

1245 "http://purl.org/spar/pro/withRole": [ 

1246 {"@id": "http://purl.org/spar/pro/author"} 

1247 ], 

1248 "http://purl.org/spar/pro/isHeldBy": [ 

1249 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

1250 ], 

1251 "https://w3id.org/oc/ontology/hasNext": [ 

1252 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

1253 ], 

1254 }, 

1255 { 

1256 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

1257 "http://purl.org/spar/pro/withRole": [ 

1258 {"@id": "http://purl.org/spar/pro/author"} 

1259 ], 

1260 "http://purl.org/spar/pro/isHeldBy": [ 

1261 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

1262 ], 

1263 }, 

1264 ] 

1265 } 

1266 ] 

1267 

1268 # Setup responsible agents with different names 

1269 ra_data = [ 

1270 { 

1271 "@graph": [ 

1272 { 

1273 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

1274 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}], 

1275 }, 

1276 { 

1277 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

1278 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}], 

1279 }, 

1280 { 

1281 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

1282 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}], 

1283 }, 

1284 ] 

1285 } 

1286 ] 

1287 

1288 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

1289 

1290 for entity_type, data in data_files.items(): 

1291 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

1292 os.makedirs(dir_path, exist_ok=True) 

1293 

1294 zip_path = os.path.join(dir_path, "1000.zip") 

1295 with ZipFile(zip_path, "w") as zip_file: 

1296 zip_file.writestr("1000.json", json.dumps(data)) 

1297 

1298 generate_csv( 

1299 input_dir=self.rdf_dir, 

1300 output_dir=self.output_dir, 

1301 dir_split_number=10000, 

1302 items_per_file=1000, 

1303 redis_port=6381, 

1304 redis_db=5, 

1305 ) 

1306 

1307 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1308 self.assertEqual(len(output_data), 1) 

1309 

1310 expected_authors = ( 

1311 f"First Author [omid:ra/{supplier_prefix}1]; " 

1312 f"Second Author [omid:ra/{supplier_prefix}2]; " 

1313 f"Third Author [omid:ra/{supplier_prefix}3]" 

1314 ) 

1315 self.assertEqual(output_data[0]["author"], expected_authors) 

1316 

1317 def test_cyclic_hasNext_relations(self): 

1318 supplier_prefix = "060" 

1319 br_data = [ 

1320 { 

1321 "@graph": [ 

1322 { 

1323 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1324 "@type": [ 

1325 "http://purl.org/spar/fabio/Expression", 

1326 "http://purl.org/spar/fabio/JournalArticle", 

1327 ], 

1328 "http://purl.org/dc/terms/title": [ 

1329 {"@value": "Cyclic Authors Article"} 

1330 ], 

1331 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1332 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}, 

1333 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}, 

1334 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}, 

1335 ], 

1336 } 

1337 ] 

1338 } 

1339 ] 

1340 

1341 # Setup agent roles with cyclic hasNext relations 

1342 ar_data = [ 

1343 { 

1344 "@graph": [ 

1345 { 

1346 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

1347 "http://purl.org/spar/pro/withRole": [ 

1348 {"@id": "http://purl.org/spar/pro/author"} 

1349 ], 

1350 "http://purl.org/spar/pro/isHeldBy": [ 

1351 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

1352 ], 

1353 "https://w3id.org/oc/ontology/hasNext": [ 

1354 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"} 

1355 ], 

1356 }, 

1357 { 

1358 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

1359 "http://purl.org/spar/pro/withRole": [ 

1360 {"@id": "http://purl.org/spar/pro/author"} 

1361 ], 

1362 "http://purl.org/spar/pro/isHeldBy": [ 

1363 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

1364 ], 

1365 # Creates a cycle: 1 -> 2 -> 3 -> 1 

1366 "https://w3id.org/oc/ontology/hasNext": [ 

1367 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

1368 ], 

1369 }, 

1370 { 

1371 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

1372 "http://purl.org/spar/pro/withRole": [ 

1373 {"@id": "http://purl.org/spar/pro/author"} 

1374 ], 

1375 "http://purl.org/spar/pro/isHeldBy": [ 

1376 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

1377 ], 

1378 # Cycle completion 

1379 "https://w3id.org/oc/ontology/hasNext": [ 

1380 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"} 

1381 ], 

1382 }, 

1383 ] 

1384 } 

1385 ] 

1386 

1387 # Setup responsible agents 

1388 ra_data = [ 

1389 { 

1390 "@graph": [ 

1391 { 

1392 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

1393 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}], 

1394 }, 

1395 { 

1396 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

1397 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}], 

1398 }, 

1399 { 

1400 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

1401 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}], 

1402 }, 

1403 ] 

1404 } 

1405 ] 

1406 

1407 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

1408 

1409 for entity_type, data in data_files.items(): 

1410 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

1411 os.makedirs(dir_path, exist_ok=True) 

1412 

1413 zip_path = os.path.join(dir_path, "1000.zip") 

1414 with ZipFile(zip_path, "w") as zip_file: 

1415 zip_file.writestr("1000.json", json.dumps(data)) 

1416 

1417 generate_csv( 

1418 input_dir=self.rdf_dir, 

1419 output_dir=self.output_dir, 

1420 dir_split_number=10000, 

1421 items_per_file=1000, 

1422 redis_port=6381, 

1423 redis_db=5, 

1424 ) 

1425 

1426 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1427 self.assertEqual(len(output_data), 1) 

1428 

1429 # The order should be maintained until the cycle is detected 

1430 authors = output_data[0]["author"].split("; ") 

1431 self.assertGreater(len(authors), 0) 

1432 

1433 self.assertTrue( 

1434 any( 

1435 f"First Author [omid:ra/{supplier_prefix}1]" in author 

1436 for author in authors 

1437 ) 

1438 ) 

1439 self.assertTrue( 

1440 any( 

1441 f"Second Author [omid:ra/{supplier_prefix}2]" in author 

1442 for author in authors 

1443 ) 

1444 ) 

1445 

1446 author_set = set(authors) 

1447 self.assertEqual( 

1448 len(authors), 

1449 len(author_set), 

1450 "Found duplicate authors in output: each author should appear exactly once", 

1451 ) 

1452 

1453 expected_authors = [ 

1454 f"First Author [omid:ra/{supplier_prefix}1]", 

1455 f"Second Author [omid:ra/{supplier_prefix}2]", 

1456 f"Third Author [omid:ra/{supplier_prefix}3]", 

1457 ] 

1458 self.assertEqual( 

1459 authors, 

1460 expected_authors, 

1461 "Authors should be in correct order and each should appear exactly once", 

1462 ) 

1463 

1464 def test_multiple_input_files(self): 

1465 supplier_prefix = "060" 

1466 

1467 # First file (entities 1-1000) 

1468 br_data_1 = [ 

1469 { 

1470 "@graph": [ 

1471 { 

1472 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1473 "@type": [ 

1474 "http://purl.org/spar/fabio/Expression", 

1475 "http://purl.org/spar/fabio/JournalArticle", 

1476 ], 

1477 "http://purl.org/dc/terms/title": [{"@value": "Article 1"}], 

1478 }, 

1479 { 

1480 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1000", 

1481 "@type": [ 

1482 "http://purl.org/spar/fabio/Expression", 

1483 "http://purl.org/spar/fabio/JournalArticle", 

1484 ], 

1485 "http://purl.org/dc/terms/title": [{"@value": "Article 1000"}], 

1486 }, 

1487 ] 

1488 } 

1489 ] 

1490 

1491 # Second file (entities 1001-2000) 

1492 br_data_2 = [ 

1493 { 

1494 "@graph": [ 

1495 { 

1496 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1001", 

1497 "@type": [ 

1498 "http://purl.org/spar/fabio/Expression", 

1499 "http://purl.org/spar/fabio/JournalArticle", 

1500 ], 

1501 "http://purl.org/dc/terms/title": [{"@value": "Article 1001"}], 

1502 }, 

1503 { 

1504 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2000", 

1505 "@type": [ 

1506 "http://purl.org/spar/fabio/Expression", 

1507 "http://purl.org/spar/fabio/JournalArticle", 

1508 ], 

1509 "http://purl.org/dc/terms/title": [{"@value": "Article 2000"}], 

1510 }, 

1511 ] 

1512 } 

1513 ] 

1514 

1515 # Third file (entities 2001-3000) 

1516 br_data_3 = [ 

1517 { 

1518 "@graph": [ 

1519 { 

1520 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2001", 

1521 "@type": [ 

1522 "http://purl.org/spar/fabio/Expression", 

1523 "http://purl.org/spar/fabio/JournalArticle", 

1524 ], 

1525 "http://purl.org/dc/terms/title": [{"@value": "Article 2001"}], 

1526 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1527 { 

1528 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001" 

1529 } 

1530 ], 

1531 } 

1532 ] 

1533 } 

1534 ] 

1535 

1536 # Create agent role data in a different file 

1537 ar_data = [ 

1538 { 

1539 "@graph": [ 

1540 { 

1541 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001", 

1542 "http://purl.org/spar/pro/withRole": [ 

1543 {"@id": "http://purl.org/spar/pro/author"} 

1544 ], 

1545 "http://purl.org/spar/pro/isHeldBy": [ 

1546 { 

1547 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001" 

1548 } 

1549 ], 

1550 } 

1551 ] 

1552 } 

1553 ] 

1554 

1555 # Create responsible agent data in a different file 

1556 ra_data = [ 

1557 { 

1558 "@graph": [ 

1559 { 

1560 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001", 

1561 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}], 

1562 } 

1563 ] 

1564 } 

1565 ] 

1566 

1567 os.makedirs(os.path.join(self.br_dir, supplier_prefix, "10000"), exist_ok=True) 

1568 os.makedirs( 

1569 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000"), exist_ok=True 

1570 ) 

1571 os.makedirs( 

1572 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000"), exist_ok=True 

1573 ) 

1574 

1575 with ZipFile( 

1576 os.path.join(self.br_dir, supplier_prefix, "10000", "1000.zip"), "w" 

1577 ) as zip_file: 

1578 zip_file.writestr("1000.json", json.dumps(br_data_1)) 

1579 with ZipFile( 

1580 os.path.join(self.br_dir, supplier_prefix, "10000", "2000.zip"), "w" 

1581 ) as zip_file: 

1582 zip_file.writestr("2000.json", json.dumps(br_data_2)) 

1583 with ZipFile( 

1584 os.path.join(self.br_dir, supplier_prefix, "10000", "3000.zip"), "w" 

1585 ) as zip_file: 

1586 zip_file.writestr("3000.json", json.dumps(br_data_3)) 

1587 

1588 with ZipFile( 

1589 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000", "3000.zip"), "w" 

1590 ) as zip_file: 

1591 zip_file.writestr("3000.json", json.dumps(ar_data)) 

1592 with ZipFile( 

1593 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000", "3000.zip"), "w" 

1594 ) as zip_file: 

1595 zip_file.writestr("3000.json", json.dumps(ra_data)) 

1596 

1597 generate_csv( 

1598 input_dir=self.rdf_dir, 

1599 output_dir=self.output_dir, 

1600 dir_split_number=10000, 

1601 items_per_file=1000, 

1602 redis_port=6381, 

1603 redis_db=5, 

1604 ) 

1605 

1606 output_files = sorted(os.listdir(self.output_dir)) 

1607 self.assertGreater(len(output_files), 0) 

1608 

1609 # Collect all output data 

1610 all_output_data = [] 

1611 for output_file in output_files: 

1612 all_output_data.extend( 

1613 get_csv_data(os.path.join(self.output_dir, output_file)) 

1614 ) 

1615 

1616 self.assertEqual(len(all_output_data), 5) # Should have 5 articles total 

1617 

1618 article_1 = next( 

1619 item 

1620 for item in all_output_data 

1621 if item["id"] == f"omid:br/{supplier_prefix}1" 

1622 ) 

1623 article_1000 = next( 

1624 item 

1625 for item in all_output_data 

1626 if item["id"] == f"omid:br/{supplier_prefix}1000" 

1627 ) 

1628 article_1001 = next( 

1629 item 

1630 for item in all_output_data 

1631 if item["id"] == f"omid:br/{supplier_prefix}1001" 

1632 ) 

1633 article_2000 = next( 

1634 item 

1635 for item in all_output_data 

1636 if item["id"] == f"omid:br/{supplier_prefix}2000" 

1637 ) 

1638 article_2001 = next( 

1639 item 

1640 for item in all_output_data 

1641 if item["id"] == f"omid:br/{supplier_prefix}2001" 

1642 ) 

1643 

1644 self.assertEqual(article_1["title"], "Article 1") 

1645 self.assertEqual(article_1000["title"], "Article 1000") 

1646 self.assertEqual(article_1001["title"], "Article 1001") 

1647 self.assertEqual(article_2000["title"], "Article 2000") 

1648 self.assertEqual(article_2001["title"], "Article 2001") 

1649 

1650 self.assertEqual( 

1651 article_2001["author"], f"Test Author [omid:ra/{supplier_prefix}2001]" 

1652 ) 

1653 

1654 def test_max_rows_per_file_and_data_integrity(self): 

1655 supplier_prefix = "060" 

1656 

1657 br_data = [ 

1658 { 

1659 "@graph": [ 

1660 # Generate 3500 test entries 

1661 *[ 

1662 { 

1663 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}{i}", 

1664 "@type": [ 

1665 "http://purl.org/spar/fabio/Expression", 

1666 "http://purl.org/spar/fabio/JournalArticle", 

1667 ], 

1668 "http://purl.org/dc/terms/title": [ 

1669 {"@value": f"Article {i}"} 

1670 ], 

1671 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

1672 {"@value": "2024-01-01"} 

1673 ], 

1674 } 

1675 for i in range(1, 3501) 

1676 ] # This will create 3500 entries 

1677 ] 

1678 } 

1679 ] 

1680 

1681 entries_per_file = 1000 

1682 for i in range(0, 3500, entries_per_file): 

1683 file_data = [{"@graph": br_data[0]["@graph"][i : i + entries_per_file]}] 

1684 

1685 # Create directory structure for the file 

1686 file_number = i + entries_per_file 

1687 dir_path = os.path.join(self.br_dir, supplier_prefix, "10000") 

1688 os.makedirs(dir_path, exist_ok=True) 

1689 

1690 # Write the file 

1691 with ZipFile(os.path.join(dir_path, f"{file_number}.zip"), "w") as zip_file: 

1692 zip_file.writestr(f"{file_number}.json", json.dumps(file_data)) 

1693 

1694 generate_csv( 

1695 input_dir=self.rdf_dir, 

1696 output_dir=self.output_dir, 

1697 dir_split_number=10000, 

1698 items_per_file=1000, 

1699 redis_port=6381, 

1700 redis_db=5, 

1701 ) 

1702 

1703 output_files = sorted(os.listdir(self.output_dir)) 

1704 

1705 # We expect at least 2 files: 3500 entries should create 2 files (3000 + 500) 

1706 self.assertGreaterEqual( 

1707 len(output_files), 2, "Should have at least 2 output files for 3500 entries" 

1708 ) 

1709 

1710 # Collect all entries from all output files 

1711 all_entries = [] 

1712 for output_file in output_files: 

1713 entries = get_csv_data(os.path.join(self.output_dir, output_file)) 

1714 

1715 # Verify each file has at most 3000 rows 

1716 self.assertLessEqual( 

1717 len(entries), 

1718 3000, 

1719 f"File {output_file} has more than 3000 rows: {len(entries)}", 

1720 ) 

1721 

1722 all_entries.extend(entries) 

1723 

1724 self.assertEqual( 

1725 len(all_entries), 

1726 3500, 

1727 f"Expected 3500 total entries, got {len(all_entries)}", 

1728 ) 

1729 

1730 unique_ids = {entry["id"] for entry in all_entries} 

1731 self.assertEqual( 

1732 len(unique_ids), 

1733 3500, 

1734 f"Expected 3500 unique entries, got {len(unique_ids)}", 

1735 ) 

1736 

1737 expected_ids = {f"omid:br/{supplier_prefix}{i}" for i in range(1, 3501)} 

1738 self.assertEqual( 

1739 unique_ids, 

1740 expected_ids, 

1741 "Some entries are missing or unexpected entries are present", 

1742 ) 

1743 

1744 for i in range(1, 3501): 

1745 entry = next( 

1746 e for e in all_entries if e["id"] == f"omid:br/{supplier_prefix}{i}" 

1747 ) 

1748 self.assertEqual(entry["title"], f"Article {i}") 

1749 self.assertEqual(entry["pub_date"], "2024-01-01") 

1750 self.assertEqual(entry["type"], "journal article") 

1751 

1752 def test_csv_field_limit_handling(self): 

1753 # Create a test CSV with a very large field 

1754 large_field = "omid:br/0601 " + " ".join( 

1755 [f"id:{i}" for i in range(10000)] 

1756 ) # This will create a field > 131072 chars 

1757 test_data = {"id": large_field, "title": "Test Large Field"} 

1758 

1759 os.makedirs(self.output_dir, exist_ok=True) 

1760 with open( 

1761 os.path.join(self.output_dir, "large_field.csv"), 

1762 "w", 

1763 newline="", 

1764 encoding="utf-8", 

1765 ) as f: 

1766 writer = csv.DictWriter(f, fieldnames=["id", "title"]) 

1767 writer.writeheader() 

1768 writer.writerow(test_data) 

1769 

1770 # Try loading the data - this should trigger the field limit increase 

1771 count = load_processed_omids_to_redis(self.output_dir, self.redis_client) 

1772 

1773 self.assertEqual(count, 1) 

1774 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client)) 

1775 

1776 def test_complex_br_with_missing_authors(self): 

1777 supplier_prefix = "06250" 

1778 br_data = [ 

1779 { 

1780 "@graph": [ 

1781 { 

1782 "@id": "https://w3id.org/oc/meta/br/062501777134", 

1783 "@type": [ 

1784 "http://purl.org/spar/fabio/JournalArticle", 

1785 "http://purl.org/spar/fabio/Expression", 

1786 ], 

1787 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [ 

1788 { 

1789 "@type": "http://www.w3.org/2001/XMLSchema#gYearMonth", 

1790 "@value": "2020-02", 

1791 } 

1792 ], 

1793 "http://purl.org/dc/terms/title": [ 

1794 { 

1795 "@value": "OpenCitations, An Infrastructure Organization For Open Scholarship" 

1796 } 

1797 ], 

1798 "http://purl.org/spar/datacite/hasIdentifier": [ 

1799 {"@id": "https://w3id.org/oc/meta/id/062501806985"}, 

1800 {"@id": "https://w3id.org/oc/meta/id/06850624745"}, 

1801 ], 

1802 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1803 {"@id": "https://w3id.org/oc/meta/ar/062507977761"}, 

1804 {"@id": "https://w3id.org/oc/meta/ar/062507977760"}, 

1805 {"@id": "https://w3id.org/oc/meta/ar/062507977759"}, 

1806 ], 

1807 "http://purl.org/vocab/frbr/core#embodiment": [ 

1808 {"@id": "https://w3id.org/oc/meta/re/062501477439"} 

1809 ], 

1810 "http://purl.org/vocab/frbr/core#partOf": [ 

1811 {"@id": "https://w3id.org/oc/meta/br/062501778111"} 

1812 ], 

1813 } 

1814 ] 

1815 } 

1816 ] 

1817 

1818 ar_data = [ 

1819 { 

1820 "@graph": [ 

1821 { 

1822 "@id": "https://w3id.org/oc/meta/ar/062507977761", 

1823 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

1824 "http://purl.org/spar/pro/isHeldBy": [ 

1825 {"@id": "https://w3id.org/oc/meta/ra/0610116105"} 

1826 ], 

1827 "http://purl.org/spar/pro/withRole": [ 

1828 {"@id": "http://purl.org/spar/pro/publisher"} 

1829 ], 

1830 }, 

1831 { 

1832 "@id": "https://w3id.org/oc/meta/ar/062507977760", 

1833 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

1834 "http://purl.org/spar/pro/isHeldBy": [ 

1835 {"@id": "https://w3id.org/oc/meta/ra/0621010775619"} 

1836 ], 

1837 "http://purl.org/spar/pro/withRole": [ 

1838 {"@id": "http://purl.org/spar/pro/author"} 

1839 ], 

1840 }, 

1841 { 

1842 "@id": "https://w3id.org/oc/meta/ar/062507977759", 

1843 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

1844 "http://purl.org/spar/pro/isHeldBy": [ 

1845 {"@id": "https://w3id.org/oc/meta/ra/0614010840729"} 

1846 ], 

1847 "http://purl.org/spar/pro/withRole": [ 

1848 {"@id": "http://purl.org/spar/pro/author"} 

1849 ], 

1850 "https://w3id.org/oc/ontology/hasNext": [ 

1851 {"@id": "https://w3id.org/oc/meta/ar/062507977760"} 

1852 ], 

1853 }, 

1854 ] 

1855 } 

1856 ] 

1857 

1858 ra_data_peroni = [ 

1859 { 

1860 "@graph": [ 

1861 { 

1862 "@id": "https://w3id.org/oc/meta/ra/0614010840729", 

1863 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

1864 "http://purl.org/spar/datacite/hasIdentifier": [ 

1865 {"@id": "https://w3id.org/oc/meta/id/06304949238"} 

1866 ], 

1867 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Peroni"}], 

1868 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Silvio"}], 

1869 "http://xmlns.com/foaf/0.1/name": [{"@value": "Peroni Silvio"}], 

1870 } 

1871 ] 

1872 } 

1873 ] 

1874 

1875 ra_data_shotton = [ 

1876 { 

1877 "@graph": [ 

1878 { 

1879 "@id": "https://w3id.org/oc/meta/ra/0621010775619", 

1880 "@type": ["http://xmlns.com/foaf/0.1/Agent"], 

1881 "http://purl.org/spar/datacite/hasIdentifier": [ 

1882 {"@id": "https://w3id.org/oc/meta/id/062404672414"} 

1883 ], 

1884 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Shotton"}], 

1885 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "D M"}], 

1886 "http://xmlns.com/foaf/0.1/name": [{"@value": "Shotton David"}], 

1887 } 

1888 ] 

1889 } 

1890 ] 

1891 

1892 # Create directory structure for BR data 

1893 br_dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "1780000") 

1894 os.makedirs(br_dir_path, exist_ok=True) 

1895 

1896 # Create directory structure for AR data 

1897 ar_dir_path = os.path.join(self.rdf_dir, "ar", supplier_prefix, "7980000") 

1898 os.makedirs(ar_dir_path, exist_ok=True) 

1899 

1900 # Create directory structure for RA data (Peroni) 

1901 ra_peroni_dir_path = os.path.join(self.rdf_dir, "ra", "06140", "10850000") 

1902 os.makedirs(ra_peroni_dir_path, exist_ok=True) 

1903 

1904 # Create directory structure for RA data (Shotton) 

1905 ra_shotton_dir_path = os.path.join(self.rdf_dir, "ra", "06210", "10780000") 

1906 os.makedirs(ra_shotton_dir_path, exist_ok=True) 

1907 

1908 with ZipFile(os.path.join(br_dir_path, "1778000.zip"), "w") as zip_file: 

1909 zip_file.writestr("1778000.json", json.dumps(br_data)) 

1910 

1911 with ZipFile(os.path.join(ar_dir_path, "7978000.zip"), "w") as zip_file: 

1912 zip_file.writestr("7978000.json", json.dumps(ar_data)) 

1913 

1914 with ZipFile(os.path.join(ra_peroni_dir_path, "10841000.zip"), "w") as zip_file: 

1915 zip_file.writestr("10841000.json", json.dumps(ra_data_peroni)) 

1916 

1917 with ZipFile( 

1918 os.path.join(ra_shotton_dir_path, "10776000.zip"), "w" 

1919 ) as zip_file: 

1920 zip_file.writestr("10776000.json", json.dumps(ra_data_shotton)) 

1921 

1922 generate_csv( 

1923 input_dir=self.rdf_dir, 

1924 output_dir=self.output_dir, 

1925 dir_split_number=10000, 

1926 items_per_file=1000, 

1927 redis_port=6381, 

1928 redis_db=5, 

1929 ) 

1930 

1931 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

1932 self.assertEqual(len(output_data), 1) 

1933 article = output_data[0] 

1934 self.assertEqual( 

1935 article["title"], 

1936 "OpenCitations, An Infrastructure Organization For Open Scholarship", 

1937 ) 

1938 self.assertEqual(article["pub_date"], "2020-02") 

1939 self.assertEqual(article["type"], "journal article") 

1940 self.assertEqual(article["id"], "omid:br/062501777134") 

1941 

1942 expected_authors = ( 

1943 "Peroni, Silvio [omid:ra/0614010840729]; " 

1944 "Shotton, D M [omid:ra/0621010775619]" 

1945 ) 

1946 self.assertEqual(article["author"], expected_authors) 

1947 

1948 # Publisher field should still be empty since we haven't added the publisher RA data 

1949 self.assertEqual(article["publisher"], "") 

1950 

1951 def test_multiple_first_ars(self): 

1952 supplier_prefix = "060" 

1953 br_data = [ 

1954 { 

1955 "@graph": [ 

1956 { 

1957 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1", 

1958 "@type": [ 

1959 "http://purl.org/spar/fabio/Expression", 

1960 "http://purl.org/spar/fabio/JournalArticle", 

1961 ], 

1962 "http://purl.org/dc/terms/title": [ 

1963 {"@value": "Article With Multiple First Authors"} 

1964 ], 

1965 "http://purl.org/spar/pro/isDocumentContextFor": [ 

1966 { 

1967 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1" 

1968 }, # First potential author (will be processed) 

1969 { 

1970 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2" 

1971 }, # Second potential author (will be ignored) 

1972 { 

1973 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3" 

1974 }, # Connected to author 1 (will be processed) 

1975 ], 

1976 } 

1977 ] 

1978 } 

1979 ] 

1980 

1981 # Setup agent roles with two potential "first" authors (no hasNext pointing to them) 

1982 # and one author connected to the first one 

1983 ar_data = [ 

1984 { 

1985 "@graph": [ 

1986 { 

1987 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1", 

1988 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

1989 "http://purl.org/spar/pro/withRole": [ 

1990 {"@id": "http://purl.org/spar/pro/author"} 

1991 ], 

1992 "http://purl.org/spar/pro/isHeldBy": [ 

1993 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"} 

1994 ], 

1995 "https://w3id.org/oc/ontology/hasNext": [ 

1996 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"} 

1997 ], 

1998 }, 

1999 { 

2000 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2", 

2001 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

2002 "http://purl.org/spar/pro/withRole": [ 

2003 {"@id": "http://purl.org/spar/pro/author"} 

2004 ], 

2005 "http://purl.org/spar/pro/isHeldBy": [ 

2006 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"} 

2007 ], 

2008 # This is also a potential first author but will be ignored 

2009 }, 

2010 { 

2011 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3", 

2012 "@type": ["http://purl.org/spar/pro/RoleInTime"], 

2013 "http://purl.org/spar/pro/withRole": [ 

2014 {"@id": "http://purl.org/spar/pro/author"} 

2015 ], 

2016 "http://purl.org/spar/pro/isHeldBy": [ 

2017 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"} 

2018 ], 

2019 # This one is connected to author 1 via hasNext and will be processed 

2020 }, 

2021 ] 

2022 } 

2023 ] 

2024 

2025 # Setup responsible agents 

2026 ra_data = [ 

2027 { 

2028 "@graph": [ 

2029 { 

2030 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1", 

2031 "http://xmlns.com/foaf/0.1/name": [ 

2032 {"@value": "First Potential Author"} 

2033 ], 

2034 }, 

2035 { 

2036 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2", 

2037 "http://xmlns.com/foaf/0.1/name": [ 

2038 {"@value": "Second Potential Author"} 

2039 ], 

2040 }, 

2041 { 

2042 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3", 

2043 "http://xmlns.com/foaf/0.1/name": [ 

2044 {"@value": "Connected Author"} 

2045 ], 

2046 }, 

2047 ] 

2048 } 

2049 ] 

2050 

2051 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data} 

2052 

2053 for entity_type, data in data_files.items(): 

2054 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000") 

2055 os.makedirs(dir_path, exist_ok=True) 

2056 

2057 zip_path = os.path.join(dir_path, "1000.zip") 

2058 with ZipFile(zip_path, "w") as zip_file: 

2059 zip_file.writestr("1000.json", json.dumps(data)) 

2060 

2061 generate_csv( 

2062 input_dir=self.rdf_dir, 

2063 output_dir=self.output_dir, 

2064 dir_split_number=10000, 

2065 items_per_file=1000, 

2066 redis_port=6381, 

2067 redis_db=5, 

2068 ) 

2069 

2070 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv")) 

2071 self.assertEqual(len(output_data), 1) 

2072 

2073 article = output_data[0] 

2074 authors = article["author"].split("; ") 

2075 

2076 self.assertEqual( 

2077 len(authors), 

2078 2, 

2079 "Should have exactly two authors (first author and connected one)", 

2080 ) 

2081 

2082 expected_authors = [ 

2083 f"First Potential Author [omid:ra/{supplier_prefix}1]", 

2084 f"Connected Author [omid:ra/{supplier_prefix}3]", 

2085 ] 

2086 self.assertEqual( 

2087 authors, 

2088 expected_authors, 

2089 "Should have first author and connected author in correct order", 

2090 ) 

2091 

2092 self.assertNotIn( 

2093 f"Second Potential Author [omid:ra/{supplier_prefix}2]", 

2094 article["author"], 

2095 "Second potential author should not be in the output", 

2096 ) 

2097 

2098 

2099if __name__ == "__main__": 

2100 unittest.main()