Coverage for test/csv_generator_lite

1#!/usr/bin/python

2# -*- coding: utf-8 -*-

5# Permission to use, copy, modify, and/or distribute this software for any purpose

6# with or without fee is hereby granted, provided that the above copyright notice

7# and this permission notice appear in all copies.

9# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH

10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND

11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,

12# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,

13# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

14# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS

15# SOFTWARE.

17import csv

18import json

19import os

20import unittest

21from shutil import rmtree

22from zipfile import ZipFile

24import redis

25from oc_meta.lib.file_manager import get_csv_data

26from oc_meta.plugins.csv_generator_lite.csv_generator_lite import (

27 generate_csv,

28 init_redis_connection,

29 is_omid_processed,

30 load_processed_omids_to_redis,

31)

34class TestCSVGeneratorLite(unittest.TestCase):

35 def setUp(self):

36 self.base_dir = os.path.join("test", "csv_generator_lite")

37 self.input_dir = os.path.join(self.base_dir, "input")

38 self.output_dir = os.path.join(self.base_dir, "output")

40 # Create test directories if they don't exist

41 os.makedirs(self.input_dir, exist_ok=True)

42 os.makedirs(self.output_dir, exist_ok=True)

44 # Create test RDF structure

45 self.rdf_dir = os.path.join(self.input_dir, "rdf")

46 self.br_dir = os.path.join(self.rdf_dir, "br")

47 os.makedirs(self.br_dir, exist_ok=True)

49 # Initialize Redis connection for tests

50 self.redis_client = init_redis_connection(db=5) # Use DB 5 for testing

51 self.redis_client.flushdb() # Clear test database

53 def tearDown(self):

54 if os.path.exists(self.base_dir):

55 rmtree(self.base_dir)

56 # Clean up Redis test database

57 self.redis_client.flushdb()

59 def _write_test_data(self, data):

60 """Helper method to write test data to the input directory"""

61 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)

62 test_data = [

63 {

64 "@graph": [

65 {

66 "@id": f"https://w3id.org/oc/meta/{item['id'].replace('omid:', '')}",

67 "@type": [

68 "http://purl.org/spar/fabio/Expression",

69 "http://purl.org/spar/fabio/JournalArticle",

70 ],

71 "http://purl.org/dc/terms/title": [{"@value": item["title"]}],

72 }

73 for item in data

74 ]

75 }

76 ]

77 with ZipFile(

78 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"

79 ) as zip_file:

80 zip_file.writestr("1000.json", json.dumps(test_data))

82 def test_redis_connection_and_caching(self):

83 """Test Redis connection and basic caching operations"""

84 # Test connection initialization

85 redis_client = init_redis_connection(db=5)

86 self.assertIsInstance(redis_client, redis.Redis)

88 # Create a test CSV file with some OMIDs

89 test_data = [

90 {"id": "omid:br/0601", "title": "Test 1"},

91 {"id": "omid:br/0602", "title": "Test 2"},

92 {"id": "omid:br/0603 issn:456", "title": "Test 3"},

93 ]

94 os.makedirs(self.output_dir, exist_ok=True)

95 with open(

96 os.path.join(self.output_dir, "test.csv"), "w", newline="", encoding="utf-8"

97 ) as f:

98 writer = csv.DictWriter(f, fieldnames=["id", "title"])

99 writer.writeheader()

100 writer.writerows(test_data)

101

102 # Test loading OMIDs into Redis

103 count = load_processed_omids_to_redis(self.output_dir, redis_client)

104 self.assertEqual(count, 3)

105

106 # Test OMID lookup

107 self.assertTrue(is_omid_processed("omid:br/0601", redis_client))

108 self.assertTrue(is_omid_processed("omid:br/0602", redis_client))

109 self.assertTrue(is_omid_processed("omid:br/0603", redis_client))

110 self.assertFalse(is_omid_processed("omid:br/0604", redis_client))

111

112 def test_redis_cache_persistence(self):

113 """Test that Redis is populated from existing CSV files and cleared after completion"""

114 # Create initial test data

115 test_data = [

116 {

117 "@graph": [

118 {

119 "@id": "https://w3id.org/oc/meta/br/0601",

120 "@type": [

121 "http://purl.org/spar/fabio/Expression",

122 "http://purl.org/spar/fabio/JournalArticle",

123 ],

124 "http://purl.org/dc/terms/title": [{"@value": "First Run"}],

125 }

126 ]

127 }

128 ]

129

130 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)

131 with ZipFile(

132 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"

133 ) as zip_file:

134 zip_file.writestr("1000.json", json.dumps(test_data))

135

136 # First run - creates initial CSV

137 generate_csv(

138 input_dir=self.rdf_dir,

139 output_dir=self.output_dir,

140 dir_split_number=10000,

141 items_per_file=1000,

142 zip_output_rdf=True,

143 redis_db=5,

144 )

145

146 # Verify Redis is empty after first run

147 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client))

148

149 # Create new test data

150 test_data_2 = [

151 {

152 "@graph": [

153 {

154 "@id": "https://w3id.org/oc/meta/br/0601", # Same OMID as before

155 "@type": [

156 "http://purl.org/spar/fabio/Expression",

157 "http://purl.org/spar/fabio/JournalArticle",

158 ],

159 "http://purl.org/dc/terms/title": [

160 {"@value": "Should Be Skipped"}

161 ],

162 },

163 {

164 "@id": "https://w3id.org/oc/meta/br/0602", # New OMID

165 "@type": [

166 "http://purl.org/spar/fabio/Expression",

167 "http://purl.org/spar/fabio/JournalArticle",

168 ],

169 "http://purl.org/dc/terms/title": [

170 {"@value": "Should Be Processed"}

171 ],

172 },

173 ]

174 }

175 ]

176

177 with ZipFile(

178 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"

179 ) as zip_file:

180 zip_file.writestr("1000.json", json.dumps(test_data_2))

181

182 # Second run - should load OMIDs from existing CSV and skip already processed resources

183 generate_csv(

184 input_dir=self.rdf_dir,

185 output_dir=self.output_dir,

186 dir_split_number=10000,

187 items_per_file=1000,

188 zip_output_rdf=True,

189 redis_db=5,

190 )

191

192 # Check output files

193 output_data = []

194 for filename in os.listdir(self.output_dir):

195 if filename.endswith(".csv"):

196 output_data.extend(

197 get_csv_data(os.path.join(self.output_dir, filename))

198 )

199

200 # Verify results

201 # Should find exactly two entries - one from first run and one new one

202 self.assertEqual(len(output_data), 2)

203

204 # Find entries by title

205 first_run_entry = next(

206 item for item in output_data if item["title"] == "First Run"

207 )

208 second_run_entry = next(

209 item for item in output_data if item["title"] == "Should Be Processed"

210 )

211

212 # Verify the first entry wasn't overwritten with "Should Be Skipped"

213 self.assertEqual(first_run_entry["title"], "First Run")

214 self.assertEqual(first_run_entry["id"], "omid:br/0601")

215

216 # Verify the new entry was processed

217 self.assertEqual(second_run_entry["title"], "Should Be Processed")

218 self.assertEqual(second_run_entry["id"], "omid:br/0602")

219

220 # Verify Redis is empty after completion

221 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client))

222 self.assertFalse(is_omid_processed("omid:br/0602", self.redis_client))

223

224 def test_redis_cache_cleanup(self):

225 """Test that Redis cache is properly cleaned up in various scenarios"""

226 # First run - should process successfully and clear Redis

227 input_data = [{"id": "omid:br/0601", "title": "First Entry"}]

228 self._write_test_data(input_data)

229

230 # Run with valid directory - should process and clear Redis

231 generate_csv(

232 input_dir=self.rdf_dir,

233 output_dir=self.output_dir,

234 dir_split_number=10000,

235 items_per_file=1000,

236 zip_output_rdf=True,

237 redis_db=5,

238 )

239

240 # Verify Redis is empty after successful run

241 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client))

242

243 # Load processed OMIDs into Redis

244 load_processed_omids_to_redis(self.output_dir, self.redis_client)

245

246 # Verify that after loading from CSV, the OMID is in Redis

247 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client))

248

249 # Run with non-existent directory - should fail but keep Redis populated

250 generate_csv(

251 input_dir="/nonexistent/dir",

252 output_dir=self.output_dir,

253 dir_split_number=10000,

254 items_per_file=1000,

255 zip_output_rdf=True,

256 redis_db=5,

257 )

258

259 # Verify Redis still has the data after failed run

260 self.assertTrue(

261 is_omid_processed("omid:br/0601", self.redis_client),

262 "Redis cache should be retained after a failed run",

263 )

264

265 def test_redis_error_handling(self):

266 """Test handling of Redis connection errors"""

267 # Test with invalid Redis connection

268 with self.assertRaises(redis.ConnectionError):

269 init_redis_connection(port=12345) # Invalid port

270

271 # Test loading OMIDs with non-existent directory

272 count = load_processed_omids_to_redis("/nonexistent/dir", self.redis_client)

273 self.assertEqual(count, 0)

274

275 def test_concurrent_processing_with_redis(self):

276 """Test concurrent processing with Redis caching"""

277 # Create multiple test files

278 test_data = []

279 for i in range(100): # Create 100 test entries

280 test_data.append(

281 {

282 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}",

283 "@type": [

284 "http://purl.org/spar/fabio/Expression",

285 "http://purl.org/spar/fabio/JournalArticle",

286 ],

287 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}],

288 }

289 )

290

291 # Split into multiple files

292 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)

293 for i in range(0, 100, 10): # Create 10 files with 10 entries each

294 file_data = [{"@graph": test_data[i : i + 10]}]

295 with ZipFile(

296 os.path.join(self.br_dir, "060", "10000", f"{i+1000}.zip"), "w"

297 ) as zip_file:

298 zip_file.writestr(f"{i+1000}.json", json.dumps(file_data))

299

300 # First run to create some CSV files

301 generate_csv(

302 input_dir=self.rdf_dir,

303 output_dir=self.output_dir,

304 dir_split_number=10000,

305 items_per_file=1000,

306 zip_output_rdf=True,

307 redis_db=5,

308 )

309

310 # Create more test entries

311 more_test_data = []

312 for i in range(100, 200): # Create 100 more test entries

313 more_test_data.append(

314 {

315 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}",

316 "@type": [

317 "http://purl.org/spar/fabio/Expression",

318 "http://purl.org/spar/fabio/JournalArticle",

319 ],

320 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}],

321 }

322 )

323

324 # Add new files

325 for i in range(0, 100, 10):

326 file_data = [{"@graph": more_test_data[i : i + 10]}]

327 with ZipFile(

328 os.path.join(self.br_dir, "060", "10000", f"{i+2000}.zip"), "w"

329 ) as zip_file:

330 zip_file.writestr(f"{i+2000}.json", json.dumps(file_data))

331

332 # Second run with existing cache

333 generate_csv(

334 input_dir=self.rdf_dir,

335 output_dir=self.output_dir,

336 dir_split_number=10000,

337 items_per_file=1000,

338 zip_output_rdf=True,

339 redis_db=5,

340 )

341

342 # Verify results

343 all_output_data = []

344 for filename in os.listdir(self.output_dir):

345 if filename.endswith(".csv"):

346 all_output_data.extend(

347 get_csv_data(os.path.join(self.output_dir, filename))

348 )

349

350 # Should have processed all 200 entries

351 self.assertEqual(len(all_output_data), 200)

352

353 # Verify no duplicates

354 processed_ids = {row["id"] for row in all_output_data}

355 self.assertEqual(len(processed_ids), 200)

356

357 def test_basic_br_processing(self):

358 """Test basic bibliographic resource processing"""

359 test_data = [

360 {

361 "@graph": [

362 {

363 "@id": "https://w3id.org/oc/meta/br/0601",

364 "@type": [

365 "http://purl.org/spar/fabio/Expression",

366 "http://purl.org/spar/fabio/JournalArticle",

367 ],

368 "http://purl.org/dc/terms/title": [{"@value": "Test Article"}],

369 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [

370 {"@value": "2024-01-01"}

371 ],

372 "http://purl.org/spar/datacite/hasIdentifier": [

373 {"@id": "https://w3id.org/oc/meta/id/0601"}

374 ],

375 }

376 ],

377 "@id": "https://w3id.org/oc/meta/br/",

378 }

379 ]

380

381 # Write test data to file

382 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)

383 with ZipFile(

384 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"

385 ) as zip_file:

386 zip_file.writestr("1000.json", json.dumps(test_data))

387

388 # Run generator

389 generate_csv(

390 input_dir=self.rdf_dir,

391 output_dir=self.output_dir,

392 dir_split_number=10000,

393 items_per_file=1000,

394 zip_output_rdf=True,

395 )

396

397 # Check output

398 output_files = os.listdir(self.output_dir)

399 self.assertEqual(len(output_files), 1)

400

401 output_data = get_csv_data(os.path.join(self.output_dir, output_files[0]))

402 self.assertEqual(len(output_data), 1)

403 self.assertEqual(output_data[0]["title"], "Test Article")

404 self.assertEqual(output_data[0]["pub_date"], "2024-01-01")

405 self.assertEqual(output_data[0]["type"], "journal article")

406 self.assertEqual(output_data[0]["id"], "omid:br/0601")

407

408 def test_complex_br_with_related_entities(self):

409 """Test processing of BR with authors, venue, and other related entities"""

410 # Create directory structure for each entity type

411 supplier_prefix = "060"

412 for entity_type in ["br", "ra", "ar", "id"]:

413 os.makedirs(

414 os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000"),

415 exist_ok=True,

416 )

417

418 # BR data including both the article and the venue

419 br_data = [

420 {

421 "@graph": [

422 {

423 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2",

424 "@type": [

425 "http://purl.org/spar/fabio/Expression",

426 "http://purl.org/spar/fabio/JournalArticle",

427 ],

428 "http://purl.org/dc/terms/title": [

429 {"@value": "Complex Article"}

430 ],

431 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [

432 {"@value": "2024-02-01"}

433 ],

434 "http://purl.org/spar/pro/isDocumentContextFor": [

435 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}

436 ],

437 "http://purl.org/vocab/frbr/core#partOf": [

438 {"@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3"}

439 ],

440 },

441 {

442 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3",

443 "@type": [

444 "http://purl.org/spar/fabio/Expression",

445 "http://purl.org/spar/fabio/Journal",

446 ],

447 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}],

448 },

449 ],

450 "@id": "https://w3id.org/oc/meta/br/",

451 }

452 ]

453

454 ar_data = [

455 {

456 "@graph": [

457 {

458 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",

459 "http://purl.org/spar/pro/withRole": [

460 {"@id": "http://purl.org/spar/pro/author"}

461 ],

462 "http://purl.org/spar/pro/isHeldBy": [

463 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}

464 ],

465 }

466 ],

467 "@id": "https://w3id.org/oc/meta/ar/",

468 }

469 ]

470

471 ra_data = [

472 {

473 "@graph": [

474 {

475 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",

476 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}],

477 }

478 ],

479 "@id": "https://w3id.org/oc/meta/ra/",

480 }

481 ]

482

483 # Write test data files in correct locations

484 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}

485

486 for entity_type, data in data_files.items():

487 zip_path = os.path.join(

488 self.rdf_dir, entity_type, supplier_prefix, "10000", "1000.zip"

489 )

490 with ZipFile(zip_path, "w") as zip_file:

491 zip_file.writestr("1000.json", json.dumps(data))

492

493 # Run generator

494 generate_csv(

495 input_dir=self.rdf_dir,

496 output_dir=self.output_dir,

497 dir_split_number=10000,

498 items_per_file=1000,

499 zip_output_rdf=True,

500 )

501

502 # Check output

503 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))

504 self.assertEqual(len(output_data), 2) # Should have 2 rows: article and journal

505

506 # Find article and journal entries

507 article = next(

508 (item for item in output_data if item["type"] == "journal article"), None

509 )

510 journal = next(

511 (item for item in output_data if item["type"] == "journal"), None

512 )

513

514 # Verify article data

515 self.assertIsNotNone(article)

516 self.assertEqual(article["title"], "Complex Article")

517 self.assertEqual(article["venue"], f"Test Journal [omid:br/{supplier_prefix}3]")

518 self.assertEqual(article["author"], "Test Author [omid:ra/0601]")

519 self.assertEqual(article["id"], f"omid:br/{supplier_prefix}2")

520

521 # Verify journal data

522 self.assertIsNotNone(journal)

523 self.assertEqual(journal["title"], "Test Journal")

524 self.assertEqual(journal["type"], "journal")

525 self.assertEqual(journal["id"], f"omid:br/{supplier_prefix}3")

526

527 def test_empty_input_directory(self):

528 """Test behavior with empty input directory"""

529 generate_csv(

530 input_dir=self.rdf_dir,

531 output_dir=self.output_dir,

532 dir_split_number=10000,

533 items_per_file=1000,

534 zip_output_rdf=True,

535 )

536

537 self.assertEqual(len(os.listdir(self.output_dir)), 0)

538

539 def test_br_with_multiple_authors_and_editors(self):

540 """Test processing of BR with multiple authors and editors"""

541 supplier_prefix = "060"

542 br_data = [

543 {

544 "@graph": [

545 {

546 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",

547 "@type": [

548 "http://purl.org/spar/fabio/Expression",

549 "http://purl.org/spar/fabio/Book",

550 ],

551 "http://purl.org/dc/terms/title": [

552 {"@value": "Multi-Author Book"}

553 ],

554 "http://purl.org/spar/pro/isDocumentContextFor": [

555 {

556 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"

557 }, # First author

558 {

559 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"

560 }, # Second author

561 {

562 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"

563 }, # First editor

564 {

565 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4"

566 }, # Second editor

567 ],

568 }

569 ]

570 }

571 ]

572

573 # Setup agent roles for authors and editors with hasNext relations

574 ar_data = [

575 {

576 "@graph": [

577 {

578 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",

579 "http://purl.org/spar/pro/withRole": [

580 {"@id": "http://purl.org/spar/pro/author"}

581 ],

582 "http://purl.org/spar/pro/isHeldBy": [

583 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}

584 ],

585 "https://w3id.org/oc/ontology/hasNext": [

586 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}

587 ],

588 },

589 {

590 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",

591 "http://purl.org/spar/pro/withRole": [

592 {"@id": "http://purl.org/spar/pro/author"}

593 ],

594 "http://purl.org/spar/pro/isHeldBy": [

595 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}

596 ],

597 "https://w3id.org/oc/ontology/hasNext": [

598 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}

599 ],

600 },

601 {

602 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",

603 "http://purl.org/spar/pro/withRole": [

604 {"@id": "http://purl.org/spar/pro/editor"}

605 ],

606 "http://purl.org/spar/pro/isHeldBy": [

607 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}

608 ],

609 "https://w3id.org/oc/ontology/hasNext": [

610 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4"}

611 ],

612 },

613 {

614 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4",

615 "http://purl.org/spar/pro/withRole": [

616 {"@id": "http://purl.org/spar/pro/editor"}

617 ],

618 "http://purl.org/spar/pro/isHeldBy": [

619 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4"}

620 ],

621 },

622 ]

623 }

624 ]

625

626 # Setup responsible agents

627 ra_data = [

628 {

629 "@graph": [

630 {

631 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",

632 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}],

633 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}],

634 },

635 {

636 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",

637 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Doe"}],

638 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Jane"}],

639 },

640 {

641 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",

642 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Brown"}],

643 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Bob"}],

644 },

645 {

646 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4",

647 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Wilson"}],

648 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Alice"}],

649 },

650 ]

651 }

652 ]

653

654 # Write test data files

655 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}

656

657 for entity_type, data in data_files.items():

658 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")

659 os.makedirs(dir_path, exist_ok=True)

660

661 zip_path = os.path.join(dir_path, "1000.zip")

662 with ZipFile(zip_path, "w") as zip_file:

663 zip_file.writestr("1000.json", json.dumps(data))

664

665 # Run generator

666 generate_csv(

667 input_dir=self.rdf_dir,

668 output_dir=self.output_dir,

669 dir_split_number=10000,

670 items_per_file=1000,

671 zip_output_rdf=True,

672 )

673

674 # Check output

675 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))

676 self.assertEqual(len(output_data), 1)

677

678 # Verify authors and editors are in the correct order

679 expected_authors = (

680 f"Smith, John [omid:ra/{supplier_prefix}1]; "

681 f"Doe, Jane [omid:ra/{supplier_prefix}2]"

682 )

683 expected_editors = (

684 f"Brown, Bob [omid:ra/{supplier_prefix}3]; "

685 f"Wilson, Alice [omid:ra/{supplier_prefix}4]"

686 )

687

688 self.assertEqual(output_data[0]["author"], expected_authors)

689 self.assertEqual(output_data[0]["editor"], expected_editors)

690

691 def test_br_with_identifiers(self):

692 """Test processing of BR with multiple identifiers"""

693 supplier_prefix = "060"

694 br_data = [

695 {

696 "@graph": [

697 {

698 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",

699 "@type": [

700 "http://purl.org/spar/fabio/Expression",

701 "http://purl.org/spar/fabio/JournalArticle",

702 ],

703 "http://purl.org/dc/terms/title": [

704 {"@value": "Article With DOI"}

705 ],

706 "http://purl.org/spar/datacite/hasIdentifier": [

707 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1"},

708 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2"},

709 ],

710 }

711 ]

712 }

713 ]

714

715 id_data = [

716 {

717 "@graph": [

718 {

719 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1",

720 "http://purl.org/spar/datacite/usesIdentifierScheme": [

721 {"@id": "http://purl.org/spar/datacite/doi"}

722 ],

723 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [

724 {"@value": "10.1234/test.123"}

725 ],

726 },

727 {

728 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2",

729 "http://purl.org/spar/datacite/usesIdentifierScheme": [

730 {"@id": "http://purl.org/spar/datacite/isbn"}

731 ],

732 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [

733 {"@value": "978-0-123456-47-2"}

734 ],

735 },

736 ]

737 }

738 ]

739

740 # Write test data files in correct locations

741 data_files = {"br": br_data, "id": id_data}

742

743 for entity_type, data in data_files.items():

744 # Create all necessary directories

745 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")

746 os.makedirs(dir_path, exist_ok=True)

747

748 zip_path = os.path.join(dir_path, "1000.zip")

749 with ZipFile(zip_path, "w") as zip_file:

750 zip_file.writestr("1000.json", json.dumps(data))

751

752 # Run generator

753 generate_csv(

754 input_dir=self.rdf_dir,

755 output_dir=self.output_dir,

756 dir_split_number=10000,

757 items_per_file=1000,

758 zip_output_rdf=True,

759 )

760

761 # Check output

762 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))

763 self.assertEqual(len(output_data), 1)

764

765 # Verify all identifiers are included

766 expected_ids = (

767 f"omid:br/{supplier_prefix}1 doi:10.1234/test.123 isbn:978-0-123456-47-2"

768 )

769 self.assertEqual(output_data[0]["id"], expected_ids)

770

771 def test_br_with_page_numbers(self):

772 """Test processing of BR with page information"""

773 supplier_prefix = "060"

774 br_data = [

775 {

776 "@graph": [

777 {

778 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",

779 "@type": [

780 "http://purl.org/spar/fabio/Expression",

781 "http://purl.org/spar/fabio/JournalArticle",

782 ],

783 "http://purl.org/dc/terms/title": [{"@value": "Paged Article"}],

784 "http://purl.org/vocab/frbr/core#embodiment": [

785 {"@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1"}

786 ],

787 }

788 ]

789 }

790 ]

791

792 re_data = [

793 {

794 "@graph": [

795 {

796 "@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1",

797 "http://prismstandard.org/namespaces/basic/2.0/startingPage": [

798 {"@value": "100"}

799 ],

800 "http://prismstandard.org/namespaces/basic/2.0/endingPage": [

801 {"@value": "120"}

802 ],

803 }

804 ]

805 }

806 ]

807

808 # Write test data files in correct locations

809 data_files = {"br": br_data, "re": re_data}

810

811 for entity_type, data in data_files.items():

812 # Create all necessary directories

813 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")

814 os.makedirs(dir_path, exist_ok=True)

815

816 zip_path = os.path.join(dir_path, "1000.zip")

817 with ZipFile(zip_path, "w") as zip_file:

818 zip_file.writestr("1000.json", json.dumps(data))

819

820 # Run generator

821 generate_csv(

822 input_dir=self.rdf_dir,

823 output_dir=self.output_dir,

824 dir_split_number=10000,

825 items_per_file=1000,

826 zip_output_rdf=True,

827 )

828

829 # Check output

830 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))

831 self.assertEqual(len(output_data), 1)

832 self.assertEqual(output_data[0]["page"], "100-120")

833

834 def test_malformed_data_handling(self):

835 """Test handling of malformed or incomplete data"""

836 supplier_prefix = "060"

837 br_data = [

838 {

839 "@graph": [

840 {

841 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",

842 "@type": [

843 "http://purl.org/spar/fabio/Expression",

844 "http://purl.org/spar/fabio/JournalArticle",

845 ],

846 # Missing title

847 "http://purl.org/spar/pro/isDocumentContextFor": [

848 {"@id": "invalid_uri"}, # Invalid URI

849 ],

850 "http://purl.org/vocab/frbr/core#partOf": [

851 {"@id": "non_existent_venue"} # Non-existent venue

852 ],

853 }

854 ]

855 }

856 ]

857

858 # Write test data files in correct locations

859 data_files = {"br": br_data}

860

861 for entity_type, data in data_files.items():

862 # Create all necessary directories

863 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")

864 os.makedirs(dir_path, exist_ok=True)

865

866 zip_path = os.path.join(dir_path, "1000.zip")

867 with ZipFile(zip_path, "w") as zip_file:

868 zip_file.writestr("1000.json", json.dumps(data))

869

870 # Run generator

871 generate_csv(

872 input_dir=self.rdf_dir,

873 output_dir=self.output_dir,

874 dir_split_number=10000,

875 items_per_file=1000,

876 zip_output_rdf=True,

877 )

878

879 # Check output

880 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))

881 self.assertEqual(len(output_data), 1)

882 # Verify graceful handling of missing/invalid data

883 self.assertEqual(output_data[0]["title"], "")

884 self.assertEqual(output_data[0]["author"], "")

885 self.assertEqual(output_data[0]["venue"], "")

886

887 def test_br_with_hierarchical_venue_structures(self):

888 """Test different hierarchical venue structures (issue->volume->journal, issue->journal, volume->journal, direct journal)"""

889 supplier_prefix = "060"

890

891 # Create test data for different hierarchical structures

892 br_data = [

893 {

894 "@graph": [

895 # Article in issue->volume->journal structure

896 {

897 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",

898 "@type": [

899 "http://purl.org/spar/fabio/Expression",

900 "http://purl.org/spar/fabio/JournalArticle",

901 ],

902 "http://purl.org/dc/terms/title": [

903 {"@value": "Article in Full Hierarchy"}

904 ],

905 "http://purl.org/vocab/frbr/core#partOf": [

906 {

907 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2"

908 } # Issue

909 ],

910 },

911 # Article in issue->journal structure (no volume)

912 {

913 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}5",

914 "@type": [

915 "http://purl.org/spar/fabio/Expression",

916 "http://purl.org/spar/fabio/JournalArticle",

917 ],

918 "http://purl.org/dc/terms/title": [

919 {"@value": "Article in Issue-Journal"}

920 ],

921 "http://purl.org/vocab/frbr/core#partOf": [

922 {

923 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6"

924 } # Issue

925 ],

926 },

927 # Article in volume->journal structure (no issue)

928 {

929 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}9",

930 "@type": [

931 "http://purl.org/spar/fabio/Expression",

932 "http://purl.org/spar/fabio/JournalArticle",

933 ],

934 "http://purl.org/dc/terms/title": [

935 {"@value": "Article in Volume-Journal"}

936 ],

937 "http://purl.org/vocab/frbr/core#partOf": [

938 {

939 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10"

940 } # Volume

941 ],

942 },

943 # Article directly in journal

944 {

945 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}13",

946 "@type": [

947 "http://purl.org/spar/fabio/Expression",

948 "http://purl.org/spar/fabio/JournalArticle",

949 ],

950 "http://purl.org/dc/terms/title": [

951 {"@value": "Article in Journal"}

952 ],

953 "http://purl.org/vocab/frbr/core#partOf": [

954 {

955 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"

956 } # Journal

957 ],

958 },

959 # Issue in full hierarchy

960 {

961 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2",

962 "@type": ["http://purl.org/spar/fabio/JournalIssue"],

963 "http://purl.org/spar/fabio/hasSequenceIdentifier": [

964 {"@value": "2"}

965 ],

966 "http://purl.org/vocab/frbr/core#partOf": [

967 {

968 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3"

969 } # Volume

970 ],

971 },

972 # Volume in full hierarchy

973 {

974 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3",

975 "@type": ["http://purl.org/spar/fabio/JournalVolume"],

976 "http://purl.org/spar/fabio/hasSequenceIdentifier": [

977 {"@value": "42"}

978 ],

979 "http://purl.org/vocab/frbr/core#partOf": [

980 {

981 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"

982 } # Journal

983 ],

984 },

985 # Journal

986 {

987 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4",

988 "@type": ["http://purl.org/spar/fabio/Journal"],

989 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}],

990 },

991 # Issue directly in journal

992 {

993 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6",

994 "@type": ["http://purl.org/spar/fabio/JournalIssue"],

995 "http://purl.org/spar/fabio/hasSequenceIdentifier": [

996 {"@value": "3"}

997 ],

998 "http://purl.org/vocab/frbr/core#partOf": [

999 {

1000 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"

1001 } # Journal

1002 ],

1003 },

1004 # Volume directly in journal

1005 {

1006 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10",

1007 "@type": ["http://purl.org/spar/fabio/JournalVolume"],

1008 "http://purl.org/spar/fabio/hasSequenceIdentifier": [

1009 {"@value": "5"}

1010 ],

1011 "http://purl.org/vocab/frbr/core#partOf": [

1012 {

1013 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"

1014 } # Journal

1015 ],

1016 },

1017 ]

1018 }

1019 ]

1020

1021 # Write test data files

1022 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000")

1023 os.makedirs(dir_path, exist_ok=True)

1024

1025 zip_path = os.path.join(dir_path, "1000.zip")

1026 with ZipFile(zip_path, "w") as zip_file:

1027 zip_file.writestr("1000.json", json.dumps(br_data))

1028

1029 # Run generator

1030 generate_csv(

1031 input_dir=self.rdf_dir,

1032 output_dir=self.output_dir,

1033 dir_split_number=10000,

1034 items_per_file=1000,

1035 zip_output_rdf=True,

1036 )

1037

1038 # Check output

1039 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))

1040

1041 # Verify we only have the articles and journal in the output

1042 self.assertEqual(len(output_data), 5) # 4 articles + 1 journal

1043

1044 # Verify no JournalVolume or JournalIssue entries exist

1045 volume_or_issue_entries = [

1046 item

1047 for item in output_data

1048 if item["type"] in ["journal volume", "journal issue"]

1049 ]

1050 self.assertEqual(len(volume_or_issue_entries), 0)

1051

1052 # Find each article by title

1053 full_hierarchy = next(

1054 item for item in output_data if item["title"] == "Article in Full Hierarchy"

1055 )

1056 issue_journal = next(

1057 item for item in output_data if item["title"] == "Article in Issue-Journal"

1058 )

1059 volume_journal = next(

1060 item for item in output_data if item["title"] == "Article in Volume-Journal"

1061 )

1062 direct_journal = next(

1063 item for item in output_data if item["title"] == "Article in Journal"

1064 )

1065

1066 # Test full hierarchy (issue->volume->journal)

1067 self.assertEqual(full_hierarchy["issue"], "2")

1068 self.assertEqual(full_hierarchy["volume"], "42")

1069 self.assertEqual(

1070 full_hierarchy["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"

1071 )

1072

1073 # Test issue->journal (no volume)

1074 self.assertEqual(issue_journal["issue"], "3")

1075 self.assertEqual(issue_journal["volume"], "")

1076 self.assertEqual(

1077 issue_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"

1078 )

1079

1080 # Test volume->journal (no issue)

1081 self.assertEqual(volume_journal["issue"], "")

1082 self.assertEqual(volume_journal["volume"], "5")

1083 self.assertEqual(

1084 volume_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"

1085 )

1086

1087 # Test direct journal connection

1088 self.assertEqual(direct_journal["issue"], "")

1089 self.assertEqual(direct_journal["volume"], "")

1090 self.assertEqual(

1091 direct_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"

1092 )

1093

1094 def test_book_in_series(self):

1095 """Test processing of a book that is part of a book series"""

1096 supplier_prefix = "060"

1097

1098 # Create test data for book in series

1099 br_data = [

1100 {

1101 "@graph": [

1102 # Book

1103 {

1104 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",

1105 "@type": [

1106 "http://purl.org/spar/fabio/Expression",

1107 "http://purl.org/spar/fabio/Book",

1108 ],

1109 "http://purl.org/dc/terms/title": [{"@value": "Test Book"}],

1110 "http://purl.org/vocab/frbr/core#partOf": [

1111 {

1112 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2"

1113 } # Series

1114 ],

1115 },

1116 # Book Series

1117 {

1118 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2",

1119 "@type": ["http://purl.org/spar/fabio/BookSeries"],

1120 "http://purl.org/dc/terms/title": [

1121 {"@value": "Test Book Series"}

1122 ],

1123 },

1124 ]

1125 }

1126 ]

1127

1128 # Write test data

1129 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000")

1130 os.makedirs(dir_path, exist_ok=True)

1131

1132 zip_path = os.path.join(dir_path, "1000.zip")

1133 with ZipFile(zip_path, "w") as zip_file:

1134 zip_file.writestr("1000.json", json.dumps(br_data))

1135

1136 # Run generator

1137 generate_csv(

1138 input_dir=self.rdf_dir,

1139 output_dir=self.output_dir,

1140 dir_split_number=10000,

1141 items_per_file=1000,

1142 zip_output_rdf=True,

1143 )

1144

1145 # Check output

1146 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))

1147

1148 # Find book entry

1149 book = next(item for item in output_data if item["type"] == "book")

1150

1151 # Verify book is correctly linked to series

1152 self.assertEqual(book["title"], "Test Book")

1153 self.assertEqual(

1154 book["venue"], f"Test Book Series [omid:br/{supplier_prefix}2]"

1155 )

1156 self.assertEqual(book["volume"], "") # Should not have volume

1157 self.assertEqual(book["issue"], "") # Should not have issue

1158

1159 def test_br_with_multiple_roles(self):

1160 """Test processing of BR with authors, editors and publishers"""

1161 supplier_prefix = "060"

1162 br_data = [

1163 {

1164 "@graph": [

1165 {

1166 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",

1167 "@type": [

1168 "http://purl.org/spar/fabio/Expression",

1169 "http://purl.org/spar/fabio/Book",

1170 ],

1171 "http://purl.org/dc/terms/title": [

1172 {"@value": "Multi-Role Book"}

1173 ],

1174 "http://purl.org/spar/pro/isDocumentContextFor": [

1175 {

1176 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"

1177 }, # Author

1178 {

1179 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"

1180 }, # Editor

1181 {

1182 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"

1183 }, # Publisher

1184 ],

1185 }

1186 ]

1187 }

1188 ]

1189

1190 # Setup agent roles for authors, editors and publishers

1191 ar_data = [

1192 {

1193 "@graph": [

1194 {

1195 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",

1196 "http://purl.org/spar/pro/withRole": [

1197 {"@id": "http://purl.org/spar/pro/author"}

1198 ],

1199 "http://purl.org/spar/pro/isHeldBy": [

1200 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}

1201 ],

1202 "https://w3id.org/oc/ontology/hasNext": [

1203 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}

1204 ],

1205 },

1206 {

1207 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",

1208 "http://purl.org/spar/pro/withRole": [

1209 {"@id": "http://purl.org/spar/pro/editor"}

1210 ],

1211 "http://purl.org/spar/pro/isHeldBy": [

1212 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}

1213 ],

1214 "https://w3id.org/oc/ontology/hasNext": [

1215 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}

1216 ],

1217 },

1218 {

1219 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",

1220 "http://purl.org/spar/pro/withRole": [

1221 {"@id": "http://purl.org/spar/pro/publisher"}

1222 ],

1223 "http://purl.org/spar/pro/isHeldBy": [

1224 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}

1225 ],

1226 },

1227 ]

1228 }

1229 ]

1230

1231 # Setup responsible agents with different name formats

1232 ra_data = [

1233 {

1234 "@graph": [

1235 {

1236 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",

1237 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}],

1238 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}],

1239 },

1240 {

1241 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",

1242 "http://xmlns.com/foaf/0.1/name": [{"@value": "Editor Name"}],

1243 },

1244 {

1245 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",

1246 "http://xmlns.com/foaf/0.1/name": [

1247 {"@value": "Publisher House"}

1248 ],

1249 },

1250 ]

1251 }

1252 ]

1253

1254 # Write test data files

1255 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}

1256

1257 for entity_type, data in data_files.items():

1258 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")

1259 os.makedirs(dir_path, exist_ok=True)

1260

1261 zip_path = os.path.join(dir_path, "1000.zip")

1262 with ZipFile(zip_path, "w") as zip_file:

1263 zip_file.writestr("1000.json", json.dumps(data))

1264

1265 # Run generator

1266 generate_csv(

1267 input_dir=self.rdf_dir,

1268 output_dir=self.output_dir,

1269 dir_split_number=10000,

1270 items_per_file=1000,

1271 zip_output_rdf=True,

1272 )

1273

1274 # Check output

1275 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))

1276 self.assertEqual(len(output_data), 1)

1277

1278 # Verify all roles are correctly processed

1279 book = output_data[0]

1280 self.assertEqual(book["title"], "Multi-Role Book")

1281 self.assertEqual(book["author"], f"Smith, John [omid:ra/{supplier_prefix}1]")

1282 self.assertEqual(book["editor"], f"Editor Name [omid:ra/{supplier_prefix}2]")

1283 self.assertEqual(

1284 book["publisher"], f"Publisher House [omid:ra/{supplier_prefix}3]"

1285 )

1286

1287 def test_ordered_authors(self):

1288 """Test that authors are ordered according to hasNext relations"""

1289 supplier_prefix = "060"

1290 br_data = [

1291 {

1292 "@graph": [

1293 {

1294 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",

1295 "@type": [

1296 "http://purl.org/spar/fabio/Expression",

1297 "http://purl.org/spar/fabio/JournalArticle",

1298 ],

1299 "http://purl.org/dc/terms/title": [

1300 {"@value": "Ordered Authors Article"}

1301 ],

1302 "http://purl.org/spar/pro/isDocumentContextFor": [

1303 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"},

1304 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"},

1305 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"},

1306 ],

1307 }

1308 ]

1309 }

1310 ]

1311

1312 # Setup agent roles with hasNext relations

1313 ar_data = [

1314 {

1315 "@graph": [

1316 {

1317 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",

1318 "http://purl.org/spar/pro/withRole": [

1319 {"@id": "http://purl.org/spar/pro/author"}

1320 ],

1321 "http://purl.org/spar/pro/isHeldBy": [

1322 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}

1323 ],

1324 "https://w3id.org/oc/ontology/hasNext": [

1325 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}

1326 ],

1327 },

1328 {

1329 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",

1330 "http://purl.org/spar/pro/withRole": [

1331 {"@id": "http://purl.org/spar/pro/author"}

1332 ],

1333 "http://purl.org/spar/pro/isHeldBy": [

1334 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}

1335 ],

1336 "https://w3id.org/oc/ontology/hasNext": [

1337 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}

1338 ],

1339 },

1340 {

1341 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",

1342 "http://purl.org/spar/pro/withRole": [

1343 {"@id": "http://purl.org/spar/pro/author"}

1344 ],

1345 "http://purl.org/spar/pro/isHeldBy": [

1346 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}

1347 ],

1348 },

1349 ]

1350 }

1351 ]

1352

1353 # Setup responsible agents with different names

1354 ra_data = [

1355 {

1356 "@graph": [

1357 {

1358 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",

1359 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}],

1360 },

1361 {

1362 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",

1363 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}],

1364 },

1365 {

1366 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",

1367 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}],

1368 },

1369 ]

1370 }

1371 ]

1372

1373 # Write test data files

1374 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}

1375

1376 for entity_type, data in data_files.items():

1377 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")

1378 os.makedirs(dir_path, exist_ok=True)

1379

1380 zip_path = os.path.join(dir_path, "1000.zip")

1381 with ZipFile(zip_path, "w") as zip_file:

1382 zip_file.writestr("1000.json", json.dumps(data))

1383

1384 # Run generator

1385 generate_csv(

1386 input_dir=self.rdf_dir,

1387 output_dir=self.output_dir,

1388 dir_split_number=10000,

1389 items_per_file=1000,

1390 zip_output_rdf=True,

1391 )

1392

1393 # Check output

1394 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))

1395 self.assertEqual(len(output_data), 1)

1396

1397 # Verify authors are in the correct order

1398 expected_authors = (

1399 f"First Author [omid:ra/{supplier_prefix}1]; "

1400 f"Second Author [omid:ra/{supplier_prefix}2]; "

1401 f"Third Author [omid:ra/{supplier_prefix}3]"

1402 )

1403 self.assertEqual(output_data[0]["author"], expected_authors)

1404

1405 def test_cyclic_hasNext_relations(self):

1406 """Test handling of cyclic hasNext relations between agent roles"""

1407 supplier_prefix = "060"

1408 br_data = [

1409 {

1410 "@graph": [

1411 {

1412 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",

1413 "@type": [

1414 "http://purl.org/spar/fabio/Expression",

1415 "http://purl.org/spar/fabio/JournalArticle",

1416 ],

1417 "http://purl.org/dc/terms/title": [

1418 {"@value": "Cyclic Authors Article"}

1419 ],

1420 "http://purl.org/spar/pro/isDocumentContextFor": [

1421 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"},

1422 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"},

1423 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"},

1424 ],

1425 }

1426 ]

1427 }

1428 ]

1429

1430 # Setup agent roles with cyclic hasNext relations

1431 ar_data = [

1432 {

1433 "@graph": [

1434 {

1435 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",

1436 "http://purl.org/spar/pro/withRole": [

1437 {"@id": "http://purl.org/spar/pro/author"}

1438 ],

1439 "http://purl.org/spar/pro/isHeldBy": [

1440 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}

1441 ],

1442 "https://w3id.org/oc/ontology/hasNext": [

1443 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}

1444 ],

1445 },

1446 {

1447 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",

1448 "http://purl.org/spar/pro/withRole": [

1449 {"@id": "http://purl.org/spar/pro/author"}

1450 ],

1451 "http://purl.org/spar/pro/isHeldBy": [

1452 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}

1453 ],

1454 # Creates a cycle: 1 -> 2 -> 3 -> 1

1455 "https://w3id.org/oc/ontology/hasNext": [

1456 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}

1457 ],

1458 },

1459 {

1460 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",

1461 "http://purl.org/spar/pro/withRole": [

1462 {"@id": "http://purl.org/spar/pro/author"}

1463 ],

1464 "http://purl.org/spar/pro/isHeldBy": [

1465 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}

1466 ],

1467 # Cycle completion

1468 "https://w3id.org/oc/ontology/hasNext": [

1469 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}

1470 ],

1471 },

1472 ]

1473 }

1474 ]

1475

1476 # Setup responsible agents

1477 ra_data = [

1478 {

1479 "@graph": [

1480 {

1481 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",

1482 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}],

1483 },

1484 {

1485 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",

1486 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}],

1487 },

1488 {

1489 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",

1490 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}],

1491 },

1492 ]

1493 }

1494 ]

1495

1496 # Write test data files

1497 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}

1498

1499 for entity_type, data in data_files.items():

1500 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")

1501 os.makedirs(dir_path, exist_ok=True)

1502

1503 zip_path = os.path.join(dir_path, "1000.zip")

1504 with ZipFile(zip_path, "w") as zip_file:

1505 zip_file.writestr("1000.json", json.dumps(data))

1506

1507 # Run generator

1508 generate_csv(

1509 input_dir=self.rdf_dir,

1510 output_dir=self.output_dir,

1511 dir_split_number=10000,

1512 items_per_file=1000,

1513 zip_output_rdf=True,

1514 )

1515

1516 # Check output

1517 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))

1518 self.assertEqual(len(output_data), 1)

1519

1520 # Verify that we get at least some authors before the cycle is detected

1521 # The order should be maintained until the cycle is detected

1522 authors = output_data[0]["author"].split("; ")

1523 self.assertGreater(len(authors), 0)

1524

1525 # Verify the presence and order of authors

1526 self.assertTrue(

1527 any(

1528 f"First Author [omid:ra/{supplier_prefix}1]" in author

1529 for author in authors

1530 )

1531 )

1532 self.assertTrue(

1533 any(

1534 f"Second Author [omid:ra/{supplier_prefix}2]" in author

1535 for author in authors

1536 )

1537 )

1538

1539 # Verify no duplicates in the output

1540 author_set = set(authors)

1541 self.assertEqual(

1542 len(authors),

1543 len(author_set),

1544 "Found duplicate authors in output: each author should appear exactly once",

1545 )

1546

1547 # Verify the exact order and number of authors

1548 expected_authors = [

1549 f"First Author [omid:ra/{supplier_prefix}1]",

1550 f"Second Author [omid:ra/{supplier_prefix}2]",

1551 f"Third Author [omid:ra/{supplier_prefix}3]",

1552 ]

1553 self.assertEqual(

1554 authors,

1555 expected_authors,

1556 "Authors should be in correct order and each should appear exactly once",

1557 )

1558

1559 def test_multiple_input_files(self):

1560 """Test processing of multiple input files with sequential entity IDs"""

1561 supplier_prefix = "060"

1562

1563 # Create test data spanning multiple files

1564 # First file (entities 1-1000)

1565 br_data_1 = [

1566 {

1567 "@graph": [

1568 {

1569 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",

1570 "@type": [

1571 "http://purl.org/spar/fabio/Expression",

1572 "http://purl.org/spar/fabio/JournalArticle",

1573 ],

1574 "http://purl.org/dc/terms/title": [{"@value": "Article 1"}],

1575 },

1576 {

1577 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1000",

1578 "@type": [

1579 "http://purl.org/spar/fabio/Expression",

1580 "http://purl.org/spar/fabio/JournalArticle",

1581 ],

1582 "http://purl.org/dc/terms/title": [{"@value": "Article 1000"}],

1583 },

1584 ]

1585 }

1586 ]

1587

1588 # Second file (entities 1001-2000)

1589 br_data_2 = [

1590 {

1591 "@graph": [

1592 {

1593 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1001",

1594 "@type": [

1595 "http://purl.org/spar/fabio/Expression",

1596 "http://purl.org/spar/fabio/JournalArticle",

1597 ],

1598 "http://purl.org/dc/terms/title": [{"@value": "Article 1001"}],

1599 },

1600 {

1601 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2000",

1602 "@type": [

1603 "http://purl.org/spar/fabio/Expression",

1604 "http://purl.org/spar/fabio/JournalArticle",

1605 ],

1606 "http://purl.org/dc/terms/title": [{"@value": "Article 2000"}],

1607 },

1608 ]

1609 }

1610 ]

1611

1612 # Third file (entities 2001-3000)

1613 br_data_3 = [

1614 {

1615 "@graph": [

1616 {

1617 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2001",

1618 "@type": [

1619 "http://purl.org/spar/fabio/Expression",

1620 "http://purl.org/spar/fabio/JournalArticle",

1621 ],

1622 "http://purl.org/dc/terms/title": [{"@value": "Article 2001"}],

1623 "http://purl.org/spar/pro/isDocumentContextFor": [

1624 {

1625 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001"

1626 }

1627 ],

1628 }

1629 ]

1630 }

1631 ]

1632

1633 # Create agent role data in a different file

1634 ar_data = [

1635 {

1636 "@graph": [

1637 {

1638 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001",

1639 "http://purl.org/spar/pro/withRole": [

1640 {"@id": "http://purl.org/spar/pro/author"}

1641 ],

1642 "http://purl.org/spar/pro/isHeldBy": [

1643 {

1644 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001"

1645 }

1646 ],

1647 }

1648 ]

1649 }

1650 ]

1651

1652 # Create responsible agent data in a different file

1653 ra_data = [

1654 {

1655 "@graph": [

1656 {

1657 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001",

1658 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}],

1659 }

1660 ]

1661 }

1662 ]

1663

1664 # Write test data to appropriate locations based on ID ranges

1665 os.makedirs(os.path.join(self.br_dir, supplier_prefix, "10000"), exist_ok=True)

1666 os.makedirs(

1667 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000"), exist_ok=True

1668 )

1669 os.makedirs(

1670 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000"), exist_ok=True

1671 )

1672

1673 # Write BR files

1674 with ZipFile(

1675 os.path.join(self.br_dir, supplier_prefix, "10000", "1000.zip"), "w"

1676 ) as zip_file:

1677 zip_file.writestr("1000.json", json.dumps(br_data_1))

1678 with ZipFile(

1679 os.path.join(self.br_dir, supplier_prefix, "10000", "2000.zip"), "w"

1680 ) as zip_file:

1681 zip_file.writestr("2000.json", json.dumps(br_data_2))

1682 with ZipFile(

1683 os.path.join(self.br_dir, supplier_prefix, "10000", "3000.zip"), "w"

1684 ) as zip_file:

1685 zip_file.writestr("3000.json", json.dumps(br_data_3))

1686

1687 # Write AR and RA files

1688 with ZipFile(

1689 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000", "3000.zip"), "w"

1690 ) as zip_file:

1691 zip_file.writestr("3000.json", json.dumps(ar_data))

1692 with ZipFile(

1693 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000", "3000.zip"), "w"

1694 ) as zip_file:

1695 zip_file.writestr("3000.json", json.dumps(ra_data))

1696

1697 # Run generator

1698 generate_csv(

1699 input_dir=self.rdf_dir,

1700 output_dir=self.output_dir,

1701 dir_split_number=10000,

1702 items_per_file=1000,

1703 zip_output_rdf=True,

1704 )

1705

1706 # Check output

1707 output_files = sorted(os.listdir(self.output_dir))

1708 self.assertGreater(len(output_files), 0)

1709

1710 # Collect all output data

1711 all_output_data = []

1712 for output_file in output_files:

1713 all_output_data.extend(

1714 get_csv_data(os.path.join(self.output_dir, output_file))

1715 )

1716

1717 # Verify we have all expected entries

1718 self.assertEqual(len(all_output_data), 5) # Should have 5 articles total

1719

1720 # Verify specific entries

1721 article_1 = next(

1722 item

1723 for item in all_output_data

1724 if item["id"] == f"omid:br/{supplier_prefix}1"

1725 )

1726 article_1000 = next(

1727 item

1728 for item in all_output_data

1729 if item["id"] == f"omid:br/{supplier_prefix}1000"

1730 )

1731 article_1001 = next(

1732 item

1733 for item in all_output_data

1734 if item["id"] == f"omid:br/{supplier_prefix}1001"

1735 )

1736 article_2000 = next(

1737 item

1738 for item in all_output_data

1739 if item["id"] == f"omid:br/{supplier_prefix}2000"

1740 )

1741 article_2001 = next(

1742 item

1743 for item in all_output_data

1744 if item["id"] == f"omid:br/{supplier_prefix}2001"

1745 )

1746

1747 # Check titles

1748 self.assertEqual(article_1["title"], "Article 1")

1749 self.assertEqual(article_1000["title"], "Article 1000")

1750 self.assertEqual(article_1001["title"], "Article 1001")

1751 self.assertEqual(article_2000["title"], "Article 2000")

1752 self.assertEqual(article_2001["title"], "Article 2001")

1753

1754 # Check author for article 2001 (which has related entities)

1755 self.assertEqual(

1756 article_2001["author"], f"Test Author [omid:ra/{supplier_prefix}2001]"

1757 )

1758

1759 def test_max_rows_per_file_and_data_integrity(self):

1760 """Test that output files respect max rows limit and no data is lost in multiprocessing"""

1761 supplier_prefix = "060"

1762

1763 # Create test data with more than 3000 entries

1764 br_data = [

1765 {

1766 "@graph": [

1767 # Generate 3500 test entries

1768 *[

1769 {

1770 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}{i}",

1771 "@type": [

1772 "http://purl.org/spar/fabio/Expression",

1773 "http://purl.org/spar/fabio/JournalArticle",

1774 ],

1775 "http://purl.org/dc/terms/title": [

1776 {"@value": f"Article {i}"}

1777 ],

1778 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [

1779 {"@value": "2024-01-01"}

1780 ],

1781 }

1782 for i in range(1, 3501)

1783 ] # This will create 3500 entries

1784 ]

1785 }

1786 ]

1787

1788 # Split data into multiple files to test multiprocessing

1789 entries_per_file = 1000

1790 for i in range(0, 3500, entries_per_file):

1791 file_data = [{"@graph": br_data[0]["@graph"][i : i + entries_per_file]}]

1792

1793 # Create directory structure for the file

1794 file_number = i + entries_per_file

1795 dir_path = os.path.join(self.br_dir, supplier_prefix, "10000")

1796 os.makedirs(dir_path, exist_ok=True)

1797

1798 # Write the file

1799 with ZipFile(os.path.join(dir_path, f"{file_number}.zip"), "w") as zip_file:

1800 zip_file.writestr(f"{file_number}.json", json.dumps(file_data))

1801

1802 # Run generator

1803 generate_csv(

1804 input_dir=self.rdf_dir,

1805 output_dir=self.output_dir,

1806 dir_split_number=10000,

1807 items_per_file=1000,

1808 zip_output_rdf=True,

1809 )

1810

1811 # Check output files

1812 output_files = sorted(os.listdir(self.output_dir))

1813

1814 # Verify number of output files

1815 # We expect at least 2 files: 3500 entries should create 2 files (3000 + 500)

1816 self.assertGreaterEqual(

1817 len(output_files), 2, "Should have at least 2 output files for 3500 entries"

1818 )

1819

1820 # Collect all entries from all output files

1821 all_entries = []

1822 for output_file in output_files:

1823 entries = get_csv_data(os.path.join(self.output_dir, output_file))

1824

1825 # Verify each file has at most 3000 rows

1826 self.assertLessEqual(

1827 len(entries),

1828 3000,

1829 f"File {output_file} has more than 3000 rows: {len(entries)}",

1830 )

1831

1832 all_entries.extend(entries)

1833

1834 # Verify total number of entries

1835 self.assertEqual(

1836 len(all_entries),

1837 3500,

1838 f"Expected 3500 total entries, got {len(all_entries)}",

1839 )

1840

1841 # Verify no duplicate entries

1842 unique_ids = {entry["id"] for entry in all_entries}

1843 self.assertEqual(

1844 len(unique_ids),

1845 3500,

1846 f"Expected 3500 unique entries, got {len(unique_ids)}",

1847 )

1848

1849 # Verify all entries are present (no missing entries)

1850 expected_ids = {f"omid:br/{supplier_prefix}{i}" for i in range(1, 3501)}

1851 self.assertEqual(

1852 unique_ids,

1853 expected_ids,

1854 "Some entries are missing or unexpected entries are present",

1855 )

1856

1857 # Verify data integrity

1858 for i in range(1, 3501):

1859 entry = next(

1860 e for e in all_entries if e["id"] == f"omid:br/{supplier_prefix}{i}"

1861 )

1862 self.assertEqual(entry["title"], f"Article {i}")

1863 self.assertEqual(entry["pub_date"], "2024-01-01")

1864 self.assertEqual(entry["type"], "journal article")

1865

1866 def test_csv_field_limit_handling(self):

1867 """Test handling of CSV files with large fields that exceed the default limit"""

1868 # Create a test CSV with a very large field

1869 large_field = "omid:br/0601 " + " ".join(

1870 [f"id:{i}" for i in range(10000)]

1871 ) # This will create a field > 131072 chars

1872 test_data = {"id": large_field, "title": "Test Large Field"}

1873

1874 os.makedirs(self.output_dir, exist_ok=True)

1875 with open(

1876 os.path.join(self.output_dir, "large_field.csv"),

1877 "w",

1878 newline="",

1879 encoding="utf-8",

1880 ) as f:

1881 writer = csv.DictWriter(f, fieldnames=["id", "title"])

1882 writer.writeheader()

1883 writer.writerow(test_data)

1884

1885 # Try loading the data - this should trigger the field limit increase

1886 count = load_processed_omids_to_redis(self.output_dir, self.redis_client)

1887

1888 # Verify the OMID was loaded despite the large field

1889 self.assertEqual(count, 1)

1890 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client))

1891

1892 def test_complex_br_with_missing_authors(self):

1893 """Test processing of a complex BR with multiple related entities where authors might be missing"""

1894 supplier_prefix = "06250"

1895 br_data = [

1896 {

1897 "@graph": [

1898 {

1899 "@id": "https://w3id.org/oc/meta/br/062501777134",

1900 "@type": [

1901 "http://purl.org/spar/fabio/JournalArticle",

1902 "http://purl.org/spar/fabio/Expression",

1903 ],

1904 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [

1905 {

1906 "@type": "http://www.w3.org/2001/XMLSchema#gYearMonth",

1907 "@value": "2020-02",

1908 }

1909 ],

1910 "http://purl.org/dc/terms/title": [

1911 {

1912 "@value": "OpenCitations, An Infrastructure Organization For Open Scholarship"

1913 }

1914 ],

1915 "http://purl.org/spar/datacite/hasIdentifier": [

1916 {"@id": "https://w3id.org/oc/meta/id/062501806985"},

1917 {"@id": "https://w3id.org/oc/meta/id/06850624745"},

1918 ],

1919 "http://purl.org/spar/pro/isDocumentContextFor": [

1920 {"@id": "https://w3id.org/oc/meta/ar/062507977761"},

1921 {"@id": "https://w3id.org/oc/meta/ar/062507977760"},

1922 {"@id": "https://w3id.org/oc/meta/ar/062507977759"},

1923 ],

1924 "http://purl.org/vocab/frbr/core#embodiment": [

1925 {"@id": "https://w3id.org/oc/meta/re/062501477439"}

1926 ],

1927 "http://purl.org/vocab/frbr/core#partOf": [

1928 {"@id": "https://w3id.org/oc/meta/br/062501778111"}

1929 ],

1930 }

1931 ]

1932 }

1933 ]

1934

1935 ar_data = [

1936 {

1937 "@graph": [

1938 {

1939 "@id": "https://w3id.org/oc/meta/ar/062507977761",

1940 "@type": ["http://purl.org/spar/pro/RoleInTime"],

1941 "http://purl.org/spar/pro/isHeldBy": [

1942 {"@id": "https://w3id.org/oc/meta/ra/0610116105"}

1943 ],

1944 "http://purl.org/spar/pro/withRole": [

1945 {"@id": "http://purl.org/spar/pro/publisher"}

1946 ],

1947 },

1948 {

1949 "@id": "https://w3id.org/oc/meta/ar/062507977760",

1950 "@type": ["http://purl.org/spar/pro/RoleInTime"],

1951 "http://purl.org/spar/pro/isHeldBy": [

1952 {"@id": "https://w3id.org/oc/meta/ra/0621010775619"}

1953 ],

1954 "http://purl.org/spar/pro/withRole": [

1955 {"@id": "http://purl.org/spar/pro/author"}

1956 ],

1957 },

1958 {

1959 "@id": "https://w3id.org/oc/meta/ar/062507977759",

1960 "@type": ["http://purl.org/spar/pro/RoleInTime"],

1961 "http://purl.org/spar/pro/isHeldBy": [

1962 {"@id": "https://w3id.org/oc/meta/ra/0614010840729"}

1963 ],

1964 "http://purl.org/spar/pro/withRole": [

1965 {"@id": "http://purl.org/spar/pro/author"}

1966 ],

1967 "https://w3id.org/oc/ontology/hasNext": [

1968 {"@id": "https://w3id.org/oc/meta/ar/062507977760"}

1969 ],

1970 },

1971 ]

1972 }

1973 ]

1974

1975 ra_data_peroni = [

1976 {

1977 "@graph": [

1978 {

1979 "@id": "https://w3id.org/oc/meta/ra/0614010840729",

1980 "@type": ["http://xmlns.com/foaf/0.1/Agent"],

1981 "http://purl.org/spar/datacite/hasIdentifier": [

1982 {"@id": "https://w3id.org/oc/meta/id/06304949238"}

1983 ],

1984 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Peroni"}],

1985 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Silvio"}],

1986 "http://xmlns.com/foaf/0.1/name": [{"@value": "Peroni Silvio"}],

1987 }

1988 ]

1989 }

1990 ]

1991

1992 ra_data_shotton = [

1993 {

1994 "@graph": [

1995 {

1996 "@id": "https://w3id.org/oc/meta/ra/0621010775619",

1997 "@type": ["http://xmlns.com/foaf/0.1/Agent"],

1998 "http://purl.org/spar/datacite/hasIdentifier": [

1999 {"@id": "https://w3id.org/oc/meta/id/062404672414"}

2000 ],

2001 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Shotton"}],

2002 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "D M"}],

2003 "http://xmlns.com/foaf/0.1/name": [{"@value": "Shotton David"}],

2004 }

2005 ]

2006 }

2007 ]

2008

2009 # Create directory structure for BR data

2010 br_dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "1780000")

2011 os.makedirs(br_dir_path, exist_ok=True)

2012

2013 # Create directory structure for AR data

2014 ar_dir_path = os.path.join(self.rdf_dir, "ar", supplier_prefix, "7980000")

2015 os.makedirs(ar_dir_path, exist_ok=True)

2016

2017 # Create directory structure for RA data (Peroni)

2018 ra_peroni_dir_path = os.path.join(self.rdf_dir, "ra", "06140", "10850000")

2019 os.makedirs(ra_peroni_dir_path, exist_ok=True)

2020

2021 # Create directory structure for RA data (Shotton)

2022 ra_shotton_dir_path = os.path.join(self.rdf_dir, "ra", "06210", "10780000")

2023 os.makedirs(ra_shotton_dir_path, exist_ok=True)

2024

2025 # Write BR data

2026 with ZipFile(os.path.join(br_dir_path, "1778000.zip"), "w") as zip_file:

2027 zip_file.writestr("1778000.json", json.dumps(br_data))

2028

2029 # Write AR data

2030 with ZipFile(os.path.join(ar_dir_path, "7978000.zip"), "w") as zip_file:

2031 zip_file.writestr("7978000.json", json.dumps(ar_data))

2032

2033 # Write RA data (Peroni)

2034 with ZipFile(os.path.join(ra_peroni_dir_path, "10841000.zip"), "w") as zip_file:

2035 zip_file.writestr("10841000.json", json.dumps(ra_data_peroni))

2036

2037 # Write RA data (Shotton)

2038 with ZipFile(

2039 os.path.join(ra_shotton_dir_path, "10776000.zip"), "w"

2040 ) as zip_file:

2041 zip_file.writestr("10776000.json", json.dumps(ra_data_shotton))

2042

2043 # Run generator

2044 generate_csv(

2045 input_dir=self.rdf_dir,

2046 output_dir=self.output_dir,

2047 dir_split_number=10000,

2048 items_per_file=1000,

2049 zip_output_rdf=True,

2050 )

2051

2052 # Check output

2053 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))

2054 self.assertEqual(len(output_data), 1)

2055 # Verify basic metadata

2056 article = output_data[0]

2057 self.assertEqual(

2058 article["title"],

2059 "OpenCitations, An Infrastructure Organization For Open Scholarship",

2060 )

2061 self.assertEqual(article["pub_date"], "2020-02")

2062 self.assertEqual(article["type"], "journal article")

2063 self.assertEqual(article["id"], "omid:br/062501777134")

2064

2065 # Now we expect the authors to be present in the correct order

2066 expected_authors = (

2067 "Peroni, Silvio [omid:ra/0614010840729]; "

2068 "Shotton, D M [omid:ra/0621010775619]"

2069 )

2070 self.assertEqual(article["author"], expected_authors)

2071

2072 # Publisher field should still be empty since we haven't added the publisher RA data

2073 self.assertEqual(article["publisher"], "")

2074

2075 def test_multiple_first_ars(self):

2076 """Test behavior when there are multiple first ARs in the same chain (no hasNext pointing to them).

2077 The current behavior is to process only one of the first ARs and its hasNext chain.

2078 """

2079 supplier_prefix = "060"

2080 br_data = [

2081 {

2082 "@graph": [

2083 {

2084 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",

2085 "@type": [

2086 "http://purl.org/spar/fabio/Expression",

2087 "http://purl.org/spar/fabio/JournalArticle",

2088 ],

2089 "http://purl.org/dc/terms/title": [

2090 {"@value": "Article With Multiple First Authors"}

2091 ],

2092 "http://purl.org/spar/pro/isDocumentContextFor": [

2093 {

2094 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"

2095 }, # First potential author (will be processed)

2096 {

2097 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"

2098 }, # Second potential author (will be ignored)

2099 {

2100 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"

2101 }, # Connected to author 1 (will be processed)

2102 ],

2103 }

2104 ]

2105 }

2106 ]

2107

2108 # Setup agent roles with two potential "first" authors (no hasNext pointing to them)

2109 # and one author connected to the first one

2110 ar_data = [

2111 {

2112 "@graph": [

2113 {

2114 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",

2115 "@type": ["http://purl.org/spar/pro/RoleInTime"],

2116 "http://purl.org/spar/pro/withRole": [

2117 {"@id": "http://purl.org/spar/pro/author"}

2118 ],

2119 "http://purl.org/spar/pro/isHeldBy": [

2120 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}

2121 ],

2122 "https://w3id.org/oc/ontology/hasNext": [

2123 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}

2124 ],

2125 },

2126 {

2127 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",

2128 "@type": ["http://purl.org/spar/pro/RoleInTime"],

2129 "http://purl.org/spar/pro/withRole": [

2130 {"@id": "http://purl.org/spar/pro/author"}

2131 ],

2132 "http://purl.org/spar/pro/isHeldBy": [

2133 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}

2134 ],

2135 # This is also a potential first author but will be ignored

2136 },

2137 {

2138 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",

2139 "@type": ["http://purl.org/spar/pro/RoleInTime"],

2140 "http://purl.org/spar/pro/withRole": [

2141 {"@id": "http://purl.org/spar/pro/author"}

2142 ],

2143 "http://purl.org/spar/pro/isHeldBy": [

2144 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}

2145 ],

2146 # This one is connected to author 1 via hasNext and will be processed

2147 },

2148 ]

2149 }

2150 ]

2151

2152 # Setup responsible agents

2153 ra_data = [

2154 {

2155 "@graph": [

2156 {

2157 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",

2158 "http://xmlns.com/foaf/0.1/name": [

2159 {"@value": "First Potential Author"}

2160 ],

2161 },

2162 {

2163 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",

2164 "http://xmlns.com/foaf/0.1/name": [

2165 {"@value": "Second Potential Author"}

2166 ],

2167 },

2168 {

2169 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",

2170 "http://xmlns.com/foaf/0.1/name": [

2171 {"@value": "Connected Author"}

2172 ],

2173 },

2174 ]

2175 }

2176 ]

2177

2178 # Write test data files

2179 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}

2180

2181 for entity_type, data in data_files.items():

2182 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")

2183 os.makedirs(dir_path, exist_ok=True)

2184

2185 zip_path = os.path.join(dir_path, "1000.zip")

2186 with ZipFile(zip_path, "w") as zip_file:

2187 zip_file.writestr("1000.json", json.dumps(data))

2188

2189 # Run generator

2190 generate_csv(

2191 input_dir=self.rdf_dir,

2192 output_dir=self.output_dir,

2193 dir_split_number=10000,

2194 items_per_file=1000,

2195 zip_output_rdf=True,

2196 )

2197

2198 # Check output

2199 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))

2200 self.assertEqual(len(output_data), 1)

2201

2202 article = output_data[0]

2203 authors = article["author"].split("; ")

2204

2205 # Verify we have exactly two authors (the first one found and its connected author)

2206 self.assertEqual(

2207 len(authors),

2208 2,

2209 "Should have exactly two authors (first author and connected one)",

2210 )

2211

2212 # Verify the specific authors we expect

2213 expected_authors = [

2214 f"First Potential Author [omid:ra/{supplier_prefix}1]",

2215 f"Connected Author [omid:ra/{supplier_prefix}3]",

2216 ]

2217 self.assertEqual(

2218 authors,

2219 expected_authors,

2220 "Should have first author and connected author in correct order",

2221 )

2222

2223 # Verify the second potential author is NOT in the output

2224 self.assertNotIn(

2225 f"Second Potential Author [omid:ra/{supplier_prefix}2]",

2226 article["author"],

2227 "Second potential author should not be in the output",

2228 )

2229

2230

2231if __name__ == "__main__":

2232 unittest.main()

Coverage for test/csv_generator_lite_test.py: 99%

455 statements