Coverage for test/csv_generator_lite_test.py: 99%
455 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-07-14 14:06 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-07-14 14:06 +0000
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# Copyright (c) 2024 Arcangelo Massari <arcangelo.massari@unibo.it>
4#
5# Permission to use, copy, modify, and/or distribute this software for any purpose
6# with or without fee is hereby granted, provided that the above copyright notice
7# and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
12# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
13# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
14# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
15# SOFTWARE.
17import csv
18import json
19import os
20import unittest
21from shutil import rmtree
22from zipfile import ZipFile
24import redis
25from oc_meta.lib.file_manager import get_csv_data
26from oc_meta.plugins.csv_generator_lite.csv_generator_lite import (
27 generate_csv,
28 init_redis_connection,
29 is_omid_processed,
30 load_processed_omids_to_redis,
31)
34class TestCSVGeneratorLite(unittest.TestCase):
35 def setUp(self):
36 self.base_dir = os.path.join("test", "csv_generator_lite")
37 self.input_dir = os.path.join(self.base_dir, "input")
38 self.output_dir = os.path.join(self.base_dir, "output")
40 # Create test directories if they don't exist
41 os.makedirs(self.input_dir, exist_ok=True)
42 os.makedirs(self.output_dir, exist_ok=True)
44 # Create test RDF structure
45 self.rdf_dir = os.path.join(self.input_dir, "rdf")
46 self.br_dir = os.path.join(self.rdf_dir, "br")
47 os.makedirs(self.br_dir, exist_ok=True)
49 # Initialize Redis connection for tests
50 self.redis_client = init_redis_connection(db=5) # Use DB 5 for testing
51 self.redis_client.flushdb() # Clear test database
53 def tearDown(self):
54 if os.path.exists(self.base_dir):
55 rmtree(self.base_dir)
56 # Clean up Redis test database
57 self.redis_client.flushdb()
59 def _write_test_data(self, data):
60 """Helper method to write test data to the input directory"""
61 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)
62 test_data = [
63 {
64 "@graph": [
65 {
66 "@id": f"https://w3id.org/oc/meta/{item['id'].replace('omid:', '')}",
67 "@type": [
68 "http://purl.org/spar/fabio/Expression",
69 "http://purl.org/spar/fabio/JournalArticle",
70 ],
71 "http://purl.org/dc/terms/title": [{"@value": item["title"]}],
72 }
73 for item in data
74 ]
75 }
76 ]
77 with ZipFile(
78 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"
79 ) as zip_file:
80 zip_file.writestr("1000.json", json.dumps(test_data))
82 def test_redis_connection_and_caching(self):
83 """Test Redis connection and basic caching operations"""
84 # Test connection initialization
85 redis_client = init_redis_connection(db=5)
86 self.assertIsInstance(redis_client, redis.Redis)
88 # Create a test CSV file with some OMIDs
89 test_data = [
90 {"id": "omid:br/0601", "title": "Test 1"},
91 {"id": "omid:br/0602", "title": "Test 2"},
92 {"id": "omid:br/0603 issn:456", "title": "Test 3"},
93 ]
94 os.makedirs(self.output_dir, exist_ok=True)
95 with open(
96 os.path.join(self.output_dir, "test.csv"), "w", newline="", encoding="utf-8"
97 ) as f:
98 writer = csv.DictWriter(f, fieldnames=["id", "title"])
99 writer.writeheader()
100 writer.writerows(test_data)
102 # Test loading OMIDs into Redis
103 count = load_processed_omids_to_redis(self.output_dir, redis_client)
104 self.assertEqual(count, 3)
106 # Test OMID lookup
107 self.assertTrue(is_omid_processed("omid:br/0601", redis_client))
108 self.assertTrue(is_omid_processed("omid:br/0602", redis_client))
109 self.assertTrue(is_omid_processed("omid:br/0603", redis_client))
110 self.assertFalse(is_omid_processed("omid:br/0604", redis_client))
112 def test_redis_cache_persistence(self):
113 """Test that Redis is populated from existing CSV files and cleared after completion"""
114 # Create initial test data
115 test_data = [
116 {
117 "@graph": [
118 {
119 "@id": "https://w3id.org/oc/meta/br/0601",
120 "@type": [
121 "http://purl.org/spar/fabio/Expression",
122 "http://purl.org/spar/fabio/JournalArticle",
123 ],
124 "http://purl.org/dc/terms/title": [{"@value": "First Run"}],
125 }
126 ]
127 }
128 ]
130 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)
131 with ZipFile(
132 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"
133 ) as zip_file:
134 zip_file.writestr("1000.json", json.dumps(test_data))
136 # First run - creates initial CSV
137 generate_csv(
138 input_dir=self.rdf_dir,
139 output_dir=self.output_dir,
140 dir_split_number=10000,
141 items_per_file=1000,
142 zip_output_rdf=True,
143 redis_db=5,
144 )
146 # Verify Redis is empty after first run
147 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client))
149 # Create new test data
150 test_data_2 = [
151 {
152 "@graph": [
153 {
154 "@id": "https://w3id.org/oc/meta/br/0601", # Same OMID as before
155 "@type": [
156 "http://purl.org/spar/fabio/Expression",
157 "http://purl.org/spar/fabio/JournalArticle",
158 ],
159 "http://purl.org/dc/terms/title": [
160 {"@value": "Should Be Skipped"}
161 ],
162 },
163 {
164 "@id": "https://w3id.org/oc/meta/br/0602", # New OMID
165 "@type": [
166 "http://purl.org/spar/fabio/Expression",
167 "http://purl.org/spar/fabio/JournalArticle",
168 ],
169 "http://purl.org/dc/terms/title": [
170 {"@value": "Should Be Processed"}
171 ],
172 },
173 ]
174 }
175 ]
177 with ZipFile(
178 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"
179 ) as zip_file:
180 zip_file.writestr("1000.json", json.dumps(test_data_2))
182 # Second run - should load OMIDs from existing CSV and skip already processed resources
183 generate_csv(
184 input_dir=self.rdf_dir,
185 output_dir=self.output_dir,
186 dir_split_number=10000,
187 items_per_file=1000,
188 zip_output_rdf=True,
189 redis_db=5,
190 )
192 # Check output files
193 output_data = []
194 for filename in os.listdir(self.output_dir):
195 if filename.endswith(".csv"):
196 output_data.extend(
197 get_csv_data(os.path.join(self.output_dir, filename))
198 )
200 # Verify results
201 # Should find exactly two entries - one from first run and one new one
202 self.assertEqual(len(output_data), 2)
204 # Find entries by title
205 first_run_entry = next(
206 item for item in output_data if item["title"] == "First Run"
207 )
208 second_run_entry = next(
209 item for item in output_data if item["title"] == "Should Be Processed"
210 )
212 # Verify the first entry wasn't overwritten with "Should Be Skipped"
213 self.assertEqual(first_run_entry["title"], "First Run")
214 self.assertEqual(first_run_entry["id"], "omid:br/0601")
216 # Verify the new entry was processed
217 self.assertEqual(second_run_entry["title"], "Should Be Processed")
218 self.assertEqual(second_run_entry["id"], "omid:br/0602")
220 # Verify Redis is empty after completion
221 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client))
222 self.assertFalse(is_omid_processed("omid:br/0602", self.redis_client))
224 def test_redis_cache_cleanup(self):
225 """Test that Redis cache is properly cleaned up in various scenarios"""
226 # First run - should process successfully and clear Redis
227 input_data = [{"id": "omid:br/0601", "title": "First Entry"}]
228 self._write_test_data(input_data)
230 # Run with valid directory - should process and clear Redis
231 generate_csv(
232 input_dir=self.rdf_dir,
233 output_dir=self.output_dir,
234 dir_split_number=10000,
235 items_per_file=1000,
236 zip_output_rdf=True,
237 redis_db=5,
238 )
240 # Verify Redis is empty after successful run
241 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client))
243 # Load processed OMIDs into Redis
244 load_processed_omids_to_redis(self.output_dir, self.redis_client)
246 # Verify that after loading from CSV, the OMID is in Redis
247 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client))
249 # Run with non-existent directory - should fail but keep Redis populated
250 generate_csv(
251 input_dir="/nonexistent/dir",
252 output_dir=self.output_dir,
253 dir_split_number=10000,
254 items_per_file=1000,
255 zip_output_rdf=True,
256 redis_db=5,
257 )
259 # Verify Redis still has the data after failed run
260 self.assertTrue(
261 is_omid_processed("omid:br/0601", self.redis_client),
262 "Redis cache should be retained after a failed run",
263 )
265 def test_redis_error_handling(self):
266 """Test handling of Redis connection errors"""
267 # Test with invalid Redis connection
268 with self.assertRaises(redis.ConnectionError):
269 init_redis_connection(port=12345) # Invalid port
271 # Test loading OMIDs with non-existent directory
272 count = load_processed_omids_to_redis("/nonexistent/dir", self.redis_client)
273 self.assertEqual(count, 0)
275 def test_concurrent_processing_with_redis(self):
276 """Test concurrent processing with Redis caching"""
277 # Create multiple test files
278 test_data = []
279 for i in range(100): # Create 100 test entries
280 test_data.append(
281 {
282 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}",
283 "@type": [
284 "http://purl.org/spar/fabio/Expression",
285 "http://purl.org/spar/fabio/JournalArticle",
286 ],
287 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}],
288 }
289 )
291 # Split into multiple files
292 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)
293 for i in range(0, 100, 10): # Create 10 files with 10 entries each
294 file_data = [{"@graph": test_data[i : i + 10]}]
295 with ZipFile(
296 os.path.join(self.br_dir, "060", "10000", f"{i+1000}.zip"), "w"
297 ) as zip_file:
298 zip_file.writestr(f"{i+1000}.json", json.dumps(file_data))
300 # First run to create some CSV files
301 generate_csv(
302 input_dir=self.rdf_dir,
303 output_dir=self.output_dir,
304 dir_split_number=10000,
305 items_per_file=1000,
306 zip_output_rdf=True,
307 redis_db=5,
308 )
310 # Create more test entries
311 more_test_data = []
312 for i in range(100, 200): # Create 100 more test entries
313 more_test_data.append(
314 {
315 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}",
316 "@type": [
317 "http://purl.org/spar/fabio/Expression",
318 "http://purl.org/spar/fabio/JournalArticle",
319 ],
320 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}],
321 }
322 )
324 # Add new files
325 for i in range(0, 100, 10):
326 file_data = [{"@graph": more_test_data[i : i + 10]}]
327 with ZipFile(
328 os.path.join(self.br_dir, "060", "10000", f"{i+2000}.zip"), "w"
329 ) as zip_file:
330 zip_file.writestr(f"{i+2000}.json", json.dumps(file_data))
332 # Second run with existing cache
333 generate_csv(
334 input_dir=self.rdf_dir,
335 output_dir=self.output_dir,
336 dir_split_number=10000,
337 items_per_file=1000,
338 zip_output_rdf=True,
339 redis_db=5,
340 )
342 # Verify results
343 all_output_data = []
344 for filename in os.listdir(self.output_dir):
345 if filename.endswith(".csv"):
346 all_output_data.extend(
347 get_csv_data(os.path.join(self.output_dir, filename))
348 )
350 # Should have processed all 200 entries
351 self.assertEqual(len(all_output_data), 200)
353 # Verify no duplicates
354 processed_ids = {row["id"] for row in all_output_data}
355 self.assertEqual(len(processed_ids), 200)
357 def test_basic_br_processing(self):
358 """Test basic bibliographic resource processing"""
359 test_data = [
360 {
361 "@graph": [
362 {
363 "@id": "https://w3id.org/oc/meta/br/0601",
364 "@type": [
365 "http://purl.org/spar/fabio/Expression",
366 "http://purl.org/spar/fabio/JournalArticle",
367 ],
368 "http://purl.org/dc/terms/title": [{"@value": "Test Article"}],
369 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
370 {"@value": "2024-01-01"}
371 ],
372 "http://purl.org/spar/datacite/hasIdentifier": [
373 {"@id": "https://w3id.org/oc/meta/id/0601"}
374 ],
375 }
376 ],
377 "@id": "https://w3id.org/oc/meta/br/",
378 }
379 ]
381 # Write test data to file
382 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)
383 with ZipFile(
384 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"
385 ) as zip_file:
386 zip_file.writestr("1000.json", json.dumps(test_data))
388 # Run generator
389 generate_csv(
390 input_dir=self.rdf_dir,
391 output_dir=self.output_dir,
392 dir_split_number=10000,
393 items_per_file=1000,
394 zip_output_rdf=True,
395 )
397 # Check output
398 output_files = os.listdir(self.output_dir)
399 self.assertEqual(len(output_files), 1)
401 output_data = get_csv_data(os.path.join(self.output_dir, output_files[0]))
402 self.assertEqual(len(output_data), 1)
403 self.assertEqual(output_data[0]["title"], "Test Article")
404 self.assertEqual(output_data[0]["pub_date"], "2024-01-01")
405 self.assertEqual(output_data[0]["type"], "journal article")
406 self.assertEqual(output_data[0]["id"], "omid:br/0601")
408 def test_complex_br_with_related_entities(self):
409 """Test processing of BR with authors, venue, and other related entities"""
410 # Create directory structure for each entity type
411 supplier_prefix = "060"
412 for entity_type in ["br", "ra", "ar", "id"]:
413 os.makedirs(
414 os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000"),
415 exist_ok=True,
416 )
418 # BR data including both the article and the venue
419 br_data = [
420 {
421 "@graph": [
422 {
423 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2",
424 "@type": [
425 "http://purl.org/spar/fabio/Expression",
426 "http://purl.org/spar/fabio/JournalArticle",
427 ],
428 "http://purl.org/dc/terms/title": [
429 {"@value": "Complex Article"}
430 ],
431 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
432 {"@value": "2024-02-01"}
433 ],
434 "http://purl.org/spar/pro/isDocumentContextFor": [
435 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}
436 ],
437 "http://purl.org/vocab/frbr/core#partOf": [
438 {"@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3"}
439 ],
440 },
441 {
442 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3",
443 "@type": [
444 "http://purl.org/spar/fabio/Expression",
445 "http://purl.org/spar/fabio/Journal",
446 ],
447 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}],
448 },
449 ],
450 "@id": "https://w3id.org/oc/meta/br/",
451 }
452 ]
454 ar_data = [
455 {
456 "@graph": [
457 {
458 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
459 "http://purl.org/spar/pro/withRole": [
460 {"@id": "http://purl.org/spar/pro/author"}
461 ],
462 "http://purl.org/spar/pro/isHeldBy": [
463 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
464 ],
465 }
466 ],
467 "@id": "https://w3id.org/oc/meta/ar/",
468 }
469 ]
471 ra_data = [
472 {
473 "@graph": [
474 {
475 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
476 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}],
477 }
478 ],
479 "@id": "https://w3id.org/oc/meta/ra/",
480 }
481 ]
483 # Write test data files in correct locations
484 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
486 for entity_type, data in data_files.items():
487 zip_path = os.path.join(
488 self.rdf_dir, entity_type, supplier_prefix, "10000", "1000.zip"
489 )
490 with ZipFile(zip_path, "w") as zip_file:
491 zip_file.writestr("1000.json", json.dumps(data))
493 # Run generator
494 generate_csv(
495 input_dir=self.rdf_dir,
496 output_dir=self.output_dir,
497 dir_split_number=10000,
498 items_per_file=1000,
499 zip_output_rdf=True,
500 )
502 # Check output
503 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
504 self.assertEqual(len(output_data), 2) # Should have 2 rows: article and journal
506 # Find article and journal entries
507 article = next(
508 (item for item in output_data if item["type"] == "journal article"), None
509 )
510 journal = next(
511 (item for item in output_data if item["type"] == "journal"), None
512 )
514 # Verify article data
515 self.assertIsNotNone(article)
516 self.assertEqual(article["title"], "Complex Article")
517 self.assertEqual(article["venue"], f"Test Journal [omid:br/{supplier_prefix}3]")
518 self.assertEqual(article["author"], "Test Author [omid:ra/0601]")
519 self.assertEqual(article["id"], f"omid:br/{supplier_prefix}2")
521 # Verify journal data
522 self.assertIsNotNone(journal)
523 self.assertEqual(journal["title"], "Test Journal")
524 self.assertEqual(journal["type"], "journal")
525 self.assertEqual(journal["id"], f"omid:br/{supplier_prefix}3")
527 def test_empty_input_directory(self):
528 """Test behavior with empty input directory"""
529 generate_csv(
530 input_dir=self.rdf_dir,
531 output_dir=self.output_dir,
532 dir_split_number=10000,
533 items_per_file=1000,
534 zip_output_rdf=True,
535 )
537 self.assertEqual(len(os.listdir(self.output_dir)), 0)
539 def test_br_with_multiple_authors_and_editors(self):
540 """Test processing of BR with multiple authors and editors"""
541 supplier_prefix = "060"
542 br_data = [
543 {
544 "@graph": [
545 {
546 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
547 "@type": [
548 "http://purl.org/spar/fabio/Expression",
549 "http://purl.org/spar/fabio/Book",
550 ],
551 "http://purl.org/dc/terms/title": [
552 {"@value": "Multi-Author Book"}
553 ],
554 "http://purl.org/spar/pro/isDocumentContextFor": [
555 {
556 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"
557 }, # First author
558 {
559 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"
560 }, # Second author
561 {
562 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"
563 }, # First editor
564 {
565 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4"
566 }, # Second editor
567 ],
568 }
569 ]
570 }
571 ]
573 # Setup agent roles for authors and editors with hasNext relations
574 ar_data = [
575 {
576 "@graph": [
577 {
578 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
579 "http://purl.org/spar/pro/withRole": [
580 {"@id": "http://purl.org/spar/pro/author"}
581 ],
582 "http://purl.org/spar/pro/isHeldBy": [
583 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
584 ],
585 "https://w3id.org/oc/ontology/hasNext": [
586 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}
587 ],
588 },
589 {
590 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
591 "http://purl.org/spar/pro/withRole": [
592 {"@id": "http://purl.org/spar/pro/author"}
593 ],
594 "http://purl.org/spar/pro/isHeldBy": [
595 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
596 ],
597 "https://w3id.org/oc/ontology/hasNext": [
598 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
599 ],
600 },
601 {
602 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
603 "http://purl.org/spar/pro/withRole": [
604 {"@id": "http://purl.org/spar/pro/editor"}
605 ],
606 "http://purl.org/spar/pro/isHeldBy": [
607 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
608 ],
609 "https://w3id.org/oc/ontology/hasNext": [
610 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4"}
611 ],
612 },
613 {
614 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4",
615 "http://purl.org/spar/pro/withRole": [
616 {"@id": "http://purl.org/spar/pro/editor"}
617 ],
618 "http://purl.org/spar/pro/isHeldBy": [
619 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4"}
620 ],
621 },
622 ]
623 }
624 ]
626 # Setup responsible agents
627 ra_data = [
628 {
629 "@graph": [
630 {
631 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
632 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}],
633 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}],
634 },
635 {
636 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
637 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Doe"}],
638 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Jane"}],
639 },
640 {
641 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
642 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Brown"}],
643 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Bob"}],
644 },
645 {
646 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4",
647 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Wilson"}],
648 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Alice"}],
649 },
650 ]
651 }
652 ]
654 # Write test data files
655 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
657 for entity_type, data in data_files.items():
658 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
659 os.makedirs(dir_path, exist_ok=True)
661 zip_path = os.path.join(dir_path, "1000.zip")
662 with ZipFile(zip_path, "w") as zip_file:
663 zip_file.writestr("1000.json", json.dumps(data))
665 # Run generator
666 generate_csv(
667 input_dir=self.rdf_dir,
668 output_dir=self.output_dir,
669 dir_split_number=10000,
670 items_per_file=1000,
671 zip_output_rdf=True,
672 )
674 # Check output
675 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
676 self.assertEqual(len(output_data), 1)
678 # Verify authors and editors are in the correct order
679 expected_authors = (
680 f"Smith, John [omid:ra/{supplier_prefix}1]; "
681 f"Doe, Jane [omid:ra/{supplier_prefix}2]"
682 )
683 expected_editors = (
684 f"Brown, Bob [omid:ra/{supplier_prefix}3]; "
685 f"Wilson, Alice [omid:ra/{supplier_prefix}4]"
686 )
688 self.assertEqual(output_data[0]["author"], expected_authors)
689 self.assertEqual(output_data[0]["editor"], expected_editors)
691 def test_br_with_identifiers(self):
692 """Test processing of BR with multiple identifiers"""
693 supplier_prefix = "060"
694 br_data = [
695 {
696 "@graph": [
697 {
698 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
699 "@type": [
700 "http://purl.org/spar/fabio/Expression",
701 "http://purl.org/spar/fabio/JournalArticle",
702 ],
703 "http://purl.org/dc/terms/title": [
704 {"@value": "Article With DOI"}
705 ],
706 "http://purl.org/spar/datacite/hasIdentifier": [
707 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1"},
708 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2"},
709 ],
710 }
711 ]
712 }
713 ]
715 id_data = [
716 {
717 "@graph": [
718 {
719 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1",
720 "http://purl.org/spar/datacite/usesIdentifierScheme": [
721 {"@id": "http://purl.org/spar/datacite/doi"}
722 ],
723 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
724 {"@value": "10.1234/test.123"}
725 ],
726 },
727 {
728 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2",
729 "http://purl.org/spar/datacite/usesIdentifierScheme": [
730 {"@id": "http://purl.org/spar/datacite/isbn"}
731 ],
732 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
733 {"@value": "978-0-123456-47-2"}
734 ],
735 },
736 ]
737 }
738 ]
740 # Write test data files in correct locations
741 data_files = {"br": br_data, "id": id_data}
743 for entity_type, data in data_files.items():
744 # Create all necessary directories
745 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
746 os.makedirs(dir_path, exist_ok=True)
748 zip_path = os.path.join(dir_path, "1000.zip")
749 with ZipFile(zip_path, "w") as zip_file:
750 zip_file.writestr("1000.json", json.dumps(data))
752 # Run generator
753 generate_csv(
754 input_dir=self.rdf_dir,
755 output_dir=self.output_dir,
756 dir_split_number=10000,
757 items_per_file=1000,
758 zip_output_rdf=True,
759 )
761 # Check output
762 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
763 self.assertEqual(len(output_data), 1)
765 # Verify all identifiers are included
766 expected_ids = (
767 f"omid:br/{supplier_prefix}1 doi:10.1234/test.123 isbn:978-0-123456-47-2"
768 )
769 self.assertEqual(output_data[0]["id"], expected_ids)
771 def test_br_with_page_numbers(self):
772 """Test processing of BR with page information"""
773 supplier_prefix = "060"
774 br_data = [
775 {
776 "@graph": [
777 {
778 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
779 "@type": [
780 "http://purl.org/spar/fabio/Expression",
781 "http://purl.org/spar/fabio/JournalArticle",
782 ],
783 "http://purl.org/dc/terms/title": [{"@value": "Paged Article"}],
784 "http://purl.org/vocab/frbr/core#embodiment": [
785 {"@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1"}
786 ],
787 }
788 ]
789 }
790 ]
792 re_data = [
793 {
794 "@graph": [
795 {
796 "@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1",
797 "http://prismstandard.org/namespaces/basic/2.0/startingPage": [
798 {"@value": "100"}
799 ],
800 "http://prismstandard.org/namespaces/basic/2.0/endingPage": [
801 {"@value": "120"}
802 ],
803 }
804 ]
805 }
806 ]
808 # Write test data files in correct locations
809 data_files = {"br": br_data, "re": re_data}
811 for entity_type, data in data_files.items():
812 # Create all necessary directories
813 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
814 os.makedirs(dir_path, exist_ok=True)
816 zip_path = os.path.join(dir_path, "1000.zip")
817 with ZipFile(zip_path, "w") as zip_file:
818 zip_file.writestr("1000.json", json.dumps(data))
820 # Run generator
821 generate_csv(
822 input_dir=self.rdf_dir,
823 output_dir=self.output_dir,
824 dir_split_number=10000,
825 items_per_file=1000,
826 zip_output_rdf=True,
827 )
829 # Check output
830 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
831 self.assertEqual(len(output_data), 1)
832 self.assertEqual(output_data[0]["page"], "100-120")
834 def test_malformed_data_handling(self):
835 """Test handling of malformed or incomplete data"""
836 supplier_prefix = "060"
837 br_data = [
838 {
839 "@graph": [
840 {
841 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
842 "@type": [
843 "http://purl.org/spar/fabio/Expression",
844 "http://purl.org/spar/fabio/JournalArticle",
845 ],
846 # Missing title
847 "http://purl.org/spar/pro/isDocumentContextFor": [
848 {"@id": "invalid_uri"}, # Invalid URI
849 ],
850 "http://purl.org/vocab/frbr/core#partOf": [
851 {"@id": "non_existent_venue"} # Non-existent venue
852 ],
853 }
854 ]
855 }
856 ]
858 # Write test data files in correct locations
859 data_files = {"br": br_data}
861 for entity_type, data in data_files.items():
862 # Create all necessary directories
863 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
864 os.makedirs(dir_path, exist_ok=True)
866 zip_path = os.path.join(dir_path, "1000.zip")
867 with ZipFile(zip_path, "w") as zip_file:
868 zip_file.writestr("1000.json", json.dumps(data))
870 # Run generator
871 generate_csv(
872 input_dir=self.rdf_dir,
873 output_dir=self.output_dir,
874 dir_split_number=10000,
875 items_per_file=1000,
876 zip_output_rdf=True,
877 )
879 # Check output
880 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
881 self.assertEqual(len(output_data), 1)
882 # Verify graceful handling of missing/invalid data
883 self.assertEqual(output_data[0]["title"], "")
884 self.assertEqual(output_data[0]["author"], "")
885 self.assertEqual(output_data[0]["venue"], "")
887 def test_br_with_hierarchical_venue_structures(self):
888 """Test different hierarchical venue structures (issue->volume->journal, issue->journal, volume->journal, direct journal)"""
889 supplier_prefix = "060"
891 # Create test data for different hierarchical structures
892 br_data = [
893 {
894 "@graph": [
895 # Article in issue->volume->journal structure
896 {
897 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
898 "@type": [
899 "http://purl.org/spar/fabio/Expression",
900 "http://purl.org/spar/fabio/JournalArticle",
901 ],
902 "http://purl.org/dc/terms/title": [
903 {"@value": "Article in Full Hierarchy"}
904 ],
905 "http://purl.org/vocab/frbr/core#partOf": [
906 {
907 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2"
908 } # Issue
909 ],
910 },
911 # Article in issue->journal structure (no volume)
912 {
913 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}5",
914 "@type": [
915 "http://purl.org/spar/fabio/Expression",
916 "http://purl.org/spar/fabio/JournalArticle",
917 ],
918 "http://purl.org/dc/terms/title": [
919 {"@value": "Article in Issue-Journal"}
920 ],
921 "http://purl.org/vocab/frbr/core#partOf": [
922 {
923 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6"
924 } # Issue
925 ],
926 },
927 # Article in volume->journal structure (no issue)
928 {
929 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}9",
930 "@type": [
931 "http://purl.org/spar/fabio/Expression",
932 "http://purl.org/spar/fabio/JournalArticle",
933 ],
934 "http://purl.org/dc/terms/title": [
935 {"@value": "Article in Volume-Journal"}
936 ],
937 "http://purl.org/vocab/frbr/core#partOf": [
938 {
939 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10"
940 } # Volume
941 ],
942 },
943 # Article directly in journal
944 {
945 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}13",
946 "@type": [
947 "http://purl.org/spar/fabio/Expression",
948 "http://purl.org/spar/fabio/JournalArticle",
949 ],
950 "http://purl.org/dc/terms/title": [
951 {"@value": "Article in Journal"}
952 ],
953 "http://purl.org/vocab/frbr/core#partOf": [
954 {
955 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"
956 } # Journal
957 ],
958 },
959 # Issue in full hierarchy
960 {
961 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2",
962 "@type": ["http://purl.org/spar/fabio/JournalIssue"],
963 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
964 {"@value": "2"}
965 ],
966 "http://purl.org/vocab/frbr/core#partOf": [
967 {
968 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3"
969 } # Volume
970 ],
971 },
972 # Volume in full hierarchy
973 {
974 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3",
975 "@type": ["http://purl.org/spar/fabio/JournalVolume"],
976 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
977 {"@value": "42"}
978 ],
979 "http://purl.org/vocab/frbr/core#partOf": [
980 {
981 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"
982 } # Journal
983 ],
984 },
985 # Journal
986 {
987 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4",
988 "@type": ["http://purl.org/spar/fabio/Journal"],
989 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}],
990 },
991 # Issue directly in journal
992 {
993 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6",
994 "@type": ["http://purl.org/spar/fabio/JournalIssue"],
995 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
996 {"@value": "3"}
997 ],
998 "http://purl.org/vocab/frbr/core#partOf": [
999 {
1000 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"
1001 } # Journal
1002 ],
1003 },
1004 # Volume directly in journal
1005 {
1006 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10",
1007 "@type": ["http://purl.org/spar/fabio/JournalVolume"],
1008 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
1009 {"@value": "5"}
1010 ],
1011 "http://purl.org/vocab/frbr/core#partOf": [
1012 {
1013 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"
1014 } # Journal
1015 ],
1016 },
1017 ]
1018 }
1019 ]
1021 # Write test data files
1022 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000")
1023 os.makedirs(dir_path, exist_ok=True)
1025 zip_path = os.path.join(dir_path, "1000.zip")
1026 with ZipFile(zip_path, "w") as zip_file:
1027 zip_file.writestr("1000.json", json.dumps(br_data))
1029 # Run generator
1030 generate_csv(
1031 input_dir=self.rdf_dir,
1032 output_dir=self.output_dir,
1033 dir_split_number=10000,
1034 items_per_file=1000,
1035 zip_output_rdf=True,
1036 )
1038 # Check output
1039 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1041 # Verify we only have the articles and journal in the output
1042 self.assertEqual(len(output_data), 5) # 4 articles + 1 journal
1044 # Verify no JournalVolume or JournalIssue entries exist
1045 volume_or_issue_entries = [
1046 item
1047 for item in output_data
1048 if item["type"] in ["journal volume", "journal issue"]
1049 ]
1050 self.assertEqual(len(volume_or_issue_entries), 0)
1052 # Find each article by title
1053 full_hierarchy = next(
1054 item for item in output_data if item["title"] == "Article in Full Hierarchy"
1055 )
1056 issue_journal = next(
1057 item for item in output_data if item["title"] == "Article in Issue-Journal"
1058 )
1059 volume_journal = next(
1060 item for item in output_data if item["title"] == "Article in Volume-Journal"
1061 )
1062 direct_journal = next(
1063 item for item in output_data if item["title"] == "Article in Journal"
1064 )
1066 # Test full hierarchy (issue->volume->journal)
1067 self.assertEqual(full_hierarchy["issue"], "2")
1068 self.assertEqual(full_hierarchy["volume"], "42")
1069 self.assertEqual(
1070 full_hierarchy["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"
1071 )
1073 # Test issue->journal (no volume)
1074 self.assertEqual(issue_journal["issue"], "3")
1075 self.assertEqual(issue_journal["volume"], "")
1076 self.assertEqual(
1077 issue_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"
1078 )
1080 # Test volume->journal (no issue)
1081 self.assertEqual(volume_journal["issue"], "")
1082 self.assertEqual(volume_journal["volume"], "5")
1083 self.assertEqual(
1084 volume_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"
1085 )
1087 # Test direct journal connection
1088 self.assertEqual(direct_journal["issue"], "")
1089 self.assertEqual(direct_journal["volume"], "")
1090 self.assertEqual(
1091 direct_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"
1092 )
1094 def test_book_in_series(self):
1095 """Test processing of a book that is part of a book series"""
1096 supplier_prefix = "060"
1098 # Create test data for book in series
1099 br_data = [
1100 {
1101 "@graph": [
1102 # Book
1103 {
1104 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1105 "@type": [
1106 "http://purl.org/spar/fabio/Expression",
1107 "http://purl.org/spar/fabio/Book",
1108 ],
1109 "http://purl.org/dc/terms/title": [{"@value": "Test Book"}],
1110 "http://purl.org/vocab/frbr/core#partOf": [
1111 {
1112 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2"
1113 } # Series
1114 ],
1115 },
1116 # Book Series
1117 {
1118 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2",
1119 "@type": ["http://purl.org/spar/fabio/BookSeries"],
1120 "http://purl.org/dc/terms/title": [
1121 {"@value": "Test Book Series"}
1122 ],
1123 },
1124 ]
1125 }
1126 ]
1128 # Write test data
1129 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000")
1130 os.makedirs(dir_path, exist_ok=True)
1132 zip_path = os.path.join(dir_path, "1000.zip")
1133 with ZipFile(zip_path, "w") as zip_file:
1134 zip_file.writestr("1000.json", json.dumps(br_data))
1136 # Run generator
1137 generate_csv(
1138 input_dir=self.rdf_dir,
1139 output_dir=self.output_dir,
1140 dir_split_number=10000,
1141 items_per_file=1000,
1142 zip_output_rdf=True,
1143 )
1145 # Check output
1146 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1148 # Find book entry
1149 book = next(item for item in output_data if item["type"] == "book")
1151 # Verify book is correctly linked to series
1152 self.assertEqual(book["title"], "Test Book")
1153 self.assertEqual(
1154 book["venue"], f"Test Book Series [omid:br/{supplier_prefix}2]"
1155 )
1156 self.assertEqual(book["volume"], "") # Should not have volume
1157 self.assertEqual(book["issue"], "") # Should not have issue
1159 def test_br_with_multiple_roles(self):
1160 """Test processing of BR with authors, editors and publishers"""
1161 supplier_prefix = "060"
1162 br_data = [
1163 {
1164 "@graph": [
1165 {
1166 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1167 "@type": [
1168 "http://purl.org/spar/fabio/Expression",
1169 "http://purl.org/spar/fabio/Book",
1170 ],
1171 "http://purl.org/dc/terms/title": [
1172 {"@value": "Multi-Role Book"}
1173 ],
1174 "http://purl.org/spar/pro/isDocumentContextFor": [
1175 {
1176 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"
1177 }, # Author
1178 {
1179 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"
1180 }, # Editor
1181 {
1182 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"
1183 }, # Publisher
1184 ],
1185 }
1186 ]
1187 }
1188 ]
1190 # Setup agent roles for authors, editors and publishers
1191 ar_data = [
1192 {
1193 "@graph": [
1194 {
1195 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
1196 "http://purl.org/spar/pro/withRole": [
1197 {"@id": "http://purl.org/spar/pro/author"}
1198 ],
1199 "http://purl.org/spar/pro/isHeldBy": [
1200 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
1201 ],
1202 "https://w3id.org/oc/ontology/hasNext": [
1203 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}
1204 ],
1205 },
1206 {
1207 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
1208 "http://purl.org/spar/pro/withRole": [
1209 {"@id": "http://purl.org/spar/pro/editor"}
1210 ],
1211 "http://purl.org/spar/pro/isHeldBy": [
1212 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
1213 ],
1214 "https://w3id.org/oc/ontology/hasNext": [
1215 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
1216 ],
1217 },
1218 {
1219 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
1220 "http://purl.org/spar/pro/withRole": [
1221 {"@id": "http://purl.org/spar/pro/publisher"}
1222 ],
1223 "http://purl.org/spar/pro/isHeldBy": [
1224 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
1225 ],
1226 },
1227 ]
1228 }
1229 ]
1231 # Setup responsible agents with different name formats
1232 ra_data = [
1233 {
1234 "@graph": [
1235 {
1236 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
1237 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}],
1238 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}],
1239 },
1240 {
1241 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
1242 "http://xmlns.com/foaf/0.1/name": [{"@value": "Editor Name"}],
1243 },
1244 {
1245 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
1246 "http://xmlns.com/foaf/0.1/name": [
1247 {"@value": "Publisher House"}
1248 ],
1249 },
1250 ]
1251 }
1252 ]
1254 # Write test data files
1255 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
1257 for entity_type, data in data_files.items():
1258 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
1259 os.makedirs(dir_path, exist_ok=True)
1261 zip_path = os.path.join(dir_path, "1000.zip")
1262 with ZipFile(zip_path, "w") as zip_file:
1263 zip_file.writestr("1000.json", json.dumps(data))
1265 # Run generator
1266 generate_csv(
1267 input_dir=self.rdf_dir,
1268 output_dir=self.output_dir,
1269 dir_split_number=10000,
1270 items_per_file=1000,
1271 zip_output_rdf=True,
1272 )
1274 # Check output
1275 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1276 self.assertEqual(len(output_data), 1)
1278 # Verify all roles are correctly processed
1279 book = output_data[0]
1280 self.assertEqual(book["title"], "Multi-Role Book")
1281 self.assertEqual(book["author"], f"Smith, John [omid:ra/{supplier_prefix}1]")
1282 self.assertEqual(book["editor"], f"Editor Name [omid:ra/{supplier_prefix}2]")
1283 self.assertEqual(
1284 book["publisher"], f"Publisher House [omid:ra/{supplier_prefix}3]"
1285 )
1287 def test_ordered_authors(self):
1288 """Test that authors are ordered according to hasNext relations"""
1289 supplier_prefix = "060"
1290 br_data = [
1291 {
1292 "@graph": [
1293 {
1294 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1295 "@type": [
1296 "http://purl.org/spar/fabio/Expression",
1297 "http://purl.org/spar/fabio/JournalArticle",
1298 ],
1299 "http://purl.org/dc/terms/title": [
1300 {"@value": "Ordered Authors Article"}
1301 ],
1302 "http://purl.org/spar/pro/isDocumentContextFor": [
1303 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"},
1304 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"},
1305 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"},
1306 ],
1307 }
1308 ]
1309 }
1310 ]
1312 # Setup agent roles with hasNext relations
1313 ar_data = [
1314 {
1315 "@graph": [
1316 {
1317 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
1318 "http://purl.org/spar/pro/withRole": [
1319 {"@id": "http://purl.org/spar/pro/author"}
1320 ],
1321 "http://purl.org/spar/pro/isHeldBy": [
1322 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
1323 ],
1324 "https://w3id.org/oc/ontology/hasNext": [
1325 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}
1326 ],
1327 },
1328 {
1329 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
1330 "http://purl.org/spar/pro/withRole": [
1331 {"@id": "http://purl.org/spar/pro/author"}
1332 ],
1333 "http://purl.org/spar/pro/isHeldBy": [
1334 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
1335 ],
1336 "https://w3id.org/oc/ontology/hasNext": [
1337 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
1338 ],
1339 },
1340 {
1341 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
1342 "http://purl.org/spar/pro/withRole": [
1343 {"@id": "http://purl.org/spar/pro/author"}
1344 ],
1345 "http://purl.org/spar/pro/isHeldBy": [
1346 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
1347 ],
1348 },
1349 ]
1350 }
1351 ]
1353 # Setup responsible agents with different names
1354 ra_data = [
1355 {
1356 "@graph": [
1357 {
1358 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
1359 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}],
1360 },
1361 {
1362 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
1363 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}],
1364 },
1365 {
1366 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
1367 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}],
1368 },
1369 ]
1370 }
1371 ]
1373 # Write test data files
1374 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
1376 for entity_type, data in data_files.items():
1377 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
1378 os.makedirs(dir_path, exist_ok=True)
1380 zip_path = os.path.join(dir_path, "1000.zip")
1381 with ZipFile(zip_path, "w") as zip_file:
1382 zip_file.writestr("1000.json", json.dumps(data))
1384 # Run generator
1385 generate_csv(
1386 input_dir=self.rdf_dir,
1387 output_dir=self.output_dir,
1388 dir_split_number=10000,
1389 items_per_file=1000,
1390 zip_output_rdf=True,
1391 )
1393 # Check output
1394 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1395 self.assertEqual(len(output_data), 1)
1397 # Verify authors are in the correct order
1398 expected_authors = (
1399 f"First Author [omid:ra/{supplier_prefix}1]; "
1400 f"Second Author [omid:ra/{supplier_prefix}2]; "
1401 f"Third Author [omid:ra/{supplier_prefix}3]"
1402 )
1403 self.assertEqual(output_data[0]["author"], expected_authors)
1405 def test_cyclic_hasNext_relations(self):
1406 """Test handling of cyclic hasNext relations between agent roles"""
1407 supplier_prefix = "060"
1408 br_data = [
1409 {
1410 "@graph": [
1411 {
1412 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1413 "@type": [
1414 "http://purl.org/spar/fabio/Expression",
1415 "http://purl.org/spar/fabio/JournalArticle",
1416 ],
1417 "http://purl.org/dc/terms/title": [
1418 {"@value": "Cyclic Authors Article"}
1419 ],
1420 "http://purl.org/spar/pro/isDocumentContextFor": [
1421 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"},
1422 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"},
1423 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"},
1424 ],
1425 }
1426 ]
1427 }
1428 ]
1430 # Setup agent roles with cyclic hasNext relations
1431 ar_data = [
1432 {
1433 "@graph": [
1434 {
1435 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
1436 "http://purl.org/spar/pro/withRole": [
1437 {"@id": "http://purl.org/spar/pro/author"}
1438 ],
1439 "http://purl.org/spar/pro/isHeldBy": [
1440 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
1441 ],
1442 "https://w3id.org/oc/ontology/hasNext": [
1443 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}
1444 ],
1445 },
1446 {
1447 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
1448 "http://purl.org/spar/pro/withRole": [
1449 {"@id": "http://purl.org/spar/pro/author"}
1450 ],
1451 "http://purl.org/spar/pro/isHeldBy": [
1452 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
1453 ],
1454 # Creates a cycle: 1 -> 2 -> 3 -> 1
1455 "https://w3id.org/oc/ontology/hasNext": [
1456 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
1457 ],
1458 },
1459 {
1460 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
1461 "http://purl.org/spar/pro/withRole": [
1462 {"@id": "http://purl.org/spar/pro/author"}
1463 ],
1464 "http://purl.org/spar/pro/isHeldBy": [
1465 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
1466 ],
1467 # Cycle completion
1468 "https://w3id.org/oc/ontology/hasNext": [
1469 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}
1470 ],
1471 },
1472 ]
1473 }
1474 ]
1476 # Setup responsible agents
1477 ra_data = [
1478 {
1479 "@graph": [
1480 {
1481 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
1482 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}],
1483 },
1484 {
1485 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
1486 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}],
1487 },
1488 {
1489 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
1490 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}],
1491 },
1492 ]
1493 }
1494 ]
1496 # Write test data files
1497 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
1499 for entity_type, data in data_files.items():
1500 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
1501 os.makedirs(dir_path, exist_ok=True)
1503 zip_path = os.path.join(dir_path, "1000.zip")
1504 with ZipFile(zip_path, "w") as zip_file:
1505 zip_file.writestr("1000.json", json.dumps(data))
1507 # Run generator
1508 generate_csv(
1509 input_dir=self.rdf_dir,
1510 output_dir=self.output_dir,
1511 dir_split_number=10000,
1512 items_per_file=1000,
1513 zip_output_rdf=True,
1514 )
1516 # Check output
1517 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1518 self.assertEqual(len(output_data), 1)
1520 # Verify that we get at least some authors before the cycle is detected
1521 # The order should be maintained until the cycle is detected
1522 authors = output_data[0]["author"].split("; ")
1523 self.assertGreater(len(authors), 0)
1525 # Verify the presence and order of authors
1526 self.assertTrue(
1527 any(
1528 f"First Author [omid:ra/{supplier_prefix}1]" in author
1529 for author in authors
1530 )
1531 )
1532 self.assertTrue(
1533 any(
1534 f"Second Author [omid:ra/{supplier_prefix}2]" in author
1535 for author in authors
1536 )
1537 )
1539 # Verify no duplicates in the output
1540 author_set = set(authors)
1541 self.assertEqual(
1542 len(authors),
1543 len(author_set),
1544 "Found duplicate authors in output: each author should appear exactly once",
1545 )
1547 # Verify the exact order and number of authors
1548 expected_authors = [
1549 f"First Author [omid:ra/{supplier_prefix}1]",
1550 f"Second Author [omid:ra/{supplier_prefix}2]",
1551 f"Third Author [omid:ra/{supplier_prefix}3]",
1552 ]
1553 self.assertEqual(
1554 authors,
1555 expected_authors,
1556 "Authors should be in correct order and each should appear exactly once",
1557 )
1559 def test_multiple_input_files(self):
1560 """Test processing of multiple input files with sequential entity IDs"""
1561 supplier_prefix = "060"
1563 # Create test data spanning multiple files
1564 # First file (entities 1-1000)
1565 br_data_1 = [
1566 {
1567 "@graph": [
1568 {
1569 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1570 "@type": [
1571 "http://purl.org/spar/fabio/Expression",
1572 "http://purl.org/spar/fabio/JournalArticle",
1573 ],
1574 "http://purl.org/dc/terms/title": [{"@value": "Article 1"}],
1575 },
1576 {
1577 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1000",
1578 "@type": [
1579 "http://purl.org/spar/fabio/Expression",
1580 "http://purl.org/spar/fabio/JournalArticle",
1581 ],
1582 "http://purl.org/dc/terms/title": [{"@value": "Article 1000"}],
1583 },
1584 ]
1585 }
1586 ]
1588 # Second file (entities 1001-2000)
1589 br_data_2 = [
1590 {
1591 "@graph": [
1592 {
1593 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1001",
1594 "@type": [
1595 "http://purl.org/spar/fabio/Expression",
1596 "http://purl.org/spar/fabio/JournalArticle",
1597 ],
1598 "http://purl.org/dc/terms/title": [{"@value": "Article 1001"}],
1599 },
1600 {
1601 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2000",
1602 "@type": [
1603 "http://purl.org/spar/fabio/Expression",
1604 "http://purl.org/spar/fabio/JournalArticle",
1605 ],
1606 "http://purl.org/dc/terms/title": [{"@value": "Article 2000"}],
1607 },
1608 ]
1609 }
1610 ]
1612 # Third file (entities 2001-3000)
1613 br_data_3 = [
1614 {
1615 "@graph": [
1616 {
1617 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2001",
1618 "@type": [
1619 "http://purl.org/spar/fabio/Expression",
1620 "http://purl.org/spar/fabio/JournalArticle",
1621 ],
1622 "http://purl.org/dc/terms/title": [{"@value": "Article 2001"}],
1623 "http://purl.org/spar/pro/isDocumentContextFor": [
1624 {
1625 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001"
1626 }
1627 ],
1628 }
1629 ]
1630 }
1631 ]
1633 # Create agent role data in a different file
1634 ar_data = [
1635 {
1636 "@graph": [
1637 {
1638 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001",
1639 "http://purl.org/spar/pro/withRole": [
1640 {"@id": "http://purl.org/spar/pro/author"}
1641 ],
1642 "http://purl.org/spar/pro/isHeldBy": [
1643 {
1644 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001"
1645 }
1646 ],
1647 }
1648 ]
1649 }
1650 ]
1652 # Create responsible agent data in a different file
1653 ra_data = [
1654 {
1655 "@graph": [
1656 {
1657 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001",
1658 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}],
1659 }
1660 ]
1661 }
1662 ]
1664 # Write test data to appropriate locations based on ID ranges
1665 os.makedirs(os.path.join(self.br_dir, supplier_prefix, "10000"), exist_ok=True)
1666 os.makedirs(
1667 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000"), exist_ok=True
1668 )
1669 os.makedirs(
1670 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000"), exist_ok=True
1671 )
1673 # Write BR files
1674 with ZipFile(
1675 os.path.join(self.br_dir, supplier_prefix, "10000", "1000.zip"), "w"
1676 ) as zip_file:
1677 zip_file.writestr("1000.json", json.dumps(br_data_1))
1678 with ZipFile(
1679 os.path.join(self.br_dir, supplier_prefix, "10000", "2000.zip"), "w"
1680 ) as zip_file:
1681 zip_file.writestr("2000.json", json.dumps(br_data_2))
1682 with ZipFile(
1683 os.path.join(self.br_dir, supplier_prefix, "10000", "3000.zip"), "w"
1684 ) as zip_file:
1685 zip_file.writestr("3000.json", json.dumps(br_data_3))
1687 # Write AR and RA files
1688 with ZipFile(
1689 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000", "3000.zip"), "w"
1690 ) as zip_file:
1691 zip_file.writestr("3000.json", json.dumps(ar_data))
1692 with ZipFile(
1693 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000", "3000.zip"), "w"
1694 ) as zip_file:
1695 zip_file.writestr("3000.json", json.dumps(ra_data))
1697 # Run generator
1698 generate_csv(
1699 input_dir=self.rdf_dir,
1700 output_dir=self.output_dir,
1701 dir_split_number=10000,
1702 items_per_file=1000,
1703 zip_output_rdf=True,
1704 )
1706 # Check output
1707 output_files = sorted(os.listdir(self.output_dir))
1708 self.assertGreater(len(output_files), 0)
1710 # Collect all output data
1711 all_output_data = []
1712 for output_file in output_files:
1713 all_output_data.extend(
1714 get_csv_data(os.path.join(self.output_dir, output_file))
1715 )
1717 # Verify we have all expected entries
1718 self.assertEqual(len(all_output_data), 5) # Should have 5 articles total
1720 # Verify specific entries
1721 article_1 = next(
1722 item
1723 for item in all_output_data
1724 if item["id"] == f"omid:br/{supplier_prefix}1"
1725 )
1726 article_1000 = next(
1727 item
1728 for item in all_output_data
1729 if item["id"] == f"omid:br/{supplier_prefix}1000"
1730 )
1731 article_1001 = next(
1732 item
1733 for item in all_output_data
1734 if item["id"] == f"omid:br/{supplier_prefix}1001"
1735 )
1736 article_2000 = next(
1737 item
1738 for item in all_output_data
1739 if item["id"] == f"omid:br/{supplier_prefix}2000"
1740 )
1741 article_2001 = next(
1742 item
1743 for item in all_output_data
1744 if item["id"] == f"omid:br/{supplier_prefix}2001"
1745 )
1747 # Check titles
1748 self.assertEqual(article_1["title"], "Article 1")
1749 self.assertEqual(article_1000["title"], "Article 1000")
1750 self.assertEqual(article_1001["title"], "Article 1001")
1751 self.assertEqual(article_2000["title"], "Article 2000")
1752 self.assertEqual(article_2001["title"], "Article 2001")
1754 # Check author for article 2001 (which has related entities)
1755 self.assertEqual(
1756 article_2001["author"], f"Test Author [omid:ra/{supplier_prefix}2001]"
1757 )
1759 def test_max_rows_per_file_and_data_integrity(self):
1760 """Test that output files respect max rows limit and no data is lost in multiprocessing"""
1761 supplier_prefix = "060"
1763 # Create test data with more than 3000 entries
1764 br_data = [
1765 {
1766 "@graph": [
1767 # Generate 3500 test entries
1768 *[
1769 {
1770 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}{i}",
1771 "@type": [
1772 "http://purl.org/spar/fabio/Expression",
1773 "http://purl.org/spar/fabio/JournalArticle",
1774 ],
1775 "http://purl.org/dc/terms/title": [
1776 {"@value": f"Article {i}"}
1777 ],
1778 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
1779 {"@value": "2024-01-01"}
1780 ],
1781 }
1782 for i in range(1, 3501)
1783 ] # This will create 3500 entries
1784 ]
1785 }
1786 ]
1788 # Split data into multiple files to test multiprocessing
1789 entries_per_file = 1000
1790 for i in range(0, 3500, entries_per_file):
1791 file_data = [{"@graph": br_data[0]["@graph"][i : i + entries_per_file]}]
1793 # Create directory structure for the file
1794 file_number = i + entries_per_file
1795 dir_path = os.path.join(self.br_dir, supplier_prefix, "10000")
1796 os.makedirs(dir_path, exist_ok=True)
1798 # Write the file
1799 with ZipFile(os.path.join(dir_path, f"{file_number}.zip"), "w") as zip_file:
1800 zip_file.writestr(f"{file_number}.json", json.dumps(file_data))
1802 # Run generator
1803 generate_csv(
1804 input_dir=self.rdf_dir,
1805 output_dir=self.output_dir,
1806 dir_split_number=10000,
1807 items_per_file=1000,
1808 zip_output_rdf=True,
1809 )
1811 # Check output files
1812 output_files = sorted(os.listdir(self.output_dir))
1814 # Verify number of output files
1815 # We expect at least 2 files: 3500 entries should create 2 files (3000 + 500)
1816 self.assertGreaterEqual(
1817 len(output_files), 2, "Should have at least 2 output files for 3500 entries"
1818 )
1820 # Collect all entries from all output files
1821 all_entries = []
1822 for output_file in output_files:
1823 entries = get_csv_data(os.path.join(self.output_dir, output_file))
1825 # Verify each file has at most 3000 rows
1826 self.assertLessEqual(
1827 len(entries),
1828 3000,
1829 f"File {output_file} has more than 3000 rows: {len(entries)}",
1830 )
1832 all_entries.extend(entries)
1834 # Verify total number of entries
1835 self.assertEqual(
1836 len(all_entries),
1837 3500,
1838 f"Expected 3500 total entries, got {len(all_entries)}",
1839 )
1841 # Verify no duplicate entries
1842 unique_ids = {entry["id"] for entry in all_entries}
1843 self.assertEqual(
1844 len(unique_ids),
1845 3500,
1846 f"Expected 3500 unique entries, got {len(unique_ids)}",
1847 )
1849 # Verify all entries are present (no missing entries)
1850 expected_ids = {f"omid:br/{supplier_prefix}{i}" for i in range(1, 3501)}
1851 self.assertEqual(
1852 unique_ids,
1853 expected_ids,
1854 "Some entries are missing or unexpected entries are present",
1855 )
1857 # Verify data integrity
1858 for i in range(1, 3501):
1859 entry = next(
1860 e for e in all_entries if e["id"] == f"omid:br/{supplier_prefix}{i}"
1861 )
1862 self.assertEqual(entry["title"], f"Article {i}")
1863 self.assertEqual(entry["pub_date"], "2024-01-01")
1864 self.assertEqual(entry["type"], "journal article")
1866 def test_csv_field_limit_handling(self):
1867 """Test handling of CSV files with large fields that exceed the default limit"""
1868 # Create a test CSV with a very large field
1869 large_field = "omid:br/0601 " + " ".join(
1870 [f"id:{i}" for i in range(10000)]
1871 ) # This will create a field > 131072 chars
1872 test_data = {"id": large_field, "title": "Test Large Field"}
1874 os.makedirs(self.output_dir, exist_ok=True)
1875 with open(
1876 os.path.join(self.output_dir, "large_field.csv"),
1877 "w",
1878 newline="",
1879 encoding="utf-8",
1880 ) as f:
1881 writer = csv.DictWriter(f, fieldnames=["id", "title"])
1882 writer.writeheader()
1883 writer.writerow(test_data)
1885 # Try loading the data - this should trigger the field limit increase
1886 count = load_processed_omids_to_redis(self.output_dir, self.redis_client)
1888 # Verify the OMID was loaded despite the large field
1889 self.assertEqual(count, 1)
1890 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client))
1892 def test_complex_br_with_missing_authors(self):
1893 """Test processing of a complex BR with multiple related entities where authors might be missing"""
1894 supplier_prefix = "06250"
1895 br_data = [
1896 {
1897 "@graph": [
1898 {
1899 "@id": "https://w3id.org/oc/meta/br/062501777134",
1900 "@type": [
1901 "http://purl.org/spar/fabio/JournalArticle",
1902 "http://purl.org/spar/fabio/Expression",
1903 ],
1904 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
1905 {
1906 "@type": "http://www.w3.org/2001/XMLSchema#gYearMonth",
1907 "@value": "2020-02",
1908 }
1909 ],
1910 "http://purl.org/dc/terms/title": [
1911 {
1912 "@value": "OpenCitations, An Infrastructure Organization For Open Scholarship"
1913 }
1914 ],
1915 "http://purl.org/spar/datacite/hasIdentifier": [
1916 {"@id": "https://w3id.org/oc/meta/id/062501806985"},
1917 {"@id": "https://w3id.org/oc/meta/id/06850624745"},
1918 ],
1919 "http://purl.org/spar/pro/isDocumentContextFor": [
1920 {"@id": "https://w3id.org/oc/meta/ar/062507977761"},
1921 {"@id": "https://w3id.org/oc/meta/ar/062507977760"},
1922 {"@id": "https://w3id.org/oc/meta/ar/062507977759"},
1923 ],
1924 "http://purl.org/vocab/frbr/core#embodiment": [
1925 {"@id": "https://w3id.org/oc/meta/re/062501477439"}
1926 ],
1927 "http://purl.org/vocab/frbr/core#partOf": [
1928 {"@id": "https://w3id.org/oc/meta/br/062501778111"}
1929 ],
1930 }
1931 ]
1932 }
1933 ]
1935 ar_data = [
1936 {
1937 "@graph": [
1938 {
1939 "@id": "https://w3id.org/oc/meta/ar/062507977761",
1940 "@type": ["http://purl.org/spar/pro/RoleInTime"],
1941 "http://purl.org/spar/pro/isHeldBy": [
1942 {"@id": "https://w3id.org/oc/meta/ra/0610116105"}
1943 ],
1944 "http://purl.org/spar/pro/withRole": [
1945 {"@id": "http://purl.org/spar/pro/publisher"}
1946 ],
1947 },
1948 {
1949 "@id": "https://w3id.org/oc/meta/ar/062507977760",
1950 "@type": ["http://purl.org/spar/pro/RoleInTime"],
1951 "http://purl.org/spar/pro/isHeldBy": [
1952 {"@id": "https://w3id.org/oc/meta/ra/0621010775619"}
1953 ],
1954 "http://purl.org/spar/pro/withRole": [
1955 {"@id": "http://purl.org/spar/pro/author"}
1956 ],
1957 },
1958 {
1959 "@id": "https://w3id.org/oc/meta/ar/062507977759",
1960 "@type": ["http://purl.org/spar/pro/RoleInTime"],
1961 "http://purl.org/spar/pro/isHeldBy": [
1962 {"@id": "https://w3id.org/oc/meta/ra/0614010840729"}
1963 ],
1964 "http://purl.org/spar/pro/withRole": [
1965 {"@id": "http://purl.org/spar/pro/author"}
1966 ],
1967 "https://w3id.org/oc/ontology/hasNext": [
1968 {"@id": "https://w3id.org/oc/meta/ar/062507977760"}
1969 ],
1970 },
1971 ]
1972 }
1973 ]
1975 ra_data_peroni = [
1976 {
1977 "@graph": [
1978 {
1979 "@id": "https://w3id.org/oc/meta/ra/0614010840729",
1980 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
1981 "http://purl.org/spar/datacite/hasIdentifier": [
1982 {"@id": "https://w3id.org/oc/meta/id/06304949238"}
1983 ],
1984 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Peroni"}],
1985 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Silvio"}],
1986 "http://xmlns.com/foaf/0.1/name": [{"@value": "Peroni Silvio"}],
1987 }
1988 ]
1989 }
1990 ]
1992 ra_data_shotton = [
1993 {
1994 "@graph": [
1995 {
1996 "@id": "https://w3id.org/oc/meta/ra/0621010775619",
1997 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
1998 "http://purl.org/spar/datacite/hasIdentifier": [
1999 {"@id": "https://w3id.org/oc/meta/id/062404672414"}
2000 ],
2001 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Shotton"}],
2002 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "D M"}],
2003 "http://xmlns.com/foaf/0.1/name": [{"@value": "Shotton David"}],
2004 }
2005 ]
2006 }
2007 ]
2009 # Create directory structure for BR data
2010 br_dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "1780000")
2011 os.makedirs(br_dir_path, exist_ok=True)
2013 # Create directory structure for AR data
2014 ar_dir_path = os.path.join(self.rdf_dir, "ar", supplier_prefix, "7980000")
2015 os.makedirs(ar_dir_path, exist_ok=True)
2017 # Create directory structure for RA data (Peroni)
2018 ra_peroni_dir_path = os.path.join(self.rdf_dir, "ra", "06140", "10850000")
2019 os.makedirs(ra_peroni_dir_path, exist_ok=True)
2021 # Create directory structure for RA data (Shotton)
2022 ra_shotton_dir_path = os.path.join(self.rdf_dir, "ra", "06210", "10780000")
2023 os.makedirs(ra_shotton_dir_path, exist_ok=True)
2025 # Write BR data
2026 with ZipFile(os.path.join(br_dir_path, "1778000.zip"), "w") as zip_file:
2027 zip_file.writestr("1778000.json", json.dumps(br_data))
2029 # Write AR data
2030 with ZipFile(os.path.join(ar_dir_path, "7978000.zip"), "w") as zip_file:
2031 zip_file.writestr("7978000.json", json.dumps(ar_data))
2033 # Write RA data (Peroni)
2034 with ZipFile(os.path.join(ra_peroni_dir_path, "10841000.zip"), "w") as zip_file:
2035 zip_file.writestr("10841000.json", json.dumps(ra_data_peroni))
2037 # Write RA data (Shotton)
2038 with ZipFile(
2039 os.path.join(ra_shotton_dir_path, "10776000.zip"), "w"
2040 ) as zip_file:
2041 zip_file.writestr("10776000.json", json.dumps(ra_data_shotton))
2043 # Run generator
2044 generate_csv(
2045 input_dir=self.rdf_dir,
2046 output_dir=self.output_dir,
2047 dir_split_number=10000,
2048 items_per_file=1000,
2049 zip_output_rdf=True,
2050 )
2052 # Check output
2053 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
2054 self.assertEqual(len(output_data), 1)
2055 # Verify basic metadata
2056 article = output_data[0]
2057 self.assertEqual(
2058 article["title"],
2059 "OpenCitations, An Infrastructure Organization For Open Scholarship",
2060 )
2061 self.assertEqual(article["pub_date"], "2020-02")
2062 self.assertEqual(article["type"], "journal article")
2063 self.assertEqual(article["id"], "omid:br/062501777134")
2065 # Now we expect the authors to be present in the correct order
2066 expected_authors = (
2067 "Peroni, Silvio [omid:ra/0614010840729]; "
2068 "Shotton, D M [omid:ra/0621010775619]"
2069 )
2070 self.assertEqual(article["author"], expected_authors)
2072 # Publisher field should still be empty since we haven't added the publisher RA data
2073 self.assertEqual(article["publisher"], "")
2075 def test_multiple_first_ars(self):
2076 """Test behavior when there are multiple first ARs in the same chain (no hasNext pointing to them).
2077 The current behavior is to process only one of the first ARs and its hasNext chain.
2078 """
2079 supplier_prefix = "060"
2080 br_data = [
2081 {
2082 "@graph": [
2083 {
2084 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
2085 "@type": [
2086 "http://purl.org/spar/fabio/Expression",
2087 "http://purl.org/spar/fabio/JournalArticle",
2088 ],
2089 "http://purl.org/dc/terms/title": [
2090 {"@value": "Article With Multiple First Authors"}
2091 ],
2092 "http://purl.org/spar/pro/isDocumentContextFor": [
2093 {
2094 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"
2095 }, # First potential author (will be processed)
2096 {
2097 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"
2098 }, # Second potential author (will be ignored)
2099 {
2100 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"
2101 }, # Connected to author 1 (will be processed)
2102 ],
2103 }
2104 ]
2105 }
2106 ]
2108 # Setup agent roles with two potential "first" authors (no hasNext pointing to them)
2109 # and one author connected to the first one
2110 ar_data = [
2111 {
2112 "@graph": [
2113 {
2114 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
2115 "@type": ["http://purl.org/spar/pro/RoleInTime"],
2116 "http://purl.org/spar/pro/withRole": [
2117 {"@id": "http://purl.org/spar/pro/author"}
2118 ],
2119 "http://purl.org/spar/pro/isHeldBy": [
2120 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
2121 ],
2122 "https://w3id.org/oc/ontology/hasNext": [
2123 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
2124 ],
2125 },
2126 {
2127 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
2128 "@type": ["http://purl.org/spar/pro/RoleInTime"],
2129 "http://purl.org/spar/pro/withRole": [
2130 {"@id": "http://purl.org/spar/pro/author"}
2131 ],
2132 "http://purl.org/spar/pro/isHeldBy": [
2133 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
2134 ],
2135 # This is also a potential first author but will be ignored
2136 },
2137 {
2138 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
2139 "@type": ["http://purl.org/spar/pro/RoleInTime"],
2140 "http://purl.org/spar/pro/withRole": [
2141 {"@id": "http://purl.org/spar/pro/author"}
2142 ],
2143 "http://purl.org/spar/pro/isHeldBy": [
2144 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
2145 ],
2146 # This one is connected to author 1 via hasNext and will be processed
2147 },
2148 ]
2149 }
2150 ]
2152 # Setup responsible agents
2153 ra_data = [
2154 {
2155 "@graph": [
2156 {
2157 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
2158 "http://xmlns.com/foaf/0.1/name": [
2159 {"@value": "First Potential Author"}
2160 ],
2161 },
2162 {
2163 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
2164 "http://xmlns.com/foaf/0.1/name": [
2165 {"@value": "Second Potential Author"}
2166 ],
2167 },
2168 {
2169 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
2170 "http://xmlns.com/foaf/0.1/name": [
2171 {"@value": "Connected Author"}
2172 ],
2173 },
2174 ]
2175 }
2176 ]
2178 # Write test data files
2179 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
2181 for entity_type, data in data_files.items():
2182 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
2183 os.makedirs(dir_path, exist_ok=True)
2185 zip_path = os.path.join(dir_path, "1000.zip")
2186 with ZipFile(zip_path, "w") as zip_file:
2187 zip_file.writestr("1000.json", json.dumps(data))
2189 # Run generator
2190 generate_csv(
2191 input_dir=self.rdf_dir,
2192 output_dir=self.output_dir,
2193 dir_split_number=10000,
2194 items_per_file=1000,
2195 zip_output_rdf=True,
2196 )
2198 # Check output
2199 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
2200 self.assertEqual(len(output_data), 1)
2202 article = output_data[0]
2203 authors = article["author"].split("; ")
2205 # Verify we have exactly two authors (the first one found and its connected author)
2206 self.assertEqual(
2207 len(authors),
2208 2,
2209 "Should have exactly two authors (first author and connected one)",
2210 )
2212 # Verify the specific authors we expect
2213 expected_authors = [
2214 f"First Potential Author [omid:ra/{supplier_prefix}1]",
2215 f"Connected Author [omid:ra/{supplier_prefix}3]",
2216 ]
2217 self.assertEqual(
2218 authors,
2219 expected_authors,
2220 "Should have first author and connected author in correct order",
2221 )
2223 # Verify the second potential author is NOT in the output
2224 self.assertNotIn(
2225 f"Second Potential Author [omid:ra/{supplier_prefix}2]",
2226 article["author"],
2227 "Second potential author should not be in the output",
2228 )
2231if __name__ == "__main__":
2232 unittest.main()