Coverage for test/csv_generator_lite_test.py: 99%
455 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-12-20 08:55 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-12-20 08:55 +0000
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# Copyright (c) 2024 Arcangelo Massari <arcangelo.massari@unibo.it>
4#
5# Permission to use, copy, modify, and/or distribute this software for any purpose
6# with or without fee is hereby granted, provided that the above copyright notice
7# and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
12# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
13# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
14# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
15# SOFTWARE.
17import csv
18import json
19import os
20import unittest
21from shutil import rmtree
22from zipfile import ZipFile
24import redis
25from oc_meta.lib.file_manager import get_csv_data
26from oc_meta.plugins.csv_generator_lite.csv_generator_lite import (
27 generate_csv,
28 init_redis_connection,
29 is_omid_processed,
30 load_processed_omids_to_redis,
31)
34class TestCSVGeneratorLite(unittest.TestCase):
35 def setUp(self):
36 self.base_dir = os.path.join("test", "csv_generator_lite")
37 self.input_dir = os.path.join(self.base_dir, "input")
38 self.output_dir = os.path.join(self.base_dir, "output")
40 # Create test directories if they don't exist
41 os.makedirs(self.input_dir, exist_ok=True)
42 os.makedirs(self.output_dir, exist_ok=True)
44 # Create test RDF structure
45 self.rdf_dir = os.path.join(self.input_dir, "rdf")
46 self.br_dir = os.path.join(self.rdf_dir, "br")
47 os.makedirs(self.br_dir, exist_ok=True)
49 # Initialize Redis connection for tests
50 self.redis_client = init_redis_connection(port=6381, db=5)
51 self.redis_client.flushdb() # Clear test database
53 def tearDown(self):
54 if os.path.exists(self.base_dir):
55 rmtree(self.base_dir)
56 # Clean up Redis test database
57 self.redis_client.flushdb()
59 def _write_test_data(self, data):
60 """Helper method to write test data to the input directory"""
61 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)
62 test_data = [
63 {
64 "@graph": [
65 {
66 "@id": f"https://w3id.org/oc/meta/{item['id'].replace('omid:', '')}",
67 "@type": [
68 "http://purl.org/spar/fabio/Expression",
69 "http://purl.org/spar/fabio/JournalArticle",
70 ],
71 "http://purl.org/dc/terms/title": [{"@value": item["title"]}],
72 }
73 for item in data
74 ]
75 }
76 ]
77 with ZipFile(
78 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"
79 ) as zip_file:
80 zip_file.writestr("1000.json", json.dumps(test_data))
82 def test_redis_connection_and_caching(self):
83 """Test Redis connection and basic caching operations"""
84 # Test connection initialization
85 redis_client = init_redis_connection(port=6381, db=5)
86 self.assertIsInstance(redis_client, redis.Redis)
88 # Create a test CSV file with some OMIDs
89 test_data = [
90 {"id": "omid:br/0601", "title": "Test 1"},
91 {"id": "omid:br/0602", "title": "Test 2"},
92 {"id": "omid:br/0603 issn:456", "title": "Test 3"},
93 ]
94 os.makedirs(self.output_dir, exist_ok=True)
95 with open(
96 os.path.join(self.output_dir, "test.csv"), "w", newline="", encoding="utf-8"
97 ) as f:
98 writer = csv.DictWriter(f, fieldnames=["id", "title"])
99 writer.writeheader()
100 writer.writerows(test_data)
102 # Test loading OMIDs into Redis
103 count = load_processed_omids_to_redis(self.output_dir, redis_client)
104 self.assertEqual(count, 3)
106 # Test OMID lookup
107 self.assertTrue(is_omid_processed("omid:br/0601", redis_client))
108 self.assertTrue(is_omid_processed("omid:br/0602", redis_client))
109 self.assertTrue(is_omid_processed("omid:br/0603", redis_client))
110 self.assertFalse(is_omid_processed("omid:br/0604", redis_client))
112 def test_redis_cache_persistence(self):
113 """Test that Redis is populated from existing CSV files and cleared after completion"""
114 # Create initial test data
115 test_data = [
116 {
117 "@graph": [
118 {
119 "@id": "https://w3id.org/oc/meta/br/0601",
120 "@type": [
121 "http://purl.org/spar/fabio/Expression",
122 "http://purl.org/spar/fabio/JournalArticle",
123 ],
124 "http://purl.org/dc/terms/title": [{"@value": "First Run"}],
125 }
126 ]
127 }
128 ]
130 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)
131 with ZipFile(
132 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"
133 ) as zip_file:
134 zip_file.writestr("1000.json", json.dumps(test_data))
136 # First run - creates initial CSV
137 generate_csv(
138 input_dir=self.rdf_dir,
139 output_dir=self.output_dir,
140 dir_split_number=10000,
141 items_per_file=1000,
142 zip_output_rdf=True,
143 redis_port=6381,
144 redis_db=5,
145 )
147 # Verify Redis is empty after first run
148 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client))
150 # Create new test data
151 test_data_2 = [
152 {
153 "@graph": [
154 {
155 "@id": "https://w3id.org/oc/meta/br/0601", # Same OMID as before
156 "@type": [
157 "http://purl.org/spar/fabio/Expression",
158 "http://purl.org/spar/fabio/JournalArticle",
159 ],
160 "http://purl.org/dc/terms/title": [
161 {"@value": "Should Be Skipped"}
162 ],
163 },
164 {
165 "@id": "https://w3id.org/oc/meta/br/0602", # New OMID
166 "@type": [
167 "http://purl.org/spar/fabio/Expression",
168 "http://purl.org/spar/fabio/JournalArticle",
169 ],
170 "http://purl.org/dc/terms/title": [
171 {"@value": "Should Be Processed"}
172 ],
173 },
174 ]
175 }
176 ]
178 with ZipFile(
179 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"
180 ) as zip_file:
181 zip_file.writestr("1000.json", json.dumps(test_data_2))
183 # Second run - should load OMIDs from existing CSV and skip already processed resources
184 generate_csv(
185 input_dir=self.rdf_dir,
186 output_dir=self.output_dir,
187 dir_split_number=10000,
188 items_per_file=1000,
189 zip_output_rdf=True,
190 redis_port=6381,
191 redis_db=5,
192 )
194 # Check output files
195 output_data = []
196 for filename in os.listdir(self.output_dir):
197 if filename.endswith(".csv"):
198 output_data.extend(
199 get_csv_data(os.path.join(self.output_dir, filename))
200 )
202 # Verify results
203 # Should find exactly two entries - one from first run and one new one
204 self.assertEqual(len(output_data), 2)
206 # Find entries by title
207 first_run_entry = next(
208 item for item in output_data if item["title"] == "First Run"
209 )
210 second_run_entry = next(
211 item for item in output_data if item["title"] == "Should Be Processed"
212 )
214 # Verify the first entry wasn't overwritten with "Should Be Skipped"
215 self.assertEqual(first_run_entry["title"], "First Run")
216 self.assertEqual(first_run_entry["id"], "omid:br/0601")
218 # Verify the new entry was processed
219 self.assertEqual(second_run_entry["title"], "Should Be Processed")
220 self.assertEqual(second_run_entry["id"], "omid:br/0602")
222 # Verify Redis is empty after completion
223 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client))
224 self.assertFalse(is_omid_processed("omid:br/0602", self.redis_client))
226 def test_redis_cache_cleanup(self):
227 """Test that Redis cache is properly cleaned up in various scenarios"""
228 # First run - should process successfully and clear Redis
229 input_data = [{"id": "omid:br/0601", "title": "First Entry"}]
230 self._write_test_data(input_data)
232 # Run with valid directory - should process and clear Redis
233 generate_csv(
234 input_dir=self.rdf_dir,
235 output_dir=self.output_dir,
236 dir_split_number=10000,
237 items_per_file=1000,
238 zip_output_rdf=True,
239 redis_port=6381,
240 redis_db=5,
241 )
243 # Verify Redis is empty after successful run
244 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client))
246 # Load processed OMIDs into Redis
247 load_processed_omids_to_redis(self.output_dir, self.redis_client)
249 # Verify that after loading from CSV, the OMID is in Redis
250 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client))
252 # Run with non-existent directory - should fail but keep Redis populated
253 generate_csv(
254 input_dir="/nonexistent/dir",
255 output_dir=self.output_dir,
256 dir_split_number=10000,
257 items_per_file=1000,
258 zip_output_rdf=True,
259 redis_port=6381,
260 redis_db=5,
261 )
263 # Verify Redis still has the data after failed run
264 self.assertTrue(
265 is_omid_processed("omid:br/0601", self.redis_client),
266 "Redis cache should be retained after a failed run",
267 )
269 def test_redis_error_handling(self):
270 """Test handling of Redis connection errors"""
271 # Test with invalid Redis connection
272 with self.assertRaises(redis.ConnectionError):
273 init_redis_connection(port=9999) # Invalid port
275 # Test loading OMIDs with non-existent directory
276 count = load_processed_omids_to_redis("/nonexistent/dir", self.redis_client)
277 self.assertEqual(count, 0)
279 def test_concurrent_processing_with_redis(self):
280 """Test concurrent processing with Redis caching"""
281 # Create multiple test files
282 test_data = []
283 for i in range(100): # Create 100 test entries
284 test_data.append(
285 {
286 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}",
287 "@type": [
288 "http://purl.org/spar/fabio/Expression",
289 "http://purl.org/spar/fabio/JournalArticle",
290 ],
291 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}],
292 }
293 )
295 # Split into multiple files
296 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)
297 for i in range(0, 100, 10): # Create 10 files with 10 entries each
298 file_data = [{"@graph": test_data[i : i + 10]}]
299 with ZipFile(
300 os.path.join(self.br_dir, "060", "10000", f"{i+1000}.zip"), "w"
301 ) as zip_file:
302 zip_file.writestr(f"{i+1000}.json", json.dumps(file_data))
304 # First run to create some CSV files
305 generate_csv(
306 input_dir=self.rdf_dir,
307 output_dir=self.output_dir,
308 dir_split_number=10000,
309 items_per_file=1000,
310 zip_output_rdf=True,
311 redis_port=6381,
312 redis_db=5,
313 )
315 # Create more test entries
316 more_test_data = []
317 for i in range(100, 200): # Create 100 more test entries
318 more_test_data.append(
319 {
320 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}",
321 "@type": [
322 "http://purl.org/spar/fabio/Expression",
323 "http://purl.org/spar/fabio/JournalArticle",
324 ],
325 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}],
326 }
327 )
329 # Add new files
330 for i in range(0, 100, 10):
331 file_data = [{"@graph": more_test_data[i : i + 10]}]
332 with ZipFile(
333 os.path.join(self.br_dir, "060", "10000", f"{i+2000}.zip"), "w"
334 ) as zip_file:
335 zip_file.writestr(f"{i+2000}.json", json.dumps(file_data))
337 # Second run with existing cache
338 generate_csv(
339 input_dir=self.rdf_dir,
340 output_dir=self.output_dir,
341 dir_split_number=10000,
342 items_per_file=1000,
343 zip_output_rdf=True,
344 redis_port=6381,
345 redis_db=5,
346 )
348 # Verify results
349 all_output_data = []
350 for filename in os.listdir(self.output_dir):
351 if filename.endswith(".csv"):
352 all_output_data.extend(
353 get_csv_data(os.path.join(self.output_dir, filename))
354 )
356 # Should have processed all 200 entries
357 self.assertEqual(len(all_output_data), 200)
359 # Verify no duplicates
360 processed_ids = {row["id"] for row in all_output_data}
361 self.assertEqual(len(processed_ids), 200)
363 def test_basic_br_processing(self):
364 """Test basic bibliographic resource processing"""
365 test_data = [
366 {
367 "@graph": [
368 {
369 "@id": "https://w3id.org/oc/meta/br/0601",
370 "@type": [
371 "http://purl.org/spar/fabio/Expression",
372 "http://purl.org/spar/fabio/JournalArticle",
373 ],
374 "http://purl.org/dc/terms/title": [{"@value": "Test Article"}],
375 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
376 {"@value": "2024-01-01"}
377 ],
378 "http://purl.org/spar/datacite/hasIdentifier": [
379 {"@id": "https://w3id.org/oc/meta/id/0601"}
380 ],
381 }
382 ],
383 "@id": "https://w3id.org/oc/meta/br/",
384 }
385 ]
387 # Write test data to file
388 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)
389 with ZipFile(
390 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"
391 ) as zip_file:
392 zip_file.writestr("1000.json", json.dumps(test_data))
394 # Run generator
395 generate_csv(
396 input_dir=self.rdf_dir,
397 output_dir=self.output_dir,
398 dir_split_number=10000,
399 items_per_file=1000,
400 zip_output_rdf=True,
401 redis_port=6381,
402 redis_db=5,
403 )
405 # Check output
406 output_files = os.listdir(self.output_dir)
407 self.assertEqual(len(output_files), 1)
409 output_data = get_csv_data(os.path.join(self.output_dir, output_files[0]))
410 self.assertEqual(len(output_data), 1)
411 self.assertEqual(output_data[0]["title"], "Test Article")
412 self.assertEqual(output_data[0]["pub_date"], "2024-01-01")
413 self.assertEqual(output_data[0]["type"], "journal article")
414 self.assertEqual(output_data[0]["id"], "omid:br/0601")
416 def test_complex_br_with_related_entities(self):
417 """Test processing of BR with authors, venue, and other related entities"""
418 # Create directory structure for each entity type
419 supplier_prefix = "060"
420 for entity_type in ["br", "ra", "ar", "id"]:
421 os.makedirs(
422 os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000"),
423 exist_ok=True,
424 )
426 # BR data including both the article and the venue
427 br_data = [
428 {
429 "@graph": [
430 {
431 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2",
432 "@type": [
433 "http://purl.org/spar/fabio/Expression",
434 "http://purl.org/spar/fabio/JournalArticle",
435 ],
436 "http://purl.org/dc/terms/title": [
437 {"@value": "Complex Article"}
438 ],
439 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
440 {"@value": "2024-02-01"}
441 ],
442 "http://purl.org/spar/pro/isDocumentContextFor": [
443 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}
444 ],
445 "http://purl.org/vocab/frbr/core#partOf": [
446 {"@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3"}
447 ],
448 },
449 {
450 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3",
451 "@type": [
452 "http://purl.org/spar/fabio/Expression",
453 "http://purl.org/spar/fabio/Journal",
454 ],
455 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}],
456 },
457 ],
458 "@id": "https://w3id.org/oc/meta/br/",
459 }
460 ]
462 ar_data = [
463 {
464 "@graph": [
465 {
466 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
467 "http://purl.org/spar/pro/withRole": [
468 {"@id": "http://purl.org/spar/pro/author"}
469 ],
470 "http://purl.org/spar/pro/isHeldBy": [
471 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
472 ],
473 }
474 ],
475 "@id": "https://w3id.org/oc/meta/ar/",
476 }
477 ]
479 ra_data = [
480 {
481 "@graph": [
482 {
483 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
484 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}],
485 }
486 ],
487 "@id": "https://w3id.org/oc/meta/ra/",
488 }
489 ]
491 # Write test data files in correct locations
492 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
494 for entity_type, data in data_files.items():
495 zip_path = os.path.join(
496 self.rdf_dir, entity_type, supplier_prefix, "10000", "1000.zip"
497 )
498 with ZipFile(zip_path, "w") as zip_file:
499 zip_file.writestr("1000.json", json.dumps(data))
501 # Run generator
502 generate_csv(
503 input_dir=self.rdf_dir,
504 output_dir=self.output_dir,
505 dir_split_number=10000,
506 items_per_file=1000,
507 zip_output_rdf=True,
508 redis_port=6381,
509 redis_db=5,
510 )
512 # Check output
513 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
514 self.assertEqual(len(output_data), 2) # Should have 2 rows: article and journal
516 # Find article and journal entries
517 article = next(
518 (item for item in output_data if item["type"] == "journal article"), None
519 )
520 journal = next(
521 (item for item in output_data if item["type"] == "journal"), None
522 )
524 # Verify article data
525 self.assertIsNotNone(article)
526 self.assertEqual(article["title"], "Complex Article")
527 self.assertEqual(article["venue"], f"Test Journal [omid:br/{supplier_prefix}3]")
528 self.assertEqual(article["author"], "Test Author [omid:ra/0601]")
529 self.assertEqual(article["id"], f"omid:br/{supplier_prefix}2")
531 # Verify journal data
532 self.assertIsNotNone(journal)
533 self.assertEqual(journal["title"], "Test Journal")
534 self.assertEqual(journal["type"], "journal")
535 self.assertEqual(journal["id"], f"omid:br/{supplier_prefix}3")
537 def test_empty_input_directory(self):
538 """Test behavior with empty input directory"""
539 generate_csv(
540 input_dir=self.rdf_dir,
541 output_dir=self.output_dir,
542 dir_split_number=10000,
543 items_per_file=1000,
544 zip_output_rdf=True,
545 redis_port=6381,
546 redis_db=5,
547 )
549 self.assertEqual(len(os.listdir(self.output_dir)), 0)
551 def test_br_with_multiple_authors_and_editors(self):
552 """Test processing of BR with multiple authors and editors"""
553 supplier_prefix = "060"
554 br_data = [
555 {
556 "@graph": [
557 {
558 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
559 "@type": [
560 "http://purl.org/spar/fabio/Expression",
561 "http://purl.org/spar/fabio/Book",
562 ],
563 "http://purl.org/dc/terms/title": [
564 {"@value": "Multi-Author Book"}
565 ],
566 "http://purl.org/spar/pro/isDocumentContextFor": [
567 {
568 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"
569 }, # First author
570 {
571 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"
572 }, # Second author
573 {
574 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"
575 }, # First editor
576 {
577 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4"
578 }, # Second editor
579 ],
580 }
581 ]
582 }
583 ]
585 # Setup agent roles for authors and editors with hasNext relations
586 ar_data = [
587 {
588 "@graph": [
589 {
590 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
591 "http://purl.org/spar/pro/withRole": [
592 {"@id": "http://purl.org/spar/pro/author"}
593 ],
594 "http://purl.org/spar/pro/isHeldBy": [
595 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
596 ],
597 "https://w3id.org/oc/ontology/hasNext": [
598 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}
599 ],
600 },
601 {
602 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
603 "http://purl.org/spar/pro/withRole": [
604 {"@id": "http://purl.org/spar/pro/author"}
605 ],
606 "http://purl.org/spar/pro/isHeldBy": [
607 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
608 ],
609 "https://w3id.org/oc/ontology/hasNext": [
610 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
611 ],
612 },
613 {
614 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
615 "http://purl.org/spar/pro/withRole": [
616 {"@id": "http://purl.org/spar/pro/editor"}
617 ],
618 "http://purl.org/spar/pro/isHeldBy": [
619 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
620 ],
621 "https://w3id.org/oc/ontology/hasNext": [
622 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4"}
623 ],
624 },
625 {
626 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4",
627 "http://purl.org/spar/pro/withRole": [
628 {"@id": "http://purl.org/spar/pro/editor"}
629 ],
630 "http://purl.org/spar/pro/isHeldBy": [
631 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4"}
632 ],
633 },
634 ]
635 }
636 ]
638 # Setup responsible agents
639 ra_data = [
640 {
641 "@graph": [
642 {
643 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
644 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}],
645 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}],
646 },
647 {
648 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
649 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Doe"}],
650 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Jane"}],
651 },
652 {
653 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
654 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Brown"}],
655 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Bob"}],
656 },
657 {
658 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4",
659 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Wilson"}],
660 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Alice"}],
661 },
662 ]
663 }
664 ]
666 # Write test data files
667 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
669 for entity_type, data in data_files.items():
670 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
671 os.makedirs(dir_path, exist_ok=True)
673 zip_path = os.path.join(dir_path, "1000.zip")
674 with ZipFile(zip_path, "w") as zip_file:
675 zip_file.writestr("1000.json", json.dumps(data))
677 # Run generator
678 generate_csv(
679 input_dir=self.rdf_dir,
680 output_dir=self.output_dir,
681 dir_split_number=10000,
682 items_per_file=1000,
683 zip_output_rdf=True,
684 redis_port=6381,
685 redis_db=5,
686 )
688 # Check output
689 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
690 self.assertEqual(len(output_data), 1)
692 # Verify authors and editors are in the correct order
693 expected_authors = (
694 f"Smith, John [omid:ra/{supplier_prefix}1]; "
695 f"Doe, Jane [omid:ra/{supplier_prefix}2]"
696 )
697 expected_editors = (
698 f"Brown, Bob [omid:ra/{supplier_prefix}3]; "
699 f"Wilson, Alice [omid:ra/{supplier_prefix}4]"
700 )
702 self.assertEqual(output_data[0]["author"], expected_authors)
703 self.assertEqual(output_data[0]["editor"], expected_editors)
705 def test_br_with_identifiers(self):
706 """Test processing of BR with multiple identifiers"""
707 supplier_prefix = "060"
708 br_data = [
709 {
710 "@graph": [
711 {
712 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
713 "@type": [
714 "http://purl.org/spar/fabio/Expression",
715 "http://purl.org/spar/fabio/JournalArticle",
716 ],
717 "http://purl.org/dc/terms/title": [
718 {"@value": "Article With DOI"}
719 ],
720 "http://purl.org/spar/datacite/hasIdentifier": [
721 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1"},
722 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2"},
723 ],
724 }
725 ]
726 }
727 ]
729 id_data = [
730 {
731 "@graph": [
732 {
733 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1",
734 "http://purl.org/spar/datacite/usesIdentifierScheme": [
735 {"@id": "http://purl.org/spar/datacite/doi"}
736 ],
737 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
738 {"@value": "10.1234/test.123"}
739 ],
740 },
741 {
742 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2",
743 "http://purl.org/spar/datacite/usesIdentifierScheme": [
744 {"@id": "http://purl.org/spar/datacite/isbn"}
745 ],
746 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
747 {"@value": "978-0-123456-47-2"}
748 ],
749 },
750 ]
751 }
752 ]
754 # Write test data files in correct locations
755 data_files = {"br": br_data, "id": id_data}
757 for entity_type, data in data_files.items():
758 # Create all necessary directories
759 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
760 os.makedirs(dir_path, exist_ok=True)
762 zip_path = os.path.join(dir_path, "1000.zip")
763 with ZipFile(zip_path, "w") as zip_file:
764 zip_file.writestr("1000.json", json.dumps(data))
766 # Run generator
767 generate_csv(
768 input_dir=self.rdf_dir,
769 output_dir=self.output_dir,
770 dir_split_number=10000,
771 items_per_file=1000,
772 zip_output_rdf=True,
773 redis_port=6381,
774 redis_db=5,
775 )
777 # Check output
778 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
779 self.assertEqual(len(output_data), 1)
781 # Verify all identifiers are included
782 expected_ids = (
783 f"omid:br/{supplier_prefix}1 doi:10.1234/test.123 isbn:978-0-123456-47-2"
784 )
785 self.assertEqual(output_data[0]["id"], expected_ids)
787 def test_br_with_page_numbers(self):
788 """Test processing of BR with page information"""
789 supplier_prefix = "060"
790 br_data = [
791 {
792 "@graph": [
793 {
794 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
795 "@type": [
796 "http://purl.org/spar/fabio/Expression",
797 "http://purl.org/spar/fabio/JournalArticle",
798 ],
799 "http://purl.org/dc/terms/title": [{"@value": "Paged Article"}],
800 "http://purl.org/vocab/frbr/core#embodiment": [
801 {"@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1"}
802 ],
803 }
804 ]
805 }
806 ]
808 re_data = [
809 {
810 "@graph": [
811 {
812 "@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1",
813 "http://prismstandard.org/namespaces/basic/2.0/startingPage": [
814 {"@value": "100"}
815 ],
816 "http://prismstandard.org/namespaces/basic/2.0/endingPage": [
817 {"@value": "120"}
818 ],
819 }
820 ]
821 }
822 ]
824 # Write test data files in correct locations
825 data_files = {"br": br_data, "re": re_data}
827 for entity_type, data in data_files.items():
828 # Create all necessary directories
829 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
830 os.makedirs(dir_path, exist_ok=True)
832 zip_path = os.path.join(dir_path, "1000.zip")
833 with ZipFile(zip_path, "w") as zip_file:
834 zip_file.writestr("1000.json", json.dumps(data))
836 # Run generator
837 generate_csv(
838 input_dir=self.rdf_dir,
839 output_dir=self.output_dir,
840 dir_split_number=10000,
841 items_per_file=1000,
842 zip_output_rdf=True,
843 redis_port=6381,
844 redis_db=5,
845 )
847 # Check output
848 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
849 self.assertEqual(len(output_data), 1)
850 self.assertEqual(output_data[0]["page"], "100-120")
852 def test_malformed_data_handling(self):
853 """Test handling of malformed or incomplete data"""
854 supplier_prefix = "060"
855 br_data = [
856 {
857 "@graph": [
858 {
859 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
860 "@type": [
861 "http://purl.org/spar/fabio/Expression",
862 "http://purl.org/spar/fabio/JournalArticle",
863 ],
864 # Missing title
865 "http://purl.org/spar/pro/isDocumentContextFor": [
866 {"@id": "invalid_uri"}, # Invalid URI
867 ],
868 "http://purl.org/vocab/frbr/core#partOf": [
869 {"@id": "non_existent_venue"} # Non-existent venue
870 ],
871 }
872 ]
873 }
874 ]
876 # Write test data files in correct locations
877 data_files = {"br": br_data}
879 for entity_type, data in data_files.items():
880 # Create all necessary directories
881 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
882 os.makedirs(dir_path, exist_ok=True)
884 zip_path = os.path.join(dir_path, "1000.zip")
885 with ZipFile(zip_path, "w") as zip_file:
886 zip_file.writestr("1000.json", json.dumps(data))
888 # Run generator
889 generate_csv(
890 input_dir=self.rdf_dir,
891 output_dir=self.output_dir,
892 dir_split_number=10000,
893 items_per_file=1000,
894 zip_output_rdf=True,
895 redis_port=6381,
896 redis_db=5,
897 )
899 # Check output
900 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
901 self.assertEqual(len(output_data), 1)
902 # Verify graceful handling of missing/invalid data
903 self.assertEqual(output_data[0]["title"], "")
904 self.assertEqual(output_data[0]["author"], "")
905 self.assertEqual(output_data[0]["venue"], "")
907 def test_br_with_hierarchical_venue_structures(self):
908 """Test different hierarchical venue structures (issue->volume->journal, issue->journal, volume->journal, direct journal)"""
909 supplier_prefix = "060"
911 # Create test data for different hierarchical structures
912 br_data = [
913 {
914 "@graph": [
915 # Article in issue->volume->journal structure
916 {
917 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
918 "@type": [
919 "http://purl.org/spar/fabio/Expression",
920 "http://purl.org/spar/fabio/JournalArticle",
921 ],
922 "http://purl.org/dc/terms/title": [
923 {"@value": "Article in Full Hierarchy"}
924 ],
925 "http://purl.org/vocab/frbr/core#partOf": [
926 {
927 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2"
928 } # Issue
929 ],
930 },
931 # Article in issue->journal structure (no volume)
932 {
933 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}5",
934 "@type": [
935 "http://purl.org/spar/fabio/Expression",
936 "http://purl.org/spar/fabio/JournalArticle",
937 ],
938 "http://purl.org/dc/terms/title": [
939 {"@value": "Article in Issue-Journal"}
940 ],
941 "http://purl.org/vocab/frbr/core#partOf": [
942 {
943 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6"
944 } # Issue
945 ],
946 },
947 # Article in volume->journal structure (no issue)
948 {
949 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}9",
950 "@type": [
951 "http://purl.org/spar/fabio/Expression",
952 "http://purl.org/spar/fabio/JournalArticle",
953 ],
954 "http://purl.org/dc/terms/title": [
955 {"@value": "Article in Volume-Journal"}
956 ],
957 "http://purl.org/vocab/frbr/core#partOf": [
958 {
959 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10"
960 } # Volume
961 ],
962 },
963 # Article directly in journal
964 {
965 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}13",
966 "@type": [
967 "http://purl.org/spar/fabio/Expression",
968 "http://purl.org/spar/fabio/JournalArticle",
969 ],
970 "http://purl.org/dc/terms/title": [
971 {"@value": "Article in Journal"}
972 ],
973 "http://purl.org/vocab/frbr/core#partOf": [
974 {
975 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"
976 } # Journal
977 ],
978 },
979 # Issue in full hierarchy
980 {
981 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2",
982 "@type": ["http://purl.org/spar/fabio/JournalIssue"],
983 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
984 {"@value": "2"}
985 ],
986 "http://purl.org/vocab/frbr/core#partOf": [
987 {
988 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3"
989 } # Volume
990 ],
991 },
992 # Volume in full hierarchy
993 {
994 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3",
995 "@type": ["http://purl.org/spar/fabio/JournalVolume"],
996 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
997 {"@value": "42"}
998 ],
999 "http://purl.org/vocab/frbr/core#partOf": [
1000 {
1001 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"
1002 } # Journal
1003 ],
1004 },
1005 # Journal
1006 {
1007 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4",
1008 "@type": ["http://purl.org/spar/fabio/Journal"],
1009 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}],
1010 },
1011 # Issue directly in journal
1012 {
1013 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6",
1014 "@type": ["http://purl.org/spar/fabio/JournalIssue"],
1015 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
1016 {"@value": "3"}
1017 ],
1018 "http://purl.org/vocab/frbr/core#partOf": [
1019 {
1020 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"
1021 } # Journal
1022 ],
1023 },
1024 # Volume directly in journal
1025 {
1026 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10",
1027 "@type": ["http://purl.org/spar/fabio/JournalVolume"],
1028 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
1029 {"@value": "5"}
1030 ],
1031 "http://purl.org/vocab/frbr/core#partOf": [
1032 {
1033 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"
1034 } # Journal
1035 ],
1036 },
1037 ]
1038 }
1039 ]
1041 # Write test data files
1042 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000")
1043 os.makedirs(dir_path, exist_ok=True)
1045 zip_path = os.path.join(dir_path, "1000.zip")
1046 with ZipFile(zip_path, "w") as zip_file:
1047 zip_file.writestr("1000.json", json.dumps(br_data))
1049 # Run generator
1050 generate_csv(
1051 input_dir=self.rdf_dir,
1052 output_dir=self.output_dir,
1053 dir_split_number=10000,
1054 items_per_file=1000,
1055 zip_output_rdf=True,
1056 redis_port=6381,
1057 redis_db=5,
1058 )
1060 # Check output
1061 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1063 # Verify we only have the articles and journal in the output
1064 self.assertEqual(len(output_data), 5) # 4 articles + 1 journal
1066 # Verify no JournalVolume or JournalIssue entries exist
1067 volume_or_issue_entries = [
1068 item
1069 for item in output_data
1070 if item["type"] in ["journal volume", "journal issue"]
1071 ]
1072 self.assertEqual(len(volume_or_issue_entries), 0)
1074 # Find each article by title
1075 full_hierarchy = next(
1076 item for item in output_data if item["title"] == "Article in Full Hierarchy"
1077 )
1078 issue_journal = next(
1079 item for item in output_data if item["title"] == "Article in Issue-Journal"
1080 )
1081 volume_journal = next(
1082 item for item in output_data if item["title"] == "Article in Volume-Journal"
1083 )
1084 direct_journal = next(
1085 item for item in output_data if item["title"] == "Article in Journal"
1086 )
1088 # Test full hierarchy (issue->volume->journal)
1089 self.assertEqual(full_hierarchy["issue"], "2")
1090 self.assertEqual(full_hierarchy["volume"], "42")
1091 self.assertEqual(
1092 full_hierarchy["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"
1093 )
1095 # Test issue->journal (no volume)
1096 self.assertEqual(issue_journal["issue"], "3")
1097 self.assertEqual(issue_journal["volume"], "")
1098 self.assertEqual(
1099 issue_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"
1100 )
1102 # Test volume->journal (no issue)
1103 self.assertEqual(volume_journal["issue"], "")
1104 self.assertEqual(volume_journal["volume"], "5")
1105 self.assertEqual(
1106 volume_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"
1107 )
1109 # Test direct journal connection
1110 self.assertEqual(direct_journal["issue"], "")
1111 self.assertEqual(direct_journal["volume"], "")
1112 self.assertEqual(
1113 direct_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"
1114 )
1116 def test_book_in_series(self):
1117 """Test processing of a book that is part of a book series"""
1118 supplier_prefix = "060"
1120 # Create test data for book in series
1121 br_data = [
1122 {
1123 "@graph": [
1124 # Book
1125 {
1126 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1127 "@type": [
1128 "http://purl.org/spar/fabio/Expression",
1129 "http://purl.org/spar/fabio/Book",
1130 ],
1131 "http://purl.org/dc/terms/title": [{"@value": "Test Book"}],
1132 "http://purl.org/vocab/frbr/core#partOf": [
1133 {
1134 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2"
1135 } # Series
1136 ],
1137 },
1138 # Book Series
1139 {
1140 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2",
1141 "@type": ["http://purl.org/spar/fabio/BookSeries"],
1142 "http://purl.org/dc/terms/title": [
1143 {"@value": "Test Book Series"}
1144 ],
1145 },
1146 ]
1147 }
1148 ]
1150 # Write test data
1151 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000")
1152 os.makedirs(dir_path, exist_ok=True)
1154 zip_path = os.path.join(dir_path, "1000.zip")
1155 with ZipFile(zip_path, "w") as zip_file:
1156 zip_file.writestr("1000.json", json.dumps(br_data))
1158 # Run generator
1159 generate_csv(
1160 input_dir=self.rdf_dir,
1161 output_dir=self.output_dir,
1162 dir_split_number=10000,
1163 items_per_file=1000,
1164 zip_output_rdf=True,
1165 redis_port=6381,
1166 redis_db=5,
1167 )
1169 # Check output
1170 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1172 # Find book entry
1173 book = next(item for item in output_data if item["type"] == "book")
1175 # Verify book is correctly linked to series
1176 self.assertEqual(book["title"], "Test Book")
1177 self.assertEqual(
1178 book["venue"], f"Test Book Series [omid:br/{supplier_prefix}2]"
1179 )
1180 self.assertEqual(book["volume"], "") # Should not have volume
1181 self.assertEqual(book["issue"], "") # Should not have issue
1183 def test_br_with_multiple_roles(self):
1184 """Test processing of BR with authors, editors and publishers"""
1185 supplier_prefix = "060"
1186 br_data = [
1187 {
1188 "@graph": [
1189 {
1190 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1191 "@type": [
1192 "http://purl.org/spar/fabio/Expression",
1193 "http://purl.org/spar/fabio/Book",
1194 ],
1195 "http://purl.org/dc/terms/title": [
1196 {"@value": "Multi-Role Book"}
1197 ],
1198 "http://purl.org/spar/pro/isDocumentContextFor": [
1199 {
1200 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"
1201 }, # Author
1202 {
1203 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"
1204 }, # Editor
1205 {
1206 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"
1207 }, # Publisher
1208 ],
1209 }
1210 ]
1211 }
1212 ]
1214 # Setup agent roles for authors, editors and publishers
1215 ar_data = [
1216 {
1217 "@graph": [
1218 {
1219 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
1220 "http://purl.org/spar/pro/withRole": [
1221 {"@id": "http://purl.org/spar/pro/author"}
1222 ],
1223 "http://purl.org/spar/pro/isHeldBy": [
1224 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
1225 ],
1226 "https://w3id.org/oc/ontology/hasNext": [
1227 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}
1228 ],
1229 },
1230 {
1231 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
1232 "http://purl.org/spar/pro/withRole": [
1233 {"@id": "http://purl.org/spar/pro/editor"}
1234 ],
1235 "http://purl.org/spar/pro/isHeldBy": [
1236 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
1237 ],
1238 "https://w3id.org/oc/ontology/hasNext": [
1239 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
1240 ],
1241 },
1242 {
1243 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
1244 "http://purl.org/spar/pro/withRole": [
1245 {"@id": "http://purl.org/spar/pro/publisher"}
1246 ],
1247 "http://purl.org/spar/pro/isHeldBy": [
1248 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
1249 ],
1250 },
1251 ]
1252 }
1253 ]
1255 # Setup responsible agents with different name formats
1256 ra_data = [
1257 {
1258 "@graph": [
1259 {
1260 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
1261 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}],
1262 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}],
1263 },
1264 {
1265 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
1266 "http://xmlns.com/foaf/0.1/name": [{"@value": "Editor Name"}],
1267 },
1268 {
1269 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
1270 "http://xmlns.com/foaf/0.1/name": [
1271 {"@value": "Publisher House"}
1272 ],
1273 },
1274 ]
1275 }
1276 ]
1278 # Write test data files
1279 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
1281 for entity_type, data in data_files.items():
1282 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
1283 os.makedirs(dir_path, exist_ok=True)
1285 zip_path = os.path.join(dir_path, "1000.zip")
1286 with ZipFile(zip_path, "w") as zip_file:
1287 zip_file.writestr("1000.json", json.dumps(data))
1289 # Run generator
1290 generate_csv(
1291 input_dir=self.rdf_dir,
1292 output_dir=self.output_dir,
1293 dir_split_number=10000,
1294 items_per_file=1000,
1295 zip_output_rdf=True,
1296 redis_port=6381,
1297 redis_db=5,
1298 )
1300 # Check output
1301 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1302 self.assertEqual(len(output_data), 1)
1304 # Verify all roles are correctly processed
1305 book = output_data[0]
1306 self.assertEqual(book["title"], "Multi-Role Book")
1307 self.assertEqual(book["author"], f"Smith, John [omid:ra/{supplier_prefix}1]")
1308 self.assertEqual(book["editor"], f"Editor Name [omid:ra/{supplier_prefix}2]")
1309 self.assertEqual(
1310 book["publisher"], f"Publisher House [omid:ra/{supplier_prefix}3]"
1311 )
1313 def test_ordered_authors(self):
1314 """Test that authors are ordered according to hasNext relations"""
1315 supplier_prefix = "060"
1316 br_data = [
1317 {
1318 "@graph": [
1319 {
1320 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1321 "@type": [
1322 "http://purl.org/spar/fabio/Expression",
1323 "http://purl.org/spar/fabio/JournalArticle",
1324 ],
1325 "http://purl.org/dc/terms/title": [
1326 {"@value": "Ordered Authors Article"}
1327 ],
1328 "http://purl.org/spar/pro/isDocumentContextFor": [
1329 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"},
1330 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"},
1331 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"},
1332 ],
1333 }
1334 ]
1335 }
1336 ]
1338 # Setup agent roles with hasNext relations
1339 ar_data = [
1340 {
1341 "@graph": [
1342 {
1343 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
1344 "http://purl.org/spar/pro/withRole": [
1345 {"@id": "http://purl.org/spar/pro/author"}
1346 ],
1347 "http://purl.org/spar/pro/isHeldBy": [
1348 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
1349 ],
1350 "https://w3id.org/oc/ontology/hasNext": [
1351 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}
1352 ],
1353 },
1354 {
1355 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
1356 "http://purl.org/spar/pro/withRole": [
1357 {"@id": "http://purl.org/spar/pro/author"}
1358 ],
1359 "http://purl.org/spar/pro/isHeldBy": [
1360 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
1361 ],
1362 "https://w3id.org/oc/ontology/hasNext": [
1363 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
1364 ],
1365 },
1366 {
1367 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
1368 "http://purl.org/spar/pro/withRole": [
1369 {"@id": "http://purl.org/spar/pro/author"}
1370 ],
1371 "http://purl.org/spar/pro/isHeldBy": [
1372 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
1373 ],
1374 },
1375 ]
1376 }
1377 ]
1379 # Setup responsible agents with different names
1380 ra_data = [
1381 {
1382 "@graph": [
1383 {
1384 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
1385 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}],
1386 },
1387 {
1388 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
1389 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}],
1390 },
1391 {
1392 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
1393 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}],
1394 },
1395 ]
1396 }
1397 ]
1399 # Write test data files
1400 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
1402 for entity_type, data in data_files.items():
1403 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
1404 os.makedirs(dir_path, exist_ok=True)
1406 zip_path = os.path.join(dir_path, "1000.zip")
1407 with ZipFile(zip_path, "w") as zip_file:
1408 zip_file.writestr("1000.json", json.dumps(data))
1410 # Run generator
1411 generate_csv(
1412 input_dir=self.rdf_dir,
1413 output_dir=self.output_dir,
1414 dir_split_number=10000,
1415 items_per_file=1000,
1416 zip_output_rdf=True,
1417 redis_port=6381,
1418 redis_db=5,
1419 )
1421 # Check output
1422 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1423 self.assertEqual(len(output_data), 1)
1425 # Verify authors are in the correct order
1426 expected_authors = (
1427 f"First Author [omid:ra/{supplier_prefix}1]; "
1428 f"Second Author [omid:ra/{supplier_prefix}2]; "
1429 f"Third Author [omid:ra/{supplier_prefix}3]"
1430 )
1431 self.assertEqual(output_data[0]["author"], expected_authors)
1433 def test_cyclic_hasNext_relations(self):
1434 """Test handling of cyclic hasNext relations between agent roles"""
1435 supplier_prefix = "060"
1436 br_data = [
1437 {
1438 "@graph": [
1439 {
1440 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1441 "@type": [
1442 "http://purl.org/spar/fabio/Expression",
1443 "http://purl.org/spar/fabio/JournalArticle",
1444 ],
1445 "http://purl.org/dc/terms/title": [
1446 {"@value": "Cyclic Authors Article"}
1447 ],
1448 "http://purl.org/spar/pro/isDocumentContextFor": [
1449 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"},
1450 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"},
1451 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"},
1452 ],
1453 }
1454 ]
1455 }
1456 ]
1458 # Setup agent roles with cyclic hasNext relations
1459 ar_data = [
1460 {
1461 "@graph": [
1462 {
1463 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
1464 "http://purl.org/spar/pro/withRole": [
1465 {"@id": "http://purl.org/spar/pro/author"}
1466 ],
1467 "http://purl.org/spar/pro/isHeldBy": [
1468 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
1469 ],
1470 "https://w3id.org/oc/ontology/hasNext": [
1471 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}
1472 ],
1473 },
1474 {
1475 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
1476 "http://purl.org/spar/pro/withRole": [
1477 {"@id": "http://purl.org/spar/pro/author"}
1478 ],
1479 "http://purl.org/spar/pro/isHeldBy": [
1480 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
1481 ],
1482 # Creates a cycle: 1 -> 2 -> 3 -> 1
1483 "https://w3id.org/oc/ontology/hasNext": [
1484 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
1485 ],
1486 },
1487 {
1488 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
1489 "http://purl.org/spar/pro/withRole": [
1490 {"@id": "http://purl.org/spar/pro/author"}
1491 ],
1492 "http://purl.org/spar/pro/isHeldBy": [
1493 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
1494 ],
1495 # Cycle completion
1496 "https://w3id.org/oc/ontology/hasNext": [
1497 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}
1498 ],
1499 },
1500 ]
1501 }
1502 ]
1504 # Setup responsible agents
1505 ra_data = [
1506 {
1507 "@graph": [
1508 {
1509 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
1510 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}],
1511 },
1512 {
1513 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
1514 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}],
1515 },
1516 {
1517 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
1518 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}],
1519 },
1520 ]
1521 }
1522 ]
1524 # Write test data files
1525 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
1527 for entity_type, data in data_files.items():
1528 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
1529 os.makedirs(dir_path, exist_ok=True)
1531 zip_path = os.path.join(dir_path, "1000.zip")
1532 with ZipFile(zip_path, "w") as zip_file:
1533 zip_file.writestr("1000.json", json.dumps(data))
1535 # Run generator
1536 generate_csv(
1537 input_dir=self.rdf_dir,
1538 output_dir=self.output_dir,
1539 dir_split_number=10000,
1540 items_per_file=1000,
1541 zip_output_rdf=True,
1542 redis_port=6381,
1543 redis_db=5,
1544 )
1546 # Check output
1547 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1548 self.assertEqual(len(output_data), 1)
1550 # Verify that we get at least some authors before the cycle is detected
1551 # The order should be maintained until the cycle is detected
1552 authors = output_data[0]["author"].split("; ")
1553 self.assertGreater(len(authors), 0)
1555 # Verify the presence and order of authors
1556 self.assertTrue(
1557 any(
1558 f"First Author [omid:ra/{supplier_prefix}1]" in author
1559 for author in authors
1560 )
1561 )
1562 self.assertTrue(
1563 any(
1564 f"Second Author [omid:ra/{supplier_prefix}2]" in author
1565 for author in authors
1566 )
1567 )
1569 # Verify no duplicates in the output
1570 author_set = set(authors)
1571 self.assertEqual(
1572 len(authors),
1573 len(author_set),
1574 "Found duplicate authors in output: each author should appear exactly once",
1575 )
1577 # Verify the exact order and number of authors
1578 expected_authors = [
1579 f"First Author [omid:ra/{supplier_prefix}1]",
1580 f"Second Author [omid:ra/{supplier_prefix}2]",
1581 f"Third Author [omid:ra/{supplier_prefix}3]",
1582 ]
1583 self.assertEqual(
1584 authors,
1585 expected_authors,
1586 "Authors should be in correct order and each should appear exactly once",
1587 )
1589 def test_multiple_input_files(self):
1590 """Test processing of multiple input files with sequential entity IDs"""
1591 supplier_prefix = "060"
1593 # Create test data spanning multiple files
1594 # First file (entities 1-1000)
1595 br_data_1 = [
1596 {
1597 "@graph": [
1598 {
1599 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1600 "@type": [
1601 "http://purl.org/spar/fabio/Expression",
1602 "http://purl.org/spar/fabio/JournalArticle",
1603 ],
1604 "http://purl.org/dc/terms/title": [{"@value": "Article 1"}],
1605 },
1606 {
1607 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1000",
1608 "@type": [
1609 "http://purl.org/spar/fabio/Expression",
1610 "http://purl.org/spar/fabio/JournalArticle",
1611 ],
1612 "http://purl.org/dc/terms/title": [{"@value": "Article 1000"}],
1613 },
1614 ]
1615 }
1616 ]
1618 # Second file (entities 1001-2000)
1619 br_data_2 = [
1620 {
1621 "@graph": [
1622 {
1623 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1001",
1624 "@type": [
1625 "http://purl.org/spar/fabio/Expression",
1626 "http://purl.org/spar/fabio/JournalArticle",
1627 ],
1628 "http://purl.org/dc/terms/title": [{"@value": "Article 1001"}],
1629 },
1630 {
1631 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2000",
1632 "@type": [
1633 "http://purl.org/spar/fabio/Expression",
1634 "http://purl.org/spar/fabio/JournalArticle",
1635 ],
1636 "http://purl.org/dc/terms/title": [{"@value": "Article 2000"}],
1637 },
1638 ]
1639 }
1640 ]
1642 # Third file (entities 2001-3000)
1643 br_data_3 = [
1644 {
1645 "@graph": [
1646 {
1647 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2001",
1648 "@type": [
1649 "http://purl.org/spar/fabio/Expression",
1650 "http://purl.org/spar/fabio/JournalArticle",
1651 ],
1652 "http://purl.org/dc/terms/title": [{"@value": "Article 2001"}],
1653 "http://purl.org/spar/pro/isDocumentContextFor": [
1654 {
1655 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001"
1656 }
1657 ],
1658 }
1659 ]
1660 }
1661 ]
1663 # Create agent role data in a different file
1664 ar_data = [
1665 {
1666 "@graph": [
1667 {
1668 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001",
1669 "http://purl.org/spar/pro/withRole": [
1670 {"@id": "http://purl.org/spar/pro/author"}
1671 ],
1672 "http://purl.org/spar/pro/isHeldBy": [
1673 {
1674 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001"
1675 }
1676 ],
1677 }
1678 ]
1679 }
1680 ]
1682 # Create responsible agent data in a different file
1683 ra_data = [
1684 {
1685 "@graph": [
1686 {
1687 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001",
1688 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}],
1689 }
1690 ]
1691 }
1692 ]
1694 # Write test data to appropriate locations based on ID ranges
1695 os.makedirs(os.path.join(self.br_dir, supplier_prefix, "10000"), exist_ok=True)
1696 os.makedirs(
1697 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000"), exist_ok=True
1698 )
1699 os.makedirs(
1700 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000"), exist_ok=True
1701 )
1703 # Write BR files
1704 with ZipFile(
1705 os.path.join(self.br_dir, supplier_prefix, "10000", "1000.zip"), "w"
1706 ) as zip_file:
1707 zip_file.writestr("1000.json", json.dumps(br_data_1))
1708 with ZipFile(
1709 os.path.join(self.br_dir, supplier_prefix, "10000", "2000.zip"), "w"
1710 ) as zip_file:
1711 zip_file.writestr("2000.json", json.dumps(br_data_2))
1712 with ZipFile(
1713 os.path.join(self.br_dir, supplier_prefix, "10000", "3000.zip"), "w"
1714 ) as zip_file:
1715 zip_file.writestr("3000.json", json.dumps(br_data_3))
1717 # Write AR and RA files
1718 with ZipFile(
1719 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000", "3000.zip"), "w"
1720 ) as zip_file:
1721 zip_file.writestr("3000.json", json.dumps(ar_data))
1722 with ZipFile(
1723 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000", "3000.zip"), "w"
1724 ) as zip_file:
1725 zip_file.writestr("3000.json", json.dumps(ra_data))
1727 # Run generator
1728 generate_csv(
1729 input_dir=self.rdf_dir,
1730 output_dir=self.output_dir,
1731 dir_split_number=10000,
1732 items_per_file=1000,
1733 zip_output_rdf=True,
1734 redis_port=6381,
1735 redis_db=5,
1736 )
1738 # Check output
1739 output_files = sorted(os.listdir(self.output_dir))
1740 self.assertGreater(len(output_files), 0)
1742 # Collect all output data
1743 all_output_data = []
1744 for output_file in output_files:
1745 all_output_data.extend(
1746 get_csv_data(os.path.join(self.output_dir, output_file))
1747 )
1749 # Verify we have all expected entries
1750 self.assertEqual(len(all_output_data), 5) # Should have 5 articles total
1752 # Verify specific entries
1753 article_1 = next(
1754 item
1755 for item in all_output_data
1756 if item["id"] == f"omid:br/{supplier_prefix}1"
1757 )
1758 article_1000 = next(
1759 item
1760 for item in all_output_data
1761 if item["id"] == f"omid:br/{supplier_prefix}1000"
1762 )
1763 article_1001 = next(
1764 item
1765 for item in all_output_data
1766 if item["id"] == f"omid:br/{supplier_prefix}1001"
1767 )
1768 article_2000 = next(
1769 item
1770 for item in all_output_data
1771 if item["id"] == f"omid:br/{supplier_prefix}2000"
1772 )
1773 article_2001 = next(
1774 item
1775 for item in all_output_data
1776 if item["id"] == f"omid:br/{supplier_prefix}2001"
1777 )
1779 # Check titles
1780 self.assertEqual(article_1["title"], "Article 1")
1781 self.assertEqual(article_1000["title"], "Article 1000")
1782 self.assertEqual(article_1001["title"], "Article 1001")
1783 self.assertEqual(article_2000["title"], "Article 2000")
1784 self.assertEqual(article_2001["title"], "Article 2001")
1786 # Check author for article 2001 (which has related entities)
1787 self.assertEqual(
1788 article_2001["author"], f"Test Author [omid:ra/{supplier_prefix}2001]"
1789 )
1791 def test_max_rows_per_file_and_data_integrity(self):
1792 """Test that output files respect max rows limit and no data is lost in multiprocessing"""
1793 supplier_prefix = "060"
1795 # Create test data with more than 3000 entries
1796 br_data = [
1797 {
1798 "@graph": [
1799 # Generate 3500 test entries
1800 *[
1801 {
1802 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}{i}",
1803 "@type": [
1804 "http://purl.org/spar/fabio/Expression",
1805 "http://purl.org/spar/fabio/JournalArticle",
1806 ],
1807 "http://purl.org/dc/terms/title": [
1808 {"@value": f"Article {i}"}
1809 ],
1810 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
1811 {"@value": "2024-01-01"}
1812 ],
1813 }
1814 for i in range(1, 3501)
1815 ] # This will create 3500 entries
1816 ]
1817 }
1818 ]
1820 # Split data into multiple files to test multiprocessing
1821 entries_per_file = 1000
1822 for i in range(0, 3500, entries_per_file):
1823 file_data = [{"@graph": br_data[0]["@graph"][i : i + entries_per_file]}]
1825 # Create directory structure for the file
1826 file_number = i + entries_per_file
1827 dir_path = os.path.join(self.br_dir, supplier_prefix, "10000")
1828 os.makedirs(dir_path, exist_ok=True)
1830 # Write the file
1831 with ZipFile(os.path.join(dir_path, f"{file_number}.zip"), "w") as zip_file:
1832 zip_file.writestr(f"{file_number}.json", json.dumps(file_data))
1834 # Run generator
1835 generate_csv(
1836 input_dir=self.rdf_dir,
1837 output_dir=self.output_dir,
1838 dir_split_number=10000,
1839 items_per_file=1000,
1840 zip_output_rdf=True,
1841 redis_port=6381,
1842 redis_db=5,
1843 )
1845 # Check output files
1846 output_files = sorted(os.listdir(self.output_dir))
1848 # Verify number of output files
1849 # We expect at least 2 files: 3500 entries should create 2 files (3000 + 500)
1850 self.assertGreaterEqual(
1851 len(output_files), 2, "Should have at least 2 output files for 3500 entries"
1852 )
1854 # Collect all entries from all output files
1855 all_entries = []
1856 for output_file in output_files:
1857 entries = get_csv_data(os.path.join(self.output_dir, output_file))
1859 # Verify each file has at most 3000 rows
1860 self.assertLessEqual(
1861 len(entries),
1862 3000,
1863 f"File {output_file} has more than 3000 rows: {len(entries)}",
1864 )
1866 all_entries.extend(entries)
1868 # Verify total number of entries
1869 self.assertEqual(
1870 len(all_entries),
1871 3500,
1872 f"Expected 3500 total entries, got {len(all_entries)}",
1873 )
1875 # Verify no duplicate entries
1876 unique_ids = {entry["id"] for entry in all_entries}
1877 self.assertEqual(
1878 len(unique_ids),
1879 3500,
1880 f"Expected 3500 unique entries, got {len(unique_ids)}",
1881 )
1883 # Verify all entries are present (no missing entries)
1884 expected_ids = {f"omid:br/{supplier_prefix}{i}" for i in range(1, 3501)}
1885 self.assertEqual(
1886 unique_ids,
1887 expected_ids,
1888 "Some entries are missing or unexpected entries are present",
1889 )
1891 # Verify data integrity
1892 for i in range(1, 3501):
1893 entry = next(
1894 e for e in all_entries if e["id"] == f"omid:br/{supplier_prefix}{i}"
1895 )
1896 self.assertEqual(entry["title"], f"Article {i}")
1897 self.assertEqual(entry["pub_date"], "2024-01-01")
1898 self.assertEqual(entry["type"], "journal article")
1900 def test_csv_field_limit_handling(self):
1901 """Test handling of CSV files with large fields that exceed the default limit"""
1902 # Create a test CSV with a very large field
1903 large_field = "omid:br/0601 " + " ".join(
1904 [f"id:{i}" for i in range(10000)]
1905 ) # This will create a field > 131072 chars
1906 test_data = {"id": large_field, "title": "Test Large Field"}
1908 os.makedirs(self.output_dir, exist_ok=True)
1909 with open(
1910 os.path.join(self.output_dir, "large_field.csv"),
1911 "w",
1912 newline="",
1913 encoding="utf-8",
1914 ) as f:
1915 writer = csv.DictWriter(f, fieldnames=["id", "title"])
1916 writer.writeheader()
1917 writer.writerow(test_data)
1919 # Try loading the data - this should trigger the field limit increase
1920 count = load_processed_omids_to_redis(self.output_dir, self.redis_client)
1922 # Verify the OMID was loaded despite the large field
1923 self.assertEqual(count, 1)
1924 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client))
1926 def test_complex_br_with_missing_authors(self):
1927 """Test processing of a complex BR with multiple related entities where authors might be missing"""
1928 supplier_prefix = "06250"
1929 br_data = [
1930 {
1931 "@graph": [
1932 {
1933 "@id": "https://w3id.org/oc/meta/br/062501777134",
1934 "@type": [
1935 "http://purl.org/spar/fabio/JournalArticle",
1936 "http://purl.org/spar/fabio/Expression",
1937 ],
1938 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
1939 {
1940 "@type": "http://www.w3.org/2001/XMLSchema#gYearMonth",
1941 "@value": "2020-02",
1942 }
1943 ],
1944 "http://purl.org/dc/terms/title": [
1945 {
1946 "@value": "OpenCitations, An Infrastructure Organization For Open Scholarship"
1947 }
1948 ],
1949 "http://purl.org/spar/datacite/hasIdentifier": [
1950 {"@id": "https://w3id.org/oc/meta/id/062501806985"},
1951 {"@id": "https://w3id.org/oc/meta/id/06850624745"},
1952 ],
1953 "http://purl.org/spar/pro/isDocumentContextFor": [
1954 {"@id": "https://w3id.org/oc/meta/ar/062507977761"},
1955 {"@id": "https://w3id.org/oc/meta/ar/062507977760"},
1956 {"@id": "https://w3id.org/oc/meta/ar/062507977759"},
1957 ],
1958 "http://purl.org/vocab/frbr/core#embodiment": [
1959 {"@id": "https://w3id.org/oc/meta/re/062501477439"}
1960 ],
1961 "http://purl.org/vocab/frbr/core#partOf": [
1962 {"@id": "https://w3id.org/oc/meta/br/062501778111"}
1963 ],
1964 }
1965 ]
1966 }
1967 ]
1969 ar_data = [
1970 {
1971 "@graph": [
1972 {
1973 "@id": "https://w3id.org/oc/meta/ar/062507977761",
1974 "@type": ["http://purl.org/spar/pro/RoleInTime"],
1975 "http://purl.org/spar/pro/isHeldBy": [
1976 {"@id": "https://w3id.org/oc/meta/ra/0610116105"}
1977 ],
1978 "http://purl.org/spar/pro/withRole": [
1979 {"@id": "http://purl.org/spar/pro/publisher"}
1980 ],
1981 },
1982 {
1983 "@id": "https://w3id.org/oc/meta/ar/062507977760",
1984 "@type": ["http://purl.org/spar/pro/RoleInTime"],
1985 "http://purl.org/spar/pro/isHeldBy": [
1986 {"@id": "https://w3id.org/oc/meta/ra/0621010775619"}
1987 ],
1988 "http://purl.org/spar/pro/withRole": [
1989 {"@id": "http://purl.org/spar/pro/author"}
1990 ],
1991 },
1992 {
1993 "@id": "https://w3id.org/oc/meta/ar/062507977759",
1994 "@type": ["http://purl.org/spar/pro/RoleInTime"],
1995 "http://purl.org/spar/pro/isHeldBy": [
1996 {"@id": "https://w3id.org/oc/meta/ra/0614010840729"}
1997 ],
1998 "http://purl.org/spar/pro/withRole": [
1999 {"@id": "http://purl.org/spar/pro/author"}
2000 ],
2001 "https://w3id.org/oc/ontology/hasNext": [
2002 {"@id": "https://w3id.org/oc/meta/ar/062507977760"}
2003 ],
2004 },
2005 ]
2006 }
2007 ]
2009 ra_data_peroni = [
2010 {
2011 "@graph": [
2012 {
2013 "@id": "https://w3id.org/oc/meta/ra/0614010840729",
2014 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
2015 "http://purl.org/spar/datacite/hasIdentifier": [
2016 {"@id": "https://w3id.org/oc/meta/id/06304949238"}
2017 ],
2018 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Peroni"}],
2019 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Silvio"}],
2020 "http://xmlns.com/foaf/0.1/name": [{"@value": "Peroni Silvio"}],
2021 }
2022 ]
2023 }
2024 ]
2026 ra_data_shotton = [
2027 {
2028 "@graph": [
2029 {
2030 "@id": "https://w3id.org/oc/meta/ra/0621010775619",
2031 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
2032 "http://purl.org/spar/datacite/hasIdentifier": [
2033 {"@id": "https://w3id.org/oc/meta/id/062404672414"}
2034 ],
2035 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Shotton"}],
2036 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "D M"}],
2037 "http://xmlns.com/foaf/0.1/name": [{"@value": "Shotton David"}],
2038 }
2039 ]
2040 }
2041 ]
2043 # Create directory structure for BR data
2044 br_dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "1780000")
2045 os.makedirs(br_dir_path, exist_ok=True)
2047 # Create directory structure for AR data
2048 ar_dir_path = os.path.join(self.rdf_dir, "ar", supplier_prefix, "7980000")
2049 os.makedirs(ar_dir_path, exist_ok=True)
2051 # Create directory structure for RA data (Peroni)
2052 ra_peroni_dir_path = os.path.join(self.rdf_dir, "ra", "06140", "10850000")
2053 os.makedirs(ra_peroni_dir_path, exist_ok=True)
2055 # Create directory structure for RA data (Shotton)
2056 ra_shotton_dir_path = os.path.join(self.rdf_dir, "ra", "06210", "10780000")
2057 os.makedirs(ra_shotton_dir_path, exist_ok=True)
2059 # Write BR data
2060 with ZipFile(os.path.join(br_dir_path, "1778000.zip"), "w") as zip_file:
2061 zip_file.writestr("1778000.json", json.dumps(br_data))
2063 # Write AR data
2064 with ZipFile(os.path.join(ar_dir_path, "7978000.zip"), "w") as zip_file:
2065 zip_file.writestr("7978000.json", json.dumps(ar_data))
2067 # Write RA data (Peroni)
2068 with ZipFile(os.path.join(ra_peroni_dir_path, "10841000.zip"), "w") as zip_file:
2069 zip_file.writestr("10841000.json", json.dumps(ra_data_peroni))
2071 # Write RA data (Shotton)
2072 with ZipFile(
2073 os.path.join(ra_shotton_dir_path, "10776000.zip"), "w"
2074 ) as zip_file:
2075 zip_file.writestr("10776000.json", json.dumps(ra_data_shotton))
2077 # Run generator
2078 generate_csv(
2079 input_dir=self.rdf_dir,
2080 output_dir=self.output_dir,
2081 dir_split_number=10000,
2082 items_per_file=1000,
2083 zip_output_rdf=True,
2084 redis_port=6381,
2085 redis_db=5,
2086 )
2088 # Check output
2089 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
2090 self.assertEqual(len(output_data), 1)
2091 # Verify basic metadata
2092 article = output_data[0]
2093 self.assertEqual(
2094 article["title"],
2095 "OpenCitations, An Infrastructure Organization For Open Scholarship",
2096 )
2097 self.assertEqual(article["pub_date"], "2020-02")
2098 self.assertEqual(article["type"], "journal article")
2099 self.assertEqual(article["id"], "omid:br/062501777134")
2101 # Now we expect the authors to be present in the correct order
2102 expected_authors = (
2103 "Peroni, Silvio [omid:ra/0614010840729]; "
2104 "Shotton, D M [omid:ra/0621010775619]"
2105 )
2106 self.assertEqual(article["author"], expected_authors)
2108 # Publisher field should still be empty since we haven't added the publisher RA data
2109 self.assertEqual(article["publisher"], "")
2111 def test_multiple_first_ars(self):
2112 """Test behavior when there are multiple first ARs in the same chain (no hasNext pointing to them).
2113 The current behavior is to process only one of the first ARs and its hasNext chain.
2114 """
2115 supplier_prefix = "060"
2116 br_data = [
2117 {
2118 "@graph": [
2119 {
2120 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
2121 "@type": [
2122 "http://purl.org/spar/fabio/Expression",
2123 "http://purl.org/spar/fabio/JournalArticle",
2124 ],
2125 "http://purl.org/dc/terms/title": [
2126 {"@value": "Article With Multiple First Authors"}
2127 ],
2128 "http://purl.org/spar/pro/isDocumentContextFor": [
2129 {
2130 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"
2131 }, # First potential author (will be processed)
2132 {
2133 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"
2134 }, # Second potential author (will be ignored)
2135 {
2136 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"
2137 }, # Connected to author 1 (will be processed)
2138 ],
2139 }
2140 ]
2141 }
2142 ]
2144 # Setup agent roles with two potential "first" authors (no hasNext pointing to them)
2145 # and one author connected to the first one
2146 ar_data = [
2147 {
2148 "@graph": [
2149 {
2150 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
2151 "@type": ["http://purl.org/spar/pro/RoleInTime"],
2152 "http://purl.org/spar/pro/withRole": [
2153 {"@id": "http://purl.org/spar/pro/author"}
2154 ],
2155 "http://purl.org/spar/pro/isHeldBy": [
2156 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
2157 ],
2158 "https://w3id.org/oc/ontology/hasNext": [
2159 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
2160 ],
2161 },
2162 {
2163 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
2164 "@type": ["http://purl.org/spar/pro/RoleInTime"],
2165 "http://purl.org/spar/pro/withRole": [
2166 {"@id": "http://purl.org/spar/pro/author"}
2167 ],
2168 "http://purl.org/spar/pro/isHeldBy": [
2169 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
2170 ],
2171 # This is also a potential first author but will be ignored
2172 },
2173 {
2174 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
2175 "@type": ["http://purl.org/spar/pro/RoleInTime"],
2176 "http://purl.org/spar/pro/withRole": [
2177 {"@id": "http://purl.org/spar/pro/author"}
2178 ],
2179 "http://purl.org/spar/pro/isHeldBy": [
2180 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
2181 ],
2182 # This one is connected to author 1 via hasNext and will be processed
2183 },
2184 ]
2185 }
2186 ]
2188 # Setup responsible agents
2189 ra_data = [
2190 {
2191 "@graph": [
2192 {
2193 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
2194 "http://xmlns.com/foaf/0.1/name": [
2195 {"@value": "First Potential Author"}
2196 ],
2197 },
2198 {
2199 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
2200 "http://xmlns.com/foaf/0.1/name": [
2201 {"@value": "Second Potential Author"}
2202 ],
2203 },
2204 {
2205 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
2206 "http://xmlns.com/foaf/0.1/name": [
2207 {"@value": "Connected Author"}
2208 ],
2209 },
2210 ]
2211 }
2212 ]
2214 # Write test data files
2215 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
2217 for entity_type, data in data_files.items():
2218 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
2219 os.makedirs(dir_path, exist_ok=True)
2221 zip_path = os.path.join(dir_path, "1000.zip")
2222 with ZipFile(zip_path, "w") as zip_file:
2223 zip_file.writestr("1000.json", json.dumps(data))
2225 # Run generator
2226 generate_csv(
2227 input_dir=self.rdf_dir,
2228 output_dir=self.output_dir,
2229 dir_split_number=10000,
2230 items_per_file=1000,
2231 zip_output_rdf=True,
2232 redis_port=6381,
2233 redis_db=5,
2234 )
2236 # Check output
2237 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
2238 self.assertEqual(len(output_data), 1)
2240 article = output_data[0]
2241 authors = article["author"].split("; ")
2243 # Verify we have exactly two authors (the first one found and its connected author)
2244 self.assertEqual(
2245 len(authors),
2246 2,
2247 "Should have exactly two authors (first author and connected one)",
2248 )
2250 # Verify the specific authors we expect
2251 expected_authors = [
2252 f"First Potential Author [omid:ra/{supplier_prefix}1]",
2253 f"Connected Author [omid:ra/{supplier_prefix}3]",
2254 ]
2255 self.assertEqual(
2256 authors,
2257 expected_authors,
2258 "Should have first author and connected author in correct order",
2259 )
2261 # Verify the second potential author is NOT in the output
2262 self.assertNotIn(
2263 f"Second Potential Author [omid:ra/{supplier_prefix}2]",
2264 article["author"],
2265 "Second potential author should not be in the output",
2266 )
2269if __name__ == "__main__":
2270 unittest.main()