Coverage for test/csv_generator_lite_test.py: 99%
455 statements
« prev ^ index » next coverage.py v6.5.0, created at 2026-01-15 10:29 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2026-01-15 10:29 +0000
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# Copyright (c) 2024 Arcangelo Massari <arcangelo.massari@unibo.it>
4#
5# Permission to use, copy, modify, and/or distribute this software for any purpose
6# with or without fee is hereby granted, provided that the above copyright notice
7# and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED 'AS IS' AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
12# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
13# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
14# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
15# SOFTWARE.
17import csv
18import json
19import os
20import unittest
21from shutil import rmtree
22from zipfile import ZipFile
24import redis
25from oc_meta.lib.file_manager import get_csv_data
26from oc_meta.plugins.csv_generator_lite.csv_generator_lite import (
27 generate_csv,
28 init_redis_connection,
29 is_omid_processed,
30 load_processed_omids_to_redis,
31)
34class TestCSVGeneratorLite(unittest.TestCase):
35 def setUp(self):
36 self.base_dir = os.path.join("test", "csv_generator_lite")
37 self.input_dir = os.path.join(self.base_dir, "input")
38 self.output_dir = os.path.join(self.base_dir, "output")
40 os.makedirs(self.input_dir, exist_ok=True)
41 os.makedirs(self.output_dir, exist_ok=True)
43 self.rdf_dir = os.path.join(self.input_dir, "rdf")
44 self.br_dir = os.path.join(self.rdf_dir, "br")
45 os.makedirs(self.br_dir, exist_ok=True)
47 self.redis_client = init_redis_connection(port=6381, db=5)
48 self.redis_client.flushdb() # Clear test database
50 def tearDown(self):
51 if os.path.exists(self.base_dir):
52 rmtree(self.base_dir)
53 self.redis_client.flushdb()
55 def _write_test_data(self, data):
56 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)
57 test_data = [
58 {
59 "@graph": [
60 {
61 "@id": f"https://w3id.org/oc/meta/{item['id'].replace('omid:', '')}",
62 "@type": [
63 "http://purl.org/spar/fabio/Expression",
64 "http://purl.org/spar/fabio/JournalArticle",
65 ],
66 "http://purl.org/dc/terms/title": [{"@value": item["title"]}],
67 }
68 for item in data
69 ]
70 }
71 ]
72 with ZipFile(
73 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"
74 ) as zip_file:
75 zip_file.writestr("1000.json", json.dumps(test_data))
77 def test_redis_connection_and_caching(self):
78 redis_client = init_redis_connection(port=6381, db=5)
79 self.assertIsInstance(redis_client, redis.Redis)
81 # Create a test CSV file with some OMIDs
82 test_data = [
83 {"id": "omid:br/0601", "title": "Test 1"},
84 {"id": "omid:br/0602", "title": "Test 2"},
85 {"id": "omid:br/0603 issn:456", "title": "Test 3"},
86 ]
87 os.makedirs(self.output_dir, exist_ok=True)
88 with open(
89 os.path.join(self.output_dir, "test.csv"), "w", newline="", encoding="utf-8"
90 ) as f:
91 writer = csv.DictWriter(f, fieldnames=["id", "title"])
92 writer.writeheader()
93 writer.writerows(test_data)
95 count = load_processed_omids_to_redis(self.output_dir, redis_client)
96 self.assertEqual(count, 3)
98 self.assertTrue(is_omid_processed("omid:br/0601", redis_client))
99 self.assertTrue(is_omid_processed("omid:br/0602", redis_client))
100 self.assertTrue(is_omid_processed("omid:br/0603", redis_client))
101 self.assertFalse(is_omid_processed("omid:br/0604", redis_client))
103 def test_redis_cache_persistence(self):
104 # Create initial test data
105 test_data = [
106 {
107 "@graph": [
108 {
109 "@id": "https://w3id.org/oc/meta/br/0601",
110 "@type": [
111 "http://purl.org/spar/fabio/Expression",
112 "http://purl.org/spar/fabio/JournalArticle",
113 ],
114 "http://purl.org/dc/terms/title": [{"@value": "First Run"}],
115 }
116 ]
117 }
118 ]
120 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)
121 with ZipFile(
122 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"
123 ) as zip_file:
124 zip_file.writestr("1000.json", json.dumps(test_data))
126 generate_csv(
127 input_dir=self.rdf_dir,
128 output_dir=self.output_dir,
129 dir_split_number=10000,
130 items_per_file=1000,
131 redis_port=6381,
132 redis_db=5,
133 )
135 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client))
137 # Create new test data
138 test_data_2 = [
139 {
140 "@graph": [
141 {
142 "@id": "https://w3id.org/oc/meta/br/0601", # Same OMID as before
143 "@type": [
144 "http://purl.org/spar/fabio/Expression",
145 "http://purl.org/spar/fabio/JournalArticle",
146 ],
147 "http://purl.org/dc/terms/title": [
148 {"@value": "Should Be Skipped"}
149 ],
150 },
151 {
152 "@id": "https://w3id.org/oc/meta/br/0602", # New OMID
153 "@type": [
154 "http://purl.org/spar/fabio/Expression",
155 "http://purl.org/spar/fabio/JournalArticle",
156 ],
157 "http://purl.org/dc/terms/title": [
158 {"@value": "Should Be Processed"}
159 ],
160 },
161 ]
162 }
163 ]
165 with ZipFile(
166 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"
167 ) as zip_file:
168 zip_file.writestr("1000.json", json.dumps(test_data_2))
170 generate_csv(
171 input_dir=self.rdf_dir,
172 output_dir=self.output_dir,
173 dir_split_number=10000,
174 items_per_file=1000,
175 redis_port=6381,
176 redis_db=5,
177 )
179 output_data = []
180 for filename in os.listdir(self.output_dir):
181 if filename.endswith(".csv"):
182 output_data.extend(
183 get_csv_data(os.path.join(self.output_dir, filename))
184 )
186 self.assertEqual(len(output_data), 2)
188 first_run_entry = next(
189 item for item in output_data if item["title"] == "First Run"
190 )
191 second_run_entry = next(
192 item for item in output_data if item["title"] == "Should Be Processed"
193 )
195 self.assertEqual(first_run_entry["title"], "First Run")
196 self.assertEqual(first_run_entry["id"], "omid:br/0601")
198 self.assertEqual(second_run_entry["title"], "Should Be Processed")
199 self.assertEqual(second_run_entry["id"], "omid:br/0602")
201 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client))
202 self.assertFalse(is_omid_processed("omid:br/0602", self.redis_client))
204 def test_redis_cache_cleanup(self):
205 input_data = [{"id": "omid:br/0601", "title": "First Entry"}]
206 self._write_test_data(input_data)
208 generate_csv(
209 input_dir=self.rdf_dir,
210 output_dir=self.output_dir,
211 dir_split_number=10000,
212 items_per_file=1000,
213 redis_port=6381,
214 redis_db=5,
215 )
217 self.assertFalse(is_omid_processed("omid:br/0601", self.redis_client))
219 load_processed_omids_to_redis(self.output_dir, self.redis_client)
221 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client))
223 generate_csv(
224 input_dir="/nonexistent/dir",
225 output_dir=self.output_dir,
226 dir_split_number=10000,
227 items_per_file=1000,
228 redis_port=6381,
229 redis_db=5,
230 )
232 self.assertTrue(
233 is_omid_processed("omid:br/0601", self.redis_client),
234 "Redis cache should be retained after a failed run",
235 )
237 def test_redis_error_handling(self):
238 with self.assertRaises(redis.ConnectionError):
239 init_redis_connection(port=9999) # Invalid port
241 count = load_processed_omids_to_redis("/nonexistent/dir", self.redis_client)
242 self.assertEqual(count, 0)
244 def test_concurrent_processing_with_redis(self):
245 # Create multiple test files
246 test_data = []
247 for i in range(100): # Create 100 test entries
248 test_data.append(
249 {
250 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}",
251 "@type": [
252 "http://purl.org/spar/fabio/Expression",
253 "http://purl.org/spar/fabio/JournalArticle",
254 ],
255 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}],
256 }
257 )
259 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)
260 for i in range(0, 100, 10): # Create 10 files with 10 entries each
261 file_data = [{"@graph": test_data[i : i + 10]}]
262 with ZipFile(
263 os.path.join(self.br_dir, "060", "10000", f"{i+1000}.zip"), "w"
264 ) as zip_file:
265 zip_file.writestr(f"{i+1000}.json", json.dumps(file_data))
267 generate_csv(
268 input_dir=self.rdf_dir,
269 output_dir=self.output_dir,
270 dir_split_number=10000,
271 items_per_file=1000,
272 redis_port=6381,
273 redis_db=5,
274 )
276 # Create more test entries
277 more_test_data = []
278 for i in range(100, 200): # Create 100 more test entries
279 more_test_data.append(
280 {
281 "@id": f"https://w3id.org/oc/meta/br/06{i:02d}",
282 "@type": [
283 "http://purl.org/spar/fabio/Expression",
284 "http://purl.org/spar/fabio/JournalArticle",
285 ],
286 "http://purl.org/dc/terms/title": [{"@value": f"Article {i}"}],
287 }
288 )
290 for i in range(0, 100, 10):
291 file_data = [{"@graph": more_test_data[i : i + 10]}]
292 with ZipFile(
293 os.path.join(self.br_dir, "060", "10000", f"{i+2000}.zip"), "w"
294 ) as zip_file:
295 zip_file.writestr(f"{i+2000}.json", json.dumps(file_data))
297 generate_csv(
298 input_dir=self.rdf_dir,
299 output_dir=self.output_dir,
300 dir_split_number=10000,
301 items_per_file=1000,
302 redis_port=6381,
303 redis_db=5,
304 )
306 all_output_data = []
307 for filename in os.listdir(self.output_dir):
308 if filename.endswith(".csv"):
309 all_output_data.extend(
310 get_csv_data(os.path.join(self.output_dir, filename))
311 )
313 self.assertEqual(len(all_output_data), 200)
315 processed_ids = {row["id"] for row in all_output_data}
316 self.assertEqual(len(processed_ids), 200)
318 def test_basic_br_processing(self):
319 test_data = [
320 {
321 "@graph": [
322 {
323 "@id": "https://w3id.org/oc/meta/br/0601",
324 "@type": [
325 "http://purl.org/spar/fabio/Expression",
326 "http://purl.org/spar/fabio/JournalArticle",
327 ],
328 "http://purl.org/dc/terms/title": [{"@value": "Test Article"}],
329 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
330 {"@value": "2024-01-01"}
331 ],
332 "http://purl.org/spar/datacite/hasIdentifier": [
333 {"@id": "https://w3id.org/oc/meta/id/0601"}
334 ],
335 }
336 ],
337 "@id": "https://w3id.org/oc/meta/br/",
338 }
339 ]
341 os.makedirs(os.path.join(self.br_dir, "060", "10000"), exist_ok=True)
342 with ZipFile(
343 os.path.join(self.br_dir, "060", "10000", "1000.zip"), "w"
344 ) as zip_file:
345 zip_file.writestr("1000.json", json.dumps(test_data))
347 generate_csv(
348 input_dir=self.rdf_dir,
349 output_dir=self.output_dir,
350 dir_split_number=10000,
351 items_per_file=1000,
352 redis_port=6381,
353 redis_db=5,
354 )
356 output_files = os.listdir(self.output_dir)
357 self.assertEqual(len(output_files), 1)
359 output_data = get_csv_data(os.path.join(self.output_dir, output_files[0]))
360 self.assertEqual(len(output_data), 1)
361 self.assertEqual(output_data[0]["title"], "Test Article")
362 self.assertEqual(output_data[0]["pub_date"], "2024-01-01")
363 self.assertEqual(output_data[0]["type"], "journal article")
364 self.assertEqual(output_data[0]["id"], "omid:br/0601")
366 def test_complex_br_with_related_entities(self):
367 # Create directory structure for each entity type
368 supplier_prefix = "060"
369 for entity_type in ["br", "ra", "ar", "id"]:
370 os.makedirs(
371 os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000"),
372 exist_ok=True,
373 )
375 # BR data including both the article and the venue
376 br_data = [
377 {
378 "@graph": [
379 {
380 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2",
381 "@type": [
382 "http://purl.org/spar/fabio/Expression",
383 "http://purl.org/spar/fabio/JournalArticle",
384 ],
385 "http://purl.org/dc/terms/title": [
386 {"@value": "Complex Article"}
387 ],
388 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
389 {"@value": "2024-02-01"}
390 ],
391 "http://purl.org/spar/pro/isDocumentContextFor": [
392 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}
393 ],
394 "http://purl.org/vocab/frbr/core#partOf": [
395 {"@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3"}
396 ],
397 },
398 {
399 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3",
400 "@type": [
401 "http://purl.org/spar/fabio/Expression",
402 "http://purl.org/spar/fabio/Journal",
403 ],
404 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}],
405 },
406 ],
407 "@id": "https://w3id.org/oc/meta/br/",
408 }
409 ]
411 ar_data = [
412 {
413 "@graph": [
414 {
415 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
416 "http://purl.org/spar/pro/withRole": [
417 {"@id": "http://purl.org/spar/pro/author"}
418 ],
419 "http://purl.org/spar/pro/isHeldBy": [
420 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
421 ],
422 }
423 ],
424 "@id": "https://w3id.org/oc/meta/ar/",
425 }
426 ]
428 ra_data = [
429 {
430 "@graph": [
431 {
432 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
433 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}],
434 }
435 ],
436 "@id": "https://w3id.org/oc/meta/ra/",
437 }
438 ]
440 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
442 for entity_type, data in data_files.items():
443 zip_path = os.path.join(
444 self.rdf_dir, entity_type, supplier_prefix, "10000", "1000.zip"
445 )
446 with ZipFile(zip_path, "w") as zip_file:
447 zip_file.writestr("1000.json", json.dumps(data))
449 generate_csv(
450 input_dir=self.rdf_dir,
451 output_dir=self.output_dir,
452 dir_split_number=10000,
453 items_per_file=1000,
454 redis_port=6381,
455 redis_db=5,
456 )
458 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
459 self.assertEqual(len(output_data), 2) # Should have 2 rows: article and journal
461 article = next(
462 (item for item in output_data if item["type"] == "journal article"), None
463 )
464 journal = next(
465 (item for item in output_data if item["type"] == "journal"), None
466 )
468 self.assertIsNotNone(article)
469 self.assertEqual(article["title"], "Complex Article")
470 self.assertEqual(article["venue"], f"Test Journal [omid:br/{supplier_prefix}3]")
471 self.assertEqual(article["author"], "Test Author [omid:ra/0601]")
472 self.assertEqual(article["id"], f"omid:br/{supplier_prefix}2")
474 self.assertIsNotNone(journal)
475 self.assertEqual(journal["title"], "Test Journal")
476 self.assertEqual(journal["type"], "journal")
477 self.assertEqual(journal["id"], f"omid:br/{supplier_prefix}3")
479 def test_empty_input_directory(self):
480 generate_csv(
481 input_dir=self.rdf_dir,
482 output_dir=self.output_dir,
483 dir_split_number=10000,
484 items_per_file=1000,
485 redis_port=6381,
486 redis_db=5,
487 )
489 self.assertEqual(len(os.listdir(self.output_dir)), 0)
491 def test_br_with_multiple_authors_and_editors(self):
492 supplier_prefix = "060"
493 br_data = [
494 {
495 "@graph": [
496 {
497 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
498 "@type": [
499 "http://purl.org/spar/fabio/Expression",
500 "http://purl.org/spar/fabio/Book",
501 ],
502 "http://purl.org/dc/terms/title": [
503 {"@value": "Multi-Author Book"}
504 ],
505 "http://purl.org/spar/pro/isDocumentContextFor": [
506 {
507 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"
508 }, # First author
509 {
510 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"
511 }, # Second author
512 {
513 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"
514 }, # First editor
515 {
516 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4"
517 }, # Second editor
518 ],
519 }
520 ]
521 }
522 ]
524 # Setup agent roles for authors and editors with hasNext relations
525 ar_data = [
526 {
527 "@graph": [
528 {
529 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
530 "http://purl.org/spar/pro/withRole": [
531 {"@id": "http://purl.org/spar/pro/author"}
532 ],
533 "http://purl.org/spar/pro/isHeldBy": [
534 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
535 ],
536 "https://w3id.org/oc/ontology/hasNext": [
537 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}
538 ],
539 },
540 {
541 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
542 "http://purl.org/spar/pro/withRole": [
543 {"@id": "http://purl.org/spar/pro/author"}
544 ],
545 "http://purl.org/spar/pro/isHeldBy": [
546 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
547 ],
548 "https://w3id.org/oc/ontology/hasNext": [
549 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
550 ],
551 },
552 {
553 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
554 "http://purl.org/spar/pro/withRole": [
555 {"@id": "http://purl.org/spar/pro/editor"}
556 ],
557 "http://purl.org/spar/pro/isHeldBy": [
558 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
559 ],
560 "https://w3id.org/oc/ontology/hasNext": [
561 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4"}
562 ],
563 },
564 {
565 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}4",
566 "http://purl.org/spar/pro/withRole": [
567 {"@id": "http://purl.org/spar/pro/editor"}
568 ],
569 "http://purl.org/spar/pro/isHeldBy": [
570 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4"}
571 ],
572 },
573 ]
574 }
575 ]
577 # Setup responsible agents
578 ra_data = [
579 {
580 "@graph": [
581 {
582 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
583 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}],
584 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}],
585 },
586 {
587 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
588 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Doe"}],
589 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Jane"}],
590 },
591 {
592 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
593 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Brown"}],
594 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Bob"}],
595 },
596 {
597 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}4",
598 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Wilson"}],
599 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Alice"}],
600 },
601 ]
602 }
603 ]
605 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
607 for entity_type, data in data_files.items():
608 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
609 os.makedirs(dir_path, exist_ok=True)
611 zip_path = os.path.join(dir_path, "1000.zip")
612 with ZipFile(zip_path, "w") as zip_file:
613 zip_file.writestr("1000.json", json.dumps(data))
615 generate_csv(
616 input_dir=self.rdf_dir,
617 output_dir=self.output_dir,
618 dir_split_number=10000,
619 items_per_file=1000,
620 redis_port=6381,
621 redis_db=5,
622 )
624 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
625 self.assertEqual(len(output_data), 1)
627 expected_authors = (
628 f"Smith, John [omid:ra/{supplier_prefix}1]; "
629 f"Doe, Jane [omid:ra/{supplier_prefix}2]"
630 )
631 expected_editors = (
632 f"Brown, Bob [omid:ra/{supplier_prefix}3]; "
633 f"Wilson, Alice [omid:ra/{supplier_prefix}4]"
634 )
636 self.assertEqual(output_data[0]["author"], expected_authors)
637 self.assertEqual(output_data[0]["editor"], expected_editors)
639 def test_br_with_identifiers(self):
640 supplier_prefix = "060"
641 br_data = [
642 {
643 "@graph": [
644 {
645 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
646 "@type": [
647 "http://purl.org/spar/fabio/Expression",
648 "http://purl.org/spar/fabio/JournalArticle",
649 ],
650 "http://purl.org/dc/terms/title": [
651 {"@value": "Article With DOI"}
652 ],
653 "http://purl.org/spar/datacite/hasIdentifier": [
654 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1"},
655 {"@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2"},
656 ],
657 }
658 ]
659 }
660 ]
662 id_data = [
663 {
664 "@graph": [
665 {
666 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}1",
667 "http://purl.org/spar/datacite/usesIdentifierScheme": [
668 {"@id": "http://purl.org/spar/datacite/doi"}
669 ],
670 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
671 {"@value": "10.1234/test.123"}
672 ],
673 },
674 {
675 "@id": f"https://w3id.org/oc/meta/id/{supplier_prefix}2",
676 "http://purl.org/spar/datacite/usesIdentifierScheme": [
677 {"@id": "http://purl.org/spar/datacite/isbn"}
678 ],
679 "http://www.essepuntato.it/2010/06/literalreification/hasLiteralValue": [
680 {"@value": "978-0-123456-47-2"}
681 ],
682 },
683 ]
684 }
685 ]
687 data_files = {"br": br_data, "id": id_data}
689 for entity_type, data in data_files.items():
690 # Create all necessary directories
691 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
692 os.makedirs(dir_path, exist_ok=True)
694 zip_path = os.path.join(dir_path, "1000.zip")
695 with ZipFile(zip_path, "w") as zip_file:
696 zip_file.writestr("1000.json", json.dumps(data))
698 generate_csv(
699 input_dir=self.rdf_dir,
700 output_dir=self.output_dir,
701 dir_split_number=10000,
702 items_per_file=1000,
703 redis_port=6381,
704 redis_db=5,
705 )
707 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
708 self.assertEqual(len(output_data), 1)
710 expected_ids = (
711 f"omid:br/{supplier_prefix}1 doi:10.1234/test.123 isbn:978-0-123456-47-2"
712 )
713 self.assertEqual(output_data[0]["id"], expected_ids)
715 def test_br_with_page_numbers(self):
716 supplier_prefix = "060"
717 br_data = [
718 {
719 "@graph": [
720 {
721 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
722 "@type": [
723 "http://purl.org/spar/fabio/Expression",
724 "http://purl.org/spar/fabio/JournalArticle",
725 ],
726 "http://purl.org/dc/terms/title": [{"@value": "Paged Article"}],
727 "http://purl.org/vocab/frbr/core#embodiment": [
728 {"@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1"}
729 ],
730 }
731 ]
732 }
733 ]
735 re_data = [
736 {
737 "@graph": [
738 {
739 "@id": f"https://w3id.org/oc/meta/re/{supplier_prefix}1",
740 "http://prismstandard.org/namespaces/basic/2.0/startingPage": [
741 {"@value": "100"}
742 ],
743 "http://prismstandard.org/namespaces/basic/2.0/endingPage": [
744 {"@value": "120"}
745 ],
746 }
747 ]
748 }
749 ]
751 data_files = {"br": br_data, "re": re_data}
753 for entity_type, data in data_files.items():
754 # Create all necessary directories
755 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
756 os.makedirs(dir_path, exist_ok=True)
758 zip_path = os.path.join(dir_path, "1000.zip")
759 with ZipFile(zip_path, "w") as zip_file:
760 zip_file.writestr("1000.json", json.dumps(data))
762 generate_csv(
763 input_dir=self.rdf_dir,
764 output_dir=self.output_dir,
765 dir_split_number=10000,
766 items_per_file=1000,
767 redis_port=6381,
768 redis_db=5,
769 )
771 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
772 self.assertEqual(len(output_data), 1)
773 self.assertEqual(output_data[0]["page"], "100-120")
775 def test_malformed_data_handling(self):
776 supplier_prefix = "060"
777 br_data = [
778 {
779 "@graph": [
780 {
781 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
782 "@type": [
783 "http://purl.org/spar/fabio/Expression",
784 "http://purl.org/spar/fabio/JournalArticle",
785 ],
786 # Missing title
787 "http://purl.org/spar/pro/isDocumentContextFor": [
788 {"@id": "invalid_uri"}, # Invalid URI
789 ],
790 "http://purl.org/vocab/frbr/core#partOf": [
791 {"@id": "non_existent_venue"} # Non-existent venue
792 ],
793 }
794 ]
795 }
796 ]
798 data_files = {"br": br_data}
800 for entity_type, data in data_files.items():
801 # Create all necessary directories
802 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
803 os.makedirs(dir_path, exist_ok=True)
805 zip_path = os.path.join(dir_path, "1000.zip")
806 with ZipFile(zip_path, "w") as zip_file:
807 zip_file.writestr("1000.json", json.dumps(data))
809 generate_csv(
810 input_dir=self.rdf_dir,
811 output_dir=self.output_dir,
812 dir_split_number=10000,
813 items_per_file=1000,
814 redis_port=6381,
815 redis_db=5,
816 )
818 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
819 self.assertEqual(len(output_data), 1)
820 self.assertEqual(output_data[0]["title"], "")
821 self.assertEqual(output_data[0]["author"], "")
822 self.assertEqual(output_data[0]["venue"], "")
824 def test_br_with_hierarchical_venue_structures(self):
825 supplier_prefix = "060"
827 br_data = [
828 {
829 "@graph": [
830 # Article in issue->volume->journal structure
831 {
832 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
833 "@type": [
834 "http://purl.org/spar/fabio/Expression",
835 "http://purl.org/spar/fabio/JournalArticle",
836 ],
837 "http://purl.org/dc/terms/title": [
838 {"@value": "Article in Full Hierarchy"}
839 ],
840 "http://purl.org/vocab/frbr/core#partOf": [
841 {
842 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2"
843 } # Issue
844 ],
845 },
846 # Article in issue->journal structure (no volume)
847 {
848 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}5",
849 "@type": [
850 "http://purl.org/spar/fabio/Expression",
851 "http://purl.org/spar/fabio/JournalArticle",
852 ],
853 "http://purl.org/dc/terms/title": [
854 {"@value": "Article in Issue-Journal"}
855 ],
856 "http://purl.org/vocab/frbr/core#partOf": [
857 {
858 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6"
859 } # Issue
860 ],
861 },
862 # Article in volume->journal structure (no issue)
863 {
864 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}9",
865 "@type": [
866 "http://purl.org/spar/fabio/Expression",
867 "http://purl.org/spar/fabio/JournalArticle",
868 ],
869 "http://purl.org/dc/terms/title": [
870 {"@value": "Article in Volume-Journal"}
871 ],
872 "http://purl.org/vocab/frbr/core#partOf": [
873 {
874 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10"
875 } # Volume
876 ],
877 },
878 # Article directly in journal
879 {
880 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}13",
881 "@type": [
882 "http://purl.org/spar/fabio/Expression",
883 "http://purl.org/spar/fabio/JournalArticle",
884 ],
885 "http://purl.org/dc/terms/title": [
886 {"@value": "Article in Journal"}
887 ],
888 "http://purl.org/vocab/frbr/core#partOf": [
889 {
890 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"
891 } # Journal
892 ],
893 },
894 # Issue in full hierarchy
895 {
896 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2",
897 "@type": ["http://purl.org/spar/fabio/JournalIssue"],
898 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
899 {"@value": "2"}
900 ],
901 "http://purl.org/vocab/frbr/core#partOf": [
902 {
903 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3"
904 } # Volume
905 ],
906 },
907 # Volume in full hierarchy
908 {
909 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}3",
910 "@type": ["http://purl.org/spar/fabio/JournalVolume"],
911 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
912 {"@value": "42"}
913 ],
914 "http://purl.org/vocab/frbr/core#partOf": [
915 {
916 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"
917 } # Journal
918 ],
919 },
920 # Journal
921 {
922 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4",
923 "@type": ["http://purl.org/spar/fabio/Journal"],
924 "http://purl.org/dc/terms/title": [{"@value": "Test Journal"}],
925 },
926 # Issue directly in journal
927 {
928 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}6",
929 "@type": ["http://purl.org/spar/fabio/JournalIssue"],
930 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
931 {"@value": "3"}
932 ],
933 "http://purl.org/vocab/frbr/core#partOf": [
934 {
935 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"
936 } # Journal
937 ],
938 },
939 # Volume directly in journal
940 {
941 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}10",
942 "@type": ["http://purl.org/spar/fabio/JournalVolume"],
943 "http://purl.org/spar/fabio/hasSequenceIdentifier": [
944 {"@value": "5"}
945 ],
946 "http://purl.org/vocab/frbr/core#partOf": [
947 {
948 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}4"
949 } # Journal
950 ],
951 },
952 ]
953 }
954 ]
956 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000")
957 os.makedirs(dir_path, exist_ok=True)
959 zip_path = os.path.join(dir_path, "1000.zip")
960 with ZipFile(zip_path, "w") as zip_file:
961 zip_file.writestr("1000.json", json.dumps(br_data))
963 generate_csv(
964 input_dir=self.rdf_dir,
965 output_dir=self.output_dir,
966 dir_split_number=10000,
967 items_per_file=1000,
968 redis_port=6381,
969 redis_db=5,
970 )
972 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
974 self.assertEqual(len(output_data), 5) # 4 articles + 1 journal
976 volume_or_issue_entries = [
977 item
978 for item in output_data
979 if item["type"] in ["journal volume", "journal issue"]
980 ]
981 self.assertEqual(len(volume_or_issue_entries), 0)
983 full_hierarchy = next(
984 item for item in output_data if item["title"] == "Article in Full Hierarchy"
985 )
986 issue_journal = next(
987 item for item in output_data if item["title"] == "Article in Issue-Journal"
988 )
989 volume_journal = next(
990 item for item in output_data if item["title"] == "Article in Volume-Journal"
991 )
992 direct_journal = next(
993 item for item in output_data if item["title"] == "Article in Journal"
994 )
996 self.assertEqual(full_hierarchy["issue"], "2")
997 self.assertEqual(full_hierarchy["volume"], "42")
998 self.assertEqual(
999 full_hierarchy["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"
1000 )
1002 self.assertEqual(issue_journal["issue"], "3")
1003 self.assertEqual(issue_journal["volume"], "")
1004 self.assertEqual(
1005 issue_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"
1006 )
1008 self.assertEqual(volume_journal["issue"], "")
1009 self.assertEqual(volume_journal["volume"], "5")
1010 self.assertEqual(
1011 volume_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"
1012 )
1014 self.assertEqual(direct_journal["issue"], "")
1015 self.assertEqual(direct_journal["volume"], "")
1016 self.assertEqual(
1017 direct_journal["venue"], f"Test Journal [omid:br/{supplier_prefix}4]"
1018 )
1020 def test_book_in_series(self):
1021 supplier_prefix = "060"
1023 br_data = [
1024 {
1025 "@graph": [
1026 # Book
1027 {
1028 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1029 "@type": [
1030 "http://purl.org/spar/fabio/Expression",
1031 "http://purl.org/spar/fabio/Book",
1032 ],
1033 "http://purl.org/dc/terms/title": [{"@value": "Test Book"}],
1034 "http://purl.org/vocab/frbr/core#partOf": [
1035 {
1036 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2"
1037 } # Series
1038 ],
1039 },
1040 # Book Series
1041 {
1042 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2",
1043 "@type": ["http://purl.org/spar/fabio/BookSeries"],
1044 "http://purl.org/dc/terms/title": [
1045 {"@value": "Test Book Series"}
1046 ],
1047 },
1048 ]
1049 }
1050 ]
1052 dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "10000")
1053 os.makedirs(dir_path, exist_ok=True)
1055 zip_path = os.path.join(dir_path, "1000.zip")
1056 with ZipFile(zip_path, "w") as zip_file:
1057 zip_file.writestr("1000.json", json.dumps(br_data))
1059 generate_csv(
1060 input_dir=self.rdf_dir,
1061 output_dir=self.output_dir,
1062 dir_split_number=10000,
1063 items_per_file=1000,
1064 redis_port=6381,
1065 redis_db=5,
1066 )
1068 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1070 book = next(item for item in output_data if item["type"] == "book")
1072 self.assertEqual(book["title"], "Test Book")
1073 self.assertEqual(
1074 book["venue"], f"Test Book Series [omid:br/{supplier_prefix}2]"
1075 )
1076 self.assertEqual(book["volume"], "") # Should not have volume
1077 self.assertEqual(book["issue"], "") # Should not have issue
1079 def test_br_with_multiple_roles(self):
1080 supplier_prefix = "060"
1081 br_data = [
1082 {
1083 "@graph": [
1084 {
1085 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1086 "@type": [
1087 "http://purl.org/spar/fabio/Expression",
1088 "http://purl.org/spar/fabio/Book",
1089 ],
1090 "http://purl.org/dc/terms/title": [
1091 {"@value": "Multi-Role Book"}
1092 ],
1093 "http://purl.org/spar/pro/isDocumentContextFor": [
1094 {
1095 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"
1096 }, # Author
1097 {
1098 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"
1099 }, # Editor
1100 {
1101 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"
1102 }, # Publisher
1103 ],
1104 }
1105 ]
1106 }
1107 ]
1109 # Setup agent roles for authors, editors and publishers
1110 ar_data = [
1111 {
1112 "@graph": [
1113 {
1114 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
1115 "http://purl.org/spar/pro/withRole": [
1116 {"@id": "http://purl.org/spar/pro/author"}
1117 ],
1118 "http://purl.org/spar/pro/isHeldBy": [
1119 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
1120 ],
1121 "https://w3id.org/oc/ontology/hasNext": [
1122 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}
1123 ],
1124 },
1125 {
1126 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
1127 "http://purl.org/spar/pro/withRole": [
1128 {"@id": "http://purl.org/spar/pro/editor"}
1129 ],
1130 "http://purl.org/spar/pro/isHeldBy": [
1131 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
1132 ],
1133 "https://w3id.org/oc/ontology/hasNext": [
1134 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
1135 ],
1136 },
1137 {
1138 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
1139 "http://purl.org/spar/pro/withRole": [
1140 {"@id": "http://purl.org/spar/pro/publisher"}
1141 ],
1142 "http://purl.org/spar/pro/isHeldBy": [
1143 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
1144 ],
1145 },
1146 ]
1147 }
1148 ]
1150 # Setup responsible agents with different name formats
1151 ra_data = [
1152 {
1153 "@graph": [
1154 {
1155 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
1156 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Smith"}],
1157 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "John"}],
1158 },
1159 {
1160 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
1161 "http://xmlns.com/foaf/0.1/name": [{"@value": "Editor Name"}],
1162 },
1163 {
1164 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
1165 "http://xmlns.com/foaf/0.1/name": [
1166 {"@value": "Publisher House"}
1167 ],
1168 },
1169 ]
1170 }
1171 ]
1173 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
1175 for entity_type, data in data_files.items():
1176 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
1177 os.makedirs(dir_path, exist_ok=True)
1179 zip_path = os.path.join(dir_path, "1000.zip")
1180 with ZipFile(zip_path, "w") as zip_file:
1181 zip_file.writestr("1000.json", json.dumps(data))
1183 generate_csv(
1184 input_dir=self.rdf_dir,
1185 output_dir=self.output_dir,
1186 dir_split_number=10000,
1187 items_per_file=1000,
1188 redis_port=6381,
1189 redis_db=5,
1190 )
1192 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1193 self.assertEqual(len(output_data), 1)
1195 book = output_data[0]
1196 self.assertEqual(book["title"], "Multi-Role Book")
1197 self.assertEqual(book["author"], f"Smith, John [omid:ra/{supplier_prefix}1]")
1198 self.assertEqual(book["editor"], f"Editor Name [omid:ra/{supplier_prefix}2]")
1199 self.assertEqual(
1200 book["publisher"], f"Publisher House [omid:ra/{supplier_prefix}3]"
1201 )
1203 def test_ordered_authors(self):
1204 supplier_prefix = "060"
1205 br_data = [
1206 {
1207 "@graph": [
1208 {
1209 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1210 "@type": [
1211 "http://purl.org/spar/fabio/Expression",
1212 "http://purl.org/spar/fabio/JournalArticle",
1213 ],
1214 "http://purl.org/dc/terms/title": [
1215 {"@value": "Ordered Authors Article"}
1216 ],
1217 "http://purl.org/spar/pro/isDocumentContextFor": [
1218 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"},
1219 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"},
1220 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"},
1221 ],
1222 }
1223 ]
1224 }
1225 ]
1227 # Setup agent roles with hasNext relations
1228 ar_data = [
1229 {
1230 "@graph": [
1231 {
1232 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
1233 "http://purl.org/spar/pro/withRole": [
1234 {"@id": "http://purl.org/spar/pro/author"}
1235 ],
1236 "http://purl.org/spar/pro/isHeldBy": [
1237 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
1238 ],
1239 "https://w3id.org/oc/ontology/hasNext": [
1240 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}
1241 ],
1242 },
1243 {
1244 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
1245 "http://purl.org/spar/pro/withRole": [
1246 {"@id": "http://purl.org/spar/pro/author"}
1247 ],
1248 "http://purl.org/spar/pro/isHeldBy": [
1249 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
1250 ],
1251 "https://w3id.org/oc/ontology/hasNext": [
1252 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
1253 ],
1254 },
1255 {
1256 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
1257 "http://purl.org/spar/pro/withRole": [
1258 {"@id": "http://purl.org/spar/pro/author"}
1259 ],
1260 "http://purl.org/spar/pro/isHeldBy": [
1261 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
1262 ],
1263 },
1264 ]
1265 }
1266 ]
1268 # Setup responsible agents with different names
1269 ra_data = [
1270 {
1271 "@graph": [
1272 {
1273 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
1274 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}],
1275 },
1276 {
1277 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
1278 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}],
1279 },
1280 {
1281 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
1282 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}],
1283 },
1284 ]
1285 }
1286 ]
1288 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
1290 for entity_type, data in data_files.items():
1291 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
1292 os.makedirs(dir_path, exist_ok=True)
1294 zip_path = os.path.join(dir_path, "1000.zip")
1295 with ZipFile(zip_path, "w") as zip_file:
1296 zip_file.writestr("1000.json", json.dumps(data))
1298 generate_csv(
1299 input_dir=self.rdf_dir,
1300 output_dir=self.output_dir,
1301 dir_split_number=10000,
1302 items_per_file=1000,
1303 redis_port=6381,
1304 redis_db=5,
1305 )
1307 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1308 self.assertEqual(len(output_data), 1)
1310 expected_authors = (
1311 f"First Author [omid:ra/{supplier_prefix}1]; "
1312 f"Second Author [omid:ra/{supplier_prefix}2]; "
1313 f"Third Author [omid:ra/{supplier_prefix}3]"
1314 )
1315 self.assertEqual(output_data[0]["author"], expected_authors)
1317 def test_cyclic_hasNext_relations(self):
1318 supplier_prefix = "060"
1319 br_data = [
1320 {
1321 "@graph": [
1322 {
1323 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1324 "@type": [
1325 "http://purl.org/spar/fabio/Expression",
1326 "http://purl.org/spar/fabio/JournalArticle",
1327 ],
1328 "http://purl.org/dc/terms/title": [
1329 {"@value": "Cyclic Authors Article"}
1330 ],
1331 "http://purl.org/spar/pro/isDocumentContextFor": [
1332 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"},
1333 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"},
1334 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"},
1335 ],
1336 }
1337 ]
1338 }
1339 ]
1341 # Setup agent roles with cyclic hasNext relations
1342 ar_data = [
1343 {
1344 "@graph": [
1345 {
1346 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
1347 "http://purl.org/spar/pro/withRole": [
1348 {"@id": "http://purl.org/spar/pro/author"}
1349 ],
1350 "http://purl.org/spar/pro/isHeldBy": [
1351 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
1352 ],
1353 "https://w3id.org/oc/ontology/hasNext": [
1354 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"}
1355 ],
1356 },
1357 {
1358 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
1359 "http://purl.org/spar/pro/withRole": [
1360 {"@id": "http://purl.org/spar/pro/author"}
1361 ],
1362 "http://purl.org/spar/pro/isHeldBy": [
1363 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
1364 ],
1365 # Creates a cycle: 1 -> 2 -> 3 -> 1
1366 "https://w3id.org/oc/ontology/hasNext": [
1367 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
1368 ],
1369 },
1370 {
1371 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
1372 "http://purl.org/spar/pro/withRole": [
1373 {"@id": "http://purl.org/spar/pro/author"}
1374 ],
1375 "http://purl.org/spar/pro/isHeldBy": [
1376 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
1377 ],
1378 # Cycle completion
1379 "https://w3id.org/oc/ontology/hasNext": [
1380 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"}
1381 ],
1382 },
1383 ]
1384 }
1385 ]
1387 # Setup responsible agents
1388 ra_data = [
1389 {
1390 "@graph": [
1391 {
1392 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
1393 "http://xmlns.com/foaf/0.1/name": [{"@value": "First Author"}],
1394 },
1395 {
1396 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
1397 "http://xmlns.com/foaf/0.1/name": [{"@value": "Second Author"}],
1398 },
1399 {
1400 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
1401 "http://xmlns.com/foaf/0.1/name": [{"@value": "Third Author"}],
1402 },
1403 ]
1404 }
1405 ]
1407 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
1409 for entity_type, data in data_files.items():
1410 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
1411 os.makedirs(dir_path, exist_ok=True)
1413 zip_path = os.path.join(dir_path, "1000.zip")
1414 with ZipFile(zip_path, "w") as zip_file:
1415 zip_file.writestr("1000.json", json.dumps(data))
1417 generate_csv(
1418 input_dir=self.rdf_dir,
1419 output_dir=self.output_dir,
1420 dir_split_number=10000,
1421 items_per_file=1000,
1422 redis_port=6381,
1423 redis_db=5,
1424 )
1426 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1427 self.assertEqual(len(output_data), 1)
1429 # The order should be maintained until the cycle is detected
1430 authors = output_data[0]["author"].split("; ")
1431 self.assertGreater(len(authors), 0)
1433 self.assertTrue(
1434 any(
1435 f"First Author [omid:ra/{supplier_prefix}1]" in author
1436 for author in authors
1437 )
1438 )
1439 self.assertTrue(
1440 any(
1441 f"Second Author [omid:ra/{supplier_prefix}2]" in author
1442 for author in authors
1443 )
1444 )
1446 author_set = set(authors)
1447 self.assertEqual(
1448 len(authors),
1449 len(author_set),
1450 "Found duplicate authors in output: each author should appear exactly once",
1451 )
1453 expected_authors = [
1454 f"First Author [omid:ra/{supplier_prefix}1]",
1455 f"Second Author [omid:ra/{supplier_prefix}2]",
1456 f"Third Author [omid:ra/{supplier_prefix}3]",
1457 ]
1458 self.assertEqual(
1459 authors,
1460 expected_authors,
1461 "Authors should be in correct order and each should appear exactly once",
1462 )
1464 def test_multiple_input_files(self):
1465 supplier_prefix = "060"
1467 # First file (entities 1-1000)
1468 br_data_1 = [
1469 {
1470 "@graph": [
1471 {
1472 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1473 "@type": [
1474 "http://purl.org/spar/fabio/Expression",
1475 "http://purl.org/spar/fabio/JournalArticle",
1476 ],
1477 "http://purl.org/dc/terms/title": [{"@value": "Article 1"}],
1478 },
1479 {
1480 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1000",
1481 "@type": [
1482 "http://purl.org/spar/fabio/Expression",
1483 "http://purl.org/spar/fabio/JournalArticle",
1484 ],
1485 "http://purl.org/dc/terms/title": [{"@value": "Article 1000"}],
1486 },
1487 ]
1488 }
1489 ]
1491 # Second file (entities 1001-2000)
1492 br_data_2 = [
1493 {
1494 "@graph": [
1495 {
1496 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1001",
1497 "@type": [
1498 "http://purl.org/spar/fabio/Expression",
1499 "http://purl.org/spar/fabio/JournalArticle",
1500 ],
1501 "http://purl.org/dc/terms/title": [{"@value": "Article 1001"}],
1502 },
1503 {
1504 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2000",
1505 "@type": [
1506 "http://purl.org/spar/fabio/Expression",
1507 "http://purl.org/spar/fabio/JournalArticle",
1508 ],
1509 "http://purl.org/dc/terms/title": [{"@value": "Article 2000"}],
1510 },
1511 ]
1512 }
1513 ]
1515 # Third file (entities 2001-3000)
1516 br_data_3 = [
1517 {
1518 "@graph": [
1519 {
1520 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}2001",
1521 "@type": [
1522 "http://purl.org/spar/fabio/Expression",
1523 "http://purl.org/spar/fabio/JournalArticle",
1524 ],
1525 "http://purl.org/dc/terms/title": [{"@value": "Article 2001"}],
1526 "http://purl.org/spar/pro/isDocumentContextFor": [
1527 {
1528 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001"
1529 }
1530 ],
1531 }
1532 ]
1533 }
1534 ]
1536 # Create agent role data in a different file
1537 ar_data = [
1538 {
1539 "@graph": [
1540 {
1541 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2001",
1542 "http://purl.org/spar/pro/withRole": [
1543 {"@id": "http://purl.org/spar/pro/author"}
1544 ],
1545 "http://purl.org/spar/pro/isHeldBy": [
1546 {
1547 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001"
1548 }
1549 ],
1550 }
1551 ]
1552 }
1553 ]
1555 # Create responsible agent data in a different file
1556 ra_data = [
1557 {
1558 "@graph": [
1559 {
1560 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2001",
1561 "http://xmlns.com/foaf/0.1/name": [{"@value": "Test Author"}],
1562 }
1563 ]
1564 }
1565 ]
1567 os.makedirs(os.path.join(self.br_dir, supplier_prefix, "10000"), exist_ok=True)
1568 os.makedirs(
1569 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000"), exist_ok=True
1570 )
1571 os.makedirs(
1572 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000"), exist_ok=True
1573 )
1575 with ZipFile(
1576 os.path.join(self.br_dir, supplier_prefix, "10000", "1000.zip"), "w"
1577 ) as zip_file:
1578 zip_file.writestr("1000.json", json.dumps(br_data_1))
1579 with ZipFile(
1580 os.path.join(self.br_dir, supplier_prefix, "10000", "2000.zip"), "w"
1581 ) as zip_file:
1582 zip_file.writestr("2000.json", json.dumps(br_data_2))
1583 with ZipFile(
1584 os.path.join(self.br_dir, supplier_prefix, "10000", "3000.zip"), "w"
1585 ) as zip_file:
1586 zip_file.writestr("3000.json", json.dumps(br_data_3))
1588 with ZipFile(
1589 os.path.join(self.rdf_dir, "ar", supplier_prefix, "10000", "3000.zip"), "w"
1590 ) as zip_file:
1591 zip_file.writestr("3000.json", json.dumps(ar_data))
1592 with ZipFile(
1593 os.path.join(self.rdf_dir, "ra", supplier_prefix, "10000", "3000.zip"), "w"
1594 ) as zip_file:
1595 zip_file.writestr("3000.json", json.dumps(ra_data))
1597 generate_csv(
1598 input_dir=self.rdf_dir,
1599 output_dir=self.output_dir,
1600 dir_split_number=10000,
1601 items_per_file=1000,
1602 redis_port=6381,
1603 redis_db=5,
1604 )
1606 output_files = sorted(os.listdir(self.output_dir))
1607 self.assertGreater(len(output_files), 0)
1609 # Collect all output data
1610 all_output_data = []
1611 for output_file in output_files:
1612 all_output_data.extend(
1613 get_csv_data(os.path.join(self.output_dir, output_file))
1614 )
1616 self.assertEqual(len(all_output_data), 5) # Should have 5 articles total
1618 article_1 = next(
1619 item
1620 for item in all_output_data
1621 if item["id"] == f"omid:br/{supplier_prefix}1"
1622 )
1623 article_1000 = next(
1624 item
1625 for item in all_output_data
1626 if item["id"] == f"omid:br/{supplier_prefix}1000"
1627 )
1628 article_1001 = next(
1629 item
1630 for item in all_output_data
1631 if item["id"] == f"omid:br/{supplier_prefix}1001"
1632 )
1633 article_2000 = next(
1634 item
1635 for item in all_output_data
1636 if item["id"] == f"omid:br/{supplier_prefix}2000"
1637 )
1638 article_2001 = next(
1639 item
1640 for item in all_output_data
1641 if item["id"] == f"omid:br/{supplier_prefix}2001"
1642 )
1644 self.assertEqual(article_1["title"], "Article 1")
1645 self.assertEqual(article_1000["title"], "Article 1000")
1646 self.assertEqual(article_1001["title"], "Article 1001")
1647 self.assertEqual(article_2000["title"], "Article 2000")
1648 self.assertEqual(article_2001["title"], "Article 2001")
1650 self.assertEqual(
1651 article_2001["author"], f"Test Author [omid:ra/{supplier_prefix}2001]"
1652 )
1654 def test_max_rows_per_file_and_data_integrity(self):
1655 supplier_prefix = "060"
1657 br_data = [
1658 {
1659 "@graph": [
1660 # Generate 3500 test entries
1661 *[
1662 {
1663 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}{i}",
1664 "@type": [
1665 "http://purl.org/spar/fabio/Expression",
1666 "http://purl.org/spar/fabio/JournalArticle",
1667 ],
1668 "http://purl.org/dc/terms/title": [
1669 {"@value": f"Article {i}"}
1670 ],
1671 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
1672 {"@value": "2024-01-01"}
1673 ],
1674 }
1675 for i in range(1, 3501)
1676 ] # This will create 3500 entries
1677 ]
1678 }
1679 ]
1681 entries_per_file = 1000
1682 for i in range(0, 3500, entries_per_file):
1683 file_data = [{"@graph": br_data[0]["@graph"][i : i + entries_per_file]}]
1685 # Create directory structure for the file
1686 file_number = i + entries_per_file
1687 dir_path = os.path.join(self.br_dir, supplier_prefix, "10000")
1688 os.makedirs(dir_path, exist_ok=True)
1690 # Write the file
1691 with ZipFile(os.path.join(dir_path, f"{file_number}.zip"), "w") as zip_file:
1692 zip_file.writestr(f"{file_number}.json", json.dumps(file_data))
1694 generate_csv(
1695 input_dir=self.rdf_dir,
1696 output_dir=self.output_dir,
1697 dir_split_number=10000,
1698 items_per_file=1000,
1699 redis_port=6381,
1700 redis_db=5,
1701 )
1703 output_files = sorted(os.listdir(self.output_dir))
1705 # We expect at least 2 files: 3500 entries should create 2 files (3000 + 500)
1706 self.assertGreaterEqual(
1707 len(output_files), 2, "Should have at least 2 output files for 3500 entries"
1708 )
1710 # Collect all entries from all output files
1711 all_entries = []
1712 for output_file in output_files:
1713 entries = get_csv_data(os.path.join(self.output_dir, output_file))
1715 # Verify each file has at most 3000 rows
1716 self.assertLessEqual(
1717 len(entries),
1718 3000,
1719 f"File {output_file} has more than 3000 rows: {len(entries)}",
1720 )
1722 all_entries.extend(entries)
1724 self.assertEqual(
1725 len(all_entries),
1726 3500,
1727 f"Expected 3500 total entries, got {len(all_entries)}",
1728 )
1730 unique_ids = {entry["id"] for entry in all_entries}
1731 self.assertEqual(
1732 len(unique_ids),
1733 3500,
1734 f"Expected 3500 unique entries, got {len(unique_ids)}",
1735 )
1737 expected_ids = {f"omid:br/{supplier_prefix}{i}" for i in range(1, 3501)}
1738 self.assertEqual(
1739 unique_ids,
1740 expected_ids,
1741 "Some entries are missing or unexpected entries are present",
1742 )
1744 for i in range(1, 3501):
1745 entry = next(
1746 e for e in all_entries if e["id"] == f"omid:br/{supplier_prefix}{i}"
1747 )
1748 self.assertEqual(entry["title"], f"Article {i}")
1749 self.assertEqual(entry["pub_date"], "2024-01-01")
1750 self.assertEqual(entry["type"], "journal article")
1752 def test_csv_field_limit_handling(self):
1753 # Create a test CSV with a very large field
1754 large_field = "omid:br/0601 " + " ".join(
1755 [f"id:{i}" for i in range(10000)]
1756 ) # This will create a field > 131072 chars
1757 test_data = {"id": large_field, "title": "Test Large Field"}
1759 os.makedirs(self.output_dir, exist_ok=True)
1760 with open(
1761 os.path.join(self.output_dir, "large_field.csv"),
1762 "w",
1763 newline="",
1764 encoding="utf-8",
1765 ) as f:
1766 writer = csv.DictWriter(f, fieldnames=["id", "title"])
1767 writer.writeheader()
1768 writer.writerow(test_data)
1770 # Try loading the data - this should trigger the field limit increase
1771 count = load_processed_omids_to_redis(self.output_dir, self.redis_client)
1773 self.assertEqual(count, 1)
1774 self.assertTrue(is_omid_processed("omid:br/0601", self.redis_client))
1776 def test_complex_br_with_missing_authors(self):
1777 supplier_prefix = "06250"
1778 br_data = [
1779 {
1780 "@graph": [
1781 {
1782 "@id": "https://w3id.org/oc/meta/br/062501777134",
1783 "@type": [
1784 "http://purl.org/spar/fabio/JournalArticle",
1785 "http://purl.org/spar/fabio/Expression",
1786 ],
1787 "http://prismstandard.org/namespaces/basic/2.0/publicationDate": [
1788 {
1789 "@type": "http://www.w3.org/2001/XMLSchema#gYearMonth",
1790 "@value": "2020-02",
1791 }
1792 ],
1793 "http://purl.org/dc/terms/title": [
1794 {
1795 "@value": "OpenCitations, An Infrastructure Organization For Open Scholarship"
1796 }
1797 ],
1798 "http://purl.org/spar/datacite/hasIdentifier": [
1799 {"@id": "https://w3id.org/oc/meta/id/062501806985"},
1800 {"@id": "https://w3id.org/oc/meta/id/06850624745"},
1801 ],
1802 "http://purl.org/spar/pro/isDocumentContextFor": [
1803 {"@id": "https://w3id.org/oc/meta/ar/062507977761"},
1804 {"@id": "https://w3id.org/oc/meta/ar/062507977760"},
1805 {"@id": "https://w3id.org/oc/meta/ar/062507977759"},
1806 ],
1807 "http://purl.org/vocab/frbr/core#embodiment": [
1808 {"@id": "https://w3id.org/oc/meta/re/062501477439"}
1809 ],
1810 "http://purl.org/vocab/frbr/core#partOf": [
1811 {"@id": "https://w3id.org/oc/meta/br/062501778111"}
1812 ],
1813 }
1814 ]
1815 }
1816 ]
1818 ar_data = [
1819 {
1820 "@graph": [
1821 {
1822 "@id": "https://w3id.org/oc/meta/ar/062507977761",
1823 "@type": ["http://purl.org/spar/pro/RoleInTime"],
1824 "http://purl.org/spar/pro/isHeldBy": [
1825 {"@id": "https://w3id.org/oc/meta/ra/0610116105"}
1826 ],
1827 "http://purl.org/spar/pro/withRole": [
1828 {"@id": "http://purl.org/spar/pro/publisher"}
1829 ],
1830 },
1831 {
1832 "@id": "https://w3id.org/oc/meta/ar/062507977760",
1833 "@type": ["http://purl.org/spar/pro/RoleInTime"],
1834 "http://purl.org/spar/pro/isHeldBy": [
1835 {"@id": "https://w3id.org/oc/meta/ra/0621010775619"}
1836 ],
1837 "http://purl.org/spar/pro/withRole": [
1838 {"@id": "http://purl.org/spar/pro/author"}
1839 ],
1840 },
1841 {
1842 "@id": "https://w3id.org/oc/meta/ar/062507977759",
1843 "@type": ["http://purl.org/spar/pro/RoleInTime"],
1844 "http://purl.org/spar/pro/isHeldBy": [
1845 {"@id": "https://w3id.org/oc/meta/ra/0614010840729"}
1846 ],
1847 "http://purl.org/spar/pro/withRole": [
1848 {"@id": "http://purl.org/spar/pro/author"}
1849 ],
1850 "https://w3id.org/oc/ontology/hasNext": [
1851 {"@id": "https://w3id.org/oc/meta/ar/062507977760"}
1852 ],
1853 },
1854 ]
1855 }
1856 ]
1858 ra_data_peroni = [
1859 {
1860 "@graph": [
1861 {
1862 "@id": "https://w3id.org/oc/meta/ra/0614010840729",
1863 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
1864 "http://purl.org/spar/datacite/hasIdentifier": [
1865 {"@id": "https://w3id.org/oc/meta/id/06304949238"}
1866 ],
1867 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Peroni"}],
1868 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "Silvio"}],
1869 "http://xmlns.com/foaf/0.1/name": [{"@value": "Peroni Silvio"}],
1870 }
1871 ]
1872 }
1873 ]
1875 ra_data_shotton = [
1876 {
1877 "@graph": [
1878 {
1879 "@id": "https://w3id.org/oc/meta/ra/0621010775619",
1880 "@type": ["http://xmlns.com/foaf/0.1/Agent"],
1881 "http://purl.org/spar/datacite/hasIdentifier": [
1882 {"@id": "https://w3id.org/oc/meta/id/062404672414"}
1883 ],
1884 "http://xmlns.com/foaf/0.1/familyName": [{"@value": "Shotton"}],
1885 "http://xmlns.com/foaf/0.1/givenName": [{"@value": "D M"}],
1886 "http://xmlns.com/foaf/0.1/name": [{"@value": "Shotton David"}],
1887 }
1888 ]
1889 }
1890 ]
1892 # Create directory structure for BR data
1893 br_dir_path = os.path.join(self.rdf_dir, "br", supplier_prefix, "1780000")
1894 os.makedirs(br_dir_path, exist_ok=True)
1896 # Create directory structure for AR data
1897 ar_dir_path = os.path.join(self.rdf_dir, "ar", supplier_prefix, "7980000")
1898 os.makedirs(ar_dir_path, exist_ok=True)
1900 # Create directory structure for RA data (Peroni)
1901 ra_peroni_dir_path = os.path.join(self.rdf_dir, "ra", "06140", "10850000")
1902 os.makedirs(ra_peroni_dir_path, exist_ok=True)
1904 # Create directory structure for RA data (Shotton)
1905 ra_shotton_dir_path = os.path.join(self.rdf_dir, "ra", "06210", "10780000")
1906 os.makedirs(ra_shotton_dir_path, exist_ok=True)
1908 with ZipFile(os.path.join(br_dir_path, "1778000.zip"), "w") as zip_file:
1909 zip_file.writestr("1778000.json", json.dumps(br_data))
1911 with ZipFile(os.path.join(ar_dir_path, "7978000.zip"), "w") as zip_file:
1912 zip_file.writestr("7978000.json", json.dumps(ar_data))
1914 with ZipFile(os.path.join(ra_peroni_dir_path, "10841000.zip"), "w") as zip_file:
1915 zip_file.writestr("10841000.json", json.dumps(ra_data_peroni))
1917 with ZipFile(
1918 os.path.join(ra_shotton_dir_path, "10776000.zip"), "w"
1919 ) as zip_file:
1920 zip_file.writestr("10776000.json", json.dumps(ra_data_shotton))
1922 generate_csv(
1923 input_dir=self.rdf_dir,
1924 output_dir=self.output_dir,
1925 dir_split_number=10000,
1926 items_per_file=1000,
1927 redis_port=6381,
1928 redis_db=5,
1929 )
1931 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
1932 self.assertEqual(len(output_data), 1)
1933 article = output_data[0]
1934 self.assertEqual(
1935 article["title"],
1936 "OpenCitations, An Infrastructure Organization For Open Scholarship",
1937 )
1938 self.assertEqual(article["pub_date"], "2020-02")
1939 self.assertEqual(article["type"], "journal article")
1940 self.assertEqual(article["id"], "omid:br/062501777134")
1942 expected_authors = (
1943 "Peroni, Silvio [omid:ra/0614010840729]; "
1944 "Shotton, D M [omid:ra/0621010775619]"
1945 )
1946 self.assertEqual(article["author"], expected_authors)
1948 # Publisher field should still be empty since we haven't added the publisher RA data
1949 self.assertEqual(article["publisher"], "")
1951 def test_multiple_first_ars(self):
1952 supplier_prefix = "060"
1953 br_data = [
1954 {
1955 "@graph": [
1956 {
1957 "@id": f"https://w3id.org/oc/meta/br/{supplier_prefix}1",
1958 "@type": [
1959 "http://purl.org/spar/fabio/Expression",
1960 "http://purl.org/spar/fabio/JournalArticle",
1961 ],
1962 "http://purl.org/dc/terms/title": [
1963 {"@value": "Article With Multiple First Authors"}
1964 ],
1965 "http://purl.org/spar/pro/isDocumentContextFor": [
1966 {
1967 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1"
1968 }, # First potential author (will be processed)
1969 {
1970 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2"
1971 }, # Second potential author (will be ignored)
1972 {
1973 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"
1974 }, # Connected to author 1 (will be processed)
1975 ],
1976 }
1977 ]
1978 }
1979 ]
1981 # Setup agent roles with two potential "first" authors (no hasNext pointing to them)
1982 # and one author connected to the first one
1983 ar_data = [
1984 {
1985 "@graph": [
1986 {
1987 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}1",
1988 "@type": ["http://purl.org/spar/pro/RoleInTime"],
1989 "http://purl.org/spar/pro/withRole": [
1990 {"@id": "http://purl.org/spar/pro/author"}
1991 ],
1992 "http://purl.org/spar/pro/isHeldBy": [
1993 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1"}
1994 ],
1995 "https://w3id.org/oc/ontology/hasNext": [
1996 {"@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3"}
1997 ],
1998 },
1999 {
2000 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}2",
2001 "@type": ["http://purl.org/spar/pro/RoleInTime"],
2002 "http://purl.org/spar/pro/withRole": [
2003 {"@id": "http://purl.org/spar/pro/author"}
2004 ],
2005 "http://purl.org/spar/pro/isHeldBy": [
2006 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2"}
2007 ],
2008 # This is also a potential first author but will be ignored
2009 },
2010 {
2011 "@id": f"https://w3id.org/oc/meta/ar/{supplier_prefix}3",
2012 "@type": ["http://purl.org/spar/pro/RoleInTime"],
2013 "http://purl.org/spar/pro/withRole": [
2014 {"@id": "http://purl.org/spar/pro/author"}
2015 ],
2016 "http://purl.org/spar/pro/isHeldBy": [
2017 {"@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3"}
2018 ],
2019 # This one is connected to author 1 via hasNext and will be processed
2020 },
2021 ]
2022 }
2023 ]
2025 # Setup responsible agents
2026 ra_data = [
2027 {
2028 "@graph": [
2029 {
2030 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}1",
2031 "http://xmlns.com/foaf/0.1/name": [
2032 {"@value": "First Potential Author"}
2033 ],
2034 },
2035 {
2036 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}2",
2037 "http://xmlns.com/foaf/0.1/name": [
2038 {"@value": "Second Potential Author"}
2039 ],
2040 },
2041 {
2042 "@id": f"https://w3id.org/oc/meta/ra/{supplier_prefix}3",
2043 "http://xmlns.com/foaf/0.1/name": [
2044 {"@value": "Connected Author"}
2045 ],
2046 },
2047 ]
2048 }
2049 ]
2051 data_files = {"br": br_data, "ra": ra_data, "ar": ar_data}
2053 for entity_type, data in data_files.items():
2054 dir_path = os.path.join(self.rdf_dir, entity_type, supplier_prefix, "10000")
2055 os.makedirs(dir_path, exist_ok=True)
2057 zip_path = os.path.join(dir_path, "1000.zip")
2058 with ZipFile(zip_path, "w") as zip_file:
2059 zip_file.writestr("1000.json", json.dumps(data))
2061 generate_csv(
2062 input_dir=self.rdf_dir,
2063 output_dir=self.output_dir,
2064 dir_split_number=10000,
2065 items_per_file=1000,
2066 redis_port=6381,
2067 redis_db=5,
2068 )
2070 output_data = get_csv_data(os.path.join(self.output_dir, "output_0.csv"))
2071 self.assertEqual(len(output_data), 1)
2073 article = output_data[0]
2074 authors = article["author"].split("; ")
2076 self.assertEqual(
2077 len(authors),
2078 2,
2079 "Should have exactly two authors (first author and connected one)",
2080 )
2082 expected_authors = [
2083 f"First Potential Author [omid:ra/{supplier_prefix}1]",
2084 f"Connected Author [omid:ra/{supplier_prefix}3]",
2085 ]
2086 self.assertEqual(
2087 authors,
2088 expected_authors,
2089 "Should have first author and connected author in correct order",
2090 )
2092 self.assertNotIn(
2093 f"Second Potential Author [omid:ra/{supplier_prefix}2]",
2094 article["author"],
2095 "Second potential author should not be in the output",
2096 )
2099if __name__ == "__main__":
2100 unittest.main()