Coverage for test/fix_provenance_test.py: 99%

357 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-07-14 14:06 +0000

1import json 

2import os 

3import shutil 

4import unittest 

5import zipfile 

6 

7from oc_meta.run.fixer.prov.fix import ProvenanceProcessor 

8from rdflib import ConjunctiveGraph, Literal, Namespace, URIRef 

9from rdflib.namespace import XSD 

10 

11 

12class TestProvenanceFixing(unittest.TestCase): 

13 def setUp(self): 

14 self.processor = ProvenanceProcessor(log_dir='test/fix_provenance_logs') 

15 

16 self.temp_dir = "test_temp_dir" 

17 if not os.path.exists(self.temp_dir): 

18 os.makedirs(self.temp_dir) 

19 

20 # Sample JSON-LD data 

21 self.test_data = { 

22 "@graph": [ 

23 { 

24 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/2", 

25 "@type": ["http://www.w3.org/ns/prov#Entity"], 

26 "http://www.w3.org/ns/prov#invalidatedAtTime": [{ 

27 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

28 "@value": "2023-12-31T22:08:21+00:00" 

29 }] 

30 }, 

31 { 

32 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/3", 

33 "@type": ["http://www.w3.org/ns/prov#Entity"], 

34 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

35 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

36 "@value": "2023-12-31T22:08:21+00:00" 

37 }], 

38 "http://www.w3.org/ns/prov#specializationOf": [{ 

39 "@id": "https://w3id.org/oc/meta/br/06504122264" 

40 }], 

41 "http://www.w3.org/ns/prov#wasDerivedFrom": [{ 

42 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/2" 

43 }] 

44 }, 

45 { 

46 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/1", 

47 "@type": ["http://www.w3.org/ns/prov#Entity"], 

48 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

49 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

50 "@value": "2023-12-13T15:05:18.218917" 

51 }], 

52 "http://www.w3.org/ns/prov#specializationOf": [{ 

53 "@id": "https://w3id.org/oc/meta/br/06504122264" 

54 }] 

55 } 

56 ], 

57 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/" 

58 } 

59 

60 # Create test zip file 

61 self.test_zip_path = os.path.join(self.temp_dir, "test_se.zip") 

62 with zipfile.ZipFile(self.test_zip_path, 'w') as zf: 

63 zf.writestr('se.json', json.dumps(self.test_data)) 

64 

65 def tearDown(self): 

66 if os.path.exists(self.temp_dir): 

67 shutil.rmtree(self.temp_dir) 

68 

69 def test_extract_snapshot_number(self): 

70 """Test extracting snapshot numbers from URIs.""" 

71 test_cases = [ 

72 ("https://w3id.org/oc/meta/br/06504122264/prov/se/1", 1), 

73 ("https://w3id.org/oc/meta/br/06504122264/prov/se/42", 42), 

74 ("invalid_uri", 0) 

75 ] 

76 

77 for uri, expected in test_cases: 

78 with self.subTest(uri=uri): 

79 self.assertEqual(self.processor._extract_snapshot_number(uri), expected) 

80 

81 def test_get_entity_from_prov_graph(self): 

82 """Test extracting entity URI from provenance graph URI.""" 

83 test_cases = [ 

84 ("https://w3id.org/oc/meta/br/06504122264/prov/", 

85 "https://w3id.org/oc/meta/br/06504122264"), 

86 ("https://example.org/resource/prov/", 

87 "https://example.org/resource") 

88 ] 

89 

90 for graph_uri, expected in test_cases: 

91 with self.subTest(graph_uri=graph_uri): 

92 self.assertEqual(self.processor._get_entity_from_prov_graph(graph_uri), expected) 

93 

94 def test_collect_snapshot_info(self): 

95 """Test collecting snapshot information from the graph.""" 

96 g = ConjunctiveGraph() 

97 PROV = Namespace("http://www.w3.org/ns/prov#") 

98 

99 # Add test data 

100 snapshot1 = URIRef("https://w3id.org/oc/meta/br/06504122264/prov/se/1") 

101 snapshot2 = URIRef("https://w3id.org/oc/meta/br/06504122264/prov/se/2") 

102 gen_time = Literal("2023-12-13T15:05:18.218917", datatype=XSD.dateTime) 

103 

104 g.add((snapshot1, PROV.generatedAtTime, gen_time)) 

105 g.add((snapshot2, PROV.generatedAtTime, gen_time)) 

106 

107 snapshots = self.processor._collect_snapshot_info(g) 

108 

109 self.assertEqual(len(snapshots), 2) 

110 self.assertEqual(snapshots[0]['number'], 1) 

111 self.assertEqual(snapshots[1]['number'], 2) 

112 self.assertEqual(len(snapshots[0]['generation_times']), 1) 

113 self.assertEqual(str(snapshots[0]['generation_times'][0]), str(gen_time)) 

114 

115 def test_multiple_timestamps(self): 

116 """Test handling of multiple timestamps for a snapshot.""" 

117 g = ConjunctiveGraph() 

118 PROV = Namespace("http://www.w3.org/ns/prov#") 

119 

120 snapshot = URIRef("https://w3id.org/oc/meta/br/06504122264/prov/se/1") 

121 time1 = Literal("2023-12-13T15:05:18+00:00", datatype=XSD.dateTime) 

122 time2 = Literal("2023-12-13T16:05:18+00:00", datatype=XSD.dateTime) 

123 

124 g.add((snapshot, PROV.generatedAtTime, time1)) 

125 g.add((snapshot, PROV.generatedAtTime, time2)) 

126 

127 # Test la rimozione dei timestamp multipli 

128 self.processor._remove_multiple_timestamps( 

129 g, snapshot, PROV.generatedAtTime, [time1, time2]) 

130 

131 # Verifica che non ci siano più timestamp 

132 remaining_times = list(g.objects(snapshot, PROV.generatedAtTime)) 

133 self.assertEqual(len(remaining_times), 0) 

134 

135 def test_process_file_with_multiple_timestamps(self): 

136 """Test processing a file that contains snapshots with multiple timestamps.""" 

137 # Crea dati di test con timestamp multipli e la catena completa di snapshot 

138 test_data = { 

139 "@graph": [ 

140 { 

141 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/1", 

142 "@type": ["http://www.w3.org/ns/prov#Entity"], 

143 "http://www.w3.org/ns/prov#generatedAtTime": [ 

144 { 

145 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

146 "@value": "2023-12-13T15:05:18+00:00" 

147 }, 

148 { 

149 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

150 "@value": "2023-12-13T16:05:18+00:00" 

151 } 

152 ] 

153 }, 

154 { 

155 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/2", 

156 "@type": ["http://www.w3.org/ns/prov#Entity"], 

157 "http://www.w3.org/ns/prov#generatedAtTime": [ 

158 { 

159 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

160 "@value": "2023-12-22T18:06:49+00:00" 

161 } 

162 ] 

163 }, 

164 { 

165 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/3", 

166 "@type": ["http://www.w3.org/ns/prov#Entity"], 

167 "http://www.w3.org/ns/prov#generatedAtTime": [ 

168 { 

169 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

170 "@value": "2023-12-31T22:08:21+00:00" 

171 } 

172 ] 

173 } 

174 ], 

175 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/" 

176 } 

177 

178 test_file = os.path.join(self.temp_dir, "multiple_timestamps.zip") 

179 with zipfile.ZipFile(test_file, 'w') as zf: 

180 zf.writestr('se.json', json.dumps(test_data)) 

181 

182 # Processa il file 

183 result = self.processor.process_file(test_file, 'test/fix_provenance_logs') 

184 self.assertIsNotNone(result) 

185 

186 # Verifica il contenuto del file risultante 

187 with zipfile.ZipFile(test_file, 'r') as zf: 

188 with zf.open('se.json') as f: 

189 fixed_data = json.loads(f.read()) 

190 

191 # Verifica la struttura di base 

192 graph_data = fixed_data[0]['@graph'] 

193 

194 # Verifica gli ID degli snapshot 

195 snapshot_ids = {item['@id'] for item in graph_data} 

196 expected_ids = { 

197 "https://w3id.org/oc/meta/br/06504122264/prov/se/1", 

198 "https://w3id.org/oc/meta/br/06504122264/prov/se/2", 

199 "https://w3id.org/oc/meta/br/06504122264/prov/se/3" 

200 } 

201 self.assertEqual(snapshot_ids, expected_ids) 

202 

203 # Verifica le relazioni wasDerivedFrom 

204 derived_from = { 

205 item['@id']: item.get('http://www.w3.org/ns/prov#wasDerivedFrom', [{}])[0].get('@id') 

206 for item in graph_data 

207 if 'http://www.w3.org/ns/prov#wasDerivedFrom' in item 

208 } 

209 expected_derived = { 

210 "https://w3id.org/oc/meta/br/06504122264/prov/se/2": 

211 "https://w3id.org/oc/meta/br/06504122264/prov/se/1", 

212 "https://w3id.org/oc/meta/br/06504122264/prov/se/3": 

213 "https://w3id.org/oc/meta/br/06504122264/prov/se/2" 

214 } 

215 self.assertEqual(derived_from, expected_derived) 

216 

217 # Verifica le relazioni specializationOf 

218 specialization_of = { 

219 item['@id']: item.get('http://www.w3.org/ns/prov#specializationOf', [{}])[0].get('@id') 

220 for item in graph_data 

221 if 'http://www.w3.org/ns/prov#specializationOf' in item 

222 } 

223 expected_specialization = { 

224 "https://w3id.org/oc/meta/br/06504122264/prov/se/1": 

225 "https://w3id.org/oc/meta/br/06504122264", 

226 "https://w3id.org/oc/meta/br/06504122264/prov/se/2": 

227 "https://w3id.org/oc/meta/br/06504122264", 

228 "https://w3id.org/oc/meta/br/06504122264/prov/se/3": 

229 "https://w3id.org/oc/meta/br/06504122264" 

230 } 

231 self.assertEqual(specialization_of, expected_specialization) 

232 

233 # Verifica i timestamp 

234 generated_times = { 

235 item['@id']: item.get('http://www.w3.org/ns/prov#generatedAtTime', [{}])[0].get('@value') 

236 for item in graph_data 

237 if 'http://www.w3.org/ns/prov#generatedAtTime' in item 

238 } 

239 

240 # Verifica che il primo snapshot abbia un solo timestamp 

241 first_snapshot = next(item for item in graph_data 

242 if item['@id'].endswith('/prov/se/1')) 

243 self.assertEqual( 

244 len(first_snapshot.get('http://www.w3.org/ns/prov#generatedAtTime', [])), 

245 1, 

246 "First snapshot should have exactly one generatedAtTime" 

247 ) 

248 

249 # Verifica i timestamp di invalidazione 

250 invalidated_times = { 

251 item['@id']: item.get('http://www.w3.org/ns/prov#invalidatedAtTime', [{}])[0].get('@value') 

252 for item in graph_data 

253 if 'http://www.w3.org/ns/prov#invalidatedAtTime' in item 

254 } 

255 expected_invalidated = { 

256 'https://w3id.org/oc/meta/br/06504122264/prov/se/1': 

257 '2023-12-22T18:06:49+00:00', 

258 "https://w3id.org/oc/meta/br/06504122264/prov/se/2": 

259 "2023-12-31T22:08:21+00:00" 

260 } 

261 self.assertEqual(invalidated_times, expected_invalidated) 

262 

263 # Verifica che tutti i timestamp siano in UTC 

264 for timestamp in generated_times.values(): 

265 self.assertTrue( 

266 '+00:00' in timestamp or 'Z' in timestamp, 

267 f"Generated timestamp {timestamp} should be in UTC" 

268 ) 

269 

270 for timestamp in invalidated_times.values(): 

271 self.assertTrue( 

272 '+00:00' in timestamp or 'Z' in timestamp, 

273 f"Invalidated timestamp {timestamp} should be in UTC" 

274 ) 

275 

276 def test_normalize_timestamps(self): 

277 """Test normalizing timestamps to UTC.""" 

278 test_cases = [ 

279 ("2023-12-13T15:05:18.218917", True), # No timezone - should be converted 

280 ("2023-12-13T15:05:18+00:00", False), # Already UTC - no change needed 

281 ("2023-12-13T15:05:18Z", False), # Already UTC - no change needed 

282 ("2023-12-13T15:05:18+01:00", True) # Different timezone - should be converted 

283 ] 

284 

285 for timestamp_str, should_change in test_cases: 

286 with self.subTest(timestamp=timestamp_str): 

287 literal = Literal(timestamp_str, datatype=XSD.dateTime) 

288 new_literal, was_changed = self.processor._normalize_timestamp(literal) 

289 self.assertEqual(was_changed, should_change) 

290 if was_changed: 

291 self.assertTrue('+00:00' in str(new_literal) or 'Z' in str(new_literal)) 

292 

293 def test_missing_snapshots(self): 

294 """Test handling of missing snapshots in the sequence.""" 

295 # Test data with snapshot 2 missing from sequence 1,3,4 

296 test_data = { 

297 "@graph": [ 

298 { 

299 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/1", 

300 "@type": ["http://www.w3.org/ns/prov#Entity"], 

301 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

302 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

303 "@value": "2023-12-13T15:05:18+00:00" 

304 }], 

305 "http://www.w3.org/ns/prov#invalidatedAtTime": [{ 

306 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

307 "@value": "2023-12-22T18:06:49+00:00" 

308 }], 

309 "http://www.w3.org/ns/prov#specializationOf": [{ 

310 "@id": "https://w3id.org/oc/meta/br/06504122264" 

311 }] 

312 }, 

313 { 

314 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/3", 

315 "@type": ["http://www.w3.org/ns/prov#Entity"], 

316 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

317 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

318 "@value": "2023-12-31T22:08:21+00:00" 

319 }], 

320 "http://www.w3.org/ns/prov#specializationOf": [{ 

321 "@id": "https://w3id.org/oc/meta/br/06504122264" 

322 }] 

323 }, 

324 { 

325 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/4", 

326 "@type": ["http://www.w3.org/ns/prov#Entity"], 

327 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

328 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

329 "@value": "2024-01-15T10:30:00+00:00" 

330 }], 

331 "http://www.w3.org/ns/prov#specializationOf": [{ 

332 "@id": "https://w3id.org/oc/meta/br/06504122264" 

333 }] 

334 } 

335 ], 

336 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/" 

337 } 

338 

339 test_file = os.path.join(self.temp_dir, "missing_snapshot.zip") 

340 with zipfile.ZipFile(test_file, 'w') as zf: 

341 zf.writestr('se.json', json.dumps(test_data)) 

342 

343 # Process the file 

344 result = self.processor.process_file(test_file, 'test/fix_provenance_logs') 

345 self.assertTrue(result) 

346 

347 # Verify the resulting file 

348 with zipfile.ZipFile(test_file, 'r') as zf: 

349 with zf.open('se.json') as f: 

350 fixed_data = json.loads(f.read()) 

351 graph_data = fixed_data[0]['@graph'] 

352 

353 # Check if the missing snapshot 2 was created 

354 snapshot_ids = {item['@id'] for item in graph_data} 

355 self.assertIn( 

356 "https://w3id.org/oc/meta/br/06504122264/prov/se/2", 

357 snapshot_ids, 

358 "Missing snapshot 2 should have been created" 

359 ) 

360 

361 # Find the created snapshot 

362 snapshot_2 = next(item for item in graph_data 

363 if item['@id'].endswith('/prov/se/2')) 

364 

365 # Verify basic properties of the created snapshot 

366 self.assertIn('@type', snapshot_2) 

367 self.assertIn('http://www.w3.org/ns/prov#Entity', snapshot_2['@type']) 

368 

369 # Verify specializationOf relationship 

370 self.assertIn('http://www.w3.org/ns/prov#specializationOf', snapshot_2) 

371 self.assertEqual( 

372 snapshot_2['http://www.w3.org/ns/prov#specializationOf'][0]['@id'], 

373 "https://w3id.org/oc/meta/br/06504122264" 

374 ) 

375 

376 # Verify wasDerivedFrom relationship 

377 self.assertIn('http://www.w3.org/ns/prov#wasDerivedFrom', snapshot_2) 

378 self.assertEqual( 

379 snapshot_2['http://www.w3.org/ns/prov#wasDerivedFrom'][0]['@id'], 

380 "https://w3id.org/oc/meta/br/06504122264/prov/se/1" 

381 ) 

382 

383 # Verify timestamps 

384 self.assertIn('http://www.w3.org/ns/prov#generatedAtTime', snapshot_2) 

385 self.assertIn('http://www.w3.org/ns/prov#invalidatedAtTime', snapshot_2) 

386 

387 def test_multiple_missing_snapshots(self): 

388 """Test handling of multiple consecutive missing snapshots.""" 

389 # Test data with snapshots 2 and 3 missing from sequence 1,4,5 

390 test_data = { 

391 "@graph": [ 

392 { 

393 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/1", 

394 "@type": ["http://www.w3.org/ns/prov#Entity"], 

395 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

396 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

397 "@value": "2023-12-13T15:05:18+00:00" 

398 }], 

399 "http://www.w3.org/ns/prov#invalidatedAtTime": [{ 

400 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

401 "@value": "2023-12-22T18:06:49+00:00" 

402 }], 

403 "http://www.w3.org/ns/prov#specializationOf": [{ 

404 "@id": "https://w3id.org/oc/meta/br/06504122264" 

405 }] 

406 }, 

407 { 

408 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/4", 

409 "@type": ["http://www.w3.org/ns/prov#Entity"], 

410 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

411 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

412 "@value": "2024-01-15T10:30:00+00:00" 

413 }], 

414 "http://www.w3.org/ns/prov#specializationOf": [{ 

415 "@id": "https://w3id.org/oc/meta/br/06504122264" 

416 }] 

417 }, 

418 { 

419 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/5", 

420 "@type": ["http://www.w3.org/ns/prov#Entity"], 

421 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

422 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

423 "@value": "2024-01-20T14:45:00+00:00" 

424 }], 

425 "http://www.w3.org/ns/prov#specializationOf": [{ 

426 "@id": "https://w3id.org/oc/meta/br/06504122264" 

427 }] 

428 } 

429 ], 

430 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/" 

431 } 

432 

433 test_file = os.path.join(self.temp_dir, "multiple_missing_snapshots.zip") 

434 with zipfile.ZipFile(test_file, 'w') as zf: 

435 zf.writestr('se.json', json.dumps(test_data)) 

436 

437 result = self.processor.process_file(test_file, 'test/fix_provenance_logs') 

438 self.assertTrue(result) 

439 

440 with zipfile.ZipFile(test_file, 'r') as zf: 

441 with zf.open('se.json') as f: 

442 fixed_data = json.loads(f.read()) 

443 

444 graph_data = fixed_data[0]['@graph'] 

445 # Raccoglie gli snapshot e i loro numeri 

446 snapshots = {} 

447 for item in graph_data: 

448 if '/prov/se/' in item['@id']: 

449 num = int(item['@id'].split('/se/')[-1]) 

450 snapshots[num] = item 

451 

452 # Verifica che tutti gli snapshot abbiano le proprietà di base 

453 for num, snapshot in snapshots.items(): 

454 # Verifica tipo 

455 self.assertIn('@type', snapshot) 

456 self.assertIn('http://www.w3.org/ns/prov#Entity', snapshot['@type']) 

457 

458 # Verifica specializationOf 

459 self.assertIn('http://www.w3.org/ns/prov#specializationOf', snapshot) 

460 self.assertEqual( 

461 snapshot['http://www.w3.org/ns/prov#specializationOf'][0]['@id'], 

462 "https://w3id.org/oc/meta/br/06504122264" 

463 ) 

464 

465 # Verifica timestamp 

466 self.assertIn('http://www.w3.org/ns/prov#generatedAtTime', snapshot) 

467 gen_time = snapshot['http://www.w3.org/ns/prov#generatedAtTime'][0]['@value'] 

468 self.assertTrue('+00:00' in gen_time or 'Z' in gen_time) 

469 

470 # Verifica wasDerivedFrom per tutti tranne il primo snapshot 

471 if num > min(snapshots.keys()): 

472 self.assertIn('http://www.w3.org/ns/prov#wasDerivedFrom', snapshot) 

473 

474 # Verifica la consistenza temporale 

475 ordered_nums = sorted(snapshots.keys()) 

476 for i in range(len(ordered_nums)-1): 

477 curr_num = ordered_nums[i] 

478 next_num = ordered_nums[i+1] 

479 

480 curr_snapshot = snapshots[curr_num] 

481 next_snapshot = snapshots[next_num] 

482 

483 # Se lo snapshot corrente ha un tempo di invalidazione 

484 if 'http://www.w3.org/ns/prov#invalidatedAtTime' in curr_snapshot: 

485 curr_inv_time = self.processor._convert_to_utc( 

486 curr_snapshot['http://www.w3.org/ns/prov#invalidatedAtTime'][0]['@value'] 

487 ) 

488 next_gen_time = self.processor._convert_to_utc( 

489 next_snapshot['http://www.w3.org/ns/prov#generatedAtTime'][0]['@value'] 

490 ) 

491 self.assertEqual( 

492 curr_inv_time, 

493 next_gen_time, 

494 f"Invalidation time of snapshot {curr_num} should match generation time of {next_num}" 

495 ) 

496 

497 # Verifica che gli snapshot siano collegati correttamente 

498 for num in ordered_nums[1:]: # Skip the first one 

499 curr_snapshot = snapshots[num] 

500 prev_num = ordered_nums[ordered_nums.index(num) - 1] 

501 

502 # Verifica che wasDerivedFrom punti allo snapshot precedente 

503 derived_from = curr_snapshot['http://www.w3.org/ns/prov#wasDerivedFrom'][0]['@id'] 

504 expected_derived = f"https://w3id.org/oc/meta/br/06504122264/prov/se/{prev_num}" 

505 self.assertEqual( 

506 derived_from, 

507 expected_derived, 

508 f"Snapshot {num} should be derived from snapshot {prev_num}" 

509 ) 

510 

511 def test_timestamp_inference(self): 

512 """Test timestamp inference for missing snapshots.""" 

513 # Create test data where we can verify timestamp inference logic 

514 test_data = { 

515 "@graph": [ 

516 { 

517 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/1", 

518 "@type": ["http://www.w3.org/ns/prov#Entity"], 

519 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

520 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

521 "@value": "2023-12-13T12:00:00+00:00" 

522 }], 

523 "http://www.w3.org/ns/prov#invalidatedAtTime": [{ 

524 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

525 "@value": "2023-12-13T14:00:00+00:00" 

526 }], 

527 "http://www.w3.org/ns/prov#specializationOf": [{ 

528 "@id": "https://w3id.org/oc/meta/br/06504122264" 

529 }] 

530 }, 

531 { 

532 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/3", 

533 "@type": ["http://www.w3.org/ns/prov#Entity"], 

534 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

535 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

536 "@value": "2023-12-13T18:00:00+00:00" 

537 }], 

538 "http://www.w3.org/ns/prov#specializationOf": [{ 

539 "@id": "https://w3id.org/oc/meta/br/06504122264" 

540 }] 

541 } 

542 ], 

543 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/" 

544 } 

545 

546 test_file = os.path.join(self.temp_dir, "timestamp_inference.zip") 

547 with zipfile.ZipFile(test_file, 'w') as zf: 

548 zf.writestr('se.json', json.dumps(test_data)) 

549 

550 result = self.processor.process_file(test_file, 'test/fix_provenance_logs') 

551 self.assertTrue(result) 

552 

553 with zipfile.ZipFile(test_file, 'r') as zf: 

554 with zf.open('se.json') as f: 

555 fixed_data = json.loads(f.read()) 

556 

557 graph_data = fixed_data[0]['@graph'] 

558 

559 # Find the created snapshot 2 

560 snapshot_2 = next(item for item in graph_data 

561 if item['@id'].endswith('/prov/se/2')) 

562 

563 # Verify timestamps were inferred correctly 

564 self.assertIn('http://www.w3.org/ns/prov#generatedAtTime', snapshot_2) 

565 gen_time = snapshot_2['http://www.w3.org/ns/prov#generatedAtTime'][0]['@value'] 

566 self.assertEqual(gen_time, "2023-12-13T14:00:00+00:00") 

567 

568 self.assertIn('http://www.w3.org/ns/prov#invalidatedAtTime', snapshot_2) 

569 inv_time = snapshot_2['http://www.w3.org/ns/prov#invalidatedAtTime'][0]['@value'] 

570 self.assertEqual(inv_time, "2023-12-13T18:00:00+00:00") 

571 

572 def test_multiple_descriptions_merge(self): 

573 """Test handling of multiple descriptions when merge descriptions are present.""" 

574 

575 test_data = { 

576 "@graph": [ 

577 { 

578 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/1", 

579 "@type": ["http://www.w3.org/ns/prov#Entity"], 

580 "http://purl.org/dc/terms/description": [{ 

581 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been created." 

582 }], 

583 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

584 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

585 "@value": "2023-12-13T15:05:18+00:00" 

586 }] 

587 }, 

588 { 

589 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/2", 

590 "@type": ["http://www.w3.org/ns/prov#Entity"], 

591 "http://purl.org/dc/terms/description": [ 

592 { 

593 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been merged with 'https://w3id.org/oc/meta/br/06504122265'." 

594 }, 

595 { 

596 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been merged with 'https://w3id.org/oc/meta/br/06504122266'." 

597 }, 

598 { 

599 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been modified." 

600 } 

601 ], 

602 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

603 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

604 "@value": "2023-12-31T22:08:21+00:00" 

605 }] 

606 } 

607 ], 

608 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/" 

609 } 

610 

611 test_file = os.path.join(self.temp_dir, "multiple_descriptions_merge.zip") 

612 with zipfile.ZipFile(test_file, 'w') as zf: 

613 zf.writestr('se.json', json.dumps(test_data)) 

614 

615 result = self.processor.process_file(test_file, 'test/fix_provenance_logs') 

616 self.assertTrue(result) 

617 

618 with zipfile.ZipFile(test_file, 'r') as zf: 

619 with zf.open('se.json') as f: 

620 fixed_data = json.loads(f.read()) 

621 

622 # Find the snapshot in the fixed data 

623 snapshot = next(item for item in fixed_data[0]['@graph'] 

624 if item['@id'].endswith('/prov/se/2')) 

625 

626 descriptions = snapshot.get('http://purl.org/dc/terms/description', []) 

627 

628 # Verify that both merge descriptions were kept 

629 merge_descriptions = [desc for desc in descriptions 

630 if "has been merged with" in desc['@value']] 

631 self.assertEqual(len(merge_descriptions), 2, 

632 "Both merge descriptions should be preserved") 

633 

634 # Verify that non-merge description was removed 

635 non_merge_descriptions = [desc for desc in descriptions 

636 if "has been modified" in desc['@value']] 

637 self.assertEqual(len(non_merge_descriptions), 0, 

638 "Non-merge description should be removed") 

639 

640 def test_multiple_descriptions_first_snapshot(self): 

641 """Test handling of multiple descriptions in the first snapshot.""" 

642 test_data = { 

643 "@graph": [ 

644 { 

645 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/1", 

646 "@type": ["http://www.w3.org/ns/prov#Entity"], 

647 "http://purl.org/dc/terms/description": [ 

648 { 

649 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been created." 

650 }, 

651 { 

652 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been modified." 

653 }, 

654 { 

655 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been merged with 'https://w3id.org/oc/meta/br/06504122265'." 

656 } 

657 ], 

658 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

659 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

660 "@value": "2023-12-13T15:05:18+00:00" 

661 }] 

662 } 

663 ], 

664 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/" 

665 } 

666 

667 test_file = os.path.join(self.temp_dir, "first_snapshot_descriptions.zip") 

668 with zipfile.ZipFile(test_file, 'w') as zf: 

669 zf.writestr('se.json', json.dumps(test_data)) 

670 

671 result = self.processor.process_file(test_file, 'test/fix_provenance_logs') 

672 self.assertTrue(result) 

673 

674 with zipfile.ZipFile(test_file, 'r') as zf: 

675 with zf.open('se.json') as f: 

676 fixed_data = json.loads(f.read()) 

677 

678 snapshot = next(item for item in fixed_data[0]['@graph'] 

679 if item['@id'].endswith('/prov/se/1')) 

680 

681 descriptions = snapshot.get('http://purl.org/dc/terms/description', []) 

682 

683 # Verify that only creation description was kept 

684 self.assertEqual(len(descriptions), 1, 

685 "First snapshot should have exactly one description") 

686 self.assertTrue("has been created" in descriptions[0]['@value'], 

687 "First snapshot should keep only creation description") 

688 

689 def test_multiple_descriptions_last_snapshot(self): 

690 """Test handling of multiple descriptions in the last snapshot.""" 

691 test_data = { 

692 "@graph": [ 

693 { 

694 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/1", 

695 "@type": ["http://www.w3.org/ns/prov#Entity"], 

696 "http://purl.org/dc/terms/description": [{ 

697 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been created." 

698 }], 

699 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

700 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

701 "@value": "2023-12-13T15:05:18+00:00" 

702 }] 

703 }, 

704 { 

705 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/2", 

706 "@type": ["http://www.w3.org/ns/prov#Entity"], 

707 "http://purl.org/dc/terms/description": [ 

708 { 

709 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been modified." 

710 }, 

711 { 

712 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been deleted." 

713 } 

714 ], 

715 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

716 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

717 "@value": "2023-12-31T22:08:21+00:00" 

718 }] 

719 } 

720 ], 

721 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/" 

722 } 

723 

724 test_file = os.path.join(self.temp_dir, "last_snapshot_descriptions.zip") 

725 with zipfile.ZipFile(test_file, 'w') as zf: 

726 zf.writestr('se.json', json.dumps(test_data)) 

727 

728 result = self.processor.process_file(test_file, 'test/fix_provenance_logs') 

729 self.assertTrue(result) 

730 

731 with zipfile.ZipFile(test_file, 'r') as zf: 

732 with zf.open('se.json') as f: 

733 fixed_data = json.loads(f.read()) 

734 

735 last_snapshot = next(item for item in fixed_data[0]['@graph'] 

736 if item['@id'].endswith('/prov/se/2')) 

737 

738 descriptions = last_snapshot.get('http://purl.org/dc/terms/description', []) 

739 

740 # Verify that only deletion description was kept 

741 self.assertEqual(len(descriptions), 1, 

742 "Last snapshot should have exactly one description") 

743 self.assertTrue("has been deleted" in descriptions[0]['@value'], 

744 "Last snapshot should keep deletion description") 

745 

746 def test_multiple_descriptions_middle_snapshot(self): 

747 """Test handling of multiple descriptions in a middle snapshot.""" 

748 test_data = { 

749 "@graph": [ 

750 { 

751 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/1", 

752 "@type": ["http://www.w3.org/ns/prov#Entity"], 

753 "http://purl.org/dc/terms/description": [{ 

754 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been created." 

755 }], 

756 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

757 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

758 "@value": "2023-12-13T15:05:18+00:00" 

759 }] 

760 }, 

761 { 

762 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/2", 

763 "@type": ["http://www.w3.org/ns/prov#Entity"], 

764 "http://purl.org/dc/terms/description": [ 

765 { 

766 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been modified." 

767 }, 

768 { 

769 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been created." 

770 } 

771 ], 

772 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

773 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

774 "@value": "2023-12-31T22:08:21+00:00" 

775 }] 

776 }, 

777 { 

778 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/se/3", 

779 "@type": ["http://www.w3.org/ns/prov#Entity"], 

780 "http://purl.org/dc/terms/description": [{ 

781 "@value": "The entity 'https://w3id.org/oc/meta/br/06504122264' has been deleted." 

782 }], 

783 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

784 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

785 "@value": "2024-01-01T10:00:00+00:00" 

786 }] 

787 } 

788 ], 

789 "@id": "https://w3id.org/oc/meta/br/06504122264/prov/" 

790 } 

791 

792 test_file = os.path.join(self.temp_dir, "middle_snapshot_descriptions.zip") 

793 with zipfile.ZipFile(test_file, 'w') as zf: 

794 zf.writestr('se.json', json.dumps(test_data)) 

795 

796 result = self.processor.process_file(test_file, 'test/fix_provenance_logs') 

797 self.assertTrue(result) 

798 

799 with zipfile.ZipFile(test_file, 'r') as zf: 

800 with zf.open('se.json') as f: 

801 fixed_data = json.loads(f.read()) 

802 

803 middle_snapshot = next(item for item in fixed_data[0]['@graph'] 

804 if item['@id'].endswith('/prov/se/2')) 

805 

806 descriptions = middle_snapshot.get('http://purl.org/dc/terms/description', []) 

807 

808 # Verify that only modification description was kept 

809 self.assertEqual(len(descriptions), 1, 

810 "Middle snapshot should have exactly one description") 

811 self.assertTrue("has been modified" in descriptions[0]['@value'], 

812 "Middle snapshot should keep modification description") 

813 

814 def test_real_case_multiple_timestamps_and_incomplete_snapshot(self): 

815 """Test the real case scenario of entity 0623074134 with multiple timestamps 

816 and an incomplete snapshot.""" 

817 test_data = { 

818 "@graph": [ 

819 { 

820 "@id": "https://w3id.org/oc/meta/id/0623074134/prov/se/4", 

821 "@type": ["http://www.w3.org/ns/prov#Entity"], 

822 "http://purl.org/dc/terms/description": [{ 

823 "@value": "The entity 'https://w3id.org/oc/meta/id/0623074134' has been merged with 'https://w3id.org/oc/meta/id/063301371593'." 

824 }], 

825 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

826 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

827 "@value": "2024-10-23T20:52:32+00:00" 

828 }], 

829 "http://www.w3.org/ns/prov#specializationOf": [{ 

830 "@id": "https://w3id.org/oc/meta/id/0623074134" 

831 }], 

832 "http://www.w3.org/ns/prov#wasAttributedTo": [{ 

833 "@id": "http://orcid.org/0000-0002-8420-0696" 

834 }], 

835 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

836 { 

837 "@id": "https://w3id.org/oc/meta/id/0623074134/prov/se/3" 

838 }, 

839 { 

840 "@id": "https://w3id.org/oc/meta/id/063301371593/prov/se/1" 

841 } 

842 ] 

843 }, 

844 { 

845 "@id": "https://w3id.org/oc/meta/id/0623074134/prov/se/3", 

846 "@type": ["http://www.w3.org/ns/prov#Entity"], 

847 "http://purl.org/dc/terms/description": [{ 

848 "@value": "The entity 'https://w3id.org/oc/meta/id/0623074134' has been modified." 

849 }], 

850 "http://www.w3.org/ns/prov#generatedAtTime": [{ 

851 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

852 "@value": "2024-06-06T18:55:36+00:00" 

853 }], 

854 "http://www.w3.org/ns/prov#invalidatedAtTime": [{ 

855 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

856 "@value": "2024-10-23T20:52:32+00:00" 

857 }], 

858 "http://www.w3.org/ns/prov#specializationOf": [{ 

859 "@id": "https://w3id.org/oc/meta/id/0623074134" 

860 }], 

861 "http://www.w3.org/ns/prov#wasDerivedFrom": [{ 

862 "@id": "https://w3id.org/oc/meta/id/0623074134/prov/se/2" 

863 }] 

864 }, 

865 { 

866 "@id": "https://w3id.org/oc/meta/id/0623074134/prov/se/1", 

867 "@type": ["http://www.w3.org/ns/prov#Entity"], 

868 "http://purl.org/dc/terms/description": [{ 

869 "@value": "The entity 'https://w3id.org/oc/meta/id/0623074134' has been created." 

870 }], 

871 "http://www.w3.org/ns/prov#generatedAtTime": [ 

872 { 

873 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

874 "@value": "2024-03-27T18:03:23+00:00" 

875 }, 

876 { 

877 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

878 "@value": "2023-12-13T16:14:48.836637" 

879 } 

880 ], 

881 "http://www.w3.org/ns/prov#specializationOf": [{ 

882 "@id": "https://w3id.org/oc/meta/id/0623074134" 

883 }] 

884 }, 

885 { 

886 "@id": "https://w3id.org/oc/meta/id/0623074134/prov/se/2", 

887 "@type": ["http://www.w3.org/ns/prov#Entity"], 

888 "http://www.w3.org/ns/prov#invalidatedAtTime": [{ 

889 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

890 "@value": "2024-06-06T18:55:36+00:00" 

891 }] 

892 } 

893 ], 

894 "@id": "https://w3id.org/oc/meta/id/0623074134/prov/" 

895 } 

896 

897 test_file = os.path.join(self.temp_dir, "real_case_test.zip") 

898 with zipfile.ZipFile(test_file, 'w') as zf: 

899 zf.writestr('se.json', json.dumps(test_data)) 

900 

901 result = self.processor.process_file(test_file, 'test/fix_provenance_logs') 

902 self.assertTrue(result) 

903 

904 # Verify the fixed data 

905 with zipfile.ZipFile(test_file, 'r') as zf: 

906 with zf.open('se.json') as f: 

907 fixed_data = json.loads(f.read()) 

908 

909 graph_data = fixed_data[0]['@graph'] 

910 

911 # Test specific issues from the case study: 

912 

913 # 1. Check that snapshot 1 has only one generatedAtTime (should keep the earliest) 

914 snapshot_1 = next(item for item in graph_data 

915 if item['@id'].endswith('/prov/se/1')) 

916 gen_times_1 = snapshot_1.get('http://www.w3.org/ns/prov#generatedAtTime', []) 

917 self.assertEqual(len(gen_times_1), 1, 

918 "Snapshot 1 should have exactly one generatedAtTime") 

919 self.assertEqual( 

920 gen_times_1[0]['@value'], 

921 "2023-12-13T15:14:48.836637+00:00", 

922 "Should keep the earliest timestamp" 

923 ) 

924 

925 # 2. Check that snapshot 2 is complete with all required properties 

926 snapshot_2 = next(item for item in graph_data 

927 if item['@id'].endswith('/prov/se/2')) 

928 self.assertIn('@type', snapshot_2) 

929 self.assertIn('http://www.w3.org/ns/prov#specializationOf', snapshot_2) 

930 self.assertIn('http://www.w3.org/ns/prov#wasDerivedFrom', snapshot_2) 

931 self.assertEqual( 

932 snapshot_2['http://www.w3.org/ns/prov#wasDerivedFrom'][0]['@id'], 

933 "https://w3id.org/oc/meta/id/0623074134/prov/se/1" 

934 ) 

935 

936 # 3. Check that all snapshots form a proper chain 

937 for i in range(2, 5): # Check snapshots 2 through 4 

938 current = next(item for item in graph_data 

939 if item['@id'].endswith(f'/prov/se/{i}')) 

940 self.assertIn('http://www.w3.org/ns/prov#wasDerivedFrom', current) 

941 derived_from = current['http://www.w3.org/ns/prov#wasDerivedFrom'][0]['@id'] 

942 self.assertTrue( 

943 derived_from.endswith(f'/prov/se/{i-1}'), 

944 f"Snapshot {i} should be derived from snapshot {i-1}" 

945 ) 

946 

947 # 4. Check bi-directional timestamp consistency 

948 snapshots = {} 

949 for i in range(1, 5): # Get all snapshots 

950 snapshots[i] = next(item for item in graph_data 

951 if item['@id'].endswith(f'/prov/se/{i}')) 

952 

953 # 4.1 Check forward: invalidatedAtTime matches next snapshot's generatedAtTime 

954 for i in range(1, 4): # Check snapshots 1 through 3 

955 current = snapshots[i] 

956 next_snapshot = snapshots[i + 1] 

957 

958 self.assertIn('http://www.w3.org/ns/prov#invalidatedAtTime', current, 

959 f"Snapshot {i} should have invalidatedAtTime") 

960 self.assertIn('http://www.w3.org/ns/prov#generatedAtTime', next_snapshot, 

961 f"Snapshot {i+1} should have generatedAtTime") 

962 

963 inv_time = current['http://www.w3.org/ns/prov#invalidatedAtTime'][0]['@value'] 

964 gen_time = next_snapshot['http://www.w3.org/ns/prov#generatedAtTime'][0]['@value'] 

965 

966 self.assertEqual( 

967 inv_time, 

968 gen_time, 

969 f"Invalidation time of snapshot {i} should match generation time of snapshot {i+1}" 

970 ) 

971 

972 # 4.2 Check backward: generatedAtTime matches previous snapshot's invalidatedAtTime 

973 for i in range(2, 5): # Check snapshots 2 through 4 

974 current = snapshots[i] 

975 prev_snapshot = snapshots[i - 1] 

976 

977 self.assertIn('http://www.w3.org/ns/prov#generatedAtTime', current, 

978 f"Snapshot {i} should have generatedAtTime") 

979 self.assertIn('http://www.w3.org/ns/prov#invalidatedAtTime', prev_snapshot, 

980 f"Snapshot {i-1} should have invalidatedAtTime") 

981 

982 gen_time = current['http://www.w3.org/ns/prov#generatedAtTime'][0]['@value'] 

983 inv_time = prev_snapshot['http://www.w3.org/ns/prov#invalidatedAtTime'][0]['@value'] 

984 

985 self.assertEqual( 

986 gen_time, 

987 inv_time, 

988 f"Generation time of snapshot {i} should match invalidation time of snapshot {i-1}" 

989 ) 

990 

991 # 5. Check that merge-related wasDerivedFrom is preserved in snapshot 4 

992 snapshot_4 = next(item for item in graph_data 

993 if item['@id'].endswith('/prov/se/4')) 

994 derived_from_ids = [ref['@id'] for ref in 

995 snapshot_4['http://www.w3.org/ns/prov#wasDerivedFrom']] 

996 self.assertIn( 

997 "https://w3id.org/oc/meta/id/063301371593/prov/se/1", 

998 derived_from_ids, 

999 "Merge-related wasDerivedFrom should be preserved" 

1000 ) 

1001 

1002 def test_original_unresolved_issues_scenario(self): 

1003 # Dati di test presi dal messaggio iniziale nella conversazione 

1004 original_data = { 

1005 "@graph": [ 

1006 { 

1007 "@id": "https://w3id.org/oc/meta/ra/06440227509/prov/se/5", 

1008 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1009 "http://purl.org/dc/terms/description": [ 

1010 { 

1011 "@value": "The entity 'https://w3id.org/oc/meta/ra/06440227509' has been deleted." 

1012 } 

1013 ], 

1014 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1015 { 

1016 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1017 "@value": "2024-12-08T01:23:24+00:00" 

1018 } 

1019 ], 

1020 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1021 { 

1022 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1023 "@value": "2024-12-08T01:23:24+00:00" 

1024 } 

1025 ], 

1026 "http://www.w3.org/ns/prov#specializationOf": [ 

1027 { 

1028 "@id": "https://w3id.org/oc/meta/ra/06440227509" 

1029 } 

1030 ], 

1031 "http://www.w3.org/ns/prov#wasAttributedTo": [ 

1032 { 

1033 "@id": "https://orcid.org/0000-0002-8420-0696" 

1034 } 

1035 ], 

1036 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

1037 { 

1038 "@id": "https://w3id.org/oc/meta/ra/06440227509/prov/se/4" 

1039 } 

1040 ], 

1041 "https://w3id.org/oc/ontology/hasUpdateQuery": [ 

1042 { 

1043 "@value": "DELETE DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06440227509> <http://xmlns.com/foaf/0.1/givenName> \"R.\" .<https://w3id.org/oc/meta/ra/06440227509> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/063501394354> .<https://w3id.org/oc/meta/ra/06440227509> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://xmlns.com/foaf/0.1/Agent> .<https://w3id.org/oc/meta/ra/06440227509> <http://xmlns.com/foaf/0.1/familyName> \"Stępniewski\" . } }" 

1044 } 

1045 ] 

1046 }, 

1047 { 

1048 "@id": "https://w3id.org/oc/meta/ra/06440227509/prov/se/1", 

1049 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1050 "http://purl.org/dc/terms/description": [ 

1051 { 

1052 "@value": "The entity 'https://w3id.org/oc/meta/ra/06440227509' has been created." 

1053 } 

1054 ], 

1055 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1056 { 

1057 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1058 "@value": "2023-12-13T15:53:04.544275" 

1059 }, 

1060 { 

1061 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1062 "@value": "2024-03-27T20:20:19+00:00" 

1063 } 

1064 ], 

1065 "http://www.w3.org/ns/prov#hadPrimarySource": [ 

1066 { 

1067 "@id": "https://openalex.s3.amazonaws.com/browse.html" 

1068 } 

1069 ], 

1070 "http://www.w3.org/ns/prov#specializationOf": [ 

1071 { 

1072 "@id": "https://w3id.org/oc/meta/ra/06440227509" 

1073 } 

1074 ], 

1075 "http://www.w3.org/ns/prov#wasAttributedTo": [ 

1076 { 

1077 "@id": "https://w3id.org/oc/meta/prov/pa/1" 

1078 } 

1079 ] 

1080 }, 

1081 { 

1082 "@id": "https://w3id.org/oc/meta/ra/06440227509/prov/se/4", 

1083 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1084 "http://purl.org/dc/terms/description": [ 

1085 { 

1086 "@value": "The entity 'https://w3id.org/oc/meta/ra/06440227509' has been modified." 

1087 } 

1088 ], 

1089 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1090 { 

1091 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1092 "@value": "2024-12-04T21:15:55+00:00" 

1093 } 

1094 ], 

1095 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1096 { 

1097 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1098 "@value": "2024-12-08T01:23:24+00:00" 

1099 } 

1100 ], 

1101 "http://www.w3.org/ns/prov#specializationOf": [ 

1102 { 

1103 "@id": "https://w3id.org/oc/meta/ra/06440227509" 

1104 } 

1105 ], 

1106 "http://www.w3.org/ns/prov#wasAttributedTo": [ 

1107 { 

1108 "@id": "https://orcid.org/0000-0002-8420-0696" 

1109 } 

1110 ], 

1111 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

1112 { 

1113 "@id": "https://w3id.org/oc/meta/ra/06440227509/prov/se/3" 

1114 } 

1115 ], 

1116 "https://w3id.org/oc/ontology/hasUpdateQuery": [ 

1117 { 

1118 "@value": "DELETE DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06440227509> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/06904873317> . } }; INSERT DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06440227509> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/063501394354> . } }" 

1119 } 

1120 ] 

1121 }, 

1122 { 

1123 "@id": "https://w3id.org/oc/meta/ra/06440227509/prov/se/2", 

1124 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1125 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1126 { 

1127 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1128 "@value": "2023-12-24T23:21:33+00:00" 

1129 } 

1130 ] 

1131 }, 

1132 { 

1133 "@id": "https://w3id.org/oc/meta/ra/06440227509/prov/se/3", 

1134 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1135 "http://purl.org/dc/terms/description": [ 

1136 { 

1137 "@value": "The entity 'https://w3id.org/oc/meta/ra/06440227509' has been modified." 

1138 } 

1139 ], 

1140 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1141 { 

1142 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1143 "@value": "2023-12-24T23:21:33+00:00" 

1144 } 

1145 ], 

1146 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1147 { 

1148 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1149 "@value": "2024-12-04T21:15:55+00:00" 

1150 } 

1151 ], 

1152 "http://www.w3.org/ns/prov#specializationOf": [ 

1153 { 

1154 "@id": "https://w3id.org/oc/meta/ra/06440227509" 

1155 } 

1156 ], 

1157 "http://www.w3.org/ns/prov#wasAttributedTo": [ 

1158 { 

1159 "@id": "https://w3id.org/oc/meta/prov/pa/1" 

1160 } 

1161 ], 

1162 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

1163 { 

1164 "@id": "https://w3id.org/oc/meta/ra/06440227509/prov/se/2" 

1165 } 

1166 ], 

1167 "https://w3id.org/oc/ontology/hasUpdateQuery": [ 

1168 { 

1169 "@value": "DELETE DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06440227509> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/0644082006> . } }; INSERT DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06440227509> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/06904873317> . } }" 

1170 } 

1171 ] 

1172 } 

1173 ], 

1174 "@id": "https://w3id.org/oc/meta/ra/06440227509/prov/" 

1175 } 

1176 

1177 test_file = os.path.join(self.temp_dir, "original_unresolved_issues.zip") 

1178 with zipfile.ZipFile(test_file, 'w') as zf: 

1179 zf.writestr('se.json', json.dumps(original_data)) 

1180 

1181 # Processa il file con lo script 

1182 result = self.processor.process_file(test_file, 'test/fix_provenance_logs') 

1183 self.assertIsNotNone(result, "Process should complete without errors") 

1184 

1185 # Legge i dati modificati 

1186 with zipfile.ZipFile(test_file, 'r') as zf: 

1187 with zf.open('se.json') as f: 

1188 fixed_data = json.loads(f.read()) 

1189 

1190 graph_data = fixed_data[0]['@graph'] 

1191 

1192 # Verifica se alcuni problemi noti sono stati risolti: 

1193 # 1. Snapshot se/1 ha multipli 'generatedAtTime', dovrebbe averne solo uno 

1194 snapshot_1 = next((x for x in graph_data if x['@id'].endswith('/prov/se/1')), None) 

1195 self.assertIsNotNone(snapshot_1, "Snapshot se/1 should exist") 

1196 gen_times_1 = snapshot_1.get('http://www.w3.org/ns/prov#generatedAtTime', []) 

1197 # Qui ci aspettiamo che lo script abbia risolto il problema tenendo il timestamp più vecchio. 

1198 # Se notiamo che non è successo, il test fallirà, evidenziando che il problema non è stato risolto. 

1199 self.assertEqual( 

1200 len(gen_times_1), 1, 

1201 "Snapshot se/1 should have only one generatedAtTime after processing" 

1202 ) 

1203 

1204 # 2. Verifica coerenza descrizioni su se/3 e se/4: dovrebbero mantenere un'unica descrizione coerente 

1205 snapshot_3 = next((x for x in graph_data if x['@id'].endswith('/prov/se/3')), None) 

1206 self.assertIsNotNone(snapshot_3, "Snapshot se/3 should exist") 

1207 desc_3 = snapshot_3.get('http://purl.org/dc/terms/description', []) 

1208 self.assertEqual(len(desc_3), 1, "Snapshot se/3 should have exactly one description") 

1209 

1210 snapshot_4 = next((x for x in graph_data if x['@id'].endswith('/prov/se/4')), None) 

1211 self.assertIsNotNone(snapshot_4, "Snapshot se/4 should exist") 

1212 desc_4 = snapshot_4.get('http://purl.org/dc/terms/description', []) 

1213 self.assertEqual(len(desc_4), 1, "Snapshot se/4 should have exactly one description") 

1214 

1215 # 3. Verifica la catena wasDerivedFrom: ogni snapshot (tranne il primo) dovrebbe avere un wasDerivedFrom che punta allo snapshot precedente 

1216 # La sequenza dovrebbe essere: se/1 (creato), se/2, se/3, se/4, se/5 (cancellato) 

1217 # Ci aspettiamo: 

1218 # se/2 -> se/1 

1219 # se/3 -> se/2 

1220 # se/4 -> se/3 

1221 # se/5 -> se/4 

1222 # Se il problema non è stato risolto, tali collegamenti potrebbero non essere corretti. 

1223 def get_derived_from(snap_id): 

1224 snap = next((x for x in graph_data if x['@id'].endswith(snap_id)), None) 

1225 if snap and 'http://www.w3.org/ns/prov#wasDerivedFrom' in snap: 

1226 return snap['http://www.w3.org/ns/prov#wasDerivedFrom'][0]['@id'].split('/se/')[-1] 

1227 return None 

1228 

1229 self.assertEqual(get_derived_from('/prov/se/2'), '1', "se/2 should derive from se/1") 

1230 self.assertEqual(get_derived_from('/prov/se/3'), '2', "se/3 should derive from se/2") 

1231 self.assertEqual(get_derived_from('/prov/se/4'), '3', "se/4 should derive from se/3") 

1232 self.assertEqual(get_derived_from('/prov/se/5'), '4', "se/5 should derive from se/4") 

1233 

1234 # Infine, se alcuni di questi test falliscono, significa che lo script non ha risolto i problemi come previsto, 

1235 # mostrando quindi il comportamento effettivo sullo scenario fornito. 

1236 

1237 def test_complex_merge_chain_scenario(self): 

1238 """Test handling of a complex chain of merges with oscillating property values.""" 

1239 test_data = { 

1240 "@graph": [ 

1241 { 

1242 "@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/9", 

1243 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1244 "http://purl.org/dc/terms/description": [ 

1245 { 

1246 "@value": "The entity 'https://w3id.org/oc/meta/ra/06490509042' has been modified." 

1247 } 

1248 ], 

1249 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1250 { 

1251 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1252 "@value": "2024-12-04T18:44:08+00:00" 

1253 } 

1254 ], 

1255 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1256 { 

1257 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1258 "@value": "2024-12-16T03:14:25+00:00" 

1259 } 

1260 ], 

1261 "http://www.w3.org/ns/prov#specializationOf": [ 

1262 {"@id": "https://w3id.org/oc/meta/ra/06490509042"} 

1263 ], 

1264 "http://www.w3.org/ns/prov#wasAttributedTo": [ 

1265 {"@id": "https://orcid.org/0000-0002-8420-0696"} 

1266 ], 

1267 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

1268 {"@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/8"} 

1269 ], 

1270 "https://w3id.org/oc/ontology/hasUpdateQuery": [ 

1271 { 

1272 "@value": "DELETE DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/06320156505> . } }; INSERT DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/063201438132> . } }" 

1273 } 

1274 ] 

1275 }, 

1276 { 

1277 "@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/5", 

1278 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1279 "http://purl.org/dc/terms/description": [ 

1280 { 

1281 "@value": "The entity 'https://w3id.org/oc/meta/ra/06490509042' has been merged with 'https://w3id.org/oc/meta/ra/06530192638'." 

1282 } 

1283 ], 

1284 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1285 { 

1286 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1287 "@value": "2024-02-21T06:29:52+00:00" 

1288 } 

1289 ], 

1290 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1291 { 

1292 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1293 "@value": "2024-02-21T06:30:27+00:00" 

1294 } 

1295 ], 

1296 "http://www.w3.org/ns/prov#specializationOf": [ 

1297 {"@id": "https://w3id.org/oc/meta/ra/06490509042"} 

1298 ], 

1299 "http://www.w3.org/ns/prov#wasAttributedTo": [ 

1300 {"@id": "https://w3id.org/oc/meta/prov/pa/1"} 

1301 ], 

1302 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

1303 {"@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/4"}, 

1304 {"@id": "https://w3id.org/oc/meta/ra/06530192638/prov/se/2"} 

1305 ], 

1306 "https://w3id.org/oc/ontology/hasUpdateQuery": [ 

1307 { 

1308 "@value": "DELETE DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://xmlns.com/foaf/0.1/givenName> \"Aurora E\" . } }; INSERT DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://xmlns.com/foaf/0.1/givenName> \"Aurora E.\" . } }" 

1309 } 

1310 ] 

1311 }, 

1312 { 

1313 "@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/4", 

1314 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1315 "http://purl.org/dc/terms/description": [ 

1316 { 

1317 "@value": "The entity 'https://w3id.org/oc/meta/ra/06490509042' has been merged with 'https://w3id.org/oc/meta/ra/065047414'." 

1318 } 

1319 ], 

1320 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1321 { 

1322 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1323 "@value": "2024-02-21T06:24:47+00:00" 

1324 } 

1325 ], 

1326 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1327 { 

1328 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1329 "@value": "2024-02-21T06:29:52+00:00" 

1330 } 

1331 ], 

1332 "http://www.w3.org/ns/prov#specializationOf": [ 

1333 {"@id": "https://w3id.org/oc/meta/ra/06490509042"} 

1334 ], 

1335 "http://www.w3.org/ns/prov#wasAttributedTo": [ 

1336 {"@id": "https://w3id.org/oc/meta/prov/pa/1"} 

1337 ], 

1338 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

1339 {"@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/3"}, 

1340 {"@id": "https://w3id.org/oc/meta/ra/065047414/prov/se/2"} 

1341 ], 

1342 "https://w3id.org/oc/ontology/hasUpdateQuery": [ 

1343 { 

1344 "@value": "DELETE DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://xmlns.com/foaf/0.1/givenName> \"Aurora Elizabeth\" . } }; INSERT DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://xmlns.com/foaf/0.1/givenName> \"Aurora E\" . } }" 

1345 } 

1346 ] 

1347 }, 

1348 { 

1349 "@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/7", 

1350 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1351 "http://purl.org/dc/terms/description": [ 

1352 { 

1353 "@value": "The entity 'https://w3id.org/oc/meta/ra/06490509042' has been merged with 'https://w3id.org/oc/meta/ra/0612010691345'." 

1354 } 

1355 ], 

1356 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1357 { 

1358 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1359 "@value": "2024-02-21T06:31:00+00:00" 

1360 } 

1361 ], 

1362 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1363 { 

1364 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1365 "@value": "2024-02-21T06:31:43+00:00" 

1366 } 

1367 ], 

1368 "http://www.w3.org/ns/prov#specializationOf": [ 

1369 {"@id": "https://w3id.org/oc/meta/ra/06490509042"} 

1370 ], 

1371 "http://www.w3.org/ns/prov#wasAttributedTo": [ 

1372 {"@id": "https://w3id.org/oc/meta/prov/pa/1"} 

1373 ], 

1374 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

1375 {"@id": "https://w3id.org/oc/meta/ra/0612010691345/prov/se/1"}, 

1376 {"@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/6"} 

1377 ], 

1378 "https://w3id.org/oc/ontology/hasUpdateQuery": [ 

1379 { 

1380 "@value": "DELETE DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://xmlns.com/foaf/0.1/givenName> \"Aurora\" . } }; INSERT DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://xmlns.com/foaf/0.1/givenName> \"Aurora E.\" . } }" 

1381 } 

1382 ] 

1383 }, 

1384 { 

1385 "@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/10", 

1386 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1387 "http://purl.org/dc/terms/description": [ 

1388 { 

1389 "@value": "The entity 'https://w3id.org/oc/meta/ra/06490509042' has been deleted." 

1390 } 

1391 ], 

1392 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1393 { 

1394 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1395 "@value": "2024-12-16T03:14:25+00:00" 

1396 } 

1397 ], 

1398 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1399 { 

1400 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1401 "@value": "2024-12-16T03:14:25+00:00" 

1402 } 

1403 ], 

1404 "http://www.w3.org/ns/prov#specializationOf": [ 

1405 {"@id": "https://w3id.org/oc/meta/ra/06490509042"} 

1406 ], 

1407 "http://www.w3.org/ns/prov#wasAttributedTo": [ 

1408 {"@id": "https://orcid.org/0000-0002-8420-0696"} 

1409 ], 

1410 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

1411 {"@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/9"} 

1412 ], 

1413 "https://w3id.org/oc/ontology/hasUpdateQuery": [ 

1414 { 

1415 "@value": "DELETE DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://xmlns.com/foaf/0.1/givenName> \"Aurora E.\" .<https://w3id.org/oc/meta/ra/06490509042> <http://xmlns.com/foaf/0.1/familyName> \"Serralde-Zúñiga\" .<https://w3id.org/oc/meta/ra/06490509042> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://xmlns.com/foaf/0.1/Agent> .<https://w3id.org/oc/meta/ra/06490509042> <http://purl.org/spar/datacite/hasIdentifier> <https://w3id.org/oc/meta/id/063201438132> . } }" 

1416 } 

1417 ] 

1418 }, 

1419 { 

1420 "@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/3", 

1421 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1422 "http://purl.org/dc/terms/description": [ 

1423 { 

1424 "@value": "The entity 'https://w3id.org/oc/meta/ra/06490509042' has been modified." 

1425 } 

1426 ], 

1427 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1428 { 

1429 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1430 "@value": "2024-02-21T06:24:19+00:00" 

1431 } 

1432 ], 

1433 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1434 { 

1435 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1436 "@value": "2024-02-21T06:24:47+00:00" 

1437 } 

1438 ], 

1439 "http://www.w3.org/ns/prov#specializationOf": [ 

1440 {"@id": "https://w3id.org/oc/meta/ra/06490509042"} 

1441 ], 

1442 "http://www.w3.org/ns/prov#wasAttributedTo": [ 

1443 {"@id": "https://w3id.org/oc/meta/prov/pa/1"} 

1444 ], 

1445 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

1446 {"@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/2"} 

1447 ], 

1448 "https://w3id.org/oc/ontology/hasUpdateQuery": [ 

1449 { 

1450 "@value": "DELETE DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://xmlns.com/foaf/0.1/givenName> \"Aurora E.\" . } }; INSERT DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://xmlns.com/foaf/0.1/givenName> \"Aurora Elizabeth\" . } }" 

1451 } 

1452 ] 

1453 }, 

1454 { 

1455 "@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/8", 

1456 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1457 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1458 { 

1459 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1460 "@value": "2024-02-21T06:31:43+00:00" 

1461 } 

1462 ], 

1463 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1464 { 

1465 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1466 "@value": "2024-12-04T18:44:08+00:00" 

1467 } 

1468 ], 

1469 "http://www.w3.org/ns/prov#specializationOf": [ 

1470 {"@id": "https://w3id.org/oc/meta/ra/06490509042"} 

1471 ], 

1472 "http://www.w3.org/ns/prov#wasAttributedTo": [ 

1473 {"@id": "https://w3id.org/oc/meta/prov/pa/1"} 

1474 ], 

1475 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

1476 {"@id": "https://w3id.org/oc/meta/ra/06320390920/prov/se/1"}, 

1477 {"@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/7"} 

1478 ] 

1479 }, 

1480 { 

1481 "@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/6", 

1482 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1483 "http://purl.org/dc/terms/description": [ 

1484 { 

1485 "@value": "The entity 'https://w3id.org/oc/meta/ra/06490509042' has been merged with 'https://w3id.org/oc/meta/ra/06520239458'." 

1486 } 

1487 ], 

1488 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1489 { 

1490 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1491 "@value": "2024-02-21T06:30:27+00:00" 

1492 } 

1493 ], 

1494 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1495 { 

1496 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1497 "@value": "2024-02-21T06:31:00+00:00" 

1498 } 

1499 ], 

1500 "http://www.w3.org/ns/prov#specializationOf": [ 

1501 {"@id": "https://w3id.org/oc/meta/ra/06490509042"} 

1502 ], 

1503 "http://www.w3.org/ns/prov#wasAttributedTo": [ 

1504 {"@id": "https://w3id.org/oc/meta/prov/pa/1"} 

1505 ], 

1506 "http://www.w3.org/ns/prov#wasDerivedFrom": [ 

1507 {"@id": "https://w3id.org/oc/meta/ra/06520239458/prov/se/1"}, 

1508 {"@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/5"} 

1509 ], 

1510 "https://w3id.org/oc/ontology/hasUpdateQuery": [ 

1511 { 

1512 "@value": "DELETE DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://xmlns.com/foaf/0.1/givenName> \"Aurora E.\" . } }; INSERT DATA { GRAPH <https://w3id.org/oc/meta/ra/> { <https://w3id.org/oc/meta/ra/06490509042> <http://xmlns.com/foaf/0.1/givenName> \"Aurora\" . } }" 

1513 } 

1514 ] 

1515 }, 

1516 { 

1517 "@id": "https://w3id.org/oc/meta/ra/06490509042/prov/se/2", 

1518 "@type": ["http://www.w3.org/ns/prov#Entity"], 

1519 "http://www.w3.org/ns/prov#generatedAtTime": [ 

1520 { 

1521 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1522 "@value": "2022-12-20T00:00:00+00:00" 

1523 } 

1524 ], 

1525 "http://www.w3.org/ns/prov#invalidatedAtTime": [ 

1526 { 

1527 "@type": "http://www.w3.org/2001/XMLSchema#dateTime", 

1528 "@value": "2024-02-21T06:24:19+00:00" 

1529 } 

1530 ], 

1531 "http://www.w3.org/ns/prov#specializationOf": [ 

1532 {"@id": "https://w3id.org/oc/meta/ra/06490509042"} 

1533 ] 

1534 } 

1535 ], 

1536 "@id": "https://w3id.org/oc/meta/ra/06490509042/prov/" 

1537 } 

1538 

1539 test_file = os.path.join(self.temp_dir, "complex_merge_chain.zip") 

1540 with zipfile.ZipFile(test_file, 'w') as zf: 

1541 zf.writestr('se.json', json.dumps(test_data)) 

1542 

1543 # Process the file 

1544 result = self.processor.process_file(test_file, 'test/fix_provenance_logs') 

1545 self.assertTrue(result) 

1546 # Verify the processed data 

1547 with zipfile.ZipFile(test_file, 'r') as zf: 

1548 with zf.open('se.json') as f: 

1549 fixed_data = json.loads(f.read()) 

1550 

1551 graph_data = fixed_data[0]['@graph'] 

1552 

1553 # Ordina gli snapshot per numero di snapshot 

1554 def get_snapshot_number(snap): 

1555 if '/prov/se/' in snap['@id']: 

1556 return int(snap['@id'].split('/se/')[-1]) 

1557 return 0 

1558 

1559 sorted_snapshots = sorted(graph_data, key=get_snapshot_number) 

1560 

1561 # 1. Verifica la catena temporale 

1562 def get_timestamp(snapshot, pred): 

1563 return next(( 

1564 item['@value'] 

1565 for item in snapshot.get(pred, []) 

1566 if '@value' in item 

1567 ), None) 

1568 

1569 for i in range(len(sorted_snapshots)-1): 

1570 curr = sorted_snapshots[i] 

1571 next_snap = sorted_snapshots[i+1] 

1572 

1573 curr_inv = get_timestamp(curr, "http://www.w3.org/ns/prov#invalidatedAtTime") 

1574 next_gen = get_timestamp(next_snap, "http://www.w3.org/ns/prov#generatedAtTime") 

1575 

1576 if curr_inv and next_gen: 

1577 self.assertEqual(curr_inv, next_gen, 

1578 f"Timestamp mismatch between snapshots {curr['@id']} and {next_snap['@id']}") 

1579 

1580 # 2. Verifica la coerenza dei merge 

1581 merge_snapshots = [ 

1582 s for s in graph_data 

1583 if any("has been merged with" in str(d.get('@value', '')) 

1584 for d in s.get('http://purl.org/dc/terms/description', [])) 

1585 ] 

1586 

1587 for merge in merge_snapshots: 

1588 derived_from = [ 

1589 d['@id'] for d in merge.get('http://www.w3.org/ns/prov#wasDerivedFrom', []) 

1590 ] 

1591 self.assertGreaterEqual(len(derived_from), 2, 

1592 f"Merge snapshot {merge['@id']} should have at least 2 wasDerivedFrom relations") 

1593 

1594 # Verify snapshot sequence completeness 

1595 snapshot_numbers = set() 

1596 for item in graph_data: 

1597 if '/prov/se/' in item['@id']: 

1598 num = int(item['@id'].split('/se/')[-1]) 

1599 snapshot_numbers.add(num) 

1600 

1601 # Check that sequence starts at 1 

1602 self.assertIn(1, snapshot_numbers, 

1603 "Snapshot sequence should start with number 1") 

1604 

1605 # Check sequence continuity 

1606 expected_numbers = set(range(1, max(snapshot_numbers) + 1)) 

1607 self.assertEqual(snapshot_numbers, expected_numbers, 

1608 f"Snapshot sequence should be continuous from 1 to {max(snapshot_numbers)}") 

1609 

1610 # Verify that snapshot 1 has creation description 

1611 snapshot_1 = next((s for s in graph_data if s['@id'].endswith('/prov/se/1')), None) 

1612 self.assertIsNotNone(snapshot_1, "Snapshot 1 should exist") 

1613 descriptions = snapshot_1.get('http://purl.org/dc/terms/description', []) 

1614 self.assertTrue(any( 

1615 "has been created" in d.get('@value', '') 

1616 for d in descriptions 

1617 ), "First snapshot should have creation description") 

1618 

1619 

1620if __name__ == '__main__': 

1621 unittest.main()