Coverage for oc_meta/run/find/merged

1# SPDX-FileCopyrightText: 2026 Arcangelo Massari <arcangelo.massari@unibo.it>

3# SPDX-License-Identifier: ISC

5import argparse

6import csv

7import os

8import zipfile

9from collections import defaultdict

10import multiprocessing

11from concurrent.futures import ProcessPoolExecutor, as_completed

13import orjson

14import yaml

15from rich.console import Console

16from rich_argparse import RichHelpFormatter

17from tqdm import tqdm

19from oc_meta.lib.file_manager import collect_files

21console = Console()

23PROV_DERIVED_FROM = "http://www.w3.org/ns/prov#wasDerivedFrom"

24PROV_SPECIALIZATION_OF = "http://www.w3.org/ns/prov#specializationOf"

27def extract_entity_from_snapshot(snapshot_uri: str) -> str:

28 return snapshot_uri.split("/prov/")[0]

31def find_prov_files(rdf_dir: str, entity_type: str) -> list[str]:

32 entity_dir = os.path.join(rdf_dir, entity_type)

33 return sorted(collect_files(entity_dir, pattern="se.zip"))

36def process_prov_file(prov_file: str) -> list[tuple[str, str]]:

37 results = []

39 try:

40 with zipfile.ZipFile(prov_file, "r") as zf:

41 with zf.open("se.json") as f:

42 data = orjson.loads(f.read())

43 except (zipfile.BadZipFile, orjson.JSONDecodeError, KeyError):

44 return results

46 for graph in data:

47 for entity in graph.get("@graph", []):

48 derived_from = entity.get(PROV_DERIVED_FROM, [])

49 if len(derived_from) < 2:

50 continue

52 specialization = entity.get(PROV_SPECIALIZATION_OF, [])

53 if not specialization:

54 continue

56 surviving_entity = specialization[0]["@id"]

58 for derived in derived_from:

59 derived_snapshot = derived["@id"]

60 derived_entity = extract_entity_from_snapshot(derived_snapshot)

62 if derived_entity != surviving_entity:

63 results.append((surviving_entity, derived_entity))

65 return results

68def build_merge_graph(

69 merge_results: list[tuple[str, str]],

70) -> dict[str, str]:

71 merged_to_surviving: dict[str, str] = {}

73 for surviving, merged in merge_results:

74 merged_to_surviving[merged] = surviving

76 return merged_to_surviving

79def find_final_surviving(entity: str, merged_to_surviving: dict[str, str]) -> str:

80 current = entity

81 visited = {entity}

83 while current in merged_to_surviving:

84 next_entity = merged_to_surviving[current]

85 if next_entity in visited:

86 break

87 visited.add(next_entity)

88 current = next_entity

90 return current

93def group_by_final_surviving(

94 merged_to_surviving: dict[str, str],

95) -> dict[str, list[str]]:

96 final_to_merged: dict[str, list[str]] = defaultdict(list)

98 for merged_entity in merged_to_surviving.keys():

99 final = find_final_surviving(merged_entity, merged_to_surviving)

100 final_to_merged[final].append(merged_entity)

101

102 return dict(final_to_merged)

103

104

105def main():

106 parser = argparse.ArgumentParser(

107 description="Find all merged entities and reconstruct merge chains from provenance files",

108 formatter_class=RichHelpFormatter,

109 )

110 parser.add_argument(

111 "-c", "--config", required=True, help="Path to meta configuration YAML file"

112 )

113 parser.add_argument(

114 "-o", "--output", required=True, help="Output CSV file path"

115 )

116 parser.add_argument(

117 "--entity-type",

118 choices=["br", "ra", "id", "ar", "re"],

119 required=True,

120 help="Entity type to search",

121 )

122 parser.add_argument(

123 "--workers",

124 type=int,

125 default=4,

126 help="Number of parallel workers",

127 )

128 args = parser.parse_args()

129

130 with open(args.config) as f:

131 config = yaml.safe_load(f)

132

133 rdf_dir = os.path.join(config["output_rdf_dir"], "rdf")

134

135 console.print(f"Scanning for provenance files in: {rdf_dir}/{args.entity_type}")

136 prov_files = find_prov_files(rdf_dir, args.entity_type)

137 console.print(f"Found {len(prov_files)} provenance files")

138

139 all_results: list[tuple[str, str]] = []

140

141 # Use forkserver to avoid deadlocks when forking in a multi-threaded environment

142 with ProcessPoolExecutor(max_workers=args.workers, mp_context=multiprocessing.get_context('forkserver')) as executor:

143 futures = {executor.submit(process_prov_file, f): f for f in prov_files}

144

145 for future in tqdm(as_completed(futures), total=len(futures), desc="Processing files"):

146 results = future.result()

147 all_results.extend(results)

148

149 console.print(f"Found {len(all_results)} merge derivations")

150

151 merged_to_surviving = build_merge_graph(all_results)

152 console.print(f"Found {len(merged_to_surviving)} merged entities")

153

154 final_to_merged = group_by_final_surviving(merged_to_surviving)

155 console.print(f"Found {len(final_to_merged)} surviving entities")

156

157 with open(args.output, "w", newline="") as f:

158 writer = csv.DictWriter(f, fieldnames=["surviving_entity", "merged_entities"])

159 writer.writeheader()

160 for surviving, merged_list in final_to_merged.items():

161 writer.writerow({

162 "surviving_entity": surviving,

163 "merged_entities": "; ".join(merged_list),

164 })

165

166 console.print(f"Output written to: {args.output}")

167

168

169if __name__ == "__main__":

170 main()

Coverage for oc_meta / run / find / merged_entities.py: 0%

97 statements