Coverage for oc_meta/run/merge/compact_output

1#!/usr/bin/env python

3# SPDX-FileCopyrightText: 2025-2026 Arcangelo Massari <arcangelo.massari@unibo.it>

5# SPDX-License-Identifier: ISC

7import argparse

8import csv

9import os

10from pathlib import Path

11from typing import List, Tuple, TypedDict

13from rich_argparse import RichHelpFormatter

14from tqdm import tqdm

17class MergeFileResult(TypedDict):

18 valid_entries: List[Tuple[str, str]]

19 total_rows: int

22def process_merge_file(input_file: str) -> MergeFileResult:

23 """Process a single merge CSV file and return list of surviving and merged entities.

25 Args:

26 input_file: Path to the input CSV file

28 Returns:

29 Dictionary containing:

30 'valid_entries': List of tuples (surviving_entity, merged_entities) for rows with Done=True

31 'total_rows': Total number of data rows in the file

32 """

33 results = []

34 total_rows = 0

36 with open(input_file, "r", encoding="utf-8") as f:

37 reader = csv.DictReader(f)

38 for row in reader:

39 total_rows += 1

40 done = row.get("Done", "").lower() == "true"

41 if done:

42 surviving = row["surviving_entity"].strip()

43 merged = row["merged_entities"].strip()

44 results.append((surviving, merged))

46 return {"valid_entries": results, "total_rows": total_rows}

49def process_merge_directory(input_dir: str, output_file: str):

50 """Process all CSV files in a directory and create a single output file.

52 Args:

53 input_dir: Directory containing input CSV files

54 output_file: Path where to save the output CSV file

55 """

56 all_results = []

57 total_rows = 0

58 input_path = Path(input_dir)

60 # Get list of CSV files

61 csv_files = list(input_path.glob("*.csv"))

63 # Process all CSV files in directory with progress bar

64 for file in tqdm(csv_files, desc="Processing files", unit="file"):

65 file_results = process_merge_file(str(file))

66 all_results.extend(file_results["valid_entries"])

67 total_rows += file_results["total_rows"]

69 # Write results to output file

70 with open(output_file, "w", encoding="utf-8", newline="") as f:

71 writer = csv.writer(f)

72 writer.writerow(["surviving_entity", "merged_entities"])

73 writer.writerows(all_results)

75 # Calculate statistics

76 valid_entries = len(all_results)

77 valid_percentage = (valid_entries / total_rows * 100) if total_rows > 0 else 0

79 print(f"\nProcessed {len(csv_files)} files")

80 print(f"Total rows processed: {total_rows}")

81 print(f"Valid merge entries found: {valid_entries} ({valid_percentage:.1f}%)")

82 print(f"Results written to: {output_file}")

85def main():

86 parser = argparse.ArgumentParser(

87 description="Process merge CSV files and combine into single output",

88 formatter_class=RichHelpFormatter,

89 )

90 parser.add_argument("input_dir", help="Directory containing input CSV files")

91 parser.add_argument("output_file", help="Path for output CSV file")

93 args = parser.parse_args()

95 if not os.path.isdir(args.input_dir):

96 raise ValueError(f"Input directory does not exist: {args.input_dir}")

98 process_merge_directory(args.input_dir, args.output_file)

100

101if __name__ == "__main__":

102 main()

Coverage for oc_meta / run / merge / compact_output_csv.py: 85%

52 statements