Coverage for oc_meta / run / merge / compact_output_csv.py: 85%
52 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-21 09:24 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-04-21 09:24 +0000
1#!/usr/bin/env python
3# SPDX-FileCopyrightText: 2025-2026 Arcangelo Massari <arcangelo.massari@unibo.it>
4#
5# SPDX-License-Identifier: ISC
7import argparse
8import csv
9import os
10from pathlib import Path
11from typing import List, Tuple, TypedDict
13from rich_argparse import RichHelpFormatter
14from tqdm import tqdm
17class MergeFileResult(TypedDict):
18 valid_entries: List[Tuple[str, str]]
19 total_rows: int
22def process_merge_file(input_file: str) -> MergeFileResult:
23 """Process a single merge CSV file and return list of surviving and merged entities.
25 Args:
26 input_file: Path to the input CSV file
28 Returns:
29 Dictionary containing:
30 'valid_entries': List of tuples (surviving_entity, merged_entities) for rows with Done=True
31 'total_rows': Total number of data rows in the file
32 """
33 results = []
34 total_rows = 0
36 with open(input_file, "r", encoding="utf-8") as f:
37 reader = csv.DictReader(f)
38 for row in reader:
39 total_rows += 1
40 done = row.get("Done", "").lower() == "true"
41 if done:
42 surviving = row["surviving_entity"].strip()
43 merged = row["merged_entities"].strip()
44 results.append((surviving, merged))
46 return {"valid_entries": results, "total_rows": total_rows}
49def process_merge_directory(input_dir: str, output_file: str):
50 """Process all CSV files in a directory and create a single output file.
52 Args:
53 input_dir: Directory containing input CSV files
54 output_file: Path where to save the output CSV file
55 """
56 all_results = []
57 total_rows = 0
58 input_path = Path(input_dir)
60 # Get list of CSV files
61 csv_files = list(input_path.glob("*.csv"))
63 # Process all CSV files in directory with progress bar
64 for file in tqdm(csv_files, desc="Processing files", unit="file"):
65 file_results = process_merge_file(str(file))
66 all_results.extend(file_results["valid_entries"])
67 total_rows += file_results["total_rows"]
69 # Write results to output file
70 with open(output_file, "w", encoding="utf-8", newline="") as f:
71 writer = csv.writer(f)
72 writer.writerow(["surviving_entity", "merged_entities"])
73 writer.writerows(all_results)
75 # Calculate statistics
76 valid_entries = len(all_results)
77 valid_percentage = (valid_entries / total_rows * 100) if total_rows > 0 else 0
79 print(f"\nProcessed {len(csv_files)} files")
80 print(f"Total rows processed: {total_rows}")
81 print(f"Valid merge entries found: {valid_entries} ({valid_percentage:.1f}%)")
82 print(f"Results written to: {output_file}")
85def main():
86 parser = argparse.ArgumentParser(
87 description="Process merge CSV files and combine into single output",
88 formatter_class=RichHelpFormatter,
89 )
90 parser.add_argument("input_dir", help="Directory containing input CSV files")
91 parser.add_argument("output_file", help="Path for output CSV file")
93 args = parser.parse_args()
95 if not os.path.isdir(args.input_dir):
96 raise ValueError(f"Input directory does not exist: {args.input_dir}")
98 process_merge_directory(args.input_dir, args.output_file)
101if __name__ == "__main__":
102 main()