Coverage for oc_meta / run / merge / compact_output_csv.py: 85%
52 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-03 17:25 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-03 17:25 +0000
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Copyright (c) 2025 Arcangelo Massari <arcangelo.massari@unibo.it>
4#
5# Permission to use, copy, modify, and/or distribute this software for any purpose
6# with or without fee is hereby granted, provided that the above copyright notice
7# and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
12# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
13# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
14# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
15# SOFTWARE.
17import argparse
18import csv
19import os
20from pathlib import Path
21from typing import List, Tuple, TypedDict
23from rich_argparse import RichHelpFormatter
24from tqdm import tqdm
27class MergeFileResult(TypedDict):
28 valid_entries: List[Tuple[str, str]]
29 total_rows: int
32def process_merge_file(input_file: str) -> MergeFileResult:
33 """Process a single merge CSV file and return list of surviving and merged entities.
35 Args:
36 input_file: Path to the input CSV file
38 Returns:
39 Dictionary containing:
40 'valid_entries': List of tuples (surviving_entity, merged_entities) for rows with Done=True
41 'total_rows': Total number of data rows in the file
42 """
43 results = []
44 total_rows = 0
46 with open(input_file, "r", encoding="utf-8") as f:
47 reader = csv.DictReader(f)
48 for row in reader:
49 total_rows += 1
50 done = row.get("Done", "").lower() == "true"
51 if done:
52 surviving = row["surviving_entity"].strip()
53 merged = row["merged_entities"].strip()
54 results.append((surviving, merged))
56 return {"valid_entries": results, "total_rows": total_rows}
59def process_merge_directory(input_dir: str, output_file: str):
60 """Process all CSV files in a directory and create a single output file.
62 Args:
63 input_dir: Directory containing input CSV files
64 output_file: Path where to save the output CSV file
65 """
66 all_results = []
67 total_rows = 0
68 input_path = Path(input_dir)
70 # Get list of CSV files
71 csv_files = list(input_path.glob("*.csv"))
73 # Process all CSV files in directory with progress bar
74 for file in tqdm(csv_files, desc="Processing files", unit="file"):
75 file_results = process_merge_file(str(file))
76 all_results.extend(file_results["valid_entries"])
77 total_rows += file_results["total_rows"]
79 # Write results to output file
80 with open(output_file, "w", encoding="utf-8", newline="") as f:
81 writer = csv.writer(f)
82 writer.writerow(["surviving_entity", "merged_entities"])
83 writer.writerows(all_results)
85 # Calculate statistics
86 valid_entries = len(all_results)
87 valid_percentage = (valid_entries / total_rows * 100) if total_rows > 0 else 0
89 print(f"\nProcessed {len(csv_files)} files")
90 print(f"Total rows processed: {total_rows}")
91 print(f"Valid merge entries found: {valid_entries} ({valid_percentage:.1f}%)")
92 print(f"Results written to: {output_file}")
95def main():
96 parser = argparse.ArgumentParser(
97 description="Process merge CSV files and combine into single output",
98 formatter_class=RichHelpFormatter,
99 )
100 parser.add_argument("input_dir", help="Directory containing input CSV files")
101 parser.add_argument("output_file", help="Path for output CSV file")
103 args = parser.parse_args()
105 if not os.path.isdir(args.input_dir):
106 raise ValueError(f"Input directory does not exist: {args.input_dir}")
108 process_merge_directory(args.input_dir, args.output_file)
111if __name__ == "__main__":
112 main()