Coverage for oc_meta / run / merge / compact_output_csv.py: 85%

52 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-04-21 09:24 +0000

1#!/usr/bin/env python 

2 

3# SPDX-FileCopyrightText: 2025-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

4# 

5# SPDX-License-Identifier: ISC 

6 

7import argparse 

8import csv 

9import os 

10from pathlib import Path 

11from typing import List, Tuple, TypedDict 

12 

13from rich_argparse import RichHelpFormatter 

14from tqdm import tqdm 

15 

16 

17class MergeFileResult(TypedDict): 

18 valid_entries: List[Tuple[str, str]] 

19 total_rows: int 

20 

21 

22def process_merge_file(input_file: str) -> MergeFileResult: 

23 """Process a single merge CSV file and return list of surviving and merged entities. 

24 

25 Args: 

26 input_file: Path to the input CSV file 

27 

28 Returns: 

29 Dictionary containing: 

30 'valid_entries': List of tuples (surviving_entity, merged_entities) for rows with Done=True 

31 'total_rows': Total number of data rows in the file 

32 """ 

33 results = [] 

34 total_rows = 0 

35 

36 with open(input_file, "r", encoding="utf-8") as f: 

37 reader = csv.DictReader(f) 

38 for row in reader: 

39 total_rows += 1 

40 done = row.get("Done", "").lower() == "true" 

41 if done: 

42 surviving = row["surviving_entity"].strip() 

43 merged = row["merged_entities"].strip() 

44 results.append((surviving, merged)) 

45 

46 return {"valid_entries": results, "total_rows": total_rows} 

47 

48 

49def process_merge_directory(input_dir: str, output_file: str): 

50 """Process all CSV files in a directory and create a single output file. 

51 

52 Args: 

53 input_dir: Directory containing input CSV files 

54 output_file: Path where to save the output CSV file 

55 """ 

56 all_results = [] 

57 total_rows = 0 

58 input_path = Path(input_dir) 

59 

60 # Get list of CSV files 

61 csv_files = list(input_path.glob("*.csv")) 

62 

63 # Process all CSV files in directory with progress bar 

64 for file in tqdm(csv_files, desc="Processing files", unit="file"): 

65 file_results = process_merge_file(str(file)) 

66 all_results.extend(file_results["valid_entries"]) 

67 total_rows += file_results["total_rows"] 

68 

69 # Write results to output file 

70 with open(output_file, "w", encoding="utf-8", newline="") as f: 

71 writer = csv.writer(f) 

72 writer.writerow(["surviving_entity", "merged_entities"]) 

73 writer.writerows(all_results) 

74 

75 # Calculate statistics 

76 valid_entries = len(all_results) 

77 valid_percentage = (valid_entries / total_rows * 100) if total_rows > 0 else 0 

78 

79 print(f"\nProcessed {len(csv_files)} files") 

80 print(f"Total rows processed: {total_rows}") 

81 print(f"Valid merge entries found: {valid_entries} ({valid_percentage:.1f}%)") 

82 print(f"Results written to: {output_file}") 

83 

84 

85def main(): 

86 parser = argparse.ArgumentParser( 

87 description="Process merge CSV files and combine into single output", 

88 formatter_class=RichHelpFormatter, 

89 ) 

90 parser.add_argument("input_dir", help="Directory containing input CSV files") 

91 parser.add_argument("output_file", help="Path for output CSV file") 

92 

93 args = parser.parse_args() 

94 

95 if not os.path.isdir(args.input_dir): 

96 raise ValueError(f"Input directory does not exist: {args.input_dir}") 

97 

98 process_merge_directory(args.input_dir, args.output_file) 

99 

100 

101if __name__ == "__main__": 

102 main()