Coverage for oc_meta / run / merge / compact_output_csv.py: 85%

52 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-03 17:25 +0000

1#!/usr/bin/env python 

2# -*- coding: utf-8 -*- 

3# Copyright (c) 2025 Arcangelo Massari <arcangelo.massari@unibo.it> 

4# 

5# Permission to use, copy, modify, and/or distribute this software for any purpose 

6# with or without fee is hereby granted, provided that the above copyright notice 

7# and this permission notice appear in all copies. 

8# 

9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 

10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 

11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, 

12# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 

13# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 

14# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 

15# SOFTWARE. 

16 

17import argparse 

18import csv 

19import os 

20from pathlib import Path 

21from typing import List, Tuple, TypedDict 

22 

23from rich_argparse import RichHelpFormatter 

24from tqdm import tqdm 

25 

26 

27class MergeFileResult(TypedDict): 

28 valid_entries: List[Tuple[str, str]] 

29 total_rows: int 

30 

31 

32def process_merge_file(input_file: str) -> MergeFileResult: 

33 """Process a single merge CSV file and return list of surviving and merged entities. 

34 

35 Args: 

36 input_file: Path to the input CSV file 

37 

38 Returns: 

39 Dictionary containing: 

40 'valid_entries': List of tuples (surviving_entity, merged_entities) for rows with Done=True 

41 'total_rows': Total number of data rows in the file 

42 """ 

43 results = [] 

44 total_rows = 0 

45 

46 with open(input_file, "r", encoding="utf-8") as f: 

47 reader = csv.DictReader(f) 

48 for row in reader: 

49 total_rows += 1 

50 done = row.get("Done", "").lower() == "true" 

51 if done: 

52 surviving = row["surviving_entity"].strip() 

53 merged = row["merged_entities"].strip() 

54 results.append((surviving, merged)) 

55 

56 return {"valid_entries": results, "total_rows": total_rows} 

57 

58 

59def process_merge_directory(input_dir: str, output_file: str): 

60 """Process all CSV files in a directory and create a single output file. 

61 

62 Args: 

63 input_dir: Directory containing input CSV files 

64 output_file: Path where to save the output CSV file 

65 """ 

66 all_results = [] 

67 total_rows = 0 

68 input_path = Path(input_dir) 

69 

70 # Get list of CSV files 

71 csv_files = list(input_path.glob("*.csv")) 

72 

73 # Process all CSV files in directory with progress bar 

74 for file in tqdm(csv_files, desc="Processing files", unit="file"): 

75 file_results = process_merge_file(str(file)) 

76 all_results.extend(file_results["valid_entries"]) 

77 total_rows += file_results["total_rows"] 

78 

79 # Write results to output file 

80 with open(output_file, "w", encoding="utf-8", newline="") as f: 

81 writer = csv.writer(f) 

82 writer.writerow(["surviving_entity", "merged_entities"]) 

83 writer.writerows(all_results) 

84 

85 # Calculate statistics 

86 valid_entries = len(all_results) 

87 valid_percentage = (valid_entries / total_rows * 100) if total_rows > 0 else 0 

88 

89 print(f"\nProcessed {len(csv_files)} files") 

90 print(f"Total rows processed: {total_rows}") 

91 print(f"Valid merge entries found: {valid_entries} ({valid_percentage:.1f}%)") 

92 print(f"Results written to: {output_file}") 

93 

94 

95def main(): 

96 parser = argparse.ArgumentParser( 

97 description="Process merge CSV files and combine into single output", 

98 formatter_class=RichHelpFormatter, 

99 ) 

100 parser.add_argument("input_dir", help="Directory containing input CSV files") 

101 parser.add_argument("output_file", help="Path for output CSV file") 

102 

103 args = parser.parse_args() 

104 

105 if not os.path.isdir(args.input_dir): 

106 raise ValueError(f"Input directory does not exist: {args.input_dir}") 

107 

108 process_merge_directory(args.input_dir, args.output_file) 

109 

110 

111if __name__ == "__main__": 

112 main()