Coverage for meta_prov_fixer / main.py: 0%

50 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-16 15:12 +0000

1#!/usr/bin/env python3 

2import argparse 

3import json 

4import logging 

5import datetime 

6 

7from meta_prov_fixer.src import fix_provenance_process 

8from meta_prov_fixer.virtuoso_watchdog import start_watchdog_thread 

9from meta_prov_fixer.dry_run_utils import create_dry_run_issues_callback 

10 

11def load_meta_dumps(json_path: str): 

12 """ 

13 Load meta_dumps_pub_dates from a JSON file. 

14 The JSON file should contain a list of [date, url] pairs. 

15 """ 

16 try: 

17 with open(json_path, "r", encoding="utf-8") as f: 

18 data = json.load(f) 

19 if not all(isinstance(t, list) and len(t) == 2 for t in data): 

20 raise ValueError 

21 return [(str(d[0]), str(d[1])) for d in data] 

22 except Exception as e: 

23 raise argparse.ArgumentTypeError( 

24 f"Failed to load meta_dumps_pub_dates from '{json_path}': {e}" 

25 ) 

26 

27def main(): 

28 parser = argparse.ArgumentParser( 

29 description="Run the pipeline for fixing Meta provenance triplestore and RDF files." 

30 ) 

31 

32 parser.add_argument( 

33 "-e", "--endpoint", type=str, required=True, 

34 help="SPARQL endpoint URL" 

35 ) 

36 

37 parser.add_argument( 

38 "-i", "--data-dir", type=str, required=True, 

39 help="Path to directory containing the RDF files to process." 

40 ) 

41 

42 parser.add_argument( 

43 "-o", "--out-dir", type=str, required=True, 

44 help="Directory where to save fixed files. If it is the same as data-dir and 'overwrite' is False, an Error will be raised." 

45 ) 

46 

47 parser.add_argument( 

48 "-m", "--meta-dumps", type=load_meta_dumps, required=True, 

49 help="Path to JSON file with list of [date, URL] pairs" 

50 ) 

51 

52 parser.add_argument( 

53 "--chunk-size", type=int, default=100, 

54 help="Number of detected issues to process in each SPARQL update query. Default is 100." 

55 ) 

56 

57 parser.add_argument( 

58 "--failed-queries-fp", type=str, default=f"prov_fix_failed_queries_{datetime.date.today().strftime('%Y-%m-%d')}.txt", 

59 help="File path to log failed SPARQL update queries. Default is 'prov_fix_failed_queries_<today's date>.txt'." 

60 ) 

61 

62 parser.add_argument( 

63 "-l", "--log-fp", type=str, 

64 default=f"provenance_fix_{datetime.date.today().strftime('%Y-%m-%d')}.log", 

65 help="File path to log file. Default is 'provenance_fix_<today's date>.log'." 

66 ) 

67 

68 parser.add_argument( 

69 "--overwrite-ok", action="store_true", 

70 help="If specified, allows overwriting the input file with the fixed output without raising errors. " 

71 "To be overwritten, the input file must still be a decompressed .json file and '--out-dir' must be " 

72 "the same as '--data-dir'. Default is False." 

73 ) 

74 

75 parser.add_argument( 

76 "--checkpoint-fp", type=str, default="fix_prov.checkpoint.json", 

77 help="File path to store checkpoint information for resuming the process. Default is 'fix_prov.checkpoint.json'." 

78 ) 

79 

80 parser.add_argument( 

81 "--cache-fp", type=str, default="filler_issues.cache.json", 

82 help="File path to store cache of detected issues. Default is 'filler_issues.cache.json'." 

83 ) 

84 

85 parser.add_argument( 

86 "-r", "--auto-restart-container", action="store_true", 

87 help="Enable memory watchdog to auto-restart the Virtuoso Docker container when memory usage is too high." 

88 ) 

89 

90 parser.add_argument( 

91 "-v", "--virtuoso-container", type=str, default=None, 

92 help="Name of the Virtuoso Docker container (required when --auto-restart-container is used)." 

93 ) 

94 

95 parser.add_argument( 

96 "--dry-run-db", action="store_true", 

97 help="If specified, no SPARQL updates are sent to the endpoint. Useful for testing or when you only want to write fixed files." 

98 ) 

99 

100 parser.add_argument( 

101 "--dry-run-files", action="store_true", 

102 help="If specified, no output files are written to out-dir. Useful when you only want to update the database." 

103 ) 

104 

105 parser.add_argument( 

106 "--dry-run-issues-dir", type=str, default=None, 

107 help="Directory where to write issues found during dry-run. If specified with --dry-run-db, creates JSON-Lines files " 

108 "with issues found in each processed file. Each file contains at most 1000 lines. The callback is only used when " 

109 "--dry-run-db is enabled." 

110 ) 

111 

112 parser.add_argument( 

113 "--dry-run-process-id", type=str, default=None, 

114 help="Optional identifier for parallel execution (e.g., directory name like 'br', 'ar'). Used to create unique filenames " 

115 "when running multiple processes with --dry-run-issues-dir to avoid file conflicts." 

116 ) 

117 

118 args = parser.parse_args() 

119 

120 if args.auto_restart_container: 

121 if not args.virtuoso_container: 

122 parser.error( 

123 "--virtuoso-container is required when using --auto-restart-container" 

124 ) 

125 

126 

127 # --- Logging setup --- 

128 logging.basicConfig( 

129 level=logging.INFO, 

130 format="%(asctime)s - %(levelname)s - [%(funcName)s, %(filename)s:%(lineno)d] - %(message)s", 

131 filename=args.log_fp 

132 ) 

133 

134 

135 # --- Start the Virtuoso memory watchdog thread if enabled --- 

136 if args.auto_restart_container: 

137 logging.info("Starting Virtuoso memory watchdog thread...") 

138 start_watchdog_thread( 

139 container_name=args.virtuoso_container, 

140 endpoint=args.endpoint 

141 ) 

142 else: 

143 logging.info("Auto-restart watchdog disabled.") 

144 

145 # --- Setup dry-run callback if needed --- 

146 dry_run_callback = None 

147 if args.dry_run_db and args.dry_run_issues_dir: 

148 logging.info(f"Creating dry-run issues callback writing to: {args.dry_run_issues_dir}") 

149 dry_run_callback = create_dry_run_issues_callback( 

150 output_dir=args.dry_run_issues_dir, 

151 max_lines_per_file=1000, 

152 process_id=args.dry_run_process_id 

153 ) 

154 

155 fix_provenance_process( 

156 endpoint=args.endpoint, 

157 data_dir=args.data_dir, 

158 out_dir=args.out_dir, 

159 meta_dumps_register=args.meta_dumps, 

160 dry_run_db=args.dry_run_db, 

161 dry_run_files=args.dry_run_files, 

162 dry_run_callback=dry_run_callback, 

163 chunk_size=args.chunk_size, 

164 failed_queries_fp=args.failed_queries_fp, 

165 overwrite_ok=args.overwrite_ok, 

166 resume=True, 

167 checkpoint_fp=args.checkpoint_fp, 

168 cache_fp=args.cache_fp 

169 ) 

170 

171 

172if __name__ == "__main__": 

173 main() 

174 

175 

176 

177## Detect and fix provenance issues (with auto-restart watchdog for Virtuoso): 

178## poetry run python meta_prov_fixer/main.py -e http://localhost:8890/sparql/ -i "../meta_prov/br" -o "../fixed/br" -m meta_dumps.json -r -v oc-meta-prov 

179 

180## Run in dry-run mode: only write fixed files, don't update database: 

181## poetry run python meta_prov_fixer/main.py -e http://localhost:8890/sparql/ -i "../meta_prov/br" -o "../fixed/br" -m meta_dumps.json --dry-run-db 

182 

183## Run in dry-run mode with issues logging: only write fixed files and log issues to JSON-Lines: 

184## poetry run python meta_prov_fixer/main.py -e http://localhost:8890/sparql/ -i "../meta_prov/br" -o "../fixed/br" -m meta_dumps.json --dry-run-db --dry-run-issues-dir "issues_output" 

185 

186## Run in dry-run mode with issues logging for parallel execution (avoiding filename conflicts): 

187## poetry run python meta_prov_fixer/main.py -e http://localhost:8890/sparql/ -i "../meta_prov/br" -o "../fixed/br" -m meta_dumps.json --dry-run-db --dry-run-issues-dir "issues_output" --dry-run-process-id "br"