Coverage for meta_prov_fixer / main.py: 0%
50 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-16 15:12 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-16 15:12 +0000
1#!/usr/bin/env python3
2import argparse
3import json
4import logging
5import datetime
7from meta_prov_fixer.src import fix_provenance_process
8from meta_prov_fixer.virtuoso_watchdog import start_watchdog_thread
9from meta_prov_fixer.dry_run_utils import create_dry_run_issues_callback
11def load_meta_dumps(json_path: str):
12 """
13 Load meta_dumps_pub_dates from a JSON file.
14 The JSON file should contain a list of [date, url] pairs.
15 """
16 try:
17 with open(json_path, "r", encoding="utf-8") as f:
18 data = json.load(f)
19 if not all(isinstance(t, list) and len(t) == 2 for t in data):
20 raise ValueError
21 return [(str(d[0]), str(d[1])) for d in data]
22 except Exception as e:
23 raise argparse.ArgumentTypeError(
24 f"Failed to load meta_dumps_pub_dates from '{json_path}': {e}"
25 )
27def main():
28 parser = argparse.ArgumentParser(
29 description="Run the pipeline for fixing Meta provenance triplestore and RDF files."
30 )
32 parser.add_argument(
33 "-e", "--endpoint", type=str, required=True,
34 help="SPARQL endpoint URL"
35 )
37 parser.add_argument(
38 "-i", "--data-dir", type=str, required=True,
39 help="Path to directory containing the RDF files to process."
40 )
42 parser.add_argument(
43 "-o", "--out-dir", type=str, required=True,
44 help="Directory where to save fixed files. If it is the same as data-dir and 'overwrite' is False, an Error will be raised."
45 )
47 parser.add_argument(
48 "-m", "--meta-dumps", type=load_meta_dumps, required=True,
49 help="Path to JSON file with list of [date, URL] pairs"
50 )
52 parser.add_argument(
53 "--chunk-size", type=int, default=100,
54 help="Number of detected issues to process in each SPARQL update query. Default is 100."
55 )
57 parser.add_argument(
58 "--failed-queries-fp", type=str, default=f"prov_fix_failed_queries_{datetime.date.today().strftime('%Y-%m-%d')}.txt",
59 help="File path to log failed SPARQL update queries. Default is 'prov_fix_failed_queries_<today's date>.txt'."
60 )
62 parser.add_argument(
63 "-l", "--log-fp", type=str,
64 default=f"provenance_fix_{datetime.date.today().strftime('%Y-%m-%d')}.log",
65 help="File path to log file. Default is 'provenance_fix_<today's date>.log'."
66 )
68 parser.add_argument(
69 "--overwrite-ok", action="store_true",
70 help="If specified, allows overwriting the input file with the fixed output without raising errors. "
71 "To be overwritten, the input file must still be a decompressed .json file and '--out-dir' must be "
72 "the same as '--data-dir'. Default is False."
73 )
75 parser.add_argument(
76 "--checkpoint-fp", type=str, default="fix_prov.checkpoint.json",
77 help="File path to store checkpoint information for resuming the process. Default is 'fix_prov.checkpoint.json'."
78 )
80 parser.add_argument(
81 "--cache-fp", type=str, default="filler_issues.cache.json",
82 help="File path to store cache of detected issues. Default is 'filler_issues.cache.json'."
83 )
85 parser.add_argument(
86 "-r", "--auto-restart-container", action="store_true",
87 help="Enable memory watchdog to auto-restart the Virtuoso Docker container when memory usage is too high."
88 )
90 parser.add_argument(
91 "-v", "--virtuoso-container", type=str, default=None,
92 help="Name of the Virtuoso Docker container (required when --auto-restart-container is used)."
93 )
95 parser.add_argument(
96 "--dry-run-db", action="store_true",
97 help="If specified, no SPARQL updates are sent to the endpoint. Useful for testing or when you only want to write fixed files."
98 )
100 parser.add_argument(
101 "--dry-run-files", action="store_true",
102 help="If specified, no output files are written to out-dir. Useful when you only want to update the database."
103 )
105 parser.add_argument(
106 "--dry-run-issues-dir", type=str, default=None,
107 help="Directory where to write issues found during dry-run. If specified with --dry-run-db, creates JSON-Lines files "
108 "with issues found in each processed file. Each file contains at most 1000 lines. The callback is only used when "
109 "--dry-run-db is enabled."
110 )
112 parser.add_argument(
113 "--dry-run-process-id", type=str, default=None,
114 help="Optional identifier for parallel execution (e.g., directory name like 'br', 'ar'). Used to create unique filenames "
115 "when running multiple processes with --dry-run-issues-dir to avoid file conflicts."
116 )
118 args = parser.parse_args()
120 if args.auto_restart_container:
121 if not args.virtuoso_container:
122 parser.error(
123 "--virtuoso-container is required when using --auto-restart-container"
124 )
127 # --- Logging setup ---
128 logging.basicConfig(
129 level=logging.INFO,
130 format="%(asctime)s - %(levelname)s - [%(funcName)s, %(filename)s:%(lineno)d] - %(message)s",
131 filename=args.log_fp
132 )
135 # --- Start the Virtuoso memory watchdog thread if enabled ---
136 if args.auto_restart_container:
137 logging.info("Starting Virtuoso memory watchdog thread...")
138 start_watchdog_thread(
139 container_name=args.virtuoso_container,
140 endpoint=args.endpoint
141 )
142 else:
143 logging.info("Auto-restart watchdog disabled.")
145 # --- Setup dry-run callback if needed ---
146 dry_run_callback = None
147 if args.dry_run_db and args.dry_run_issues_dir:
148 logging.info(f"Creating dry-run issues callback writing to: {args.dry_run_issues_dir}")
149 dry_run_callback = create_dry_run_issues_callback(
150 output_dir=args.dry_run_issues_dir,
151 max_lines_per_file=1000,
152 process_id=args.dry_run_process_id
153 )
155 fix_provenance_process(
156 endpoint=args.endpoint,
157 data_dir=args.data_dir,
158 out_dir=args.out_dir,
159 meta_dumps_register=args.meta_dumps,
160 dry_run_db=args.dry_run_db,
161 dry_run_files=args.dry_run_files,
162 dry_run_callback=dry_run_callback,
163 chunk_size=args.chunk_size,
164 failed_queries_fp=args.failed_queries_fp,
165 overwrite_ok=args.overwrite_ok,
166 resume=True,
167 checkpoint_fp=args.checkpoint_fp,
168 cache_fp=args.cache_fp
169 )
172if __name__ == "__main__":
173 main()
177## Detect and fix provenance issues (with auto-restart watchdog for Virtuoso):
178## poetry run python meta_prov_fixer/main.py -e http://localhost:8890/sparql/ -i "../meta_prov/br" -o "../fixed/br" -m meta_dumps.json -r -v oc-meta-prov
180## Run in dry-run mode: only write fixed files, don't update database:
181## poetry run python meta_prov_fixer/main.py -e http://localhost:8890/sparql/ -i "../meta_prov/br" -o "../fixed/br" -m meta_dumps.json --dry-run-db
183## Run in dry-run mode with issues logging: only write fixed files and log issues to JSON-Lines:
184## poetry run python meta_prov_fixer/main.py -e http://localhost:8890/sparql/ -i "../meta_prov/br" -o "../fixed/br" -m meta_dumps.json --dry-run-db --dry-run-issues-dir "issues_output"
186## Run in dry-run mode with issues logging for parallel execution (avoiding filename conflicts):
187## poetry run python meta_prov_fixer/main.py -e http://localhost:8890/sparql/ -i "../meta_prov/br" -o "../fixed/br" -m meta_dumps.json --dry-run-db --dry-run-issues-dir "issues_output" --dry-run-process-id "br"