Coverage for oc_validator / interface / gui.py: 0%

101 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-30 15:46 +0000

1# ISC License 

2# 

3# Copyright (c) 2023-2026, Elia Rizzetto, Silvio Peroni 

4# 

5# Permission to use, copy, modify, and/or distribute this software for any 

6# purpose with or without fee is hereby granted, provided that the above 

7# copyright notice and this permission notice appear in all copies. 

8# 

9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 

10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 

11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 

12# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 

13# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 

14# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 

15# PERFORMANCE OF THIS SOFTWARE. 

16 

17from bs4 import BeautifulSoup, Tag 

18from oc_validator.table_reader import MetadataRow, CitationsRow, AgentItem, VenueInfo 

19from typing import Union, List 

20import colorsys 

21import json 

22from oc_validator.helper import CSVStreamReader 

23from jinja2 import Environment, FileSystemLoader 

24from os.path import dirname, abspath, realpath 

25import random 

26from os.path import join 

27 

28def generate_error_colors(n: int) -> set[str]: 

29 """ 

30 Generate *n* visually distinct random colours as hex strings. 

31 

32 Uses HSV colour space with controlled saturation and value ranges to 

33 ensure readable colours. 

34 

35 :param n: Number of distinct colours to generate. 

36 :type n: int 

37 :return: Set of hex colour strings (e.g. ``'#a3f2c1'``). 

38 :rtype: set[str] 

39 """ 

40 colors = set() 

41 

42 while len(colors) < n: 

43 h = random.random() 

44 s = random.uniform(0.5, 0.9) 

45 v = random.uniform(0.7, 0.95) 

46 

47 r, g, b = colorsys.hsv_to_rgb(h, s, v) 

48 

49 hex_color = '#{:02x}{:02x}{:02x}'.format( 

50 int(r * 255), int(g * 255), int(b * 255) 

51 ) 

52 

53 colors.add(hex_color) 

54 

55 return colors 

56 

57 

58def model_row_default(row:Union[MetadataRow, CitationsRow], row_idx: int) -> dict: 

59 """ 

60 Models a row of the input table as a dictionary with a structure suitable  

61 for enrichment with error information and rendering in the HTML template.  

62 Each field value is broken down into items, and each item is associated with  

63 an empty list of associated issues that can be populated later. 

64 

65 The resulting dictionary has the following structure: :: 

66 

67 { 

68 "contains_issue": False, # will be updated to True if any error is associated 

69 "row_idx": 4, # index of the row in the original table 

70 "fields": { 

71 "id": [ 

72 { 

73 "raw": "doi:10.4242/x", # original value of the whole item (without separators) 

74 "item_id": "4-id-0", # unique identifier for the item 

75 "issues": [] # error IDs that affect this item (default empty list) 

76 }, 

77 ... 

78 ], 

79 "title": [...], 

80 ... 

81 } 

82 } 

83  

84 :param row: the original row to be modelled 

85 :type row: Union[MetadataRow, CitationsRow] 

86 :param row_idx: 0-based index of the row in the original table, used to create unique item IDs and for error mapping 

87 :type row_idx: int 

88 :return: a dictionary representing the modelled row, ready for enrichment with error information and rendering in the HTML template 

89 :rtype: dict 

90 """ 

91 

92 default_model = { 

93 "contains_issue": False, 

94 "row_idx": row_idx, 

95 "fields": {} # <field label>:[dict] 

96 } 

97 

98 for field_label, items_in_field in row.flat_serialise().items(): 

99 field_value_model = [] 

100 if not items_in_field: 

101 # if the field is empty an empty list (None in the report), 

102 # we still want to represent it in the model,  

103 # with an empty item that can be associated with errors related to the whole field 

104 field_value_model.append({ 

105 "raw": "", 

106 "item_id": f"{row_idx}-{field_label}-empty", # e.g. 4-id-empty 

107 "issues": [] 

108 }) 

109 else: 

110 for item_idx, item in enumerate(items_in_field): 

111 

112 field_value_model.append({ 

113 "raw": item, 

114 "item_id": f"{row_idx}-{field_label}-{item_idx}", 

115 "issues": [] 

116 }) 

117 default_model['fields'][field_label] = field_value_model 

118 

119 return default_model 

120 

121 

122def enrich_row(modeled_row: dict, error_obj: dict, err_id: str) -> dict: 

123 """ 

124 Enrich the modelled row with error information, by adding the error ID to 

125 the ``issues`` list of each item involved in the error. 

126 

127 :param modeled_row: The dictionary representing the modelled row to enrich. 

128 :type modeled_row: dict 

129 :param error_obj: The error object from the validation report. 

130 :type error_obj: dict 

131 :param err_id: Unique identifier of the error. 

132 :type err_id: str 

133 :return: The enriched modelled row (modified in place and returned). 

134 :rtype: dict 

135 """ 

136 row_number : str = str(modeled_row['row_idx']) 

137 for field_label, items_indexes in error_obj['position']['table'][row_number].items(): 

138 if items_indexes is None: 

139 # if None, the error is related to the whole field,  

140 # so we associate it with the first (and only) virtual empty item  

141 # representing the whole field value 

142 

143 data_item :dict= modeled_row['fields'][field_label][0] 

144 data_item['issues'].append(err_id) 

145 break 

146 

147 for item_idx in items_indexes: 

148 data_item :dict= modeled_row['fields'][field_label][item_idx] 

149 if err_id not in data_item['issues']: # avoid error duplicates 

150 data_item['issues'].append(err_id) 

151 

152 modeled_row['contains_issue'] = True 

153 

154 return modeled_row 

155 

156 

157def map_errors_to_data(data: List[Union[MetadataRow, CitationsRow]], report: list) -> tuple[list[dict], dict]: 

158 """ 

159 Map validation report errors to the corresponding data items in the original table. 

160 

161 :param data: The original table data, as a list of MetadataRow or CitationsRow objects. 

162 :type data: List[Union[MetadataRow, CitationsRow]] 

163 :param report: The validation report, as a list of error dictionaries. 

164 :type report: list 

165 :return: A tuple ``(enriched_rows, mapped_errors)`` where *enriched_rows* is a 

166 list of row dictionaries and *mapped_errors* maps error IDs to their metadata. 

167 :rtype: tuple[list[dict], dict] 

168 """ 

169 

170 out_data = [model_row_default(row, idx) for idx, row in enumerate(data)] 

171 

172 table_type:str = 'cits' if isinstance(data[0], CitationsRow) else 'meta' 

173 

174 del data # free memory 

175 

176 colors = generate_error_colors(len(report)) 

177 

178 out_errors = {} 

179 

180 for err_idx, error_obj in enumerate(report): 

181 err_id = f"{table_type}-{err_idx}" # e.g. meta-0, cits-1 

182 

183 # store error info 

184 out_errors[err_id] = { 

185 "message": error_obj['message'], 

186 "label": error_obj['error_label'], # can be used for grouping and filtering in HTML 

187 "level": error_obj['error_type'], # error|warning 

188 "color": colors.pop() 

189 } 

190 

191 # enrich data with error info 

192 invalid_rows_indexes :dict = [int(k) for k in error_obj['position']['table'].keys()] 

193 for i in invalid_rows_indexes: 

194 out_data[i] = enrich_row(out_data[i], error_obj, err_id) 

195 

196 return out_data, out_errors 

197 

198 

199def make_gui(csv_fp: str, report_fp: str, out_fp: str) -> None: 

200 """ 

201 Generate an HTML document that visualises the validation results. 

202 

203 Maps errors from the validation report to the corresponding data items 

204 in the original CSV table and renders them in a user-friendly HTML page. 

205 

206 :param csv_fp: Path to the original CSV data file. 

207 :type csv_fp: str 

208 :param report_fp: Path to the JSON-Lines validation report. 

209 :type report_fp: str 

210 :param out_fp: Path where the generated HTML document will be saved. 

211 :type out_fp: str 

212 :rtype: None 

213 """ 

214 

215 # separators for items according to the field, 

216 # used in the template to reconstruct the whole field value  

217 # from the list of items 

218 item_separators = { 

219 'citing_id': ' ', 

220 'cited_id': ' ', 

221 'id': ' ', 

222 'author': '; ', 

223 'publisher': '; ', 

224 'editor': '; ' 

225 } 

226 

227 current_dir = dirname(abspath(__file__)) 

228 env = Environment( 

229 loader=FileSystemLoader(current_dir), 

230 trim_blocks=True, 

231 lstrip_blocks=True 

232 ) 

233 

234 # Read JSON-Lines file (one JSON object per line) 

235 report = [] 

236 with open(report_fp, 'r', encoding='utf-8') as f: 

237 for line in f: 

238 if line.strip(): # Skip empty lines 

239 report.append(json.loads(line)) 

240 

241 with open(join(current_dir, 'script.js'), 'r') as script_file, open(join(current_dir, 'style.css'), 'r') as style_file: 

242 script = script_file.read() 

243 style = style_file.read() 

244 

245 if not report: 

246 # template = env.get_template('valid_page.html') 

247 # html_output = template.render() 

248 

249 with open(out_fp, "w", encoding='utf-8') as file, open('valid_page.html', 'r') as valid_page: 

250 file.write(valid_page.read()) 

251 

252 print("No errors found: valid HTML generated.") 

253 return 

254 

255 # Use streaming to read CSV file efficiently 

256 csv_stream = CSVStreamReader(csv_fp) 

257 

258 # Read first row to determine table type 

259 first_row = None 

260 for row in csv_stream: 

261 first_row = row 

262 break 

263 

264 table_type = 'meta' if len(list(first_row.keys())) > 4 else 'cits' 

265 parser = MetadataRow if table_type == 'meta' else CitationsRow 

266 

267 # Stream and parse all rows 

268 structured_data = [parser(row) for row in csv_stream.stream()] 

269 

270 mapped_data, mapped_errors = map_errors_to_data(structured_data, report) 

271 

272 template = env.get_template('invalid_template.j2') 

273 

274 html_output = template.render( 

275 error_count=len(mapped_errors), 

276 data=mapped_data, 

277 errors=mapped_errors, 

278 item_separators=item_separators, 

279 script=script, 

280 style=style 

281 ) 

282 

283 with open(out_fp, "w", encoding='utf-8') as file: 

284 file.write(html_output) 

285 

286 print(f"HTML document generated successfully at {realpath(out_fp)}.") 

287 

288 return None 

289 

290def merge_html_files(doc1_fp: str, doc2_fp: str, merged_out_fp: str) -> None: 

291 """ 

292 Merge two HTML documents into a single document. 

293 

294 Combines the table containers from both documents and interleaves the 

295 general-info sections. 

296 

297 :param doc1_fp: Path to the first HTML document. 

298 :type doc1_fp: str 

299 :param doc2_fp: Path to the second HTML document. 

300 :type doc2_fp: str 

301 :param merged_out_fp: Path for the output merged HTML document. 

302 :type merged_out_fp: str 

303 :rtype: None 

304 """ 

305 with open(doc1_fp, 'r', encoding='utf-8') as fhandle1, open(doc2_fp, 'r', encoding='utf-8') as fhandle2: 

306 soup1 = BeautifulSoup(fhandle1, 'html.parser') 

307 soup2 = BeautifulSoup(fhandle2, 'html.parser') 

308 

309 # general_info_1 = soup1.find('div', class_='general-info') 

310 general_info_2 = soup2.find('div', class_='general-info') 

311 table_1_container = soup1.find('div', class_='table-container') 

312 table_2_container = soup2.find('div', class_='table-container') 

313 table_1_container.insert_after(general_info_2) 

314 general_info_2.insert_after(table_2_container) 

315 

316 html_out = str(soup1) 

317 with open(merged_out_fp, "w", encoding='utf-8') as outf: 

318 outf.write(html_out) 

319 print(f"HTML document generated successfully at {realpath(outf.name)}.")