Coverage for oc_validator/interface/gui.py: 0%

1# ISC License

5# Permission to use, copy, modify, and/or distribute this software for any

6# purpose with or without fee is hereby granted, provided that the above

7# copyright notice and this permission notice appear in all copies.

9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH

10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND

11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,

12# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM

13# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR

14# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR

15# PERFORMANCE OF THIS SOFTWARE.

17from bs4 import BeautifulSoup, Tag

18from oc_validator.table_reader import MetadataRow, CitationsRow, AgentItem, VenueInfo

19from typing import Union, List

20import colorsys

21import json

22from oc_validator.helper import CSVStreamReader

23from jinja2 import Environment, FileSystemLoader

24from os.path import dirname, abspath, realpath

25import random

26from os.path import join

28def generate_error_colors(n: int) -> set[str]:

29 """

30 Generate *n* visually distinct random colours as hex strings.

32 Uses HSV colour space with controlled saturation and value ranges to

33 ensure readable colours.

35 :param n: Number of distinct colours to generate.

36 :type n: int

37 :return: Set of hex colour strings (e.g. ``'#a3f2c1'``).

38 :rtype: set[str]

39 """

40 colors = set()

42 while len(colors) < n:

43 h = random.random()

44 s = random.uniform(0.5, 0.9)

45 v = random.uniform(0.7, 0.95)

47 r, g, b = colorsys.hsv_to_rgb(h, s, v)

49 hex_color = '#{:02x}{:02x}{:02x}'.format(

50 int(r * 255), int(g * 255), int(b * 255)

51 )

53 colors.add(hex_color)

55 return colors

58def model_row_default(row:Union[MetadataRow, CitationsRow], row_idx: int) -> dict:

59 """

60 Models a row of the input table as a dictionary with a structure suitable

61 for enrichment with error information and rendering in the HTML template.

62 Each field value is broken down into items, and each item is associated with

63 an empty list of associated issues that can be populated later.

65 The resulting dictionary has the following structure: ::

67 {

68 "contains_issue": False, # will be updated to True if any error is associated

69 "row_idx": 4, # index of the row in the original table

70 "fields": {

71 "id": [

72 {

73 "raw": "doi:10.4242/x", # original value of the whole item (without separators)

74 "item_id": "4-id-0", # unique identifier for the item

75 "issues": [] # error IDs that affect this item (default empty list)

76 },

77 ...

78 ],

79 "title": [...],

80 ...

81 }

82 }

84 :param row: the original row to be modelled

85 :type row: Union[MetadataRow, CitationsRow]

86 :param row_idx: 0-based index of the row in the original table, used to create unique item IDs and for error mapping

87 :type row_idx: int

88 :return: a dictionary representing the modelled row, ready for enrichment with error information and rendering in the HTML template

89 :rtype: dict

90 """

92 default_model = {

93 "contains_issue": False,

94 "row_idx": row_idx,

95 "fields": {} # <field label>:[dict]

96 }

98 for field_label, items_in_field in row.flat_serialise().items():

99 field_value_model = []

100 if not items_in_field:

101 # if the field is empty an empty list (None in the report),

102 # we still want to represent it in the model,

103 # with an empty item that can be associated with errors related to the whole field

104 field_value_model.append({

105 "raw": "",

106 "item_id": f"{row_idx}-{field_label}-empty", # e.g. 4-id-empty

107 "issues": []

108 })

109 else:

110 for item_idx, item in enumerate(items_in_field):

111

112 field_value_model.append({

113 "raw": item,

114 "item_id": f"{row_idx}-{field_label}-{item_idx}",

115 "issues": []

116 })

117 default_model['fields'][field_label] = field_value_model

118

119 return default_model

120

121

122def enrich_row(modeled_row: dict, error_obj: dict, err_id: str) -> dict:

123 """

124 Enrich the modelled row with error information, by adding the error ID to

125 the ``issues`` list of each item involved in the error.

126

127 :param modeled_row: The dictionary representing the modelled row to enrich.

128 :type modeled_row: dict

129 :param error_obj: The error object from the validation report.

130 :type error_obj: dict

131 :param err_id: Unique identifier of the error.

132 :type err_id: str

133 :return: The enriched modelled row (modified in place and returned).

134 :rtype: dict

135 """

136 row_number : str = str(modeled_row['row_idx'])

137 for field_label, items_indexes in error_obj['position']['table'][row_number].items():

138 if items_indexes is None:

139 # if None, the error is related to the whole field,

140 # so we associate it with the first (and only) virtual empty item

141 # representing the whole field value

142

143 data_item :dict= modeled_row['fields'][field_label][0]

144 data_item['issues'].append(err_id)

145 break

146

147 for item_idx in items_indexes:

148 data_item :dict= modeled_row['fields'][field_label][item_idx]

149 if err_id not in data_item['issues']: # avoid error duplicates

150 data_item['issues'].append(err_id)

151

152 modeled_row['contains_issue'] = True

153

154 return modeled_row

155

156

157def map_errors_to_data(data: List[Union[MetadataRow, CitationsRow]], report: list) -> tuple[list[dict], dict]:

158 """

159 Map validation report errors to the corresponding data items in the original table.

160

161 :param data: The original table data, as a list of MetadataRow or CitationsRow objects.

162 :type data: List[Union[MetadataRow, CitationsRow]]

163 :param report: The validation report, as a list of error dictionaries.

164 :type report: list

165 :return: A tuple ``(enriched_rows, mapped_errors)`` where *enriched_rows* is a

166 list of row dictionaries and *mapped_errors* maps error IDs to their metadata.

167 :rtype: tuple[list[dict], dict]

168 """

169

170 out_data = [model_row_default(row, idx) for idx, row in enumerate(data)]

171

172 table_type:str = 'cits' if isinstance(data[0], CitationsRow) else 'meta'

173

174 del data # free memory

175

176 colors = generate_error_colors(len(report))

177

178 out_errors = {}

179

180 for err_idx, error_obj in enumerate(report):

181 err_id = f"{table_type}-{err_idx}" # e.g. meta-0, cits-1

182

183 # store error info

184 out_errors[err_id] = {

185 "message": error_obj['message'],

186 "label": error_obj['error_label'], # can be used for grouping and filtering in HTML

187 "level": error_obj['error_type'], # error|warning

188 "color": colors.pop()

189 }

190

191 # enrich data with error info

192 invalid_rows_indexes :dict = [int(k) for k in error_obj['position']['table'].keys()]

193 for i in invalid_rows_indexes:

194 out_data[i] = enrich_row(out_data[i], error_obj, err_id)

195

196 return out_data, out_errors

197

198

199def make_gui(csv_fp: str, report_fp: str, out_fp: str) -> None:

200 """

201 Generate an HTML document that visualises the validation results.

202

203 Maps errors from the validation report to the corresponding data items

204 in the original CSV table and renders them in a user-friendly HTML page.

205

206 :param csv_fp: Path to the original CSV data file.

207 :type csv_fp: str

208 :param report_fp: Path to the JSON-Lines validation report.

209 :type report_fp: str

210 :param out_fp: Path where the generated HTML document will be saved.

211 :type out_fp: str

212 :rtype: None

213 """

214

215 # separators for items according to the field,

216 # used in the template to reconstruct the whole field value

217 # from the list of items

218 item_separators = {

219 'citing_id': ' ',

220 'cited_id': ' ',

221 'id': ' ',

222 'author': '; ',

223 'publisher': '; ',

224 'editor': '; '

225 }

226

227 current_dir = dirname(abspath(__file__))

228 env = Environment(

229 loader=FileSystemLoader(current_dir),

230 trim_blocks=True,

231 lstrip_blocks=True

232 )

233

234 # Read JSON-Lines file (one JSON object per line)

235 report = []

236 with open(report_fp, 'r', encoding='utf-8') as f:

237 for line in f:

238 if line.strip(): # Skip empty lines

239 report.append(json.loads(line))

240

241 with open(join(current_dir, 'script.js'), 'r') as script_file, open(join(current_dir, 'style.css'), 'r') as style_file:

242 script = script_file.read()

243 style = style_file.read()

244

245 if not report:

246 # template = env.get_template('valid_page.html')

247 # html_output = template.render()

248

249 with open(out_fp, "w", encoding='utf-8') as file, open('valid_page.html', 'r') as valid_page:

250 file.write(valid_page.read())

251

252 print("No errors found: valid HTML generated.")

253 return

254

255 # Use streaming to read CSV file efficiently

256 csv_stream = CSVStreamReader(csv_fp)

257

258 # Read first row to determine table type

259 first_row = None

260 for row in csv_stream:

261 first_row = row

262 break

263

264 table_type = 'meta' if len(list(first_row.keys())) > 4 else 'cits'

265 parser = MetadataRow if table_type == 'meta' else CitationsRow

266

267 # Stream and parse all rows

268 structured_data = [parser(row) for row in csv_stream.stream()]

269

270 mapped_data, mapped_errors = map_errors_to_data(structured_data, report)

271

272 template = env.get_template('invalid_template.j2')

273

274 html_output = template.render(

275 error_count=len(mapped_errors),

276 data=mapped_data,

277 errors=mapped_errors,

278 item_separators=item_separators,

279 script=script,

280 style=style

281 )

282

283 with open(out_fp, "w", encoding='utf-8') as file:

284 file.write(html_output)

285

286 print(f"HTML document generated successfully at {realpath(out_fp)}.")

287

288 return None

289

290def merge_html_files(doc1_fp: str, doc2_fp: str, merged_out_fp: str) -> None:

291 """

292 Merge two HTML documents into a single document.

293

294 Combines the table containers from both documents and interleaves the

295 general-info sections.

296

297 :param doc1_fp: Path to the first HTML document.

298 :type doc1_fp: str

299 :param doc2_fp: Path to the second HTML document.

300 :type doc2_fp: str

301 :param merged_out_fp: Path for the output merged HTML document.

302 :type merged_out_fp: str

303 :rtype: None

304 """

305 with open(doc1_fp, 'r', encoding='utf-8') as fhandle1, open(doc2_fp, 'r', encoding='utf-8') as fhandle2:

306 soup1 = BeautifulSoup(fhandle1, 'html.parser')

307 soup2 = BeautifulSoup(fhandle2, 'html.parser')

308

309 # general_info_1 = soup1.find('div', class_='general-info')

310 general_info_2 = soup2.find('div', class_='general-info')

311 table_1_container = soup1.find('div', class_='table-container')

312 table_2_container = soup2.find('div', class_='table-container')

313 table_1_container.insert_after(general_info_2)

314 general_info_2.insert_after(table_2_container)

315

316 html_out = str(soup1)

317 with open(merged_out_fp, "w", encoding='utf-8') as outf:

318 outf.write(html_out)

319 print(f"HTML document generated successfully at {realpath(outf.name)}.")

Coverage for oc_validator / interface / gui.py: 0%

101 statements