Coverage for oc_validator / interface / gui.py: 0%
101 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 15:46 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-30 15:46 +0000
1# ISC License
2#
3# Copyright (c) 2023-2026, Elia Rizzetto, Silvio Peroni
4#
5# Permission to use, copy, modify, and/or distribute this software for any
6# purpose with or without fee is hereby granted, provided that the above
7# copyright notice and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
10# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
11# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
12# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
14# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15# PERFORMANCE OF THIS SOFTWARE.
17from bs4 import BeautifulSoup, Tag
18from oc_validator.table_reader import MetadataRow, CitationsRow, AgentItem, VenueInfo
19from typing import Union, List
20import colorsys
21import json
22from oc_validator.helper import CSVStreamReader
23from jinja2 import Environment, FileSystemLoader
24from os.path import dirname, abspath, realpath
25import random
26from os.path import join
28def generate_error_colors(n: int) -> set[str]:
29 """
30 Generate *n* visually distinct random colours as hex strings.
32 Uses HSV colour space with controlled saturation and value ranges to
33 ensure readable colours.
35 :param n: Number of distinct colours to generate.
36 :type n: int
37 :return: Set of hex colour strings (e.g. ``'#a3f2c1'``).
38 :rtype: set[str]
39 """
40 colors = set()
42 while len(colors) < n:
43 h = random.random()
44 s = random.uniform(0.5, 0.9)
45 v = random.uniform(0.7, 0.95)
47 r, g, b = colorsys.hsv_to_rgb(h, s, v)
49 hex_color = '#{:02x}{:02x}{:02x}'.format(
50 int(r * 255), int(g * 255), int(b * 255)
51 )
53 colors.add(hex_color)
55 return colors
58def model_row_default(row:Union[MetadataRow, CitationsRow], row_idx: int) -> dict:
59 """
60 Models a row of the input table as a dictionary with a structure suitable
61 for enrichment with error information and rendering in the HTML template.
62 Each field value is broken down into items, and each item is associated with
63 an empty list of associated issues that can be populated later.
65 The resulting dictionary has the following structure: ::
67 {
68 "contains_issue": False, # will be updated to True if any error is associated
69 "row_idx": 4, # index of the row in the original table
70 "fields": {
71 "id": [
72 {
73 "raw": "doi:10.4242/x", # original value of the whole item (without separators)
74 "item_id": "4-id-0", # unique identifier for the item
75 "issues": [] # error IDs that affect this item (default empty list)
76 },
77 ...
78 ],
79 "title": [...],
80 ...
81 }
82 }
84 :param row: the original row to be modelled
85 :type row: Union[MetadataRow, CitationsRow]
86 :param row_idx: 0-based index of the row in the original table, used to create unique item IDs and for error mapping
87 :type row_idx: int
88 :return: a dictionary representing the modelled row, ready for enrichment with error information and rendering in the HTML template
89 :rtype: dict
90 """
92 default_model = {
93 "contains_issue": False,
94 "row_idx": row_idx,
95 "fields": {} # <field label>:[dict]
96 }
98 for field_label, items_in_field in row.flat_serialise().items():
99 field_value_model = []
100 if not items_in_field:
101 # if the field is empty an empty list (None in the report),
102 # we still want to represent it in the model,
103 # with an empty item that can be associated with errors related to the whole field
104 field_value_model.append({
105 "raw": "",
106 "item_id": f"{row_idx}-{field_label}-empty", # e.g. 4-id-empty
107 "issues": []
108 })
109 else:
110 for item_idx, item in enumerate(items_in_field):
112 field_value_model.append({
113 "raw": item,
114 "item_id": f"{row_idx}-{field_label}-{item_idx}",
115 "issues": []
116 })
117 default_model['fields'][field_label] = field_value_model
119 return default_model
122def enrich_row(modeled_row: dict, error_obj: dict, err_id: str) -> dict:
123 """
124 Enrich the modelled row with error information, by adding the error ID to
125 the ``issues`` list of each item involved in the error.
127 :param modeled_row: The dictionary representing the modelled row to enrich.
128 :type modeled_row: dict
129 :param error_obj: The error object from the validation report.
130 :type error_obj: dict
131 :param err_id: Unique identifier of the error.
132 :type err_id: str
133 :return: The enriched modelled row (modified in place and returned).
134 :rtype: dict
135 """
136 row_number : str = str(modeled_row['row_idx'])
137 for field_label, items_indexes in error_obj['position']['table'][row_number].items():
138 if items_indexes is None:
139 # if None, the error is related to the whole field,
140 # so we associate it with the first (and only) virtual empty item
141 # representing the whole field value
143 data_item :dict= modeled_row['fields'][field_label][0]
144 data_item['issues'].append(err_id)
145 break
147 for item_idx in items_indexes:
148 data_item :dict= modeled_row['fields'][field_label][item_idx]
149 if err_id not in data_item['issues']: # avoid error duplicates
150 data_item['issues'].append(err_id)
152 modeled_row['contains_issue'] = True
154 return modeled_row
157def map_errors_to_data(data: List[Union[MetadataRow, CitationsRow]], report: list) -> tuple[list[dict], dict]:
158 """
159 Map validation report errors to the corresponding data items in the original table.
161 :param data: The original table data, as a list of MetadataRow or CitationsRow objects.
162 :type data: List[Union[MetadataRow, CitationsRow]]
163 :param report: The validation report, as a list of error dictionaries.
164 :type report: list
165 :return: A tuple ``(enriched_rows, mapped_errors)`` where *enriched_rows* is a
166 list of row dictionaries and *mapped_errors* maps error IDs to their metadata.
167 :rtype: tuple[list[dict], dict]
168 """
170 out_data = [model_row_default(row, idx) for idx, row in enumerate(data)]
172 table_type:str = 'cits' if isinstance(data[0], CitationsRow) else 'meta'
174 del data # free memory
176 colors = generate_error_colors(len(report))
178 out_errors = {}
180 for err_idx, error_obj in enumerate(report):
181 err_id = f"{table_type}-{err_idx}" # e.g. meta-0, cits-1
183 # store error info
184 out_errors[err_id] = {
185 "message": error_obj['message'],
186 "label": error_obj['error_label'], # can be used for grouping and filtering in HTML
187 "level": error_obj['error_type'], # error|warning
188 "color": colors.pop()
189 }
191 # enrich data with error info
192 invalid_rows_indexes :dict = [int(k) for k in error_obj['position']['table'].keys()]
193 for i in invalid_rows_indexes:
194 out_data[i] = enrich_row(out_data[i], error_obj, err_id)
196 return out_data, out_errors
199def make_gui(csv_fp: str, report_fp: str, out_fp: str) -> None:
200 """
201 Generate an HTML document that visualises the validation results.
203 Maps errors from the validation report to the corresponding data items
204 in the original CSV table and renders them in a user-friendly HTML page.
206 :param csv_fp: Path to the original CSV data file.
207 :type csv_fp: str
208 :param report_fp: Path to the JSON-Lines validation report.
209 :type report_fp: str
210 :param out_fp: Path where the generated HTML document will be saved.
211 :type out_fp: str
212 :rtype: None
213 """
215 # separators for items according to the field,
216 # used in the template to reconstruct the whole field value
217 # from the list of items
218 item_separators = {
219 'citing_id': ' ',
220 'cited_id': ' ',
221 'id': ' ',
222 'author': '; ',
223 'publisher': '; ',
224 'editor': '; '
225 }
227 current_dir = dirname(abspath(__file__))
228 env = Environment(
229 loader=FileSystemLoader(current_dir),
230 trim_blocks=True,
231 lstrip_blocks=True
232 )
234 # Read JSON-Lines file (one JSON object per line)
235 report = []
236 with open(report_fp, 'r', encoding='utf-8') as f:
237 for line in f:
238 if line.strip(): # Skip empty lines
239 report.append(json.loads(line))
241 with open(join(current_dir, 'script.js'), 'r') as script_file, open(join(current_dir, 'style.css'), 'r') as style_file:
242 script = script_file.read()
243 style = style_file.read()
245 if not report:
246 # template = env.get_template('valid_page.html')
247 # html_output = template.render()
249 with open(out_fp, "w", encoding='utf-8') as file, open('valid_page.html', 'r') as valid_page:
250 file.write(valid_page.read())
252 print("No errors found: valid HTML generated.")
253 return
255 # Use streaming to read CSV file efficiently
256 csv_stream = CSVStreamReader(csv_fp)
258 # Read first row to determine table type
259 first_row = None
260 for row in csv_stream:
261 first_row = row
262 break
264 table_type = 'meta' if len(list(first_row.keys())) > 4 else 'cits'
265 parser = MetadataRow if table_type == 'meta' else CitationsRow
267 # Stream and parse all rows
268 structured_data = [parser(row) for row in csv_stream.stream()]
270 mapped_data, mapped_errors = map_errors_to_data(structured_data, report)
272 template = env.get_template('invalid_template.j2')
274 html_output = template.render(
275 error_count=len(mapped_errors),
276 data=mapped_data,
277 errors=mapped_errors,
278 item_separators=item_separators,
279 script=script,
280 style=style
281 )
283 with open(out_fp, "w", encoding='utf-8') as file:
284 file.write(html_output)
286 print(f"HTML document generated successfully at {realpath(out_fp)}.")
288 return None
290def merge_html_files(doc1_fp: str, doc2_fp: str, merged_out_fp: str) -> None:
291 """
292 Merge two HTML documents into a single document.
294 Combines the table containers from both documents and interleaves the
295 general-info sections.
297 :param doc1_fp: Path to the first HTML document.
298 :type doc1_fp: str
299 :param doc2_fp: Path to the second HTML document.
300 :type doc2_fp: str
301 :param merged_out_fp: Path for the output merged HTML document.
302 :type merged_out_fp: str
303 :rtype: None
304 """
305 with open(doc1_fp, 'r', encoding='utf-8') as fhandle1, open(doc2_fp, 'r', encoding='utf-8') as fhandle2:
306 soup1 = BeautifulSoup(fhandle1, 'html.parser')
307 soup2 = BeautifulSoup(fhandle2, 'html.parser')
309 # general_info_1 = soup1.find('div', class_='general-info')
310 general_info_2 = soup2.find('div', class_='general-info')
311 table_1_container = soup1.find('div', class_='table-container')
312 table_2_container = soup2.find('div', class_='table-container')
313 table_1_container.insert_after(general_info_2)
314 general_info_2.insert_after(table_2_container)
316 html_out = str(soup1)
317 with open(merged_out_fp, "w", encoding='utf-8') as outf:
318 outf.write(html_out)
319 print(f"HTML document generated successfully at {realpath(outf.name)}.")