Coverage for heritrace / extensions.py: 99%

296 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-21 12:56 +0000

1# SPDX-FileCopyrightText: 2024-2026 Arcangelo Massari <arcangelo.massari@unibo.it> 

2# 

3# SPDX-License-Identifier: ISC 

4 

5# heritrace/extensions.py 

6 

7import json 

8import logging 

9import os 

10import time 

11from collections import defaultdict 

12from datetime import datetime, timedelta 

13from typing import Dict 

14from urllib.parse import urlparse, urlunparse 

15 

16import yaml 

17from flask import Flask, current_app, g, redirect, session, url_for 

18from flask_babel import Babel 

19from flask_login import LoginManager 

20from flask_login.signals import user_loaded_from_cookie 

21from heritrace.models import User 

22from heritrace.services.resource_lock_manager import ResourceLockManager 

23from heritrace.uri_generator.uri_generator import URIGenerator 

24from heritrace.utils.filters import Filter 

25from rdflib import Graph 

26from rdflib_ocdm.counter_handler.counter_handler import CounterHandler 

27from redis import Redis 

28from SPARQLWrapper import JSON, SPARQLWrapper 

29from time_agnostic_library.support import generate_config_file 

30 

31# Global variables 

32initialization_done = False 

33dataset_endpoint = None 

34provenance_endpoint = None 

35sparql = None 

36provenance_sparql = None 

37change_tracking_config = None 

38form_fields_cache = None 

39custom_filter = None 

40redis_client = None 

41display_rules = None 

42dataset_is_quadstore = None 

43shacl_graph = None 

44classes_with_multiple_shapes = None 

45 

46 

47class SPARQLWrapperWithRetry(SPARQLWrapper): 

48 """ 

49 Extension of SPARQLWrapper that includes automatic retry functionality and timeout handling. 

50 Uses SPARQLWrapper's built-in timeout functionality. 

51 """ 

52 def __init__(self, endpoint, **kwargs): 

53 self.max_attempts = kwargs.pop('max_attempts', 3) 

54 self.initial_delay = kwargs.pop('initial_delay', 1.0) 

55 self.backoff_factor = kwargs.pop('backoff_factor', 2.0) 

56 query_timeout = kwargs.pop('timeout', 5.0) 

57 

58 super().__init__(endpoint, **kwargs) 

59 

60 self.setTimeout(int(query_timeout)) 

61 

62 def query(self): 

63 """ 

64 Override the query method to include retry logic with SPARQLWrapper's built-in timeout. 

65 Returns the original SPARQLWrapper.QueryResult so that convert() can be called on it. 

66 """ 

67 logger = logging.getLogger(__name__) 

68 

69 attempt = 1 

70 delay = self.initial_delay 

71 last_exception = None 

72 

73 while attempt <= self.max_attempts: 

74 try: 

75 result = super().query() 

76 return result 

77 

78 except Exception as e: 

79 last_exception = e 

80 logger.warning(f"SPARQL query attempt {attempt}/{self.max_attempts} failed: {str(e)}") 

81 

82 if attempt < self.max_attempts: 

83 logger.info(f"Retrying in {delay:.2f} seconds...") 

84 time.sleep(delay) 

85 delay *= self.backoff_factor 

86 

87 attempt += 1 

88 

89 logger.error(f"All {self.max_attempts} SPARQL query attempts failed") 

90 raise last_exception 

91 

92def init_extensions(app: Flask, babel: Babel, login_manager: LoginManager, redis: Redis): 

93 """ 

94 Initialize Flask extensions and configure shared objects. 

95  

96 Args: 

97 app: Flask application instance 

98 babel: Babel extension instance 

99 login_manager: LoginManager instance 

100 redis: Redis client instance 

101 """ 

102 global redis_client 

103 

104 redis_client = redis 

105 

106 # Initialize Babel 

107 babel.init_app( 

108 app=app, 

109 locale_selector=lambda: session.get('lang', 'en'), 

110 default_translation_directories=app.config['BABEL_TRANSLATION_DIRECTORIES'] 

111 ) 

112 

113 # Initialize LoginManager 

114 init_login_manager(app, login_manager) 

115 

116 # Initialize SPARQL endpoints and other services 

117 init_sparql_services(app) 

118 

119 # Initialize filters 

120 init_filters(app) 

121 

122 # Register before_request handlers 

123 init_request_handlers(app) 

124 

125 # Store extensions in app context 

126 app.babel = babel 

127 app.login_manager = login_manager 

128 app.redis_client = redis_client 

129 

130def init_login_manager(app, login_manager: LoginManager): 

131 """Configure the Flask-Login extension.""" 

132 login_manager.init_app(app) 

133 login_manager.login_view = 'auth.login' 

134 login_manager.unauthorized_handler(lambda: redirect(url_for('auth.login'))) 

135 

136 @login_manager.user_loader 

137 def load_user(user_id): 

138 user_name = session.get('user_name', 'Unknown User') 

139 return User(id=user_id, name=user_name, orcid=user_id) 

140 

141 @user_loaded_from_cookie.connect 

142 def rotate_session_token(sender, user): 

143 session.modified = True 

144 

145def initialize_change_tracking_config(app: Flask, adjusted_dataset_endpoint=None, adjusted_provenance_endpoint=None): 

146 """ 

147 Initialize and return the change tracking configuration JSON. 

148 Uses pre-adjusted endpoints if provided to avoid redundant adjustments. 

149  

150 Args: 

151 app: Flask application instance 

152 adjusted_dataset_endpoint: Dataset endpoint URL already adjusted for Docker 

153 adjusted_provenance_endpoint: Provenance endpoint URL already adjusted for Docker 

154  

155 Returns: 

156 dict: The loaded configuration dictionary 

157 """ 

158 config_needs_generation = False 

159 config_path = None 

160 config = None 

161 

162 # Check if we have a config path in app.config 

163 if 'CHANGE_TRACKING_CONFIG' in app.config: 

164 config_path = app.config['CHANGE_TRACKING_CONFIG'] 

165 if not os.path.exists(config_path): 

166 app.logger.warning(f"Change tracking configuration file not found at specified path: {config_path}") 

167 config_needs_generation = True 

168 else: 

169 config_needs_generation = True 

170 config_path = os.path.join(app.instance_path, 'change_tracking_config.json') 

171 os.makedirs(app.instance_path, exist_ok=True) 

172 

173 if config_needs_generation: 

174 dataset_urls = [adjusted_dataset_endpoint] if adjusted_dataset_endpoint else [] 

175 provenance_urls = [adjusted_provenance_endpoint] if adjusted_provenance_endpoint else [] 

176 

177 db_triplestore = app.config.get('DATASET_DB_TRIPLESTORE', '').lower() 

178 text_index_enabled = app.config.get('DATASET_DB_TEXT_INDEX_ENABLED', False) 

179 

180 blazegraph_search = db_triplestore == 'blazegraph' and text_index_enabled 

181 fuseki_search = db_triplestore == 'fuseki' and text_index_enabled 

182 virtuoso_search = db_triplestore == 'virtuoso' and text_index_enabled 

183 

184 graphdb_connector = '' #TODO: Add graphdb support 

185 

186 try: 

187 config = generate_config_file( 

188 config_path=config_path, 

189 dataset_urls=dataset_urls, 

190 dataset_dirs=app.config.get('DATASET_DIRS', []), 

191 dataset_is_quadstore=app.config.get('DATASET_IS_QUADSTORE', False), 

192 provenance_urls=provenance_urls, 

193 provenance_is_quadstore=app.config.get('PROVENANCE_IS_QUADSTORE', False), 

194 provenance_dirs=app.config.get('PROVENANCE_DIRS', []), 

195 blazegraph_full_text_search=blazegraph_search, 

196 fuseki_full_text_search=fuseki_search, 

197 virtuoso_full_text_search=virtuoso_search, 

198 graphdb_connector_name=graphdb_connector, 

199 ) 

200 app.logger.info(f"Generated new change tracking configuration at: {config_path}") 

201 except Exception as e: 

202 raise RuntimeError(f"Failed to generate change tracking configuration: {str(e)}") 

203 

204 # Load and validate the configuration 

205 try: 

206 if not config: 

207 with open(config_path, 'r', encoding='utf8') as f: 

208 config = json.load(f) 

209 

210 except json.JSONDecodeError as e: 

211 raise RuntimeError(f"Invalid change tracking configuration JSON at {config_path}: {str(e)}") 

212 except Exception as e: 

213 raise RuntimeError(f"Error reading change tracking configuration at {config_path}: {str(e)}") 

214 

215 app.config['CHANGE_TRACKING_CONFIG'] = config_path 

216 return config 

217 

218def need_initialization(app: Flask): 

219 """ 

220 Check if counter handler initialization is needed. 

221 

222 When using external Redis (non-default REDIS_URL), assumes counters are already 

223 populated and returns False. For internal Redis, checks cache validity. 

224 """ 

225 uri_generator = app.config['URI_GENERATOR'] 

226 

227 if not hasattr(uri_generator, "counter_handler"): 

228 return False 

229 

230 redis_url = os.environ.get('REDIS_URL', 'redis://localhost:6379/0') 

231 is_external_redis = redis_url != 'redis://localhost:6379/0' 

232 

233 if is_external_redis: 

234 app.logger.info(f"Using external Redis at {redis_url} - skipping counter initialization") 

235 return False 

236 

237 # For internal Redis, check cache validity 

238 cache_validity_days = app.config['CACHE_VALIDITY_DAYS'] 

239 

240 try: 

241 last_init_str = redis_client.get('heritrace:last_initialization') 

242 if not last_init_str: 

243 return True 

244 

245 last_init = datetime.fromisoformat(last_init_str.decode('utf-8')) 

246 return datetime.now() - last_init > timedelta(days=cache_validity_days) 

247 except Exception: 

248 return True 

249 

250def update_cache(app: Flask): 

251 """ 

252 Update Redis with current initialization timestamp. 

253 """ 

254 current_time = datetime.now().isoformat() 

255 redis_client.set('heritrace:last_initialization', current_time) 

256 redis_client.set('heritrace:cache_version', '1.0') 

257 

258def initialize_counter_handler(app: Flask): 

259 """ 

260 Initialize the counter handler for URI generation if needed. 

261 Skips initialization for external Redis (detected automatically in need_initialization). 

262 """ 

263 if not need_initialization(app): 

264 return 

265 

266 uri_generator: URIGenerator = app.config['URI_GENERATOR'] 

267 counter_handler: CounterHandler = uri_generator.counter_handler 

268 

269 # Inizializza i contatori specifici dell'URI generator 

270 uri_generator.initialize_counters(sparql) 

271 

272 # Query per contare gli snapshot nella provenance 

273 # Contiamo il numero di wasDerivedFrom per ogni entità e aggiungiamo 1 

274 # (poiché il primo snapshot non ha wasDerivedFrom) 

275 prov_query = """ 

276 SELECT ?entity (COUNT(DISTINCT ?snapshot) as ?count) 

277 WHERE { 

278 ?snapshot a <http://www.w3.org/ns/prov#Entity> ; 

279 <http://www.w3.org/ns/prov#specializationOf> ?entity . 

280 OPTIONAL { 

281 ?snapshot <http://www.w3.org/ns/prov#wasDerivedFrom> ?prev . 

282 } 

283 } 

284 GROUP BY ?entity 

285 """ 

286 

287 # Esegui query sulla provenance e imposta i contatori degli snapshot 

288 provenance_sparql.setQuery(prov_query) 

289 provenance_sparql.setReturnFormat(JSON) 

290 prov_results = provenance_sparql.query().convert() 

291 

292 for result in prov_results["results"]["bindings"]: 

293 entity = result["entity"]["value"] 

294 count = int(result["count"]["value"]) 

295 counter_handler.set_counter(count, entity) 

296 

297 update_cache(app) 

298 

299def identify_classes_with_multiple_shapes(): 

300 """ 

301 Identify classes that have multiple VISIBLE shapes associated with them. 

302 Only returns classes where multiple shapes are actually visible to avoid unnecessary processing. 

303  

304 Returns: 

305 Set[str]: Set of class URIs that have multiple visible shapes 

306 """ 

307 global display_rules, shacl_graph 

308 

309 if not display_rules or not shacl_graph: 

310 return set() 

311 

312 from heritrace.utils.display_rules_utils import is_entity_type_visible 

313 

314 class_to_shapes = defaultdict(set) 

315 

316 for rule in display_rules: 

317 target = rule.get("target", {}) 

318 

319 if "class" in target: 

320 class_uri = target["class"] 

321 if shacl_graph: 

322 query_string = f""" 

323 SELECT DISTINCT ?shape WHERE {{ 

324 ?shape <http://www.w3.org/ns/shacl#targetClass> <{class_uri}> . 

325 }} 

326 """ 

327 results = shacl_graph.query(query_string) 

328 for row in results: 

329 shape_uri = str(row.shape) 

330 entity_key = (class_uri, shape_uri) 

331 if is_entity_type_visible(entity_key): 

332 class_to_shapes[class_uri].add(shape_uri) 

333 

334 elif "shape" in target: 

335 shape_uri = target["shape"] 

336 if shacl_graph: 

337 query_string = f""" 

338 SELECT DISTINCT ?class WHERE {{ 

339 <{shape_uri}> <http://www.w3.org/ns/shacl#targetClass> ?class . 

340 }} 

341 """ 

342 results = shacl_graph.query(query_string) 

343 for row in results: 

344 class_uri = str(row[0]) 

345 entity_key = (class_uri, shape_uri) 

346 if is_entity_type_visible(entity_key): 

347 class_to_shapes[class_uri].add(shape_uri) 

348 

349 return {class_uri for class_uri, shapes in class_to_shapes.items() if len(shapes) > 1} 

350 

351def initialize_global_variables(app: Flask): 

352 """ 

353 Initialize all global variables including form fields cache, display rules, 

354 and dataset configuration from SHACL shapes graph and configuration files. 

355  

356 Args: 

357 app: Flask application instance 

358 """ 

359 global shacl_graph, form_fields_cache, display_rules, dataset_is_quadstore, classes_with_multiple_shapes 

360 

361 try: 

362 dataset_is_quadstore = app.config.get('DATASET_IS_QUADSTORE', False) 

363 

364 if app.config.get('DISPLAY_RULES_PATH'): 

365 if not os.path.exists(app.config['DISPLAY_RULES_PATH']): 

366 app.logger.warning(f"Display rules file not found at: {app.config['DISPLAY_RULES_PATH']}") 

367 else: 

368 try: 

369 with open(app.config['DISPLAY_RULES_PATH'], 'r') as f: 

370 yaml_content = yaml.safe_load(f) 

371 display_rules = yaml_content['rules'] 

372 except Exception as e: 

373 app.logger.error(f"Error loading display rules: {str(e)}") 

374 raise RuntimeError(f"Failed to load display rules: {str(e)}") 

375 else: 

376 display_rules = [] 

377 

378 if app.config.get('SHACL_PATH'): 

379 if not os.path.exists(app.config['SHACL_PATH']): 

380 app.logger.warning(f"SHACL file not found at: {app.config['SHACL_PATH']}") 

381 return 

382 

383 try: 

384 shacl_graph = Graph() 

385 shacl_graph.parse(source=app.config['SHACL_PATH'], format="turtle") 

386 

387 from heritrace.utils.shacl_utils import \ 

388 get_form_fields_from_shacl 

389 form_fields_cache = get_form_fields_from_shacl(shacl_graph, display_rules, app=app) 

390 except Exception as e: 

391 app.logger.error(f"Error initializing form fields from SHACL: {str(e)}") 

392 raise RuntimeError(f"Failed to initialize form fields: {str(e)}") 

393 else: 

394 shacl_graph = Graph() 

395 form_fields_cache = {} 

396 

397 classes_with_multiple_shapes = identify_classes_with_multiple_shapes() 

398 

399 app.logger.info("Global variables initialized successfully") 

400 

401 except Exception as e: 

402 app.logger.error(f"Error during global variables initialization: {str(e)}") 

403 raise RuntimeError(f"Global variables initialization failed: {str(e)}") 

404 

405def init_sparql_services(app: Flask): 

406 """Initialize SPARQL endpoints and related services.""" 

407 global initialization_done, dataset_endpoint, provenance_endpoint, sparql, provenance_sparql, change_tracking_config 

408 

409 if not initialization_done: 

410 dataset_endpoint = adjust_endpoint_url(app.config['DATASET_DB_URL']) 

411 provenance_endpoint = adjust_endpoint_url(app.config['PROVENANCE_DB_URL']) 

412 

413 sparql = SPARQLWrapperWithRetry(dataset_endpoint, timeout=30.0) 

414 provenance_sparql = SPARQLWrapperWithRetry(provenance_endpoint, timeout=30.0) 

415 

416 change_tracking_config = initialize_change_tracking_config( 

417 app, 

418 adjusted_dataset_endpoint=dataset_endpoint, 

419 adjusted_provenance_endpoint=provenance_endpoint 

420 ) 

421 

422 initialize_counter_handler(app) 

423 initialize_global_variables(app) 

424 initialization_done = True 

425 

426def init_filters(app: Flask): 

427 """Initialize custom template filters.""" 

428 global custom_filter 

429 

430 with open(os.path.join(os.path.dirname(__file__), "utils", "context.json"), "r") as config_file: 

431 context = json.load(config_file)["@context"] 

432 

433 display_rules = None 

434 if app.config["DISPLAY_RULES_PATH"]: 

435 with open(app.config["DISPLAY_RULES_PATH"], 'r') as f: 

436 yaml_content = yaml.safe_load(f) 

437 display_rules = yaml_content.get('rules', []) 

438 

439 custom_filter = Filter(context, display_rules, dataset_endpoint) 

440 

441 app.jinja_env.filters['human_readable_predicate'] = custom_filter.human_readable_predicate 

442 app.jinja_env.filters['human_readable_class'] = custom_filter.human_readable_class 

443 app.jinja_env.filters['human_readable_entity'] = custom_filter.human_readable_entity 

444 app.jinja_env.filters['human_readable_primary_source'] = custom_filter.human_readable_primary_source 

445 app.jinja_env.filters['format_datetime'] = custom_filter.human_readable_datetime 

446 from heritrace.utils.filters import split_namespace 

447 app.jinja_env.filters['split_ns'] = split_namespace 

448 app.jinja_env.filters['format_source_reference'] = custom_filter.format_source_reference 

449 app.jinja_env.filters['format_agent_reference'] = custom_filter.format_agent_reference 

450 

451def init_request_handlers(app): 

452 """Initialize before_request and teardown_request handlers.""" 

453 

454 @app.before_request 

455 def initialize_lock_manager(): 

456 """Initialize the resource lock manager for each request.""" 

457 if not hasattr(g, 'resource_lock_manager'): 

458 g.resource_lock_manager = ResourceLockManager(redis_client) 

459 

460 @app.teardown_appcontext 

461 def close_redis_connection(error): 

462 """Close Redis connection when the request context ends.""" 

463 if hasattr(g, 'resource_lock_manager'): 

464 del g.resource_lock_manager 

465 

466def adjust_endpoint_url(url: str) -> str: 

467 """ 

468 Adjust endpoint URLs to work properly within Docker containers. 

469  

470 Args: 

471 url: The endpoint URL to adjust 

472  

473 Returns: 

474 The adjusted URL if running in Docker, original URL otherwise 

475 """ 

476 if not running_in_docker(): 

477 return url 

478 

479 local_patterns = ['localhost', '127.0.0.1', '0.0.0.0'] 

480 parsed_url = urlparse(url) 

481 

482 if any(pattern in parsed_url.netloc for pattern in local_patterns): 

483 netloc_parts = parsed_url.netloc.split(':') 

484 new_netloc = f'host.docker.internal:{netloc_parts[1]}' if len(netloc_parts) > 1 else 'host.docker.internal' 

485 url_parts = list(parsed_url) 

486 url_parts[1] = new_netloc 

487 return urlunparse(url_parts) 

488 

489 return url 

490 

491def running_in_docker() -> bool: 

492 """Check if the application is running inside a Docker container.""" 

493 return os.path.exists('/.dockerenv') 

494 

495def get_dataset_endpoint() -> str: 

496 """Get the configured dataset endpoint URL.""" 

497 

498 global dataset_endpoint 

499 return dataset_endpoint 

500 

501def get_sparql() -> SPARQLWrapperWithRetry: 

502 """Get the configured SPARQL wrapper for the dataset endpoint with built-in retry mechanism.""" 

503 

504 global sparql 

505 return sparql 

506 

507def get_provenance_endpoint() -> str: 

508 """Get the configured provenance endpoint URL.""" 

509 

510 global provenance_endpoint 

511 return provenance_endpoint 

512 

513def get_provenance_sparql() -> SPARQLWrapperWithRetry: 

514 """Get the configured SPARQL wrapper for the provenance endpoint with built-in retry mechanism.""" 

515 

516 global provenance_sparql 

517 return provenance_sparql 

518 

519def get_counter_handler() -> CounterHandler: 

520 """ 

521 Get the configured CounterHandler instance from the URIGenerator. 

522 Assumes URIGenerator and its counter_handler are initialized in app.config. 

523 """ 

524 uri_generator: URIGenerator = current_app.config.get('URI_GENERATOR') 

525 if uri_generator and hasattr(uri_generator, 'counter_handler'): 

526 return uri_generator.counter_handler 

527 else: 

528 # Handle cases where it might not be initialized yet or configured 

529 current_app.logger.error("CounterHandler not found in URIGenerator config.") 

530 raise RuntimeError("CounterHandler is not available. Initialization might have failed.") 

531 

532def get_custom_filter() -> Filter: 

533 """Get the configured custom filter instance.""" 

534 

535 global custom_filter 

536 return custom_filter 

537 

538def get_change_tracking_config() -> Dict: 

539 """Get the change tracking configuration.""" 

540 

541 global change_tracking_config 

542 return change_tracking_config 

543 

544def get_display_rules() -> Dict: 

545 """Get the display_rules configuration.""" 

546 

547 global display_rules 

548 return display_rules 

549 

550def get_form_fields() -> Dict: 

551 """Get the form_fields configuration.""" 

552 

553 global form_fields_cache 

554 return form_fields_cache 

555 

556def get_dataset_is_quadstore() -> bool: 

557 """Check if the dataset is a quadstore.""" 

558 

559 global dataset_is_quadstore 

560 return dataset_is_quadstore 

561 

562def get_shacl_graph() -> Graph: 

563 """Get the SHACL shapes graph.""" 

564 

565 global shacl_graph 

566 return shacl_graph 

567 

568def get_classes_with_multiple_shapes() -> set: 

569 """Get the set of classes that have multiple visible shapes.""" 

570 

571 global classes_with_multiple_shapes 

572 return classes_with_multiple_shapes or set()