Coverage for heritrace/extensions.py: 99%

302 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-10-13 17:12 +0000

1# heritrace/extensions.py 

2 

3import json 

4import logging 

5import os 

6import time 

7from collections import defaultdict 

8from datetime import datetime, timedelta 

9from typing import Dict 

10from urllib.parse import urlparse, urlunparse 

11 

12import yaml 

13from flask import Flask, current_app, g, redirect, session, url_for 

14from flask_babel import Babel 

15from flask_login import LoginManager 

16from flask_login.signals import user_loaded_from_cookie 

17from heritrace.models import User 

18from heritrace.services.resource_lock_manager import ResourceLockManager 

19from heritrace.uri_generator.uri_generator import URIGenerator 

20from heritrace.utils.filters import Filter 

21from rdflib import Graph 

22from rdflib_ocdm.counter_handler.counter_handler import CounterHandler 

23from redis import Redis 

24from SPARQLWrapper import JSON, SPARQLWrapper 

25from time_agnostic_library.support import generate_config_file 

26 

27# Global variables 

28initialization_done = False 

29dataset_endpoint = None 

30provenance_endpoint = None 

31sparql = None 

32provenance_sparql = None 

33change_tracking_config = None 

34form_fields_cache = None 

35custom_filter = None 

36redis_client = None 

37display_rules = None 

38dataset_is_quadstore = None 

39shacl_graph = None 

40classes_with_multiple_shapes = None 

41 

42 

43class SPARQLWrapperWithRetry(SPARQLWrapper): 

44 """ 

45 Extension of SPARQLWrapper that includes automatic retry functionality and timeout handling. 

46 Uses SPARQLWrapper's built-in timeout functionality. 

47 """ 

48 def __init__(self, endpoint, **kwargs): 

49 self.max_attempts = kwargs.pop('max_attempts', 3) 

50 self.initial_delay = kwargs.pop('initial_delay', 1.0) 

51 self.backoff_factor = kwargs.pop('backoff_factor', 2.0) 

52 query_timeout = kwargs.pop('timeout', 5.0) 

53 

54 super().__init__(endpoint, **kwargs) 

55 

56 self.setTimeout(int(query_timeout)) 

57 

58 def query(self): 

59 """ 

60 Override the query method to include retry logic with SPARQLWrapper's built-in timeout. 

61 Returns the original SPARQLWrapper.QueryResult so that convert() can be called on it. 

62 """ 

63 logger = logging.getLogger(__name__) 

64 

65 attempt = 1 

66 delay = self.initial_delay 

67 last_exception = None 

68 

69 while attempt <= self.max_attempts: 

70 try: 

71 result = super().query() 

72 return result 

73 

74 except Exception as e: 

75 last_exception = e 

76 logger.warning(f"SPARQL query attempt {attempt}/{self.max_attempts} failed: {str(e)}") 

77 

78 if attempt < self.max_attempts: 

79 logger.info(f"Retrying in {delay:.2f} seconds...") 

80 time.sleep(delay) 

81 delay *= self.backoff_factor 

82 

83 attempt += 1 

84 

85 logger.error(f"All {self.max_attempts} SPARQL query attempts failed") 

86 raise last_exception 

87 

88def init_extensions(app: Flask, babel: Babel, login_manager: LoginManager, redis: Redis): 

89 """ 

90 Initialize Flask extensions and configure shared objects. 

91  

92 Args: 

93 app: Flask application instance 

94 babel: Babel extension instance 

95 login_manager: LoginManager instance 

96 redis: Redis client instance 

97 """ 

98 global redis_client 

99 

100 redis_client = redis 

101 

102 # Initialize Babel 

103 babel.init_app( 

104 app=app, 

105 locale_selector=lambda: session.get('lang', 'en'), 

106 default_translation_directories=app.config['BABEL_TRANSLATION_DIRECTORIES'] 

107 ) 

108 

109 # Initialize LoginManager 

110 init_login_manager(app, login_manager) 

111 

112 # Initialize SPARQL endpoints and other services 

113 init_sparql_services(app) 

114 

115 # Initialize filters 

116 init_filters(app) 

117 

118 # Register before_request handlers 

119 init_request_handlers(app) 

120 

121 # Store extensions in app context 

122 app.babel = babel 

123 app.login_manager = login_manager 

124 app.redis_client = redis_client 

125 

126def init_login_manager(app, login_manager: LoginManager): 

127 """Configure the Flask-Login extension.""" 

128 login_manager.init_app(app) 

129 login_manager.login_view = 'auth.login' 

130 login_manager.unauthorized_handler(lambda: redirect(url_for('auth.login'))) 

131 

132 @login_manager.user_loader 

133 def load_user(user_id): 

134 user_name = session.get('user_name', 'Unknown User') 

135 return User(id=user_id, name=user_name, orcid=user_id) 

136 

137 @user_loaded_from_cookie.connect 

138 def rotate_session_token(sender, user): 

139 session.modified = True 

140 

141def initialize_change_tracking_config(app: Flask, adjusted_dataset_endpoint=None, adjusted_provenance_endpoint=None): 

142 """ 

143 Initialize and return the change tracking configuration JSON. 

144 Uses pre-adjusted endpoints if provided to avoid redundant adjustments. 

145  

146 Args: 

147 app: Flask application instance 

148 adjusted_dataset_endpoint: Dataset endpoint URL already adjusted for Docker 

149 adjusted_provenance_endpoint: Provenance endpoint URL already adjusted for Docker 

150  

151 Returns: 

152 dict: The loaded configuration dictionary 

153 """ 

154 config_needs_generation = False 

155 config_path = None 

156 config = None 

157 

158 # Check if we have a config path in app.config 

159 if 'CHANGE_TRACKING_CONFIG' in app.config: 

160 config_path = app.config['CHANGE_TRACKING_CONFIG'] 

161 if not os.path.exists(config_path): 

162 app.logger.warning(f"Change tracking configuration file not found at specified path: {config_path}") 

163 config_needs_generation = True 

164 else: 

165 config_needs_generation = True 

166 config_path = os.path.join(app.instance_path, 'change_tracking_config.json') 

167 os.makedirs(app.instance_path, exist_ok=True) 

168 

169 if config_needs_generation: 

170 dataset_urls = [adjusted_dataset_endpoint] if adjusted_dataset_endpoint else [] 

171 provenance_urls = [adjusted_provenance_endpoint] if adjusted_provenance_endpoint else [] 

172 

173 cache_endpoint = adjust_endpoint_url(app.config.get('CACHE_ENDPOINT', '')) 

174 cache_update_endpoint = adjust_endpoint_url(app.config.get('CACHE_UPDATE_ENDPOINT', '')) 

175 

176 db_triplestore = app.config.get('DATASET_DB_TRIPLESTORE', '').lower() 

177 text_index_enabled = app.config.get('DATASET_DB_TEXT_INDEX_ENABLED', False) 

178 

179 blazegraph_search = db_triplestore == 'blazegraph' and text_index_enabled 

180 fuseki_search = db_triplestore == 'fuseki' and text_index_enabled 

181 virtuoso_search = db_triplestore == 'virtuoso' and text_index_enabled 

182 

183 graphdb_connector = '' #TODO: Add graphdb support 

184 

185 try: 

186 config = generate_config_file( 

187 config_path=config_path, 

188 dataset_urls=dataset_urls, 

189 dataset_dirs=app.config.get('DATASET_DIRS', []), 

190 dataset_is_quadstore=app.config.get('DATASET_IS_QUADSTORE', False), 

191 provenance_urls=provenance_urls, 

192 provenance_is_quadstore=app.config.get('PROVENANCE_IS_QUADSTORE', False), 

193 provenance_dirs=app.config.get('PROVENANCE_DIRS', []), 

194 blazegraph_full_text_search=blazegraph_search, 

195 fuseki_full_text_search=fuseki_search, 

196 virtuoso_full_text_search=virtuoso_search, 

197 graphdb_connector_name=graphdb_connector, 

198 cache_endpoint=cache_endpoint, 

199 cache_update_endpoint=cache_update_endpoint 

200 ) 

201 app.logger.info(f"Generated new change tracking configuration at: {config_path}") 

202 except Exception as e: 

203 raise RuntimeError(f"Failed to generate change tracking configuration: {str(e)}") 

204 

205 # Load and validate the configuration 

206 try: 

207 if not config: 

208 with open(config_path, 'r', encoding='utf8') as f: 

209 config = json.load(f) 

210 

211 # Adjust cache URLs if needed 

212 if config['cache_triplestore_url'].get('endpoint'): 

213 config['cache_triplestore_url']['endpoint'] = adjust_endpoint_url( 

214 config['cache_triplestore_url']['endpoint'] 

215 ) 

216 

217 if config['cache_triplestore_url'].get('update_endpoint'): 

218 config['cache_triplestore_url']['update_endpoint'] = adjust_endpoint_url( 

219 config['cache_triplestore_url']['update_endpoint'] 

220 ) 

221 

222 except json.JSONDecodeError as e: 

223 raise RuntimeError(f"Invalid change tracking configuration JSON at {config_path}: {str(e)}") 

224 except Exception as e: 

225 raise RuntimeError(f"Error reading change tracking configuration at {config_path}: {str(e)}") 

226 

227 app.config['CHANGE_TRACKING_CONFIG'] = config_path 

228 return config 

229 

230def need_initialization(app: Flask): 

231 """ 

232 Check if counter handler initialization is needed. 

233 

234 When using external Redis (non-default REDIS_URL), assumes counters are already 

235 populated and returns False. For internal Redis, checks cache validity. 

236 """ 

237 uri_generator = app.config['URI_GENERATOR'] 

238 

239 if not hasattr(uri_generator, "counter_handler"): 

240 return False 

241 

242 redis_url = os.environ.get('REDIS_URL', 'redis://localhost:6379/0') 

243 is_external_redis = redis_url != 'redis://localhost:6379/0' 

244 

245 if is_external_redis: 

246 app.logger.info(f"Using external Redis at {redis_url} - skipping counter initialization") 

247 return False 

248 

249 # For internal Redis, check cache validity 

250 cache_validity_days = app.config['CACHE_VALIDITY_DAYS'] 

251 

252 try: 

253 last_init_str = redis_client.get('heritrace:last_initialization') 

254 if not last_init_str: 

255 return True 

256 

257 last_init = datetime.fromisoformat(last_init_str.decode('utf-8')) 

258 return datetime.now() - last_init > timedelta(days=cache_validity_days) 

259 except Exception: 

260 return True 

261 

262def update_cache(app: Flask): 

263 """ 

264 Update Redis with current initialization timestamp. 

265 """ 

266 current_time = datetime.now().isoformat() 

267 redis_client.set('heritrace:last_initialization', current_time) 

268 redis_client.set('heritrace:cache_version', '1.0') 

269 

270def initialize_counter_handler(app: Flask): 

271 """ 

272 Initialize the counter handler for URI generation if needed. 

273 Skips initialization for external Redis (detected automatically in need_initialization). 

274 """ 

275 if not need_initialization(app): 

276 return 

277 

278 uri_generator: URIGenerator = app.config['URI_GENERATOR'] 

279 counter_handler: CounterHandler = uri_generator.counter_handler 

280 

281 # Inizializza i contatori specifici dell'URI generator 

282 uri_generator.initialize_counters(sparql) 

283 

284 # Query per contare gli snapshot nella provenance 

285 # Contiamo il numero di wasDerivedFrom per ogni entità e aggiungiamo 1 

286 # (poiché il primo snapshot non ha wasDerivedFrom) 

287 prov_query = """ 

288 SELECT ?entity (COUNT(DISTINCT ?snapshot) as ?count) 

289 WHERE { 

290 ?snapshot a <http://www.w3.org/ns/prov#Entity> ; 

291 <http://www.w3.org/ns/prov#specializationOf> ?entity . 

292 OPTIONAL { 

293 ?snapshot <http://www.w3.org/ns/prov#wasDerivedFrom> ?prev . 

294 } 

295 } 

296 GROUP BY ?entity 

297 """ 

298 

299 # Esegui query sulla provenance e imposta i contatori degli snapshot 

300 provenance_sparql.setQuery(prov_query) 

301 provenance_sparql.setReturnFormat(JSON) 

302 prov_results = provenance_sparql.query().convert() 

303 

304 for result in prov_results["results"]["bindings"]: 

305 entity = result["entity"]["value"] 

306 count = int(result["count"]["value"]) 

307 counter_handler.set_counter(count, entity) 

308 

309 update_cache(app) 

310 

311def identify_classes_with_multiple_shapes(): 

312 """ 

313 Identify classes that have multiple VISIBLE shapes associated with them. 

314 Only returns classes where multiple shapes are actually visible to avoid unnecessary processing. 

315  

316 Returns: 

317 Set[str]: Set of class URIs that have multiple visible shapes 

318 """ 

319 global display_rules, shacl_graph 

320 

321 if not display_rules or not shacl_graph: 

322 return set() 

323 

324 from heritrace.utils.display_rules_utils import is_entity_type_visible 

325 

326 class_to_shapes = defaultdict(set) 

327 

328 for rule in display_rules: 

329 target = rule.get("target", {}) 

330 

331 if "class" in target: 

332 class_uri = target["class"] 

333 if shacl_graph: 

334 query_string = f""" 

335 SELECT DISTINCT ?shape WHERE {{ 

336 ?shape <http://www.w3.org/ns/shacl#targetClass> <{class_uri}> . 

337 }} 

338 """ 

339 results = shacl_graph.query(query_string) 

340 for row in results: 

341 shape_uri = str(row.shape) 

342 entity_key = (class_uri, shape_uri) 

343 if is_entity_type_visible(entity_key): 

344 class_to_shapes[class_uri].add(shape_uri) 

345 

346 elif "shape" in target: 

347 shape_uri = target["shape"] 

348 if shacl_graph: 

349 query_string = f""" 

350 SELECT DISTINCT ?class WHERE {{ 

351 <{shape_uri}> <http://www.w3.org/ns/shacl#targetClass> ?class . 

352 }} 

353 """ 

354 results = shacl_graph.query(query_string) 

355 for row in results: 

356 class_uri = str(row[0]) 

357 entity_key = (class_uri, shape_uri) 

358 if is_entity_type_visible(entity_key): 

359 class_to_shapes[class_uri].add(shape_uri) 

360 

361 return {class_uri for class_uri, shapes in class_to_shapes.items() if len(shapes) > 1} 

362 

363def initialize_global_variables(app: Flask): 

364 """ 

365 Initialize all global variables including form fields cache, display rules, 

366 and dataset configuration from SHACL shapes graph and configuration files. 

367  

368 Args: 

369 app: Flask application instance 

370 """ 

371 global shacl_graph, form_fields_cache, display_rules, dataset_is_quadstore, classes_with_multiple_shapes 

372 

373 try: 

374 dataset_is_quadstore = app.config.get('DATASET_IS_QUADSTORE', False) 

375 

376 if app.config.get('DISPLAY_RULES_PATH'): 

377 if not os.path.exists(app.config['DISPLAY_RULES_PATH']): 

378 app.logger.warning(f"Display rules file not found at: {app.config['DISPLAY_RULES_PATH']}") 

379 else: 

380 try: 

381 with open(app.config['DISPLAY_RULES_PATH'], 'r') as f: 

382 yaml_content = yaml.safe_load(f) 

383 display_rules = yaml_content['rules'] 

384 except Exception as e: 

385 app.logger.error(f"Error loading display rules: {str(e)}") 

386 raise RuntimeError(f"Failed to load display rules: {str(e)}") 

387 else: 

388 display_rules = [] 

389 

390 if app.config.get('SHACL_PATH'): 

391 if not os.path.exists(app.config['SHACL_PATH']): 

392 app.logger.warning(f"SHACL file not found at: {app.config['SHACL_PATH']}") 

393 return 

394 

395 try: 

396 shacl_graph = Graph() 

397 shacl_graph.parse(source=app.config['SHACL_PATH'], format="turtle") 

398 

399 from heritrace.utils.shacl_utils import \ 

400 get_form_fields_from_shacl 

401 form_fields_cache = get_form_fields_from_shacl(shacl_graph, display_rules, app=app) 

402 except Exception as e: 

403 app.logger.error(f"Error initializing form fields from SHACL: {str(e)}") 

404 raise RuntimeError(f"Failed to initialize form fields: {str(e)}") 

405 else: 

406 shacl_graph = Graph() 

407 form_fields_cache = {} 

408 

409 classes_with_multiple_shapes = identify_classes_with_multiple_shapes() 

410 

411 app.logger.info("Global variables initialized successfully") 

412 

413 except Exception as e: 

414 app.logger.error(f"Error during global variables initialization: {str(e)}") 

415 raise RuntimeError(f"Global variables initialization failed: {str(e)}") 

416 

417def init_sparql_services(app: Flask): 

418 """Initialize SPARQL endpoints and related services.""" 

419 global initialization_done, dataset_endpoint, provenance_endpoint, sparql, provenance_sparql, change_tracking_config 

420 

421 if not initialization_done: 

422 dataset_endpoint = adjust_endpoint_url(app.config['DATASET_DB_URL']) 

423 provenance_endpoint = adjust_endpoint_url(app.config['PROVENANCE_DB_URL']) 

424 

425 sparql = SPARQLWrapperWithRetry(dataset_endpoint, timeout=30.0) 

426 provenance_sparql = SPARQLWrapperWithRetry(provenance_endpoint, timeout=30.0) 

427 

428 change_tracking_config = initialize_change_tracking_config( 

429 app, 

430 adjusted_dataset_endpoint=dataset_endpoint, 

431 adjusted_provenance_endpoint=provenance_endpoint 

432 ) 

433 

434 initialize_counter_handler(app) 

435 initialize_global_variables(app) 

436 initialization_done = True 

437 

438def init_filters(app: Flask): 

439 """Initialize custom template filters.""" 

440 global custom_filter 

441 

442 with open(os.path.join(os.path.dirname(__file__), "utils", "context.json"), "r") as config_file: 

443 context = json.load(config_file)["@context"] 

444 

445 display_rules = None 

446 if app.config["DISPLAY_RULES_PATH"]: 

447 with open(app.config["DISPLAY_RULES_PATH"], 'r') as f: 

448 yaml_content = yaml.safe_load(f) 

449 display_rules = yaml_content.get('rules', []) 

450 

451 custom_filter = Filter(context, display_rules, dataset_endpoint) 

452 

453 app.jinja_env.filters['human_readable_predicate'] = custom_filter.human_readable_predicate 

454 app.jinja_env.filters['human_readable_class'] = custom_filter.human_readable_class 

455 app.jinja_env.filters['human_readable_entity'] = custom_filter.human_readable_entity 

456 app.jinja_env.filters['human_readable_primary_source'] = custom_filter.human_readable_primary_source 

457 app.jinja_env.filters['format_datetime'] = custom_filter.human_readable_datetime 

458 from heritrace.utils.filters import split_namespace 

459 app.jinja_env.filters['split_ns'] = split_namespace 

460 app.jinja_env.filters['format_source_reference'] = custom_filter.format_source_reference 

461 app.jinja_env.filters['format_agent_reference'] = custom_filter.format_agent_reference 

462 

463def init_request_handlers(app): 

464 """Initialize before_request and teardown_request handlers.""" 

465 

466 @app.before_request 

467 def initialize_lock_manager(): 

468 """Initialize the resource lock manager for each request.""" 

469 if not hasattr(g, 'resource_lock_manager'): 

470 g.resource_lock_manager = ResourceLockManager(redis_client) 

471 

472 @app.teardown_appcontext 

473 def close_redis_connection(error): 

474 """Close Redis connection when the request context ends.""" 

475 if hasattr(g, 'resource_lock_manager'): 

476 del g.resource_lock_manager 

477 

478def adjust_endpoint_url(url: str) -> str: 

479 """ 

480 Adjust endpoint URLs to work properly within Docker containers. 

481  

482 Args: 

483 url: The endpoint URL to adjust 

484  

485 Returns: 

486 The adjusted URL if running in Docker, original URL otherwise 

487 """ 

488 if not running_in_docker(): 

489 return url 

490 

491 local_patterns = ['localhost', '127.0.0.1', '0.0.0.0'] 

492 parsed_url = urlparse(url) 

493 

494 if any(pattern in parsed_url.netloc for pattern in local_patterns): 

495 netloc_parts = parsed_url.netloc.split(':') 

496 new_netloc = f'host.docker.internal:{netloc_parts[1]}' if len(netloc_parts) > 1 else 'host.docker.internal' 

497 url_parts = list(parsed_url) 

498 url_parts[1] = new_netloc 

499 return urlunparse(url_parts) 

500 

501 return url 

502 

503def running_in_docker() -> bool: 

504 """Check if the application is running inside a Docker container.""" 

505 return os.path.exists('/.dockerenv') 

506 

507def get_dataset_endpoint() -> str: 

508 """Get the configured dataset endpoint URL.""" 

509 

510 global dataset_endpoint 

511 return dataset_endpoint 

512 

513def get_sparql() -> SPARQLWrapperWithRetry: 

514 """Get the configured SPARQL wrapper for the dataset endpoint with built-in retry mechanism.""" 

515 

516 global sparql 

517 return sparql 

518 

519def get_provenance_endpoint() -> str: 

520 """Get the configured provenance endpoint URL.""" 

521 

522 global provenance_endpoint 

523 return provenance_endpoint 

524 

525def get_provenance_sparql() -> SPARQLWrapperWithRetry: 

526 """Get the configured SPARQL wrapper for the provenance endpoint with built-in retry mechanism.""" 

527 

528 global provenance_sparql 

529 return provenance_sparql 

530 

531def get_counter_handler() -> CounterHandler: 

532 """ 

533 Get the configured CounterHandler instance from the URIGenerator. 

534 Assumes URIGenerator and its counter_handler are initialized in app.config. 

535 """ 

536 uri_generator: URIGenerator = current_app.config.get('URI_GENERATOR') 

537 if uri_generator and hasattr(uri_generator, 'counter_handler'): 

538 return uri_generator.counter_handler 

539 else: 

540 # Handle cases where it might not be initialized yet or configured 

541 current_app.logger.error("CounterHandler not found in URIGenerator config.") 

542 raise RuntimeError("CounterHandler is not available. Initialization might have failed.") 

543 

544def get_custom_filter() -> Filter: 

545 """Get the configured custom filter instance.""" 

546 

547 global custom_filter 

548 return custom_filter 

549 

550def get_change_tracking_config() -> Dict: 

551 """Get the change tracking configuration.""" 

552 

553 global change_tracking_config 

554 return change_tracking_config 

555 

556def get_display_rules() -> Dict: 

557 """Get the display_rules configuration.""" 

558 

559 global display_rules 

560 return display_rules 

561 

562def get_form_fields() -> Dict: 

563 """Get the form_fields configuration.""" 

564 

565 global form_fields_cache 

566 return form_fields_cache 

567 

568def get_dataset_is_quadstore() -> bool: 

569 """Check if the dataset is a quadstore.""" 

570 

571 global dataset_is_quadstore 

572 return dataset_is_quadstore 

573 

574def get_shacl_graph() -> Graph: 

575 """Get the SHACL shapes graph.""" 

576 

577 global shacl_graph 

578 return shacl_graph 

579 

580def get_classes_with_multiple_shapes() -> set: 

581 """Get the set of classes that have multiple visible shapes.""" 

582 

583 global classes_with_multiple_shapes 

584 return classes_with_multiple_shapes or set()