Coverage for heritrace/extensions.py: 99%
302 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-10-13 17:12 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-10-13 17:12 +0000
1# heritrace/extensions.py
3import json
4import logging
5import os
6import time
7from collections import defaultdict
8from datetime import datetime, timedelta
9from typing import Dict
10from urllib.parse import urlparse, urlunparse
12import yaml
13from flask import Flask, current_app, g, redirect, session, url_for
14from flask_babel import Babel
15from flask_login import LoginManager
16from flask_login.signals import user_loaded_from_cookie
17from heritrace.models import User
18from heritrace.services.resource_lock_manager import ResourceLockManager
19from heritrace.uri_generator.uri_generator import URIGenerator
20from heritrace.utils.filters import Filter
21from rdflib import Graph
22from rdflib_ocdm.counter_handler.counter_handler import CounterHandler
23from redis import Redis
24from SPARQLWrapper import JSON, SPARQLWrapper
25from time_agnostic_library.support import generate_config_file
27# Global variables
28initialization_done = False
29dataset_endpoint = None
30provenance_endpoint = None
31sparql = None
32provenance_sparql = None
33change_tracking_config = None
34form_fields_cache = None
35custom_filter = None
36redis_client = None
37display_rules = None
38dataset_is_quadstore = None
39shacl_graph = None
40classes_with_multiple_shapes = None
43class SPARQLWrapperWithRetry(SPARQLWrapper):
44 """
45 Extension of SPARQLWrapper that includes automatic retry functionality and timeout handling.
46 Uses SPARQLWrapper's built-in timeout functionality.
47 """
48 def __init__(self, endpoint, **kwargs):
49 self.max_attempts = kwargs.pop('max_attempts', 3)
50 self.initial_delay = kwargs.pop('initial_delay', 1.0)
51 self.backoff_factor = kwargs.pop('backoff_factor', 2.0)
52 query_timeout = kwargs.pop('timeout', 5.0)
54 super().__init__(endpoint, **kwargs)
56 self.setTimeout(int(query_timeout))
58 def query(self):
59 """
60 Override the query method to include retry logic with SPARQLWrapper's built-in timeout.
61 Returns the original SPARQLWrapper.QueryResult so that convert() can be called on it.
62 """
63 logger = logging.getLogger(__name__)
65 attempt = 1
66 delay = self.initial_delay
67 last_exception = None
69 while attempt <= self.max_attempts:
70 try:
71 result = super().query()
72 return result
74 except Exception as e:
75 last_exception = e
76 logger.warning(f"SPARQL query attempt {attempt}/{self.max_attempts} failed: {str(e)}")
78 if attempt < self.max_attempts:
79 logger.info(f"Retrying in {delay:.2f} seconds...")
80 time.sleep(delay)
81 delay *= self.backoff_factor
83 attempt += 1
85 logger.error(f"All {self.max_attempts} SPARQL query attempts failed")
86 raise last_exception
88def init_extensions(app: Flask, babel: Babel, login_manager: LoginManager, redis: Redis):
89 """
90 Initialize Flask extensions and configure shared objects.
92 Args:
93 app: Flask application instance
94 babel: Babel extension instance
95 login_manager: LoginManager instance
96 redis: Redis client instance
97 """
98 global redis_client
100 redis_client = redis
102 # Initialize Babel
103 babel.init_app(
104 app=app,
105 locale_selector=lambda: session.get('lang', 'en'),
106 default_translation_directories=app.config['BABEL_TRANSLATION_DIRECTORIES']
107 )
109 # Initialize LoginManager
110 init_login_manager(app, login_manager)
112 # Initialize SPARQL endpoints and other services
113 init_sparql_services(app)
115 # Initialize filters
116 init_filters(app)
118 # Register before_request handlers
119 init_request_handlers(app)
121 # Store extensions in app context
122 app.babel = babel
123 app.login_manager = login_manager
124 app.redis_client = redis_client
126def init_login_manager(app, login_manager: LoginManager):
127 """Configure the Flask-Login extension."""
128 login_manager.init_app(app)
129 login_manager.login_view = 'auth.login'
130 login_manager.unauthorized_handler(lambda: redirect(url_for('auth.login')))
132 @login_manager.user_loader
133 def load_user(user_id):
134 user_name = session.get('user_name', 'Unknown User')
135 return User(id=user_id, name=user_name, orcid=user_id)
137 @user_loaded_from_cookie.connect
138 def rotate_session_token(sender, user):
139 session.modified = True
141def initialize_change_tracking_config(app: Flask, adjusted_dataset_endpoint=None, adjusted_provenance_endpoint=None):
142 """
143 Initialize and return the change tracking configuration JSON.
144 Uses pre-adjusted endpoints if provided to avoid redundant adjustments.
146 Args:
147 app: Flask application instance
148 adjusted_dataset_endpoint: Dataset endpoint URL already adjusted for Docker
149 adjusted_provenance_endpoint: Provenance endpoint URL already adjusted for Docker
151 Returns:
152 dict: The loaded configuration dictionary
153 """
154 config_needs_generation = False
155 config_path = None
156 config = None
158 # Check if we have a config path in app.config
159 if 'CHANGE_TRACKING_CONFIG' in app.config:
160 config_path = app.config['CHANGE_TRACKING_CONFIG']
161 if not os.path.exists(config_path):
162 app.logger.warning(f"Change tracking configuration file not found at specified path: {config_path}")
163 config_needs_generation = True
164 else:
165 config_needs_generation = True
166 config_path = os.path.join(app.instance_path, 'change_tracking_config.json')
167 os.makedirs(app.instance_path, exist_ok=True)
169 if config_needs_generation:
170 dataset_urls = [adjusted_dataset_endpoint] if adjusted_dataset_endpoint else []
171 provenance_urls = [adjusted_provenance_endpoint] if adjusted_provenance_endpoint else []
173 cache_endpoint = adjust_endpoint_url(app.config.get('CACHE_ENDPOINT', ''))
174 cache_update_endpoint = adjust_endpoint_url(app.config.get('CACHE_UPDATE_ENDPOINT', ''))
176 db_triplestore = app.config.get('DATASET_DB_TRIPLESTORE', '').lower()
177 text_index_enabled = app.config.get('DATASET_DB_TEXT_INDEX_ENABLED', False)
179 blazegraph_search = db_triplestore == 'blazegraph' and text_index_enabled
180 fuseki_search = db_triplestore == 'fuseki' and text_index_enabled
181 virtuoso_search = db_triplestore == 'virtuoso' and text_index_enabled
183 graphdb_connector = '' #TODO: Add graphdb support
185 try:
186 config = generate_config_file(
187 config_path=config_path,
188 dataset_urls=dataset_urls,
189 dataset_dirs=app.config.get('DATASET_DIRS', []),
190 dataset_is_quadstore=app.config.get('DATASET_IS_QUADSTORE', False),
191 provenance_urls=provenance_urls,
192 provenance_is_quadstore=app.config.get('PROVENANCE_IS_QUADSTORE', False),
193 provenance_dirs=app.config.get('PROVENANCE_DIRS', []),
194 blazegraph_full_text_search=blazegraph_search,
195 fuseki_full_text_search=fuseki_search,
196 virtuoso_full_text_search=virtuoso_search,
197 graphdb_connector_name=graphdb_connector,
198 cache_endpoint=cache_endpoint,
199 cache_update_endpoint=cache_update_endpoint
200 )
201 app.logger.info(f"Generated new change tracking configuration at: {config_path}")
202 except Exception as e:
203 raise RuntimeError(f"Failed to generate change tracking configuration: {str(e)}")
205 # Load and validate the configuration
206 try:
207 if not config:
208 with open(config_path, 'r', encoding='utf8') as f:
209 config = json.load(f)
211 # Adjust cache URLs if needed
212 if config['cache_triplestore_url'].get('endpoint'):
213 config['cache_triplestore_url']['endpoint'] = adjust_endpoint_url(
214 config['cache_triplestore_url']['endpoint']
215 )
217 if config['cache_triplestore_url'].get('update_endpoint'):
218 config['cache_triplestore_url']['update_endpoint'] = adjust_endpoint_url(
219 config['cache_triplestore_url']['update_endpoint']
220 )
222 except json.JSONDecodeError as e:
223 raise RuntimeError(f"Invalid change tracking configuration JSON at {config_path}: {str(e)}")
224 except Exception as e:
225 raise RuntimeError(f"Error reading change tracking configuration at {config_path}: {str(e)}")
227 app.config['CHANGE_TRACKING_CONFIG'] = config_path
228 return config
230def need_initialization(app: Flask):
231 """
232 Check if counter handler initialization is needed.
234 When using external Redis (non-default REDIS_URL), assumes counters are already
235 populated and returns False. For internal Redis, checks cache validity.
236 """
237 uri_generator = app.config['URI_GENERATOR']
239 if not hasattr(uri_generator, "counter_handler"):
240 return False
242 redis_url = os.environ.get('REDIS_URL', 'redis://localhost:6379/0')
243 is_external_redis = redis_url != 'redis://localhost:6379/0'
245 if is_external_redis:
246 app.logger.info(f"Using external Redis at {redis_url} - skipping counter initialization")
247 return False
249 # For internal Redis, check cache validity
250 cache_validity_days = app.config['CACHE_VALIDITY_DAYS']
252 try:
253 last_init_str = redis_client.get('heritrace:last_initialization')
254 if not last_init_str:
255 return True
257 last_init = datetime.fromisoformat(last_init_str.decode('utf-8'))
258 return datetime.now() - last_init > timedelta(days=cache_validity_days)
259 except Exception:
260 return True
262def update_cache(app: Flask):
263 """
264 Update Redis with current initialization timestamp.
265 """
266 current_time = datetime.now().isoformat()
267 redis_client.set('heritrace:last_initialization', current_time)
268 redis_client.set('heritrace:cache_version', '1.0')
270def initialize_counter_handler(app: Flask):
271 """
272 Initialize the counter handler for URI generation if needed.
273 Skips initialization for external Redis (detected automatically in need_initialization).
274 """
275 if not need_initialization(app):
276 return
278 uri_generator: URIGenerator = app.config['URI_GENERATOR']
279 counter_handler: CounterHandler = uri_generator.counter_handler
281 # Inizializza i contatori specifici dell'URI generator
282 uri_generator.initialize_counters(sparql)
284 # Query per contare gli snapshot nella provenance
285 # Contiamo il numero di wasDerivedFrom per ogni entità e aggiungiamo 1
286 # (poiché il primo snapshot non ha wasDerivedFrom)
287 prov_query = """
288 SELECT ?entity (COUNT(DISTINCT ?snapshot) as ?count)
289 WHERE {
290 ?snapshot a <http://www.w3.org/ns/prov#Entity> ;
291 <http://www.w3.org/ns/prov#specializationOf> ?entity .
292 OPTIONAL {
293 ?snapshot <http://www.w3.org/ns/prov#wasDerivedFrom> ?prev .
294 }
295 }
296 GROUP BY ?entity
297 """
299 # Esegui query sulla provenance e imposta i contatori degli snapshot
300 provenance_sparql.setQuery(prov_query)
301 provenance_sparql.setReturnFormat(JSON)
302 prov_results = provenance_sparql.query().convert()
304 for result in prov_results["results"]["bindings"]:
305 entity = result["entity"]["value"]
306 count = int(result["count"]["value"])
307 counter_handler.set_counter(count, entity)
309 update_cache(app)
311def identify_classes_with_multiple_shapes():
312 """
313 Identify classes that have multiple VISIBLE shapes associated with them.
314 Only returns classes where multiple shapes are actually visible to avoid unnecessary processing.
316 Returns:
317 Set[str]: Set of class URIs that have multiple visible shapes
318 """
319 global display_rules, shacl_graph
321 if not display_rules or not shacl_graph:
322 return set()
324 from heritrace.utils.display_rules_utils import is_entity_type_visible
326 class_to_shapes = defaultdict(set)
328 for rule in display_rules:
329 target = rule.get("target", {})
331 if "class" in target:
332 class_uri = target["class"]
333 if shacl_graph:
334 query_string = f"""
335 SELECT DISTINCT ?shape WHERE {{
336 ?shape <http://www.w3.org/ns/shacl#targetClass> <{class_uri}> .
337 }}
338 """
339 results = shacl_graph.query(query_string)
340 for row in results:
341 shape_uri = str(row.shape)
342 entity_key = (class_uri, shape_uri)
343 if is_entity_type_visible(entity_key):
344 class_to_shapes[class_uri].add(shape_uri)
346 elif "shape" in target:
347 shape_uri = target["shape"]
348 if shacl_graph:
349 query_string = f"""
350 SELECT DISTINCT ?class WHERE {{
351 <{shape_uri}> <http://www.w3.org/ns/shacl#targetClass> ?class .
352 }}
353 """
354 results = shacl_graph.query(query_string)
355 for row in results:
356 class_uri = str(row[0])
357 entity_key = (class_uri, shape_uri)
358 if is_entity_type_visible(entity_key):
359 class_to_shapes[class_uri].add(shape_uri)
361 return {class_uri for class_uri, shapes in class_to_shapes.items() if len(shapes) > 1}
363def initialize_global_variables(app: Flask):
364 """
365 Initialize all global variables including form fields cache, display rules,
366 and dataset configuration from SHACL shapes graph and configuration files.
368 Args:
369 app: Flask application instance
370 """
371 global shacl_graph, form_fields_cache, display_rules, dataset_is_quadstore, classes_with_multiple_shapes
373 try:
374 dataset_is_quadstore = app.config.get('DATASET_IS_QUADSTORE', False)
376 if app.config.get('DISPLAY_RULES_PATH'):
377 if not os.path.exists(app.config['DISPLAY_RULES_PATH']):
378 app.logger.warning(f"Display rules file not found at: {app.config['DISPLAY_RULES_PATH']}")
379 else:
380 try:
381 with open(app.config['DISPLAY_RULES_PATH'], 'r') as f:
382 yaml_content = yaml.safe_load(f)
383 display_rules = yaml_content['rules']
384 except Exception as e:
385 app.logger.error(f"Error loading display rules: {str(e)}")
386 raise RuntimeError(f"Failed to load display rules: {str(e)}")
387 else:
388 display_rules = []
390 if app.config.get('SHACL_PATH'):
391 if not os.path.exists(app.config['SHACL_PATH']):
392 app.logger.warning(f"SHACL file not found at: {app.config['SHACL_PATH']}")
393 return
395 try:
396 shacl_graph = Graph()
397 shacl_graph.parse(source=app.config['SHACL_PATH'], format="turtle")
399 from heritrace.utils.shacl_utils import \
400 get_form_fields_from_shacl
401 form_fields_cache = get_form_fields_from_shacl(shacl_graph, display_rules, app=app)
402 except Exception as e:
403 app.logger.error(f"Error initializing form fields from SHACL: {str(e)}")
404 raise RuntimeError(f"Failed to initialize form fields: {str(e)}")
405 else:
406 shacl_graph = Graph()
407 form_fields_cache = {}
409 classes_with_multiple_shapes = identify_classes_with_multiple_shapes()
411 app.logger.info("Global variables initialized successfully")
413 except Exception as e:
414 app.logger.error(f"Error during global variables initialization: {str(e)}")
415 raise RuntimeError(f"Global variables initialization failed: {str(e)}")
417def init_sparql_services(app: Flask):
418 """Initialize SPARQL endpoints and related services."""
419 global initialization_done, dataset_endpoint, provenance_endpoint, sparql, provenance_sparql, change_tracking_config
421 if not initialization_done:
422 dataset_endpoint = adjust_endpoint_url(app.config['DATASET_DB_URL'])
423 provenance_endpoint = adjust_endpoint_url(app.config['PROVENANCE_DB_URL'])
425 sparql = SPARQLWrapperWithRetry(dataset_endpoint, timeout=30.0)
426 provenance_sparql = SPARQLWrapperWithRetry(provenance_endpoint, timeout=30.0)
428 change_tracking_config = initialize_change_tracking_config(
429 app,
430 adjusted_dataset_endpoint=dataset_endpoint,
431 adjusted_provenance_endpoint=provenance_endpoint
432 )
434 initialize_counter_handler(app)
435 initialize_global_variables(app)
436 initialization_done = True
438def init_filters(app: Flask):
439 """Initialize custom template filters."""
440 global custom_filter
442 with open(os.path.join(os.path.dirname(__file__), "utils", "context.json"), "r") as config_file:
443 context = json.load(config_file)["@context"]
445 display_rules = None
446 if app.config["DISPLAY_RULES_PATH"]:
447 with open(app.config["DISPLAY_RULES_PATH"], 'r') as f:
448 yaml_content = yaml.safe_load(f)
449 display_rules = yaml_content.get('rules', [])
451 custom_filter = Filter(context, display_rules, dataset_endpoint)
453 app.jinja_env.filters['human_readable_predicate'] = custom_filter.human_readable_predicate
454 app.jinja_env.filters['human_readable_class'] = custom_filter.human_readable_class
455 app.jinja_env.filters['human_readable_entity'] = custom_filter.human_readable_entity
456 app.jinja_env.filters['human_readable_primary_source'] = custom_filter.human_readable_primary_source
457 app.jinja_env.filters['format_datetime'] = custom_filter.human_readable_datetime
458 from heritrace.utils.filters import split_namespace
459 app.jinja_env.filters['split_ns'] = split_namespace
460 app.jinja_env.filters['format_source_reference'] = custom_filter.format_source_reference
461 app.jinja_env.filters['format_agent_reference'] = custom_filter.format_agent_reference
463def init_request_handlers(app):
464 """Initialize before_request and teardown_request handlers."""
466 @app.before_request
467 def initialize_lock_manager():
468 """Initialize the resource lock manager for each request."""
469 if not hasattr(g, 'resource_lock_manager'):
470 g.resource_lock_manager = ResourceLockManager(redis_client)
472 @app.teardown_appcontext
473 def close_redis_connection(error):
474 """Close Redis connection when the request context ends."""
475 if hasattr(g, 'resource_lock_manager'):
476 del g.resource_lock_manager
478def adjust_endpoint_url(url: str) -> str:
479 """
480 Adjust endpoint URLs to work properly within Docker containers.
482 Args:
483 url: The endpoint URL to adjust
485 Returns:
486 The adjusted URL if running in Docker, original URL otherwise
487 """
488 if not running_in_docker():
489 return url
491 local_patterns = ['localhost', '127.0.0.1', '0.0.0.0']
492 parsed_url = urlparse(url)
494 if any(pattern in parsed_url.netloc for pattern in local_patterns):
495 netloc_parts = parsed_url.netloc.split(':')
496 new_netloc = f'host.docker.internal:{netloc_parts[1]}' if len(netloc_parts) > 1 else 'host.docker.internal'
497 url_parts = list(parsed_url)
498 url_parts[1] = new_netloc
499 return urlunparse(url_parts)
501 return url
503def running_in_docker() -> bool:
504 """Check if the application is running inside a Docker container."""
505 return os.path.exists('/.dockerenv')
507def get_dataset_endpoint() -> str:
508 """Get the configured dataset endpoint URL."""
510 global dataset_endpoint
511 return dataset_endpoint
513def get_sparql() -> SPARQLWrapperWithRetry:
514 """Get the configured SPARQL wrapper for the dataset endpoint with built-in retry mechanism."""
516 global sparql
517 return sparql
519def get_provenance_endpoint() -> str:
520 """Get the configured provenance endpoint URL."""
522 global provenance_endpoint
523 return provenance_endpoint
525def get_provenance_sparql() -> SPARQLWrapperWithRetry:
526 """Get the configured SPARQL wrapper for the provenance endpoint with built-in retry mechanism."""
528 global provenance_sparql
529 return provenance_sparql
531def get_counter_handler() -> CounterHandler:
532 """
533 Get the configured CounterHandler instance from the URIGenerator.
534 Assumes URIGenerator and its counter_handler are initialized in app.config.
535 """
536 uri_generator: URIGenerator = current_app.config.get('URI_GENERATOR')
537 if uri_generator and hasattr(uri_generator, 'counter_handler'):
538 return uri_generator.counter_handler
539 else:
540 # Handle cases where it might not be initialized yet or configured
541 current_app.logger.error("CounterHandler not found in URIGenerator config.")
542 raise RuntimeError("CounterHandler is not available. Initialization might have failed.")
544def get_custom_filter() -> Filter:
545 """Get the configured custom filter instance."""
547 global custom_filter
548 return custom_filter
550def get_change_tracking_config() -> Dict:
551 """Get the change tracking configuration."""
553 global change_tracking_config
554 return change_tracking_config
556def get_display_rules() -> Dict:
557 """Get the display_rules configuration."""
559 global display_rules
560 return display_rules
562def get_form_fields() -> Dict:
563 """Get the form_fields configuration."""
565 global form_fields_cache
566 return form_fields_cache
568def get_dataset_is_quadstore() -> bool:
569 """Check if the dataset is a quadstore."""
571 global dataset_is_quadstore
572 return dataset_is_quadstore
574def get_shacl_graph() -> Graph:
575 """Get the SHACL shapes graph."""
577 global shacl_graph
578 return shacl_graph
580def get_classes_with_multiple_shapes() -> set:
581 """Get the set of classes that have multiple visible shapes."""
583 global classes_with_multiple_shapes
584 return classes_with_multiple_shapes or set()