Add metrics for tracking when instance key renewal succeeds and fails, as well as when instance key *lookup* fails
This commit is contained in:
parent
a927ce3e0f
commit
bbdf9e074c
10 changed files with 61 additions and 24 deletions
|
@ -4,7 +4,6 @@ from health.models_pre_oci import pre_oci_model as model
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _check_gunicorn(endpoint):
|
||||
def fn(app):
|
||||
""" Returns the status of the gunicorn workers. """
|
||||
|
@ -23,7 +22,9 @@ def _check_gunicorn(endpoint):
|
|||
registry_url = '%s://localhost%s/%s' % (scheme, port, endpoint)
|
||||
try:
|
||||
status_code = client.get(registry_url, verify=False, timeout=2).status_code
|
||||
return (status_code == 200, 'Got non-200 response for worker: %s' % status_code)
|
||||
okay = status_code == 200
|
||||
message = 'Got non-200 response for worker: %s' % status_code if not okay else None
|
||||
return (okay, message)
|
||||
except Exception as ex:
|
||||
logger.exception('Exception when checking worker health: %s', registry_url)
|
||||
return (False, 'Exception when checking worker health: %s' % registry_url)
|
||||
|
@ -61,23 +62,24 @@ def _check_service_key(app):
|
|||
has expired, then will return False.
|
||||
"""
|
||||
if not app.config.get('SETUP_COMPLETE', False):
|
||||
return (True, 'Stack not fully setup')
|
||||
return (True, 'Stack not fully setup; skipping check')
|
||||
|
||||
try:
|
||||
kid = instance_keys.local_key_id
|
||||
except IOError as ex:
|
||||
# Key has not been created yet.
|
||||
return (True, 'Stack not fully setup')
|
||||
return (True, 'Stack not fully setup; skipping check')
|
||||
|
||||
try:
|
||||
result = bool(instance_keys.get_service_key_public_key(kid))
|
||||
return (result, 'Could not find valid instance service key %s' % kid)
|
||||
key_is_valid = bool(instance_keys.get_service_key_public_key(kid))
|
||||
message = 'Could not find valid instance service key %s' % kid if not key_is_valid else None
|
||||
return (key_is_valid, message)
|
||||
except Exception as ex:
|
||||
logger.exception('Got exception when trying to retrieve the instance key')
|
||||
|
||||
# NOTE: We return *True* here if there was an exception when retrieving the key, as it means
|
||||
# the database is down, which will be handled by the database health check.
|
||||
return (True, 'Failed to get instance key due to a database issue')
|
||||
return (True, 'Failed to get instance key due to a database issue; skipping check')
|
||||
|
||||
|
||||
_SERVICES = {
|
||||
|
|
Reference in a new issue