Add metrics for tracking when instance key renewal succeeds and fails, as well as when instance key *lookup* fails

This commit is contained in:
Joseph Schorr 2017-11-10 15:46:09 -05:00 committed by Joseph Schorr
parent a927ce3e0f
commit bbdf9e074c
10 changed files with 61 additions and 24 deletions

View file

@ -4,7 +4,6 @@ from health.models_pre_oci import pre_oci_model as model
logger = logging.getLogger(__name__)
def _check_gunicorn(endpoint):
def fn(app):
""" Returns the status of the gunicorn workers. """
@ -23,7 +22,9 @@ def _check_gunicorn(endpoint):
registry_url = '%s://localhost%s/%s' % (scheme, port, endpoint)
try:
status_code = client.get(registry_url, verify=False, timeout=2).status_code
return (status_code == 200, 'Got non-200 response for worker: %s' % status_code)
okay = status_code == 200
message = 'Got non-200 response for worker: %s' % status_code if not okay else None
return (okay, message)
except Exception as ex:
logger.exception('Exception when checking worker health: %s', registry_url)
return (False, 'Exception when checking worker health: %s' % registry_url)
@ -61,23 +62,24 @@ def _check_service_key(app):
has expired, then will return False.
"""
if not app.config.get('SETUP_COMPLETE', False):
return (True, 'Stack not fully setup')
return (True, 'Stack not fully setup; skipping check')
try:
kid = instance_keys.local_key_id
except IOError as ex:
# Key has not been created yet.
return (True, 'Stack not fully setup')
return (True, 'Stack not fully setup; skipping check')
try:
result = bool(instance_keys.get_service_key_public_key(kid))
return (result, 'Could not find valid instance service key %s' % kid)
key_is_valid = bool(instance_keys.get_service_key_public_key(kid))
message = 'Could not find valid instance service key %s' % kid if not key_is_valid else None
return (key_is_valid, message)
except Exception as ex:
logger.exception('Got exception when trying to retrieve the instance key')
# NOTE: We return *True* here if there was an exception when retrieving the key, as it means
# the database is down, which will be handled by the database health check.
return (True, 'Failed to get instance key due to a database issue')
return (True, 'Failed to get instance key due to a database issue; skipping check')
_SERVICES = {