Make health check failures report their reasons
Note that we add a new block with expanded service info, to avoid breaking compatibility with existing callers of the health endpoint
This commit is contained in:
parent
e44a503bd0
commit
4ad3682b9c
4 changed files with 32 additions and 20 deletions
|
@ -120,15 +120,14 @@ class RedisBuildLogs(object):
|
||||||
|
|
||||||
connection = redis.StrictRedis(**args)
|
connection = redis.StrictRedis(**args)
|
||||||
if not connection.ping() == True:
|
if not connection.ping() == True:
|
||||||
return False
|
return (False, 'Could not ping redis')
|
||||||
|
|
||||||
# Ensure we can write and read a key.
|
# Ensure we can write and read a key.
|
||||||
connection.set(self._health_key(), time.time())
|
connection.set(self._health_key(), time.time())
|
||||||
connection.get(self._health_key())
|
connection.get(self._health_key())
|
||||||
|
return (True, None)
|
||||||
return True
|
except redis.RedisError as re:
|
||||||
except redis.RedisError:
|
return (False, 'Could not connect to redis: %s' % re.message)
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
class BuildLogs(object):
|
class BuildLogs(object):
|
||||||
|
|
|
@ -11,12 +11,11 @@ def check_health(app_config):
|
||||||
# check).
|
# check).
|
||||||
try:
|
try:
|
||||||
validate_database_url(app_config['DB_URI'], {}, connect_timeout=3)
|
validate_database_url(app_config['DB_URI'], {}, connect_timeout=3)
|
||||||
except Exception:
|
except Exception as ex:
|
||||||
logger.exception('Could not connect to the database')
|
return (False, 'Could not connect to the database: %s', ex.message)
|
||||||
return False
|
|
||||||
|
|
||||||
# We will connect to the db, check that it contains some team role kinds
|
# We will connect to the db, check that it contains some team role kinds
|
||||||
try:
|
try:
|
||||||
return bool(list(TeamRole.select().limit(1)))
|
return (bool(list(TeamRole.select().limit(1))), 'Could not connect to the database')
|
||||||
except:
|
except Exception as ex:
|
||||||
return False
|
return (False, 'Could not connect to the database: %s', ex.message)
|
||||||
|
|
|
@ -46,15 +46,29 @@ class HealthCheck(object):
|
||||||
is_healthy = True
|
is_healthy = True
|
||||||
notes = notes or []
|
notes = notes or []
|
||||||
|
|
||||||
|
service_statuses_bools = {}
|
||||||
|
service_status_expanded = {}
|
||||||
|
|
||||||
for service_name in service_statuses:
|
for service_name in service_statuses:
|
||||||
|
status, err = service_statuses[service_name]
|
||||||
|
|
||||||
|
service_statuses_bools[service_name] = status
|
||||||
|
service_status_expanded[service_name] = {
|
||||||
|
'status': status,
|
||||||
|
}
|
||||||
|
|
||||||
|
if not status:
|
||||||
|
service_status_expanded[service_name]['failure'] = err
|
||||||
|
|
||||||
if skip and service_name in skip:
|
if skip and service_name in skip:
|
||||||
notes.append('%s skipped in compute health' % service_name)
|
notes.append('%s skipped in compute health' % service_name)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
is_healthy = is_healthy and service_statuses[service_name]
|
is_healthy = is_healthy and status
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
'services': service_statuses,
|
'services': service_statuses_bools,
|
||||||
|
'services_expanded': service_status_expanded,
|
||||||
'notes': notes,
|
'notes': notes,
|
||||||
'is_testing': self.app.config['TESTING'],
|
'is_testing': self.app.config['TESTING'],
|
||||||
'config_provider': self.config_provider.provider_id,
|
'config_provider': self.config_provider.provider_id,
|
||||||
|
|
|
@ -21,10 +21,11 @@ def _check_registry_gunicorn(app):
|
||||||
|
|
||||||
registry_url = '%s://localhost%s/v1/_internal_ping' % (scheme, port)
|
registry_url = '%s://localhost%s/v1/_internal_ping' % (scheme, port)
|
||||||
try:
|
try:
|
||||||
return client.get(registry_url, verify=False, timeout=2).status_code == 200
|
status_code = client.get(registry_url, verify=False, timeout=2).status_code
|
||||||
except Exception:
|
return (status_code == 200, 'Got non-200 response for registry: %s' % status_code)
|
||||||
|
except Exception as ex:
|
||||||
logger.exception('Exception when checking registry health: %s', registry_url)
|
logger.exception('Exception when checking registry health: %s', registry_url)
|
||||||
return False
|
return (False, 'Exception when checking registry health: %s' % registry_url)
|
||||||
|
|
||||||
|
|
||||||
def _check_database(app):
|
def _check_database(app):
|
||||||
|
@ -41,15 +42,14 @@ def _check_storage(app):
|
||||||
""" Returns the status of storage, as accessed from this instance. """
|
""" Returns the status of storage, as accessed from this instance. """
|
||||||
try:
|
try:
|
||||||
storage.validate(storage.preferred_locations, app.config['HTTPCLIENT'])
|
storage.validate(storage.preferred_locations, app.config['HTTPCLIENT'])
|
||||||
return True
|
return (True, None)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logger.exception('Storage check failed with exception %s', ex)
|
logger.exception('Storage check failed with exception %s', ex)
|
||||||
return False
|
return (False, 'Storage check failed with exception %s' % ex.message)
|
||||||
|
|
||||||
def _check_auth(app):
|
def _check_auth(app):
|
||||||
""" Returns the status of the auth engine, as accessed from this instance. """
|
""" Returns the status of the auth engine, as accessed from this instance. """
|
||||||
(status, _) = authentication.ping()
|
return authentication.ping()
|
||||||
return status
|
|
||||||
|
|
||||||
|
|
||||||
_SERVICES = {
|
_SERVICES = {
|
||||||
|
|
Reference in a new issue