From 4ad3682b9c59178be491a52caf4ed26554d901b8 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 10 May 2017 21:05:14 -0700 Subject: [PATCH] Make health check failures report their reasons Note that we add a new block with expanded service info, to avoid breaking compatibility with existing callers of the health endpoint --- data/buildlogs.py | 9 ++++----- data/model/health.py | 11 +++++------ health/healthcheck.py | 18 ++++++++++++++++-- health/services.py | 14 +++++++------- 4 files changed, 32 insertions(+), 20 deletions(-) diff --git a/data/buildlogs.py b/data/buildlogs.py index ba8b1ce78..f989977e3 100644 --- a/data/buildlogs.py +++ b/data/buildlogs.py @@ -120,15 +120,14 @@ class RedisBuildLogs(object): connection = redis.StrictRedis(**args) if not connection.ping() == True: - return False + return (False, 'Could not ping redis') # Ensure we can write and read a key. connection.set(self._health_key(), time.time()) connection.get(self._health_key()) - - return True - except redis.RedisError: - return False + return (True, None) + except redis.RedisError as re: + return (False, 'Could not connect to redis: %s' % re.message) class BuildLogs(object): diff --git a/data/model/health.py b/data/model/health.py index 0d177b400..ca796a48b 100644 --- a/data/model/health.py +++ b/data/model/health.py @@ -11,12 +11,11 @@ def check_health(app_config): # check). try: validate_database_url(app_config['DB_URI'], {}, connect_timeout=3) - except Exception: - logger.exception('Could not connect to the database') - return False + except Exception as ex: + return (False, 'Could not connect to the database: %s', ex.message) # We will connect to the db, check that it contains some team role kinds try: - return bool(list(TeamRole.select().limit(1))) - except: - return False + return (bool(list(TeamRole.select().limit(1))), 'Could not connect to the database') + except Exception as ex: + return (False, 'Could not connect to the database: %s', ex.message) diff --git a/health/healthcheck.py b/health/healthcheck.py index cfb427a31..632bb5920 100644 --- a/health/healthcheck.py +++ b/health/healthcheck.py @@ -46,15 +46,29 @@ class HealthCheck(object): is_healthy = True notes = notes or [] + service_statuses_bools = {} + service_status_expanded = {} + for service_name in service_statuses: + status, err = service_statuses[service_name] + + service_statuses_bools[service_name] = status + service_status_expanded[service_name] = { + 'status': status, + } + + if not status: + service_status_expanded[service_name]['failure'] = err + if skip and service_name in skip: notes.append('%s skipped in compute health' % service_name) continue - is_healthy = is_healthy and service_statuses[service_name] + is_healthy = is_healthy and status data = { - 'services': service_statuses, + 'services': service_statuses_bools, + 'services_expanded': service_status_expanded, 'notes': notes, 'is_testing': self.app.config['TESTING'], 'config_provider': self.config_provider.provider_id, diff --git a/health/services.py b/health/services.py index 95d4ba824..b1191032a 100644 --- a/health/services.py +++ b/health/services.py @@ -21,10 +21,11 @@ def _check_registry_gunicorn(app): registry_url = '%s://localhost%s/v1/_internal_ping' % (scheme, port) try: - return client.get(registry_url, verify=False, timeout=2).status_code == 200 - except Exception: + status_code = client.get(registry_url, verify=False, timeout=2).status_code + return (status_code == 200, 'Got non-200 response for registry: %s' % status_code) + except Exception as ex: logger.exception('Exception when checking registry health: %s', registry_url) - return False + return (False, 'Exception when checking registry health: %s' % registry_url) def _check_database(app): @@ -41,15 +42,14 @@ def _check_storage(app): """ Returns the status of storage, as accessed from this instance. """ try: storage.validate(storage.preferred_locations, app.config['HTTPCLIENT']) - return True + return (True, None) except Exception as ex: logger.exception('Storage check failed with exception %s', ex) - return False + return (False, 'Storage check failed with exception %s' % ex.message) def _check_auth(app): """ Returns the status of the auth engine, as accessed from this instance. """ - (status, _) = authentication.ping() - return status + return authentication.ping() _SERVICES = {