From cac5f44d151354ac0686d3d9fda9b2ff7954790a Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Mon, 4 Feb 2019 13:17:59 -0500 Subject: [PATCH] Improvements to health checks - Adds a warning endpoint for warning-only checks - Changes the default for the disk space check to 1%, instead of 10% - Removes instance services from the overall health check endpoint --- endpoints/web.py | 13 +++++-- health/healthcheck.py | 14 ++++++-- health/services.py | 79 +++++++++++++++++++++++++++++-------------- 3 files changed, 75 insertions(+), 31 deletions(-) diff --git a/endpoints/web.py b/endpoints/web.py index 3982c509d..be22dfeb7 100644 --- a/endpoints/web.py +++ b/endpoints/web.py @@ -257,7 +257,6 @@ def privacy(): return index('') -# TODO(jschorr): Remove this mirrored endpoint once we migrate ELB. @web.route('/health', methods=['GET']) @web.route('/health/instance', methods=['GET']) @process_auth_or_cookie @@ -270,7 +269,6 @@ def instance_health(): return response -# TODO(jschorr): Remove this mirrored endpoint once we migrate pingdom. @web.route('/status', methods=['GET']) @web.route('/health/endtoend', methods=['GET']) @process_auth_or_cookie @@ -283,6 +281,17 @@ def endtoend_health(): return response +@web.route('/health/warning', methods=['GET']) +@process_auth_or_cookie +@no_cache +def warning_health(): + checker = get_healthchecker(app, config_provider, instance_keys) + (data, status_code) = checker.check_warning() + response = jsonify(dict(data=data, status_code=status_code)) + response.status_code = status_code + return response + + @web.route('/health/dbrevision', methods=['GET']) @route_show_if(features.BILLING) # Since this is only used in production. @process_auth_or_cookie diff --git a/health/healthcheck.py b/health/healthcheck.py index 99671e74a..a449eba50 100644 --- a/health/healthcheck.py +++ b/health/healthcheck.py @@ -3,7 +3,7 @@ import logging from auth.permissions import SuperUserPermission from flask import session -from health.services import check_all_services +from health.services import check_all_services, check_warning_services logger = logging.getLogger(__name__) @@ -20,12 +20,20 @@ class HealthCheck(object): self.instance_keys = instance_keys self.instance_skips = instance_skips or [] + def check_warning(self): + """ + Conducts a check on the warnings, returning a dict representing the HealthCheck + output and a number indicating the health check response code. + """ + service_statuses = check_warning_services(self.app, []) + return self.get_instance_health(service_statuses) + def check_instance(self): """ Conducts a check on this specific instance, returning a dict representing the HealthCheck output and a number indicating the health check response code. """ - service_statuses = check_all_services(self.app, self.instance_skips) + service_statuses = check_all_services(self.app, self.instance_skips, for_instance=True) return self.get_instance_health(service_statuses) def check_endtoend(self): @@ -33,7 +41,7 @@ class HealthCheck(object): Conducts a check on all services, returning a dict representing the HealthCheck output and a number indicating the health check response code. """ - service_statuses = check_all_services(self.app, []) + service_statuses = check_all_services(self.app, [], for_instance=False) return self.calculate_overall_health(service_statuses) def get_instance_health(self, service_statuses): diff --git a/health/services.py b/health/services.py index cbd9d80ea..0aed369c7 100644 --- a/health/services.py +++ b/health/services.py @@ -94,50 +94,77 @@ def _disk_within_threshold(path, threshold): return (1.0 - (usage.percent / 100.0)) >= threshold -def _check_disk_space(app): - """ Returns the status of the disk space for this instance. If the available disk space is below - a certain threshold, then will return False. - """ - if not app.config.get('SETUP_COMPLETE', False): - return (True, 'Stack not fully setup; skipping check') +def _check_disk_space(for_warning): + def _check_disk_space(app): + """ Returns the status of the disk space for this instance. If the available disk space is below + a certain threshold, then will return False. + """ + if not app.config.get('SETUP_COMPLETE', False): + return (True, 'Stack not fully setup; skipping check') - # Check the directory in which we're running. - currentfile = os.path.abspath(__file__) - if not _disk_within_threshold(currentfile, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)): - stats = psutil.disk_usage(currentfile) - logger.debug('Disk space on main volume: %s', stats) - return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent) + config_key = ('DISKSPACE_HEALTH_WARNING_THRESHOLD' + if for_warning else 'DISKSPACE_HEALTH_THRESHOLD') + default_threshold = 0.1 if for_warning else 0.01 - # Check the temp directory as well. - tempdir = tempfile.gettempdir() - if tempdir is not None: - if not _disk_within_threshold(tempdir, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)): - stats = psutil.disk_usage(tempdir) - logger.debug('Disk space on temp volume: %s', stats) - return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent) + # Check the directory in which we're running. + currentfile = os.path.abspath(__file__) + if not _disk_within_threshold(currentfile, app.config.get(config_key, default_threshold)): + stats = psutil.disk_usage(currentfile) + logger.debug('Disk space on main volume: %s', stats) + return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent) - return (True, '') + # Check the temp directory as well. + tempdir = tempfile.gettempdir() + if tempdir is not None: + if not _disk_within_threshold(tempdir, app.config.get(config_key, default_threshold)): + stats = psutil.disk_usage(tempdir) + logger.debug('Disk space on temp volume: %s', stats) + return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent) + + return (True, '') + + return _check_disk_space -_SERVICES = { +_INSTANCE_SERVICES = { 'registry_gunicorn': _check_gunicorn('v1/_internal_ping'), 'web_gunicorn': _check_gunicorn('_internal_ping'), 'verbs_gunicorn': _check_gunicorn('c1/_internal_ping'), + 'service_key': _check_service_key, + 'disk_space': _check_disk_space(for_warning=False), +} + +_GLOBAL_SERVICES = { 'database': _check_database, 'redis': _check_redis, 'storage': _check_storage, 'auth': _check_auth, - 'service_key': _check_service_key, - 'disk_space': _check_disk_space, } -def check_all_services(app, skip): +_WARNING_SERVICES = { + 'disk_space_warning': _check_disk_space(for_warning=True), +} + +def check_all_services(app, skip, for_instance=False): """ Returns a dictionary containing the status of all the services defined. """ + if for_instance: + services = dict(_INSTANCE_SERVICES) + services.update(_GLOBAL_SERVICES) + else: + services = _GLOBAL_SERVICES + + return _check_services(app, skip, services) + +def check_warning_services(app, skip): + """ Returns a dictionary containing the status of all the warning services defined. """ + return _check_services(app, skip, _WARNING_SERVICES) + +def _check_services(app, skip, services): status = {} - for name in _SERVICES: + for name in services: if name in skip: continue - status[name] = _SERVICES[name](app) + status[name] = services[name](app) return status