Improvements to health checks

- Adds a warning endpoint for warning-only checks
- Changes the default for the disk space check to 1%, instead of 10%
- Removes instance services from the overall health check endpoint
This commit is contained in:
Joseph Schorr 2019-02-04 13:17:59 -05:00
parent 6661ee8119
commit cac5f44d15
3 changed files with 75 additions and 31 deletions

View file

@ -257,7 +257,6 @@ def privacy():
return index('') return index('')
# TODO(jschorr): Remove this mirrored endpoint once we migrate ELB.
@web.route('/health', methods=['GET']) @web.route('/health', methods=['GET'])
@web.route('/health/instance', methods=['GET']) @web.route('/health/instance', methods=['GET'])
@process_auth_or_cookie @process_auth_or_cookie
@ -270,7 +269,6 @@ def instance_health():
return response return response
# TODO(jschorr): Remove this mirrored endpoint once we migrate pingdom.
@web.route('/status', methods=['GET']) @web.route('/status', methods=['GET'])
@web.route('/health/endtoend', methods=['GET']) @web.route('/health/endtoend', methods=['GET'])
@process_auth_or_cookie @process_auth_or_cookie
@ -283,6 +281,17 @@ def endtoend_health():
return response return response
@web.route('/health/warning', methods=['GET'])
@process_auth_or_cookie
@no_cache
def warning_health():
checker = get_healthchecker(app, config_provider, instance_keys)
(data, status_code) = checker.check_warning()
response = jsonify(dict(data=data, status_code=status_code))
response.status_code = status_code
return response
@web.route('/health/dbrevision', methods=['GET']) @web.route('/health/dbrevision', methods=['GET'])
@route_show_if(features.BILLING) # Since this is only used in production. @route_show_if(features.BILLING) # Since this is only used in production.
@process_auth_or_cookie @process_auth_or_cookie

View file

@ -3,7 +3,7 @@ import logging
from auth.permissions import SuperUserPermission from auth.permissions import SuperUserPermission
from flask import session from flask import session
from health.services import check_all_services from health.services import check_all_services, check_warning_services
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -20,12 +20,20 @@ class HealthCheck(object):
self.instance_keys = instance_keys self.instance_keys = instance_keys
self.instance_skips = instance_skips or [] self.instance_skips = instance_skips or []
def check_warning(self):
"""
Conducts a check on the warnings, returning a dict representing the HealthCheck
output and a number indicating the health check response code.
"""
service_statuses = check_warning_services(self.app, [])
return self.get_instance_health(service_statuses)
def check_instance(self): def check_instance(self):
""" """
Conducts a check on this specific instance, returning a dict representing the HealthCheck Conducts a check on this specific instance, returning a dict representing the HealthCheck
output and a number indicating the health check response code. output and a number indicating the health check response code.
""" """
service_statuses = check_all_services(self.app, self.instance_skips) service_statuses = check_all_services(self.app, self.instance_skips, for_instance=True)
return self.get_instance_health(service_statuses) return self.get_instance_health(service_statuses)
def check_endtoend(self): def check_endtoend(self):
@ -33,7 +41,7 @@ class HealthCheck(object):
Conducts a check on all services, returning a dict representing the HealthCheck Conducts a check on all services, returning a dict representing the HealthCheck
output and a number indicating the health check response code. output and a number indicating the health check response code.
""" """
service_statuses = check_all_services(self.app, []) service_statuses = check_all_services(self.app, [], for_instance=False)
return self.calculate_overall_health(service_statuses) return self.calculate_overall_health(service_statuses)
def get_instance_health(self, service_statuses): def get_instance_health(self, service_statuses):

View file

@ -94,6 +94,7 @@ def _disk_within_threshold(path, threshold):
return (1.0 - (usage.percent / 100.0)) >= threshold return (1.0 - (usage.percent / 100.0)) >= threshold
def _check_disk_space(for_warning):
def _check_disk_space(app): def _check_disk_space(app):
""" Returns the status of the disk space for this instance. If the available disk space is below """ Returns the status of the disk space for this instance. If the available disk space is below
a certain threshold, then will return False. a certain threshold, then will return False.
@ -101,9 +102,13 @@ def _check_disk_space(app):
if not app.config.get('SETUP_COMPLETE', False): if not app.config.get('SETUP_COMPLETE', False):
return (True, 'Stack not fully setup; skipping check') return (True, 'Stack not fully setup; skipping check')
config_key = ('DISKSPACE_HEALTH_WARNING_THRESHOLD'
if for_warning else 'DISKSPACE_HEALTH_THRESHOLD')
default_threshold = 0.1 if for_warning else 0.01
# Check the directory in which we're running. # Check the directory in which we're running.
currentfile = os.path.abspath(__file__) currentfile = os.path.abspath(__file__)
if not _disk_within_threshold(currentfile, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)): if not _disk_within_threshold(currentfile, app.config.get(config_key, default_threshold)):
stats = psutil.disk_usage(currentfile) stats = psutil.disk_usage(currentfile)
logger.debug('Disk space on main volume: %s', stats) logger.debug('Disk space on main volume: %s', stats)
return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent) return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent)
@ -111,33 +116,55 @@ def _check_disk_space(app):
# Check the temp directory as well. # Check the temp directory as well.
tempdir = tempfile.gettempdir() tempdir = tempfile.gettempdir()
if tempdir is not None: if tempdir is not None:
if not _disk_within_threshold(tempdir, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)): if not _disk_within_threshold(tempdir, app.config.get(config_key, default_threshold)):
stats = psutil.disk_usage(tempdir) stats = psutil.disk_usage(tempdir)
logger.debug('Disk space on temp volume: %s', stats) logger.debug('Disk space on temp volume: %s', stats)
return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent) return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent)
return (True, '') return (True, '')
return _check_disk_space
_SERVICES = {
_INSTANCE_SERVICES = {
'registry_gunicorn': _check_gunicorn('v1/_internal_ping'), 'registry_gunicorn': _check_gunicorn('v1/_internal_ping'),
'web_gunicorn': _check_gunicorn('_internal_ping'), 'web_gunicorn': _check_gunicorn('_internal_ping'),
'verbs_gunicorn': _check_gunicorn('c1/_internal_ping'), 'verbs_gunicorn': _check_gunicorn('c1/_internal_ping'),
'service_key': _check_service_key,
'disk_space': _check_disk_space(for_warning=False),
}
_GLOBAL_SERVICES = {
'database': _check_database, 'database': _check_database,
'redis': _check_redis, 'redis': _check_redis,
'storage': _check_storage, 'storage': _check_storage,
'auth': _check_auth, 'auth': _check_auth,
'service_key': _check_service_key,
'disk_space': _check_disk_space,
} }
def check_all_services(app, skip): _WARNING_SERVICES = {
'disk_space_warning': _check_disk_space(for_warning=True),
}
def check_all_services(app, skip, for_instance=False):
""" Returns a dictionary containing the status of all the services defined. """ """ Returns a dictionary containing the status of all the services defined. """
if for_instance:
services = dict(_INSTANCE_SERVICES)
services.update(_GLOBAL_SERVICES)
else:
services = _GLOBAL_SERVICES
return _check_services(app, skip, services)
def check_warning_services(app, skip):
""" Returns a dictionary containing the status of all the warning services defined. """
return _check_services(app, skip, _WARNING_SERVICES)
def _check_services(app, skip, services):
status = {} status = {}
for name in _SERVICES: for name in services:
if name in skip: if name in skip:
continue continue
status[name] = _SERVICES[name](app) status[name] = services[name](app)
return status return status