Improvements to health checks

- Adds a warning endpoint for warning-only checks
- Changes the default for the disk space check to 1%, instead of 10%
- Removes instance services from the overall health check endpoint
This commit is contained in:
Joseph Schorr 2019-02-04 13:17:59 -05:00
parent 6661ee8119
commit cac5f44d15
3 changed files with 75 additions and 31 deletions

View file

@ -3,7 +3,7 @@ import logging
from auth.permissions import SuperUserPermission
from flask import session
from health.services import check_all_services
from health.services import check_all_services, check_warning_services
logger = logging.getLogger(__name__)
@ -20,12 +20,20 @@ class HealthCheck(object):
self.instance_keys = instance_keys
self.instance_skips = instance_skips or []
def check_warning(self):
"""
Conducts a check on the warnings, returning a dict representing the HealthCheck
output and a number indicating the health check response code.
"""
service_statuses = check_warning_services(self.app, [])
return self.get_instance_health(service_statuses)
def check_instance(self):
"""
Conducts a check on this specific instance, returning a dict representing the HealthCheck
output and a number indicating the health check response code.
"""
service_statuses = check_all_services(self.app, self.instance_skips)
service_statuses = check_all_services(self.app, self.instance_skips, for_instance=True)
return self.get_instance_health(service_statuses)
def check_endtoend(self):
@ -33,7 +41,7 @@ class HealthCheck(object):
Conducts a check on all services, returning a dict representing the HealthCheck
output and a number indicating the health check response code.
"""
service_statuses = check_all_services(self.app, [])
service_statuses = check_all_services(self.app, [], for_instance=False)
return self.calculate_overall_health(service_statuses)
def get_instance_health(self, service_statuses):

View file

@ -94,50 +94,77 @@ def _disk_within_threshold(path, threshold):
return (1.0 - (usage.percent / 100.0)) >= threshold
def _check_disk_space(app):
""" Returns the status of the disk space for this instance. If the available disk space is below
a certain threshold, then will return False.
"""
if not app.config.get('SETUP_COMPLETE', False):
return (True, 'Stack not fully setup; skipping check')
def _check_disk_space(for_warning):
def _check_disk_space(app):
""" Returns the status of the disk space for this instance. If the available disk space is below
a certain threshold, then will return False.
"""
if not app.config.get('SETUP_COMPLETE', False):
return (True, 'Stack not fully setup; skipping check')
# Check the directory in which we're running.
currentfile = os.path.abspath(__file__)
if not _disk_within_threshold(currentfile, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)):
stats = psutil.disk_usage(currentfile)
logger.debug('Disk space on main volume: %s', stats)
return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent)
config_key = ('DISKSPACE_HEALTH_WARNING_THRESHOLD'
if for_warning else 'DISKSPACE_HEALTH_THRESHOLD')
default_threshold = 0.1 if for_warning else 0.01
# Check the temp directory as well.
tempdir = tempfile.gettempdir()
if tempdir is not None:
if not _disk_within_threshold(tempdir, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)):
stats = psutil.disk_usage(tempdir)
logger.debug('Disk space on temp volume: %s', stats)
return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent)
# Check the directory in which we're running.
currentfile = os.path.abspath(__file__)
if not _disk_within_threshold(currentfile, app.config.get(config_key, default_threshold)):
stats = psutil.disk_usage(currentfile)
logger.debug('Disk space on main volume: %s', stats)
return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent)
return (True, '')
# Check the temp directory as well.
tempdir = tempfile.gettempdir()
if tempdir is not None:
if not _disk_within_threshold(tempdir, app.config.get(config_key, default_threshold)):
stats = psutil.disk_usage(tempdir)
logger.debug('Disk space on temp volume: %s', stats)
return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent)
return (True, '')
return _check_disk_space
_SERVICES = {
_INSTANCE_SERVICES = {
'registry_gunicorn': _check_gunicorn('v1/_internal_ping'),
'web_gunicorn': _check_gunicorn('_internal_ping'),
'verbs_gunicorn': _check_gunicorn('c1/_internal_ping'),
'service_key': _check_service_key,
'disk_space': _check_disk_space(for_warning=False),
}
_GLOBAL_SERVICES = {
'database': _check_database,
'redis': _check_redis,
'storage': _check_storage,
'auth': _check_auth,
'service_key': _check_service_key,
'disk_space': _check_disk_space,
}
def check_all_services(app, skip):
_WARNING_SERVICES = {
'disk_space_warning': _check_disk_space(for_warning=True),
}
def check_all_services(app, skip, for_instance=False):
""" Returns a dictionary containing the status of all the services defined. """
if for_instance:
services = dict(_INSTANCE_SERVICES)
services.update(_GLOBAL_SERVICES)
else:
services = _GLOBAL_SERVICES
return _check_services(app, skip, services)
def check_warning_services(app, skip):
""" Returns a dictionary containing the status of all the warning services defined. """
return _check_services(app, skip, _WARNING_SERVICES)
def _check_services(app, skip, services):
status = {}
for name in _SERVICES:
for name in services:
if name in skip:
continue
status[name] = _SERVICES[name](app)
status[name] = services[name](app)
return status