Improvements to health checks
- Adds a warning endpoint for warning-only checks - Changes the default for the disk space check to 1%, instead of 10% - Removes instance services from the overall health check endpoint
This commit is contained in:
parent
6661ee8119
commit
cac5f44d15
3 changed files with 75 additions and 31 deletions
|
@ -257,7 +257,6 @@ def privacy():
|
||||||
return index('')
|
return index('')
|
||||||
|
|
||||||
|
|
||||||
# TODO(jschorr): Remove this mirrored endpoint once we migrate ELB.
|
|
||||||
@web.route('/health', methods=['GET'])
|
@web.route('/health', methods=['GET'])
|
||||||
@web.route('/health/instance', methods=['GET'])
|
@web.route('/health/instance', methods=['GET'])
|
||||||
@process_auth_or_cookie
|
@process_auth_or_cookie
|
||||||
|
@ -270,7 +269,6 @@ def instance_health():
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
# TODO(jschorr): Remove this mirrored endpoint once we migrate pingdom.
|
|
||||||
@web.route('/status', methods=['GET'])
|
@web.route('/status', methods=['GET'])
|
||||||
@web.route('/health/endtoend', methods=['GET'])
|
@web.route('/health/endtoend', methods=['GET'])
|
||||||
@process_auth_or_cookie
|
@process_auth_or_cookie
|
||||||
|
@ -283,6 +281,17 @@ def endtoend_health():
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
@web.route('/health/warning', methods=['GET'])
|
||||||
|
@process_auth_or_cookie
|
||||||
|
@no_cache
|
||||||
|
def warning_health():
|
||||||
|
checker = get_healthchecker(app, config_provider, instance_keys)
|
||||||
|
(data, status_code) = checker.check_warning()
|
||||||
|
response = jsonify(dict(data=data, status_code=status_code))
|
||||||
|
response.status_code = status_code
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
@web.route('/health/dbrevision', methods=['GET'])
|
@web.route('/health/dbrevision', methods=['GET'])
|
||||||
@route_show_if(features.BILLING) # Since this is only used in production.
|
@route_show_if(features.BILLING) # Since this is only used in production.
|
||||||
@process_auth_or_cookie
|
@process_auth_or_cookie
|
||||||
|
|
|
@ -3,7 +3,7 @@ import logging
|
||||||
|
|
||||||
from auth.permissions import SuperUserPermission
|
from auth.permissions import SuperUserPermission
|
||||||
from flask import session
|
from flask import session
|
||||||
from health.services import check_all_services
|
from health.services import check_all_services, check_warning_services
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -20,12 +20,20 @@ class HealthCheck(object):
|
||||||
self.instance_keys = instance_keys
|
self.instance_keys = instance_keys
|
||||||
self.instance_skips = instance_skips or []
|
self.instance_skips = instance_skips or []
|
||||||
|
|
||||||
|
def check_warning(self):
|
||||||
|
"""
|
||||||
|
Conducts a check on the warnings, returning a dict representing the HealthCheck
|
||||||
|
output and a number indicating the health check response code.
|
||||||
|
"""
|
||||||
|
service_statuses = check_warning_services(self.app, [])
|
||||||
|
return self.get_instance_health(service_statuses)
|
||||||
|
|
||||||
def check_instance(self):
|
def check_instance(self):
|
||||||
"""
|
"""
|
||||||
Conducts a check on this specific instance, returning a dict representing the HealthCheck
|
Conducts a check on this specific instance, returning a dict representing the HealthCheck
|
||||||
output and a number indicating the health check response code.
|
output and a number indicating the health check response code.
|
||||||
"""
|
"""
|
||||||
service_statuses = check_all_services(self.app, self.instance_skips)
|
service_statuses = check_all_services(self.app, self.instance_skips, for_instance=True)
|
||||||
return self.get_instance_health(service_statuses)
|
return self.get_instance_health(service_statuses)
|
||||||
|
|
||||||
def check_endtoend(self):
|
def check_endtoend(self):
|
||||||
|
@ -33,7 +41,7 @@ class HealthCheck(object):
|
||||||
Conducts a check on all services, returning a dict representing the HealthCheck
|
Conducts a check on all services, returning a dict representing the HealthCheck
|
||||||
output and a number indicating the health check response code.
|
output and a number indicating the health check response code.
|
||||||
"""
|
"""
|
||||||
service_statuses = check_all_services(self.app, [])
|
service_statuses = check_all_services(self.app, [], for_instance=False)
|
||||||
return self.calculate_overall_health(service_statuses)
|
return self.calculate_overall_health(service_statuses)
|
||||||
|
|
||||||
def get_instance_health(self, service_statuses):
|
def get_instance_health(self, service_statuses):
|
||||||
|
|
|
@ -94,50 +94,77 @@ def _disk_within_threshold(path, threshold):
|
||||||
return (1.0 - (usage.percent / 100.0)) >= threshold
|
return (1.0 - (usage.percent / 100.0)) >= threshold
|
||||||
|
|
||||||
|
|
||||||
def _check_disk_space(app):
|
def _check_disk_space(for_warning):
|
||||||
""" Returns the status of the disk space for this instance. If the available disk space is below
|
def _check_disk_space(app):
|
||||||
a certain threshold, then will return False.
|
""" Returns the status of the disk space for this instance. If the available disk space is below
|
||||||
"""
|
a certain threshold, then will return False.
|
||||||
if not app.config.get('SETUP_COMPLETE', False):
|
"""
|
||||||
return (True, 'Stack not fully setup; skipping check')
|
if not app.config.get('SETUP_COMPLETE', False):
|
||||||
|
return (True, 'Stack not fully setup; skipping check')
|
||||||
|
|
||||||
# Check the directory in which we're running.
|
config_key = ('DISKSPACE_HEALTH_WARNING_THRESHOLD'
|
||||||
currentfile = os.path.abspath(__file__)
|
if for_warning else 'DISKSPACE_HEALTH_THRESHOLD')
|
||||||
if not _disk_within_threshold(currentfile, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)):
|
default_threshold = 0.1 if for_warning else 0.01
|
||||||
stats = psutil.disk_usage(currentfile)
|
|
||||||
logger.debug('Disk space on main volume: %s', stats)
|
|
||||||
return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent)
|
|
||||||
|
|
||||||
# Check the temp directory as well.
|
# Check the directory in which we're running.
|
||||||
tempdir = tempfile.gettempdir()
|
currentfile = os.path.abspath(__file__)
|
||||||
if tempdir is not None:
|
if not _disk_within_threshold(currentfile, app.config.get(config_key, default_threshold)):
|
||||||
if not _disk_within_threshold(tempdir, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)):
|
stats = psutil.disk_usage(currentfile)
|
||||||
stats = psutil.disk_usage(tempdir)
|
logger.debug('Disk space on main volume: %s', stats)
|
||||||
logger.debug('Disk space on temp volume: %s', stats)
|
return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent)
|
||||||
return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent)
|
|
||||||
|
|
||||||
return (True, '')
|
# Check the temp directory as well.
|
||||||
|
tempdir = tempfile.gettempdir()
|
||||||
|
if tempdir is not None:
|
||||||
|
if not _disk_within_threshold(tempdir, app.config.get(config_key, default_threshold)):
|
||||||
|
stats = psutil.disk_usage(tempdir)
|
||||||
|
logger.debug('Disk space on temp volume: %s', stats)
|
||||||
|
return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent)
|
||||||
|
|
||||||
|
return (True, '')
|
||||||
|
|
||||||
|
return _check_disk_space
|
||||||
|
|
||||||
|
|
||||||
_SERVICES = {
|
_INSTANCE_SERVICES = {
|
||||||
'registry_gunicorn': _check_gunicorn('v1/_internal_ping'),
|
'registry_gunicorn': _check_gunicorn('v1/_internal_ping'),
|
||||||
'web_gunicorn': _check_gunicorn('_internal_ping'),
|
'web_gunicorn': _check_gunicorn('_internal_ping'),
|
||||||
'verbs_gunicorn': _check_gunicorn('c1/_internal_ping'),
|
'verbs_gunicorn': _check_gunicorn('c1/_internal_ping'),
|
||||||
|
'service_key': _check_service_key,
|
||||||
|
'disk_space': _check_disk_space(for_warning=False),
|
||||||
|
}
|
||||||
|
|
||||||
|
_GLOBAL_SERVICES = {
|
||||||
'database': _check_database,
|
'database': _check_database,
|
||||||
'redis': _check_redis,
|
'redis': _check_redis,
|
||||||
'storage': _check_storage,
|
'storage': _check_storage,
|
||||||
'auth': _check_auth,
|
'auth': _check_auth,
|
||||||
'service_key': _check_service_key,
|
|
||||||
'disk_space': _check_disk_space,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def check_all_services(app, skip):
|
_WARNING_SERVICES = {
|
||||||
|
'disk_space_warning': _check_disk_space(for_warning=True),
|
||||||
|
}
|
||||||
|
|
||||||
|
def check_all_services(app, skip, for_instance=False):
|
||||||
""" Returns a dictionary containing the status of all the services defined. """
|
""" Returns a dictionary containing the status of all the services defined. """
|
||||||
|
if for_instance:
|
||||||
|
services = dict(_INSTANCE_SERVICES)
|
||||||
|
services.update(_GLOBAL_SERVICES)
|
||||||
|
else:
|
||||||
|
services = _GLOBAL_SERVICES
|
||||||
|
|
||||||
|
return _check_services(app, skip, services)
|
||||||
|
|
||||||
|
def check_warning_services(app, skip):
|
||||||
|
""" Returns a dictionary containing the status of all the warning services defined. """
|
||||||
|
return _check_services(app, skip, _WARNING_SERVICES)
|
||||||
|
|
||||||
|
def _check_services(app, skip, services):
|
||||||
status = {}
|
status = {}
|
||||||
for name in _SERVICES:
|
for name in services:
|
||||||
if name in skip:
|
if name in skip:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
status[name] = _SERVICES[name](app)
|
status[name] = services[name](app)
|
||||||
|
|
||||||
return status
|
return status
|
||||||
|
|
Reference in a new issue