Add health check for node disk space

If a node runs out of disk space, nginx can no longer swap, and this can cause issues with large pushes

Fixes https://jira.coreos.com/browse/QUAY-1047
This commit is contained in:
Joseph Schorr 2018-09-05 17:57:22 -04:00
parent 56809ef125
commit a94f657cb7

View file

@ -1,4 +1,9 @@
import logging import logging
import os
import tempfile
import psutil
from app import build_logs, storage, authentication, instance_keys from app import build_logs, storage, authentication, instance_keys
from health.models_pre_oci import pre_oci_model as model from health.models_pre_oci import pre_oci_model as model
@ -83,6 +88,37 @@ def _check_service_key(app):
return (True, 'Failed to get instance key due to a database issue; skipping check') return (True, 'Failed to get instance key due to a database issue; skipping check')
def _disk_within_threshold(path, threshold):
usage = psutil.disk_usage(path)
return (1.0 - (usage.percent / 100.0)) >= threshold
def _check_disk_space(app):
""" Returns the status of the disk space for this instance. If the available disk space is below
a certain threshold, then will return False.
"""
if not app.config.get('SETUP_COMPLETE', False):
return (True, 'Stack not fully setup; skipping check')
# Check the directory in which we're running.
currentfile = os.path.abspath(__file__)
if not _disk_within_threshold(currentfile, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)):
stats = psutil.disk_usage(currentfile)
logger.debug('Disk space on main volume: %s', stats)
return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent)
# Check the temp directory as well.
tempdir = tempfile.gettempdir()
if tempdir is not None:
if not _disk_within_threshold(tempdir, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)):
stats = psutil.disk_usage(tempdir)
logger.debug('Disk space on temp volume: %s', stats)
return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent)
return (True, '')
_SERVICES = { _SERVICES = {
'registry_gunicorn': _check_gunicorn('v1/_internal_ping'), 'registry_gunicorn': _check_gunicorn('v1/_internal_ping'),
'web_gunicorn': _check_gunicorn('_internal_ping'), 'web_gunicorn': _check_gunicorn('_internal_ping'),
@ -92,6 +128,7 @@ _SERVICES = {
'storage': _check_storage, 'storage': _check_storage,
'auth': _check_auth, 'auth': _check_auth,
'service_key': _check_service_key, 'service_key': _check_service_key,
'disk_space': _check_disk_space,
} }
def check_all_services(app, skip): def check_all_services(app, skip):