Add health check for node disk space
If a node runs out of disk space, nginx can no longer swap, and this can cause issues with large pushes Fixes https://jira.coreos.com/browse/QUAY-1047
This commit is contained in:
parent
56809ef125
commit
a94f657cb7
1 changed files with 37 additions and 0 deletions
|
@ -1,4 +1,9 @@
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
from app import build_logs, storage, authentication, instance_keys
|
from app import build_logs, storage, authentication, instance_keys
|
||||||
from health.models_pre_oci import pre_oci_model as model
|
from health.models_pre_oci import pre_oci_model as model
|
||||||
|
|
||||||
|
@ -83,6 +88,37 @@ def _check_service_key(app):
|
||||||
return (True, 'Failed to get instance key due to a database issue; skipping check')
|
return (True, 'Failed to get instance key due to a database issue; skipping check')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _disk_within_threshold(path, threshold):
|
||||||
|
usage = psutil.disk_usage(path)
|
||||||
|
return (1.0 - (usage.percent / 100.0)) >= threshold
|
||||||
|
|
||||||
|
|
||||||
|
def _check_disk_space(app):
|
||||||
|
""" Returns the status of the disk space for this instance. If the available disk space is below
|
||||||
|
a certain threshold, then will return False.
|
||||||
|
"""
|
||||||
|
if not app.config.get('SETUP_COMPLETE', False):
|
||||||
|
return (True, 'Stack not fully setup; skipping check')
|
||||||
|
|
||||||
|
# Check the directory in which we're running.
|
||||||
|
currentfile = os.path.abspath(__file__)
|
||||||
|
if not _disk_within_threshold(currentfile, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)):
|
||||||
|
stats = psutil.disk_usage(currentfile)
|
||||||
|
logger.debug('Disk space on main volume: %s', stats)
|
||||||
|
return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent)
|
||||||
|
|
||||||
|
# Check the temp directory as well.
|
||||||
|
tempdir = tempfile.gettempdir()
|
||||||
|
if tempdir is not None:
|
||||||
|
if not _disk_within_threshold(tempdir, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)):
|
||||||
|
stats = psutil.disk_usage(tempdir)
|
||||||
|
logger.debug('Disk space on temp volume: %s', stats)
|
||||||
|
return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent)
|
||||||
|
|
||||||
|
return (True, '')
|
||||||
|
|
||||||
|
|
||||||
_SERVICES = {
|
_SERVICES = {
|
||||||
'registry_gunicorn': _check_gunicorn('v1/_internal_ping'),
|
'registry_gunicorn': _check_gunicorn('v1/_internal_ping'),
|
||||||
'web_gunicorn': _check_gunicorn('_internal_ping'),
|
'web_gunicorn': _check_gunicorn('_internal_ping'),
|
||||||
|
@ -92,6 +128,7 @@ _SERVICES = {
|
||||||
'storage': _check_storage,
|
'storage': _check_storage,
|
||||||
'auth': _check_auth,
|
'auth': _check_auth,
|
||||||
'service_key': _check_service_key,
|
'service_key': _check_service_key,
|
||||||
|
'disk_space': _check_disk_space,
|
||||||
}
|
}
|
||||||
|
|
||||||
def check_all_services(app, skip):
|
def check_all_services(app, skip):
|
||||||
|
|
Reference in a new issue