Add health check for node disk space
If a node runs out of disk space, nginx can no longer swap, and this can cause issues with large pushes Fixes https://jira.coreos.com/browse/QUAY-1047
This commit is contained in:
parent
56809ef125
commit
a94f657cb7
1 changed files with 37 additions and 0 deletions
|
@ -1,4 +1,9 @@
|
|||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import psutil
|
||||
|
||||
from app import build_logs, storage, authentication, instance_keys
|
||||
from health.models_pre_oci import pre_oci_model as model
|
||||
|
||||
|
@ -83,6 +88,37 @@ def _check_service_key(app):
|
|||
return (True, 'Failed to get instance key due to a database issue; skipping check')
|
||||
|
||||
|
||||
|
||||
def _disk_within_threshold(path, threshold):
|
||||
usage = psutil.disk_usage(path)
|
||||
return (1.0 - (usage.percent / 100.0)) >= threshold
|
||||
|
||||
|
||||
def _check_disk_space(app):
|
||||
""" Returns the status of the disk space for this instance. If the available disk space is below
|
||||
a certain threshold, then will return False.
|
||||
"""
|
||||
if not app.config.get('SETUP_COMPLETE', False):
|
||||
return (True, 'Stack not fully setup; skipping check')
|
||||
|
||||
# Check the directory in which we're running.
|
||||
currentfile = os.path.abspath(__file__)
|
||||
if not _disk_within_threshold(currentfile, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)):
|
||||
stats = psutil.disk_usage(currentfile)
|
||||
logger.debug('Disk space on main volume: %s', stats)
|
||||
return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent)
|
||||
|
||||
# Check the temp directory as well.
|
||||
tempdir = tempfile.gettempdir()
|
||||
if tempdir is not None:
|
||||
if not _disk_within_threshold(tempdir, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)):
|
||||
stats = psutil.disk_usage(tempdir)
|
||||
logger.debug('Disk space on temp volume: %s', stats)
|
||||
return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent)
|
||||
|
||||
return (True, '')
|
||||
|
||||
|
||||
_SERVICES = {
|
||||
'registry_gunicorn': _check_gunicorn('v1/_internal_ping'),
|
||||
'web_gunicorn': _check_gunicorn('_internal_ping'),
|
||||
|
@ -92,6 +128,7 @@ _SERVICES = {
|
|||
'storage': _check_storage,
|
||||
'auth': _check_auth,
|
||||
'service_key': _check_service_key,
|
||||
'disk_space': _check_disk_space,
|
||||
}
|
||||
|
||||
def check_all_services(app, skip):
|
||||
|
|
Reference in a new issue