From a94f657cb7e7df0e18a5b063e74367b02e2f2864 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 5 Sep 2018 17:57:22 -0400 Subject: [PATCH] Add health check for node disk space If a node runs out of disk space, nginx can no longer swap, and this can cause issues with large pushes Fixes https://jira.coreos.com/browse/QUAY-1047 --- health/services.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/health/services.py b/health/services.py index 4f638d1b6..cbd9d80ea 100644 --- a/health/services.py +++ b/health/services.py @@ -1,4 +1,9 @@ import logging +import os +import tempfile + +import psutil + from app import build_logs, storage, authentication, instance_keys from health.models_pre_oci import pre_oci_model as model @@ -83,6 +88,37 @@ def _check_service_key(app): return (True, 'Failed to get instance key due to a database issue; skipping check') + +def _disk_within_threshold(path, threshold): + usage = psutil.disk_usage(path) + return (1.0 - (usage.percent / 100.0)) >= threshold + + +def _check_disk_space(app): + """ Returns the status of the disk space for this instance. If the available disk space is below + a certain threshold, then will return False. + """ + if not app.config.get('SETUP_COMPLETE', False): + return (True, 'Stack not fully setup; skipping check') + + # Check the directory in which we're running. + currentfile = os.path.abspath(__file__) + if not _disk_within_threshold(currentfile, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)): + stats = psutil.disk_usage(currentfile) + logger.debug('Disk space on main volume: %s', stats) + return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent) + + # Check the temp directory as well. + tempdir = tempfile.gettempdir() + if tempdir is not None: + if not _disk_within_threshold(tempdir, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)): + stats = psutil.disk_usage(tempdir) + logger.debug('Disk space on temp volume: %s', stats) + return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent) + + return (True, '') + + _SERVICES = { 'registry_gunicorn': _check_gunicorn('v1/_internal_ping'), 'web_gunicorn': _check_gunicorn('_internal_ping'), @@ -92,6 +128,7 @@ _SERVICES = { 'storage': _check_storage, 'auth': _check_auth, 'service_key': _check_service_key, + 'disk_space': _check_disk_space, } def check_all_services(app, skip):