This repository has been archived on 2020-03-24. You can view files and clone it, but cannot push or open issues or pull requests.
quay/health/services.py
Joseph Schorr a94f657cb7 Add health check for node disk space
If a node runs out of disk space, nginx can no longer swap, and this can cause issues with large pushes

Fixes https://jira.coreos.com/browse/QUAY-1047
2018-09-05 17:57:22 -04:00

143 lines
4.9 KiB
Python

import logging
import os
import tempfile
import psutil
from app import build_logs, storage, authentication, instance_keys
from health.models_pre_oci import pre_oci_model as model
logger = logging.getLogger(__name__)
def _check_gunicorn(endpoint):
def fn(app):
""" Returns the status of the gunicorn workers. """
# Compute the URL for checking the endpoint. We append a port if and only if the
# hostname contains one.
client = app.config['HTTPCLIENT']
hostname_parts = app.config['SERVER_HOSTNAME'].split(':')
port = ''
if hostname_parts[0] == 'localhost':
if len(hostname_parts) == 2:
port = ':' + hostname_parts[1]
scheme = app.config['PREFERRED_URL_SCHEME']
if app.config.get('EXTERNAL_TLS_TERMINATION', False):
scheme = 'http'
registry_url = '%s://localhost%s/%s' % (scheme, port, endpoint)
try:
status_code = client.get(registry_url, verify=False, timeout=2).status_code
okay = status_code == 200
message = 'Got non-200 response for worker: %s' % status_code if not okay else None
return (okay, message)
except Exception as ex:
logger.exception('Exception when checking worker health: %s', registry_url)
return (False, 'Exception when checking worker health: %s' % registry_url)
return fn
def _check_database(app):
""" Returns the status of the database, as accessed from this instance. """
return model.check_health(app.config)
def _check_redis(app):
""" Returns the status of Redis, as accessed from this instance. """
return build_logs.check_health()
def _check_storage(app):
""" Returns the status of storage, as accessed from this instance. """
try:
storage.validate(storage.preferred_locations, app.config['HTTPCLIENT'])
return (True, None)
except Exception as ex:
logger.exception('Storage check failed with exception %s', ex)
return (False, 'Storage check failed with exception %s' % ex.message)
def _check_auth(app):
""" Returns the status of the auth engine, as accessed from this instance. """
return authentication.ping()
def _check_service_key(app):
""" Returns the status of the service key for this instance. If the key has disappeared or
has expired, then will return False.
"""
if not app.config.get('SETUP_COMPLETE', False):
return (True, 'Stack not fully setup; skipping check')
try:
kid = instance_keys.local_key_id
except IOError as ex:
# Key has not been created yet.
return (True, 'Stack not fully setup; skipping check')
try:
key_is_valid = bool(instance_keys.get_service_key_public_key(kid))
message = 'Could not find valid instance service key %s' % kid if not key_is_valid else None
return (key_is_valid, message)
except Exception as ex:
logger.exception('Got exception when trying to retrieve the instance key')
# NOTE: We return *True* here if there was an exception when retrieving the key, as it means
# the database is down, which will be handled by the database health check.
return (True, 'Failed to get instance key due to a database issue; skipping check')
def _disk_within_threshold(path, threshold):
usage = psutil.disk_usage(path)
return (1.0 - (usage.percent / 100.0)) >= threshold
def _check_disk_space(app):
""" Returns the status of the disk space for this instance. If the available disk space is below
a certain threshold, then will return False.
"""
if not app.config.get('SETUP_COMPLETE', False):
return (True, 'Stack not fully setup; skipping check')
# Check the directory in which we're running.
currentfile = os.path.abspath(__file__)
if not _disk_within_threshold(currentfile, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)):
stats = psutil.disk_usage(currentfile)
logger.debug('Disk space on main volume: %s', stats)
return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent)
# Check the temp directory as well.
tempdir = tempfile.gettempdir()
if tempdir is not None:
if not _disk_within_threshold(tempdir, app.config.get('DISKSPACE_HEALTH_THRESHOLD', 0.1)):
stats = psutil.disk_usage(tempdir)
logger.debug('Disk space on temp volume: %s', stats)
return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent)
return (True, '')
_SERVICES = {
'registry_gunicorn': _check_gunicorn('v1/_internal_ping'),
'web_gunicorn': _check_gunicorn('_internal_ping'),
'verbs_gunicorn': _check_gunicorn('c1/_internal_ping'),
'database': _check_database,
'redis': _check_redis,
'storage': _check_storage,
'auth': _check_auth,
'service_key': _check_service_key,
'disk_space': _check_disk_space,
}
def check_all_services(app, skip):
""" Returns a dictionary containing the status of all the services defined. """
status = {}
for name in _SERVICES:
if name in skip:
continue
status[name] = _SERVICES[name](app)
return status