initial import for Open Source 🎉
This commit is contained in:
parent
1898c361f3
commit
9c0dd3b722
2048 changed files with 218743 additions and 0 deletions
0
health/__init__.py
Normal file
0
health/__init__.py
Normal file
182
health/healthcheck.py
Normal file
182
health/healthcheck.py
Normal file
|
@ -0,0 +1,182 @@
|
|||
import logging
|
||||
import socket
|
||||
|
||||
import boto.rds2
|
||||
|
||||
from auth.permissions import SuperUserPermission
|
||||
from flask import session
|
||||
from health.services import check_all_services, check_warning_services
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_healthchecker(app, config_provider, instance_keys):
|
||||
""" Returns a HealthCheck instance for the given app. """
|
||||
return HealthCheck.get_checker(app, config_provider, instance_keys)
|
||||
|
||||
|
||||
class HealthCheck(object):
|
||||
def __init__(self, app, config_provider, instance_keys, instance_skips=None):
|
||||
self.app = app
|
||||
self.config_provider = config_provider
|
||||
self.instance_keys = instance_keys
|
||||
self.instance_skips = instance_skips or []
|
||||
|
||||
def check_warning(self):
|
||||
"""
|
||||
Conducts a check on the warnings, returning a dict representing the HealthCheck
|
||||
output and a number indicating the health check response code.
|
||||
"""
|
||||
service_statuses = check_warning_services(self.app, [])
|
||||
return self.get_instance_health(service_statuses)
|
||||
|
||||
def check_instance(self):
|
||||
"""
|
||||
Conducts a check on this specific instance, returning a dict representing the HealthCheck
|
||||
output and a number indicating the health check response code.
|
||||
"""
|
||||
service_statuses = check_all_services(self.app, self.instance_skips, for_instance=True)
|
||||
return self.get_instance_health(service_statuses)
|
||||
|
||||
def check_endtoend(self):
|
||||
"""
|
||||
Conducts a check on all services, returning a dict representing the HealthCheck
|
||||
output and a number indicating the health check response code.
|
||||
"""
|
||||
service_statuses = check_all_services(self.app, [], for_instance=False)
|
||||
return self.calculate_overall_health(service_statuses)
|
||||
|
||||
def get_instance_health(self, service_statuses):
|
||||
"""
|
||||
For the given service statuses, returns a dict representing the HealthCheck
|
||||
output and a number indicating the health check response code. By default,
|
||||
this simply ensures that all services are reporting as healthy.
|
||||
"""
|
||||
return self.calculate_overall_health(service_statuses)
|
||||
|
||||
def calculate_overall_health(self, service_statuses, skip=None, notes=None):
|
||||
""" Returns true if and only if all the given service statuses report as healthy. """
|
||||
is_healthy = True
|
||||
notes = notes or []
|
||||
|
||||
service_statuses_bools = {}
|
||||
service_status_expanded = {}
|
||||
|
||||
for service_name in service_statuses:
|
||||
status, message = service_statuses[service_name]
|
||||
|
||||
service_statuses_bools[service_name] = status
|
||||
service_status_expanded[service_name] = {
|
||||
'status': status,
|
||||
}
|
||||
|
||||
if not status:
|
||||
service_status_expanded[service_name]['failure'] = message
|
||||
elif message:
|
||||
service_status_expanded[service_name]['message'] = message
|
||||
|
||||
if skip and service_name in skip:
|
||||
notes.append('%s skipped in compute health' % service_name)
|
||||
continue
|
||||
|
||||
is_healthy = is_healthy and status
|
||||
|
||||
data = {
|
||||
'services': service_statuses_bools,
|
||||
}
|
||||
|
||||
expanded_data = {
|
||||
'services_expanded': service_status_expanded,
|
||||
'notes': notes,
|
||||
'is_testing': self.app.config['TESTING'],
|
||||
'config_provider': self.config_provider.provider_id,
|
||||
'local_service_key_id': self.instance_keys.local_key_id,
|
||||
'hostname': socket.gethostname(),
|
||||
}
|
||||
|
||||
add_debug_information = SuperUserPermission().can() or session.get('health_debug', False)
|
||||
if add_debug_information:
|
||||
data.update(expanded_data)
|
||||
|
||||
if not is_healthy:
|
||||
logger.warning('[FAILED HEALTH CHECK] %s', expanded_data)
|
||||
|
||||
return (data, 200 if is_healthy else 503)
|
||||
|
||||
@classmethod
|
||||
def get_checker(cls, app, config_provider, instance_keys):
|
||||
name = app.config['HEALTH_CHECKER'][0]
|
||||
parameters = app.config['HEALTH_CHECKER'][1] or {}
|
||||
|
||||
for subc in cls.__subclasses__():
|
||||
if name in subc.check_names():
|
||||
return subc(app, config_provider, instance_keys, **parameters)
|
||||
|
||||
raise Exception('Unknown health check with name %s' % name)
|
||||
|
||||
|
||||
class LocalHealthCheck(HealthCheck):
|
||||
def __init__(self, app, config_provider, instance_keys):
|
||||
super(LocalHealthCheck, self).__init__(app, config_provider, instance_keys, [
|
||||
'redis', 'storage'])
|
||||
|
||||
@classmethod
|
||||
def check_names(cls):
|
||||
return ['LocalHealthCheck']
|
||||
|
||||
|
||||
class RDSAwareHealthCheck(HealthCheck):
|
||||
def __init__(self, app, config_provider, instance_keys, access_key, secret_key,
|
||||
db_instance='quay', region='us-east-1'):
|
||||
# Note: We skip the redis check because if redis is down, we don't want ELB taking the
|
||||
# machines out of service. Redis is not considered a high avaliability-required service.
|
||||
super(RDSAwareHealthCheck, self).__init__(app, config_provider, instance_keys, [
|
||||
'redis', 'storage'])
|
||||
|
||||
self.access_key = access_key
|
||||
self.secret_key = secret_key
|
||||
self.db_instance = db_instance
|
||||
self.region = region
|
||||
|
||||
@classmethod
|
||||
def check_names(cls):
|
||||
return ['RDSAwareHealthCheck', 'ProductionHealthCheck']
|
||||
|
||||
def get_instance_health(self, service_statuses):
|
||||
skip = []
|
||||
notes = []
|
||||
|
||||
# If the database is marked as unhealthy, check the status of RDS directly. If RDS is
|
||||
# reporting as available, then the problem is with this instance. Otherwise, the problem is
|
||||
# with RDS, and so we skip the DB status so we can keep this machine as 'healthy'.
|
||||
if 'database' in service_statuses:
|
||||
db_healthy = service_statuses['database']
|
||||
if not db_healthy:
|
||||
rds_status = self._get_rds_status()
|
||||
notes.append('DB reports unhealthy; RDS status: %s' % rds_status)
|
||||
|
||||
# If the RDS is in any state but available, then we skip the DB check since it will
|
||||
# fail and bring down the instance.
|
||||
if rds_status != 'available':
|
||||
skip.append('database')
|
||||
|
||||
return self.calculate_overall_health(service_statuses, skip=skip, notes=notes)
|
||||
|
||||
def _get_rds_status(self):
|
||||
""" Returns the status of the RDS instance as reported by AWS. """
|
||||
try:
|
||||
region = boto.rds2.connect_to_region(self.region, aws_access_key_id=self.access_key,
|
||||
aws_secret_access_key=self.secret_key)
|
||||
|
||||
response = region.describe_db_instances()['DescribeDBInstancesResponse']
|
||||
result = response['DescribeDBInstancesResult']
|
||||
instances = [
|
||||
i for i in result['DBInstances'] if i['DBInstanceIdentifier'] == self.db_instance]
|
||||
if not instances:
|
||||
return 'error'
|
||||
|
||||
status = instances[0]['DBInstanceStatus']
|
||||
return status
|
||||
except:
|
||||
logger.exception("Exception while checking RDS status")
|
||||
return 'error'
|
14
health/models_interface.py
Normal file
14
health/models_interface.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
from abc import ABCMeta, abstractmethod
|
||||
from six import add_metaclass
|
||||
|
||||
|
||||
@add_metaclass(ABCMeta)
|
||||
class HealthCheckDataInterface(object):
|
||||
"""
|
||||
Interface that represents all data store interactions required by health checks.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def check_health(self, app_config):
|
||||
""" Returns True if the connection to the database is healthy and False otherwise. """
|
||||
pass
|
10
health/models_pre_oci.py
Normal file
10
health/models_pre_oci.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
from data.model import health
|
||||
from health.models_interface import HealthCheckDataInterface
|
||||
|
||||
|
||||
class PreOCIModel(HealthCheckDataInterface):
|
||||
def check_health(self, app_config):
|
||||
return health.check_health(app_config)
|
||||
|
||||
|
||||
pre_oci_model = PreOCIModel()
|
197
health/services.py
Normal file
197
health/services.py
Normal file
|
@ -0,0 +1,197 @@
|
|||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import psutil
|
||||
|
||||
from app import build_logs, storage, authentication, instance_keys
|
||||
from health.models_pre_oci import pre_oci_model as model
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def _compute_internal_endpoint(app, endpoint):
|
||||
# Compute the URL for checking the endpoint. We append a port if and only if the
|
||||
# hostname contains one.
|
||||
hostname_parts = app.config['SERVER_HOSTNAME'].split(':')
|
||||
port = ''
|
||||
if hostname_parts[0] == 'localhost':
|
||||
if len(hostname_parts) == 2:
|
||||
port = ':' + hostname_parts[1]
|
||||
|
||||
scheme = app.config['PREFERRED_URL_SCHEME']
|
||||
if app.config.get('EXTERNAL_TLS_TERMINATION', False):
|
||||
scheme = 'http'
|
||||
|
||||
if port == '':
|
||||
if scheme == 'http':
|
||||
port = ':8080'
|
||||
else:
|
||||
port = ':8443'
|
||||
|
||||
return '%s://localhost%s/%s' % (scheme, port, endpoint)
|
||||
|
||||
|
||||
def _check_gunicorn(endpoint):
|
||||
def fn(app):
|
||||
""" Returns the status of the gunicorn workers. """
|
||||
client = app.config['HTTPCLIENT']
|
||||
registry_url = _compute_internal_endpoint(app, endpoint)
|
||||
try:
|
||||
status_code = client.get(registry_url, verify=False, timeout=2).status_code
|
||||
okay = status_code == 200
|
||||
message = ('Got non-200 response for worker: %s' % status_code) if not okay else None
|
||||
return (okay, message)
|
||||
except Exception as ex:
|
||||
logger.exception('Exception when checking worker health: %s', registry_url)
|
||||
return (False, 'Exception when checking worker health: %s' % registry_url)
|
||||
|
||||
return fn
|
||||
|
||||
|
||||
def _check_jwt_proxy(app):
|
||||
""" Returns the status of JWT proxy in the container. """
|
||||
client = app.config['HTTPCLIENT']
|
||||
registry_url = _compute_internal_endpoint(app, 'secscan')
|
||||
try:
|
||||
status_code = client.get(registry_url, verify=False, timeout=2).status_code
|
||||
okay = status_code == 403
|
||||
return (okay, ('Got non-403 response for JWT proxy: %s' % status_code) if not okay else None)
|
||||
except Exception as ex:
|
||||
logger.exception('Exception when checking jwtproxy health: %s', registry_url)
|
||||
return (False, 'Exception when checking jwtproxy health: %s' % registry_url)
|
||||
|
||||
|
||||
def _check_database(app):
|
||||
""" Returns the status of the database, as accessed from this instance. """
|
||||
return model.check_health(app.config)
|
||||
|
||||
|
||||
def _check_redis(app):
|
||||
""" Returns the status of Redis, as accessed from this instance. """
|
||||
return build_logs.check_health()
|
||||
|
||||
|
||||
def _check_storage(app):
|
||||
""" Returns the status of storage, as accessed from this instance. """
|
||||
if app.config.get('REGISTRY_STATE', 'normal') == 'readonly':
|
||||
return (True, 'Storage check disabled for readonly mode')
|
||||
|
||||
try:
|
||||
storage.validate(storage.preferred_locations, app.config['HTTPCLIENT'])
|
||||
return (True, None)
|
||||
except Exception as ex:
|
||||
logger.exception('Storage check failed with exception %s', ex)
|
||||
return (False, 'Storage check failed with exception %s' % ex.message)
|
||||
|
||||
|
||||
def _check_auth(app):
|
||||
""" Returns the status of the auth engine, as accessed from this instance. """
|
||||
return authentication.ping()
|
||||
|
||||
|
||||
def _check_service_key(app):
|
||||
""" Returns the status of the service key for this instance. If the key has disappeared or
|
||||
has expired, then will return False.
|
||||
"""
|
||||
if not app.config.get('SETUP_COMPLETE', False):
|
||||
return (True, 'Stack not fully setup; skipping check')
|
||||
|
||||
try:
|
||||
kid = instance_keys.local_key_id
|
||||
except IOError as ex:
|
||||
# Key has not been created yet.
|
||||
return (True, 'Stack not fully setup; skipping check')
|
||||
|
||||
try:
|
||||
key_is_valid = bool(instance_keys.get_service_key_public_key(kid))
|
||||
message = 'Could not find valid instance service key %s' % kid if not key_is_valid else None
|
||||
return (key_is_valid, message)
|
||||
except Exception as ex:
|
||||
logger.exception('Got exception when trying to retrieve the instance key')
|
||||
|
||||
# NOTE: We return *True* here if there was an exception when retrieving the key, as it means
|
||||
# the database is down, which will be handled by the database health check.
|
||||
return (True, 'Failed to get instance key due to a database issue; skipping check')
|
||||
|
||||
|
||||
|
||||
def _disk_within_threshold(path, threshold):
|
||||
usage = psutil.disk_usage(path)
|
||||
return (1.0 - (usage.percent / 100.0)) >= threshold
|
||||
|
||||
|
||||
def _check_disk_space(for_warning):
|
||||
def _check_disk_space(app):
|
||||
""" Returns the status of the disk space for this instance. If the available disk space is below
|
||||
a certain threshold, then will return False.
|
||||
"""
|
||||
if not app.config.get('SETUP_COMPLETE', False):
|
||||
return (True, 'Stack not fully setup; skipping check')
|
||||
|
||||
config_key = ('DISKSPACE_HEALTH_WARNING_THRESHOLD'
|
||||
if for_warning else 'DISKSPACE_HEALTH_THRESHOLD')
|
||||
default_threshold = 0.1 if for_warning else 0.01
|
||||
|
||||
# Check the directory in which we're running.
|
||||
currentfile = os.path.abspath(__file__)
|
||||
if not _disk_within_threshold(currentfile, app.config.get(config_key, default_threshold)):
|
||||
stats = psutil.disk_usage(currentfile)
|
||||
logger.debug('Disk space on main volume: %s', stats)
|
||||
return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent)
|
||||
|
||||
# Check the temp directory as well.
|
||||
tempdir = tempfile.gettempdir()
|
||||
if tempdir is not None:
|
||||
if not _disk_within_threshold(tempdir, app.config.get(config_key, default_threshold)):
|
||||
stats = psutil.disk_usage(tempdir)
|
||||
logger.debug('Disk space on temp volume: %s', stats)
|
||||
return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent)
|
||||
|
||||
return (True, '')
|
||||
|
||||
return _check_disk_space
|
||||
|
||||
|
||||
_INSTANCE_SERVICES = {
|
||||
'registry_gunicorn': _check_gunicorn('v1/_internal_ping'),
|
||||
'web_gunicorn': _check_gunicorn('_internal_ping'),
|
||||
'verbs_gunicorn': _check_gunicorn('c1/_internal_ping'),
|
||||
'service_key': _check_service_key,
|
||||
'disk_space': _check_disk_space(for_warning=False),
|
||||
'jwtproxy': _check_jwt_proxy,
|
||||
}
|
||||
|
||||
_GLOBAL_SERVICES = {
|
||||
'database': _check_database,
|
||||
'redis': _check_redis,
|
||||
'storage': _check_storage,
|
||||
'auth': _check_auth,
|
||||
}
|
||||
|
||||
_WARNING_SERVICES = {
|
||||
'disk_space_warning': _check_disk_space(for_warning=True),
|
||||
}
|
||||
|
||||
def check_all_services(app, skip, for_instance=False):
|
||||
""" Returns a dictionary containing the status of all the services defined. """
|
||||
if for_instance:
|
||||
services = dict(_INSTANCE_SERVICES)
|
||||
services.update(_GLOBAL_SERVICES)
|
||||
else:
|
||||
services = _GLOBAL_SERVICES
|
||||
|
||||
return _check_services(app, skip, services)
|
||||
|
||||
def check_warning_services(app, skip):
|
||||
""" Returns a dictionary containing the status of all the warning services defined. """
|
||||
return _check_services(app, skip, _WARNING_SERVICES)
|
||||
|
||||
def _check_services(app, skip, services):
|
||||
status = {}
|
||||
for name in services:
|
||||
if name in skip:
|
||||
continue
|
||||
|
||||
status[name] = services[name](app)
|
||||
|
||||
return status
|
Reference in a new issue