initial import for Open Source 🎉

This commit is contained in:
Jimmy Zelinskie 2019-11-12 11:09:47 -05:00
parent 1898c361f3
commit 9c0dd3b722
2048 changed files with 218743 additions and 0 deletions

0
health/__init__.py Normal file
View file

182
health/healthcheck.py Normal file
View file

@ -0,0 +1,182 @@
import logging
import socket
import boto.rds2
from auth.permissions import SuperUserPermission
from flask import session
from health.services import check_all_services, check_warning_services
logger = logging.getLogger(__name__)
def get_healthchecker(app, config_provider, instance_keys):
""" Returns a HealthCheck instance for the given app. """
return HealthCheck.get_checker(app, config_provider, instance_keys)
class HealthCheck(object):
def __init__(self, app, config_provider, instance_keys, instance_skips=None):
self.app = app
self.config_provider = config_provider
self.instance_keys = instance_keys
self.instance_skips = instance_skips or []
def check_warning(self):
"""
Conducts a check on the warnings, returning a dict representing the HealthCheck
output and a number indicating the health check response code.
"""
service_statuses = check_warning_services(self.app, [])
return self.get_instance_health(service_statuses)
def check_instance(self):
"""
Conducts a check on this specific instance, returning a dict representing the HealthCheck
output and a number indicating the health check response code.
"""
service_statuses = check_all_services(self.app, self.instance_skips, for_instance=True)
return self.get_instance_health(service_statuses)
def check_endtoend(self):
"""
Conducts a check on all services, returning a dict representing the HealthCheck
output and a number indicating the health check response code.
"""
service_statuses = check_all_services(self.app, [], for_instance=False)
return self.calculate_overall_health(service_statuses)
def get_instance_health(self, service_statuses):
"""
For the given service statuses, returns a dict representing the HealthCheck
output and a number indicating the health check response code. By default,
this simply ensures that all services are reporting as healthy.
"""
return self.calculate_overall_health(service_statuses)
def calculate_overall_health(self, service_statuses, skip=None, notes=None):
""" Returns true if and only if all the given service statuses report as healthy. """
is_healthy = True
notes = notes or []
service_statuses_bools = {}
service_status_expanded = {}
for service_name in service_statuses:
status, message = service_statuses[service_name]
service_statuses_bools[service_name] = status
service_status_expanded[service_name] = {
'status': status,
}
if not status:
service_status_expanded[service_name]['failure'] = message
elif message:
service_status_expanded[service_name]['message'] = message
if skip and service_name in skip:
notes.append('%s skipped in compute health' % service_name)
continue
is_healthy = is_healthy and status
data = {
'services': service_statuses_bools,
}
expanded_data = {
'services_expanded': service_status_expanded,
'notes': notes,
'is_testing': self.app.config['TESTING'],
'config_provider': self.config_provider.provider_id,
'local_service_key_id': self.instance_keys.local_key_id,
'hostname': socket.gethostname(),
}
add_debug_information = SuperUserPermission().can() or session.get('health_debug', False)
if add_debug_information:
data.update(expanded_data)
if not is_healthy:
logger.warning('[FAILED HEALTH CHECK] %s', expanded_data)
return (data, 200 if is_healthy else 503)
@classmethod
def get_checker(cls, app, config_provider, instance_keys):
name = app.config['HEALTH_CHECKER'][0]
parameters = app.config['HEALTH_CHECKER'][1] or {}
for subc in cls.__subclasses__():
if name in subc.check_names():
return subc(app, config_provider, instance_keys, **parameters)
raise Exception('Unknown health check with name %s' % name)
class LocalHealthCheck(HealthCheck):
def __init__(self, app, config_provider, instance_keys):
super(LocalHealthCheck, self).__init__(app, config_provider, instance_keys, [
'redis', 'storage'])
@classmethod
def check_names(cls):
return ['LocalHealthCheck']
class RDSAwareHealthCheck(HealthCheck):
def __init__(self, app, config_provider, instance_keys, access_key, secret_key,
db_instance='quay', region='us-east-1'):
# Note: We skip the redis check because if redis is down, we don't want ELB taking the
# machines out of service. Redis is not considered a high avaliability-required service.
super(RDSAwareHealthCheck, self).__init__(app, config_provider, instance_keys, [
'redis', 'storage'])
self.access_key = access_key
self.secret_key = secret_key
self.db_instance = db_instance
self.region = region
@classmethod
def check_names(cls):
return ['RDSAwareHealthCheck', 'ProductionHealthCheck']
def get_instance_health(self, service_statuses):
skip = []
notes = []
# If the database is marked as unhealthy, check the status of RDS directly. If RDS is
# reporting as available, then the problem is with this instance. Otherwise, the problem is
# with RDS, and so we skip the DB status so we can keep this machine as 'healthy'.
if 'database' in service_statuses:
db_healthy = service_statuses['database']
if not db_healthy:
rds_status = self._get_rds_status()
notes.append('DB reports unhealthy; RDS status: %s' % rds_status)
# If the RDS is in any state but available, then we skip the DB check since it will
# fail and bring down the instance.
if rds_status != 'available':
skip.append('database')
return self.calculate_overall_health(service_statuses, skip=skip, notes=notes)
def _get_rds_status(self):
""" Returns the status of the RDS instance as reported by AWS. """
try:
region = boto.rds2.connect_to_region(self.region, aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key)
response = region.describe_db_instances()['DescribeDBInstancesResponse']
result = response['DescribeDBInstancesResult']
instances = [
i for i in result['DBInstances'] if i['DBInstanceIdentifier'] == self.db_instance]
if not instances:
return 'error'
status = instances[0]['DBInstanceStatus']
return status
except:
logger.exception("Exception while checking RDS status")
return 'error'

View file

@ -0,0 +1,14 @@
from abc import ABCMeta, abstractmethod
from six import add_metaclass
@add_metaclass(ABCMeta)
class HealthCheckDataInterface(object):
"""
Interface that represents all data store interactions required by health checks.
"""
@abstractmethod
def check_health(self, app_config):
""" Returns True if the connection to the database is healthy and False otherwise. """
pass

10
health/models_pre_oci.py Normal file
View file

@ -0,0 +1,10 @@
from data.model import health
from health.models_interface import HealthCheckDataInterface
class PreOCIModel(HealthCheckDataInterface):
def check_health(self, app_config):
return health.check_health(app_config)
pre_oci_model = PreOCIModel()

197
health/services.py Normal file
View file

@ -0,0 +1,197 @@
import logging
import os
import tempfile
import psutil
from app import build_logs, storage, authentication, instance_keys
from health.models_pre_oci import pre_oci_model as model
logger = logging.getLogger(__name__)
def _compute_internal_endpoint(app, endpoint):
# Compute the URL for checking the endpoint. We append a port if and only if the
# hostname contains one.
hostname_parts = app.config['SERVER_HOSTNAME'].split(':')
port = ''
if hostname_parts[0] == 'localhost':
if len(hostname_parts) == 2:
port = ':' + hostname_parts[1]
scheme = app.config['PREFERRED_URL_SCHEME']
if app.config.get('EXTERNAL_TLS_TERMINATION', False):
scheme = 'http'
if port == '':
if scheme == 'http':
port = ':8080'
else:
port = ':8443'
return '%s://localhost%s/%s' % (scheme, port, endpoint)
def _check_gunicorn(endpoint):
def fn(app):
""" Returns the status of the gunicorn workers. """
client = app.config['HTTPCLIENT']
registry_url = _compute_internal_endpoint(app, endpoint)
try:
status_code = client.get(registry_url, verify=False, timeout=2).status_code
okay = status_code == 200
message = ('Got non-200 response for worker: %s' % status_code) if not okay else None
return (okay, message)
except Exception as ex:
logger.exception('Exception when checking worker health: %s', registry_url)
return (False, 'Exception when checking worker health: %s' % registry_url)
return fn
def _check_jwt_proxy(app):
""" Returns the status of JWT proxy in the container. """
client = app.config['HTTPCLIENT']
registry_url = _compute_internal_endpoint(app, 'secscan')
try:
status_code = client.get(registry_url, verify=False, timeout=2).status_code
okay = status_code == 403
return (okay, ('Got non-403 response for JWT proxy: %s' % status_code) if not okay else None)
except Exception as ex:
logger.exception('Exception when checking jwtproxy health: %s', registry_url)
return (False, 'Exception when checking jwtproxy health: %s' % registry_url)
def _check_database(app):
""" Returns the status of the database, as accessed from this instance. """
return model.check_health(app.config)
def _check_redis(app):
""" Returns the status of Redis, as accessed from this instance. """
return build_logs.check_health()
def _check_storage(app):
""" Returns the status of storage, as accessed from this instance. """
if app.config.get('REGISTRY_STATE', 'normal') == 'readonly':
return (True, 'Storage check disabled for readonly mode')
try:
storage.validate(storage.preferred_locations, app.config['HTTPCLIENT'])
return (True, None)
except Exception as ex:
logger.exception('Storage check failed with exception %s', ex)
return (False, 'Storage check failed with exception %s' % ex.message)
def _check_auth(app):
""" Returns the status of the auth engine, as accessed from this instance. """
return authentication.ping()
def _check_service_key(app):
""" Returns the status of the service key for this instance. If the key has disappeared or
has expired, then will return False.
"""
if not app.config.get('SETUP_COMPLETE', False):
return (True, 'Stack not fully setup; skipping check')
try:
kid = instance_keys.local_key_id
except IOError as ex:
# Key has not been created yet.
return (True, 'Stack not fully setup; skipping check')
try:
key_is_valid = bool(instance_keys.get_service_key_public_key(kid))
message = 'Could not find valid instance service key %s' % kid if not key_is_valid else None
return (key_is_valid, message)
except Exception as ex:
logger.exception('Got exception when trying to retrieve the instance key')
# NOTE: We return *True* here if there was an exception when retrieving the key, as it means
# the database is down, which will be handled by the database health check.
return (True, 'Failed to get instance key due to a database issue; skipping check')
def _disk_within_threshold(path, threshold):
usage = psutil.disk_usage(path)
return (1.0 - (usage.percent / 100.0)) >= threshold
def _check_disk_space(for_warning):
def _check_disk_space(app):
""" Returns the status of the disk space for this instance. If the available disk space is below
a certain threshold, then will return False.
"""
if not app.config.get('SETUP_COMPLETE', False):
return (True, 'Stack not fully setup; skipping check')
config_key = ('DISKSPACE_HEALTH_WARNING_THRESHOLD'
if for_warning else 'DISKSPACE_HEALTH_THRESHOLD')
default_threshold = 0.1 if for_warning else 0.01
# Check the directory in which we're running.
currentfile = os.path.abspath(__file__)
if not _disk_within_threshold(currentfile, app.config.get(config_key, default_threshold)):
stats = psutil.disk_usage(currentfile)
logger.debug('Disk space on main volume: %s', stats)
return (False, 'Disk space has gone below threshold on main volume: %s' % stats.percent)
# Check the temp directory as well.
tempdir = tempfile.gettempdir()
if tempdir is not None:
if not _disk_within_threshold(tempdir, app.config.get(config_key, default_threshold)):
stats = psutil.disk_usage(tempdir)
logger.debug('Disk space on temp volume: %s', stats)
return (False, 'Disk space has gone below threshold on temp volume: %s' % stats.percent)
return (True, '')
return _check_disk_space
_INSTANCE_SERVICES = {
'registry_gunicorn': _check_gunicorn('v1/_internal_ping'),
'web_gunicorn': _check_gunicorn('_internal_ping'),
'verbs_gunicorn': _check_gunicorn('c1/_internal_ping'),
'service_key': _check_service_key,
'disk_space': _check_disk_space(for_warning=False),
'jwtproxy': _check_jwt_proxy,
}
_GLOBAL_SERVICES = {
'database': _check_database,
'redis': _check_redis,
'storage': _check_storage,
'auth': _check_auth,
}
_WARNING_SERVICES = {
'disk_space_warning': _check_disk_space(for_warning=True),
}
def check_all_services(app, skip, for_instance=False):
""" Returns a dictionary containing the status of all the services defined. """
if for_instance:
services = dict(_INSTANCE_SERVICES)
services.update(_GLOBAL_SERVICES)
else:
services = _GLOBAL_SERVICES
return _check_services(app, skip, services)
def check_warning_services(app, skip):
""" Returns a dictionary containing the status of all the warning services defined. """
return _check_services(app, skip, _WARNING_SERVICES)
def _check_services(app, skip, services):
status = {}
for name in services:
if name in skip:
continue
status[name] = services[name](app)
return status