183 lines
6.4 KiB
Python
183 lines
6.4 KiB
Python
|
import logging
|
||
|
import socket
|
||
|
|
||
|
import boto.rds2
|
||
|
|
||
|
from auth.permissions import SuperUserPermission
|
||
|
from flask import session
|
||
|
from health.services import check_all_services, check_warning_services
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
|
||
|
def get_healthchecker(app, config_provider, instance_keys):
|
||
|
""" Returns a HealthCheck instance for the given app. """
|
||
|
return HealthCheck.get_checker(app, config_provider, instance_keys)
|
||
|
|
||
|
|
||
|
class HealthCheck(object):
|
||
|
def __init__(self, app, config_provider, instance_keys, instance_skips=None):
|
||
|
self.app = app
|
||
|
self.config_provider = config_provider
|
||
|
self.instance_keys = instance_keys
|
||
|
self.instance_skips = instance_skips or []
|
||
|
|
||
|
def check_warning(self):
|
||
|
"""
|
||
|
Conducts a check on the warnings, returning a dict representing the HealthCheck
|
||
|
output and a number indicating the health check response code.
|
||
|
"""
|
||
|
service_statuses = check_warning_services(self.app, [])
|
||
|
return self.get_instance_health(service_statuses)
|
||
|
|
||
|
def check_instance(self):
|
||
|
"""
|
||
|
Conducts a check on this specific instance, returning a dict representing the HealthCheck
|
||
|
output and a number indicating the health check response code.
|
||
|
"""
|
||
|
service_statuses = check_all_services(self.app, self.instance_skips, for_instance=True)
|
||
|
return self.get_instance_health(service_statuses)
|
||
|
|
||
|
def check_endtoend(self):
|
||
|
"""
|
||
|
Conducts a check on all services, returning a dict representing the HealthCheck
|
||
|
output and a number indicating the health check response code.
|
||
|
"""
|
||
|
service_statuses = check_all_services(self.app, [], for_instance=False)
|
||
|
return self.calculate_overall_health(service_statuses)
|
||
|
|
||
|
def get_instance_health(self, service_statuses):
|
||
|
"""
|
||
|
For the given service statuses, returns a dict representing the HealthCheck
|
||
|
output and a number indicating the health check response code. By default,
|
||
|
this simply ensures that all services are reporting as healthy.
|
||
|
"""
|
||
|
return self.calculate_overall_health(service_statuses)
|
||
|
|
||
|
def calculate_overall_health(self, service_statuses, skip=None, notes=None):
|
||
|
""" Returns true if and only if all the given service statuses report as healthy. """
|
||
|
is_healthy = True
|
||
|
notes = notes or []
|
||
|
|
||
|
service_statuses_bools = {}
|
||
|
service_status_expanded = {}
|
||
|
|
||
|
for service_name in service_statuses:
|
||
|
status, message = service_statuses[service_name]
|
||
|
|
||
|
service_statuses_bools[service_name] = status
|
||
|
service_status_expanded[service_name] = {
|
||
|
'status': status,
|
||
|
}
|
||
|
|
||
|
if not status:
|
||
|
service_status_expanded[service_name]['failure'] = message
|
||
|
elif message:
|
||
|
service_status_expanded[service_name]['message'] = message
|
||
|
|
||
|
if skip and service_name in skip:
|
||
|
notes.append('%s skipped in compute health' % service_name)
|
||
|
continue
|
||
|
|
||
|
is_healthy = is_healthy and status
|
||
|
|
||
|
data = {
|
||
|
'services': service_statuses_bools,
|
||
|
}
|
||
|
|
||
|
expanded_data = {
|
||
|
'services_expanded': service_status_expanded,
|
||
|
'notes': notes,
|
||
|
'is_testing': self.app.config['TESTING'],
|
||
|
'config_provider': self.config_provider.provider_id,
|
||
|
'local_service_key_id': self.instance_keys.local_key_id,
|
||
|
'hostname': socket.gethostname(),
|
||
|
}
|
||
|
|
||
|
add_debug_information = SuperUserPermission().can() or session.get('health_debug', False)
|
||
|
if add_debug_information:
|
||
|
data.update(expanded_data)
|
||
|
|
||
|
if not is_healthy:
|
||
|
logger.warning('[FAILED HEALTH CHECK] %s', expanded_data)
|
||
|
|
||
|
return (data, 200 if is_healthy else 503)
|
||
|
|
||
|
@classmethod
|
||
|
def get_checker(cls, app, config_provider, instance_keys):
|
||
|
name = app.config['HEALTH_CHECKER'][0]
|
||
|
parameters = app.config['HEALTH_CHECKER'][1] or {}
|
||
|
|
||
|
for subc in cls.__subclasses__():
|
||
|
if name in subc.check_names():
|
||
|
return subc(app, config_provider, instance_keys, **parameters)
|
||
|
|
||
|
raise Exception('Unknown health check with name %s' % name)
|
||
|
|
||
|
|
||
|
class LocalHealthCheck(HealthCheck):
|
||
|
def __init__(self, app, config_provider, instance_keys):
|
||
|
super(LocalHealthCheck, self).__init__(app, config_provider, instance_keys, [
|
||
|
'redis', 'storage'])
|
||
|
|
||
|
@classmethod
|
||
|
def check_names(cls):
|
||
|
return ['LocalHealthCheck']
|
||
|
|
||
|
|
||
|
class RDSAwareHealthCheck(HealthCheck):
|
||
|
def __init__(self, app, config_provider, instance_keys, access_key, secret_key,
|
||
|
db_instance='quay', region='us-east-1'):
|
||
|
# Note: We skip the redis check because if redis is down, we don't want ELB taking the
|
||
|
# machines out of service. Redis is not considered a high avaliability-required service.
|
||
|
super(RDSAwareHealthCheck, self).__init__(app, config_provider, instance_keys, [
|
||
|
'redis', 'storage'])
|
||
|
|
||
|
self.access_key = access_key
|
||
|
self.secret_key = secret_key
|
||
|
self.db_instance = db_instance
|
||
|
self.region = region
|
||
|
|
||
|
@classmethod
|
||
|
def check_names(cls):
|
||
|
return ['RDSAwareHealthCheck', 'ProductionHealthCheck']
|
||
|
|
||
|
def get_instance_health(self, service_statuses):
|
||
|
skip = []
|
||
|
notes = []
|
||
|
|
||
|
# If the database is marked as unhealthy, check the status of RDS directly. If RDS is
|
||
|
# reporting as available, then the problem is with this instance. Otherwise, the problem is
|
||
|
# with RDS, and so we skip the DB status so we can keep this machine as 'healthy'.
|
||
|
if 'database' in service_statuses:
|
||
|
db_healthy = service_statuses['database']
|
||
|
if not db_healthy:
|
||
|
rds_status = self._get_rds_status()
|
||
|
notes.append('DB reports unhealthy; RDS status: %s' % rds_status)
|
||
|
|
||
|
# If the RDS is in any state but available, then we skip the DB check since it will
|
||
|
# fail and bring down the instance.
|
||
|
if rds_status != 'available':
|
||
|
skip.append('database')
|
||
|
|
||
|
return self.calculate_overall_health(service_statuses, skip=skip, notes=notes)
|
||
|
|
||
|
def _get_rds_status(self):
|
||
|
""" Returns the status of the RDS instance as reported by AWS. """
|
||
|
try:
|
||
|
region = boto.rds2.connect_to_region(self.region, aws_access_key_id=self.access_key,
|
||
|
aws_secret_access_key=self.secret_key)
|
||
|
|
||
|
response = region.describe_db_instances()['DescribeDBInstancesResponse']
|
||
|
result = response['DescribeDBInstancesResult']
|
||
|
instances = [
|
||
|
i for i in result['DBInstances'] if i['DBInstanceIdentifier'] == self.db_instance]
|
||
|
if not instances:
|
||
|
return 'error'
|
||
|
|
||
|
status = instances[0]['DBInstanceStatus']
|
||
|
return status
|
||
|
except:
|
||
|
logger.exception("Exception while checking RDS status")
|
||
|
return 'error'
|