Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend.

This commit is contained in:
Joseph Schorr 2015-01-20 16:53:05 -05:00
parent 92d32bc636
commit b74b7de197
3 changed files with 146 additions and 88 deletions

View file

@ -1,48 +1,83 @@
import boto.rds2
import logging
from health.services import check_all_services
logger = logging.getLogger(__name__)
class HealthCheck(object):
def __init__(self):
pass
def get_healthchecker(app):
""" Returns a HealthCheck instance for the given app. """
return HealthCheck.get_checker(app)
def conduct_healthcheck(self, db_healthy, buildlogs_healthy, registry_healthy):
class HealthCheck(object):
def __init__(self, app):
self.app = app
def check_instance(self):
"""
Conducts any custom healthcheck work, returning a dict representing the HealthCheck
output and a boolean indicating whether the instance is healthy.
Conducts a check on this specific instance, returning a dict representing the HealthCheck
output and a number indicating the health check response code.
"""
raise NotImplementedError
service_statuses = check_all_services(self.app)
return self.get_instance_health(service_statuses)
def check_endtoend(self):
"""
Conducts a check on all services, returning a dict representing the HealthCheck
output and a number indicating the health check response code.
"""
service_statuses = check_all_services(self.app)
return self.calculate_overall_health(service_statuses)
def get_instance_health(self, service_statuses):
"""
For the given service statuses, returns a dict representing the HealthCheck
output and a number indicating the health check response code. By default,
this simply ensures that all services are reporting as healthy.
"""
return self.calculate_overall_health(service_statuses)
def calculate_overall_health(self, service_statuses, skip=None, notes=None):
""" Returns true if and only if all the given service statuses report as healthy. """
is_healthy = True
notes = notes or []
for service_name in service_statuses:
if skip and service_name in skip:
notes.append('%s skipped in compute health' % service_name)
continue
is_healthy = is_healthy and service_statuses[service_name]
data = {
'services': service_statuses,
'notes': notes
}
return (data, 200 if is_healthy else 503)
@classmethod
def get_check(cls, name, parameters):
def get_checker(cls, app):
name = app.config['HEALTH_CHECKER'][0]
parameters = app.config['HEALTH_CHECKER'][1] or {}
for subc in cls.__subclasses__():
if subc.check_name() == name:
return subc(**parameters)
return subc(app, **parameters)
raise Exception('Unknown health check with name %s' % name)
class LocalHealthCheck(HealthCheck):
def __init__(self):
pass
@classmethod
def check_name(cls):
return 'LocalHealthCheck'
def conduct_healthcheck(self, db_healthy, buildlogs_healthy, registry_healthy):
data = {
'db_healthy': db_healthy,
'buildlogs_healthy': buildlogs_healthy,
'registry_healthy': registry_healthy
}
return (data, db_healthy and buildlogs_healthy)
class ProductionHealthCheck(HealthCheck):
def __init__(self, access_key, secret_key):
def __init__(self, app, access_key, secret_key):
super(ProductionHealthCheck, self).__init__(app)
self.access_key = access_key
self.secret_key = secret_key
@ -50,37 +85,38 @@ class ProductionHealthCheck(HealthCheck):
def check_name(cls):
return 'ProductionHealthCheck'
def conduct_healthcheck(self, db_healthy, buildlogs_healthy, registry_healthy):
data = {
'db_healthy': db_healthy,
'buildlogs_healthy': buildlogs_healthy,
'registry_healthy': registry_healthy
}
def get_instance_health(self, service_statuses):
# Note: We skip the redis check because if redis is down, we don't want ELB taking the
# machines out of service. Redis is not considered a high avaliability-required service.
skip = ['redis']
notes = []
# Only report unhealthy if the machine cannot connect to the DB. Redis isn't required for
# mission critical/high avaliability operations.
# If the database is marked as unhealthy, check the status of RDS directly. If RDS is
# reporting as available, then the problem is with this instance. Otherwise, the problem is
# with RDS, and so we skip the DB status so we can keep this machine as 'healthy'.
db_healthy = service_statuses['database']
if not db_healthy:
# If the database is marked as unhealthy, check the status of RDS directly. If RDS is
# reporting as available, then the problem is with this instance. Otherwise, the problem is
# with RDS, and we can keep this machine as 'healthy'.
is_rds_working = False
try:
region = boto.rds2.connect_to_region('us-east-1',
aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
response = region.describe_db_instances()['DescribeDBInstancesResponse']
result = response['DescribeDBInstancesResult']
instances = result['DBInstances']
status = instances[0]['DBInstanceStatus']
is_rds_working = status == 'available'
except:
logger.exception("Exception while checking RDS status")
pass
rds_status = self._get_rds_status()
notes.append('DB reports unhealthy; RDS status: %s' % rds_status)
data['db_available_checked'] = True
data['db_available_status'] = is_rds_working
# If the RDS is in any state but available, then we skip the DB check since it will
# fail and bring down the instance.
if rds_status != 'available':
skip.append('database')
# If RDS is down, then we still report the machine as healthy, so that it can handle
# requests once RDS comes back up.
return (data, not is_rds_working and registry_healthy)
return self.calculate_overall_health(service_statuses, skip=skip, notes=notes)
return (data, db_healthy and registry_healthy)
def _get_rds_status(self):
""" Returns the status of the RDS instance as reported by AWS. """
try:
region = boto.rds2.connect_to_region('us-east-1',
aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
response = region.describe_db_instances()['DescribeDBInstancesResponse']
result = response['DescribeDBInstancesResult']
instances = result['DBInstances']
status = instances[0]['DBInstanceStatus']
return status
except:
logger.exception("Exception while checking RDS status")
return 'error'