import boto.rds2 import logging from health.services import check_all_services logger = logging.getLogger(__name__) def get_healthchecker(app): """ Returns a HealthCheck instance for the given app. """ return HealthCheck.get_checker(app) class HealthCheck(object): def __init__(self, app): self.app = app def check_instance(self): """ Conducts a check on this specific instance, returning a dict representing the HealthCheck output and a number indicating the health check response code. """ service_statuses = check_all_services(self.app) return self.get_instance_health(service_statuses) def check_endtoend(self): """ Conducts a check on all services, returning a dict representing the HealthCheck output and a number indicating the health check response code. """ service_statuses = check_all_services(self.app) return self.calculate_overall_health(service_statuses) def get_instance_health(self, service_statuses): """ For the given service statuses, returns a dict representing the HealthCheck output and a number indicating the health check response code. By default, this simply ensures that all services are reporting as healthy. """ return self.calculate_overall_health(service_statuses) def calculate_overall_health(self, service_statuses, skip=None, notes=None): """ Returns true if and only if all the given service statuses report as healthy. """ is_healthy = True notes = notes or [] for service_name in service_statuses: if skip and service_name in skip: notes.append('%s skipped in compute health' % service_name) continue is_healthy = is_healthy and service_statuses[service_name] data = { 'services': service_statuses, 'notes': notes, 'is_testing': self.app.config['TESTING'] } return (data, 200 if is_healthy else 503) @classmethod def get_checker(cls, app): name = app.config['HEALTH_CHECKER'][0] parameters = app.config['HEALTH_CHECKER'][1] or {} for subc in cls.__subclasses__(): if subc.check_name() == name: return subc(app, **parameters) raise Exception('Unknown health check with name %s' % name) class LocalHealthCheck(HealthCheck): @classmethod def check_name(cls): return 'LocalHealthCheck' class ProductionHealthCheck(HealthCheck): def __init__(self, app, access_key, secret_key, db_instance='quay'): super(ProductionHealthCheck, self).__init__(app) self.access_key = access_key self.secret_key = secret_key self.db_instance = db_instance @classmethod def check_name(cls): return 'ProductionHealthCheck' def get_instance_health(self, service_statuses): # Note: We skip the redis check because if redis is down, we don't want ELB taking the # machines out of service. Redis is not considered a high avaliability-required service. skip = ['redis'] notes = [] # If the database is marked as unhealthy, check the status of RDS directly. If RDS is # reporting as available, then the problem is with this instance. Otherwise, the problem is # with RDS, and so we skip the DB status so we can keep this machine as 'healthy'. db_healthy = service_statuses['database'] if not db_healthy: rds_status = self._get_rds_status() notes.append('DB reports unhealthy; RDS status: %s' % rds_status) # If the RDS is in any state but available, then we skip the DB check since it will # fail and bring down the instance. if rds_status != 'available': skip.append('database') return self.calculate_overall_health(service_statuses, skip=skip, notes=notes) def _get_rds_status(self): """ Returns the status of the RDS instance as reported by AWS. """ try: region = boto.rds2.connect_to_region('us-east-1', aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key) response = region.describe_db_instances()['DescribeDBInstancesResponse'] result = response['DescribeDBInstancesResult'] instances = [i for i in result['DBInstances'] if i['DBInstanceIdentifier'] == self.db_instance] if not instances: return 'error' status = instances[0]['DBInstanceStatus'] return status except: logger.exception("Exception while checking RDS status") return 'error'