127 lines
		
	
	
	
		
			4.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			127 lines
		
	
	
	
		
			4.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import boto.rds2
 | |
| import logging
 | |
| from health.services import check_all_services
 | |
| 
 | |
| logger = logging.getLogger(__name__)
 | |
| 
 | |
| def get_healthchecker(app):
 | |
|   """ Returns a HealthCheck instance for the given app. """
 | |
|   return HealthCheck.get_checker(app)
 | |
| 
 | |
| 
 | |
| class HealthCheck(object):
 | |
|   def __init__(self, app):
 | |
|     self.app = app
 | |
| 
 | |
|   def check_instance(self):
 | |
|     """
 | |
|     Conducts a check on this specific instance, returning a dict representing the HealthCheck
 | |
|     output and a number indicating the health check response code.
 | |
|     """
 | |
|     service_statuses = check_all_services(self.app)
 | |
|     return self.get_instance_health(service_statuses)
 | |
| 
 | |
|   def check_endtoend(self):
 | |
|     """
 | |
|     Conducts a check on all services, returning a dict representing the HealthCheck
 | |
|     output and a number indicating the health check response code.
 | |
|     """
 | |
|     service_statuses = check_all_services(self.app)
 | |
|     return self.calculate_overall_health(service_statuses)
 | |
| 
 | |
|   def get_instance_health(self, service_statuses):
 | |
|     """
 | |
|     For the given service statuses, returns a dict representing the HealthCheck
 | |
|     output and a number indicating the health check response code. By default,
 | |
|     this simply ensures that all services are reporting as healthy.
 | |
|     """
 | |
|     return self.calculate_overall_health(service_statuses)
 | |
| 
 | |
|   def calculate_overall_health(self, service_statuses, skip=None, notes=None):
 | |
|     """ Returns true if and only if all the given service statuses report as healthy. """
 | |
|     is_healthy = True
 | |
|     notes = notes or []
 | |
| 
 | |
|     for service_name in service_statuses:
 | |
|       if skip and service_name in skip:
 | |
|         notes.append('%s skipped in compute health' % service_name)
 | |
|         continue
 | |
| 
 | |
|       is_healthy = is_healthy and service_statuses[service_name]
 | |
| 
 | |
|     data = {
 | |
|       'services': service_statuses,
 | |
|       'notes': notes,
 | |
|       'is_testing': self.app.config['TESTING']
 | |
|     }
 | |
| 
 | |
|     return (data, 200 if is_healthy else 503)
 | |
| 
 | |
| 
 | |
|   @classmethod
 | |
|   def get_checker(cls, app):
 | |
|     name = app.config['HEALTH_CHECKER'][0]
 | |
|     parameters = app.config['HEALTH_CHECKER'][1] or {}
 | |
| 
 | |
|     for subc in cls.__subclasses__():
 | |
|       if subc.check_name() == name:
 | |
|         return subc(app, **parameters)
 | |
| 
 | |
|     raise Exception('Unknown health check with name %s' % name)
 | |
| 
 | |
| 
 | |
| class LocalHealthCheck(HealthCheck):
 | |
|   @classmethod
 | |
|   def check_name(cls):
 | |
|     return 'LocalHealthCheck'
 | |
| 
 | |
| 
 | |
| class ProductionHealthCheck(HealthCheck):
 | |
|   def __init__(self, app, access_key, secret_key, db_instance='quay'):
 | |
|     super(ProductionHealthCheck, self).__init__(app)
 | |
|     self.access_key = access_key
 | |
|     self.secret_key = secret_key
 | |
|     self.db_instance = db_instance
 | |
| 
 | |
|   @classmethod
 | |
|   def check_name(cls):
 | |
|     return 'ProductionHealthCheck'
 | |
| 
 | |
|   def get_instance_health(self, service_statuses):
 | |
|     # Note: We skip the redis check because if redis is down, we don't want ELB taking the
 | |
|     # machines out of service. Redis is not considered a high avaliability-required service.
 | |
|     skip = ['redis']
 | |
|     notes = []
 | |
| 
 | |
|     # If the database is marked as unhealthy, check the status of RDS directly. If RDS is
 | |
|     # reporting as available, then the problem is with this instance. Otherwise, the problem is
 | |
|     # with RDS, and so we skip the DB status so we can keep this machine as 'healthy'.
 | |
|     db_healthy = service_statuses['database']
 | |
|     if not db_healthy:
 | |
|       rds_status = self._get_rds_status()
 | |
|       notes.append('DB reports unhealthy; RDS status: %s' % rds_status)
 | |
| 
 | |
|       # If the RDS is in any state but available, then we skip the DB check since it will
 | |
|       # fail and bring down the instance.
 | |
|       if rds_status != 'available':
 | |
|         skip.append('database')
 | |
| 
 | |
|     return self.calculate_overall_health(service_statuses, skip=skip, notes=notes)
 | |
| 
 | |
| 
 | |
|   def _get_rds_status(self):
 | |
|     """ Returns the status of the RDS instance as reported by AWS. """
 | |
|     try:
 | |
|       region = boto.rds2.connect_to_region('us-east-1',
 | |
|         aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
 | |
|       response = region.describe_db_instances()['DescribeDBInstancesResponse']
 | |
|       result = response['DescribeDBInstancesResult']
 | |
|       instances = [i for i in result['DBInstances'] if i['DBInstanceIdentifier'] == self.db_instance]
 | |
|       if not instances:
 | |
|         return 'error'
 | |
| 
 | |
|       status = instances[0]['DBInstanceStatus']
 | |
|       return status
 | |
|     except:
 | |
|       logger.exception("Exception while checking RDS status")
 | |
|       return 'error'
 |