quay/health/healthcheck.py

import boto.rds2
import logging
from health.services import check_all_services

logger = logging.getLogger(__name__)

def get_healthchecker(app, config_provider, instance_keys):
  """ Returns a HealthCheck instance for the given app. """
  return HealthCheck.get_checker(app, config_provider, instance_keys)


class HealthCheck(object):
  def __init__(self, app, config_provider, instance_keys, instance_skips=None):
    self.app = app
    self.config_provider = config_provider
    self.instance_keys = instance_keys
    self.instance_skips = instance_skips or []

  def check_instance(self):
    """
    Conducts a check on this specific instance, returning a dict representing the HealthCheck
    output and a number indicating the health check response code.
    """
    service_statuses = check_all_services(self.app, self.instance_skips)
    return self.get_instance_health(service_statuses)

  def check_endtoend(self):
    """
    Conducts a check on all services, returning a dict representing the HealthCheck
    output and a number indicating the health check response code.
    """
    service_statuses = check_all_services(self.app, [])
    return self.calculate_overall_health(service_statuses)

  def get_instance_health(self, service_statuses):
    """
    For the given service statuses, returns a dict representing the HealthCheck
    output and a number indicating the health check response code. By default,
    this simply ensures that all services are reporting as healthy.
    """
    return self.calculate_overall_health(service_statuses)

  def calculate_overall_health(self, service_statuses, skip=None, notes=None):
    """ Returns true if and only if all the given service statuses report as healthy. """
    is_healthy = True
    notes = notes or []

    for service_name in service_statuses:
      if skip and service_name in skip:
        notes.append('%s skipped in compute health' % service_name)
        continue

      is_healthy = is_healthy and service_statuses[service_name]

    data = {
      'services': service_statuses,
      'notes': notes,
      'is_testing': self.app.config['TESTING'],
      'config_provider': self.config_provider.provider_id,
      'local_service_key_id': self.instance_keys.local_key_id,
    }

    return (data, 200 if is_healthy else 503)


  @classmethod
  def get_checker(cls, app, config_provider, instance_keys):
    name = app.config['HEALTH_CHECKER'][0]
    parameters = app.config['HEALTH_CHECKER'][1] or {}

    for subc in cls.__subclasses__():
      if name in subc.check_names():
        return subc(app, config_provider, instance_keys, **parameters)

    raise Exception('Unknown health check with name %s' % name)


class LocalHealthCheck(HealthCheck):
  def __init__(self, app, config_provider, instance_keys):
    super(LocalHealthCheck, self).__init__(app, config_provider, instance_keys,
                                           ['redis', 'storage'])

  @classmethod
  def check_names(cls):
    return ['LocalHealthCheck']


class RDSAwareHealthCheck(HealthCheck):
  def __init__(self, app, config_provider, instance_keys, access_key, secret_key,
               db_instance='quay', region='us-east-1'):
    super(RDSAwareHealthCheck, self).__init__(app, config_provider, instance_keys,
                                              ['redis', 'storage'])

    self.access_key = access_key
    self.secret_key = secret_key
    self.db_instance = db_instance
    self.region = region

  @classmethod
  def check_names(cls):
    return ['RDSAwareHealthCheck', 'ProductionHealthCheck']

  def get_instance_health(self, service_statuses):
    # Note: We skip the redis check because if redis is down, we don't want ELB taking the
    # machines out of service. Redis is not considered a high avaliability-required service.
    skip = []
    notes = []

    # If the database is marked as unhealthy, check the status of RDS directly. If RDS is
    # reporting as available, then the problem is with this instance. Otherwise, the problem is
    # with RDS, and so we skip the DB status so we can keep this machine as 'healthy'.
    db_healthy = service_statuses['database']
    if not db_healthy:
      rds_status = self._get_rds_status()
      notes.append('DB reports unhealthy; RDS status: %s' % rds_status)

      # If the RDS is in any state but available, then we skip the DB check since it will
      # fail and bring down the instance.
      if rds_status != 'available':
        skip.append('database')

    return self.calculate_overall_health(service_statuses, skip=skip, notes=notes)


  def _get_rds_status(self):
    """ Returns the status of the RDS instance as reported by AWS. """
    try:
      region = boto.rds2.connect_to_region(self.region, aws_access_key_id=self.access_key,
                                           aws_secret_access_key=self.secret_key)

      response = region.describe_db_instances()['DescribeDBInstancesResponse']
      result = response['DescribeDBInstancesResult']
      instances = [i for i in result['DBInstances'] if i['DBInstanceIdentifier'] == self.db_instance]
      if not instances:
        return 'error'

      status = instances[0]['DBInstanceStatus']
      return status
    except:
      logger.exception("Exception while checking RDS status")
      return 'error'
Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00			`import boto.rds2`
			`import logging`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`from health.services import check_all_services`
Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00
			`logger = logging.getLogger(__name__)`

Add instance key ID to the health check endpoint Fixes #1429 2016-07-05 18:14:22 +00:00			`def get_healthchecker(app, config_provider, instance_keys):`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`""" Returns a HealthCheck instance for the given app. """`
Add instance key ID to the health check endpoint Fixes #1429 2016-07-05 18:14:22 +00:00			`return HealthCheck.get_checker(app, config_provider, instance_keys)`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00

Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00			`class HealthCheck(object):`
Add instance key ID to the health check endpoint Fixes #1429 2016-07-05 18:14:22 +00:00			`def __init__(self, app, config_provider, instance_keys, instance_skips=None):`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`self.app = app`
Add Kubernetes configuration provider which writes config to a secret Fixes #145 2015-07-27 15:17:44 +00:00			`self.config_provider = config_provider`
Add instance key ID to the health check endpoint Fixes #1429 2016-07-05 18:14:22 +00:00			`self.instance_keys = instance_keys`
I hate Redis! - Remove redis check from our health endpoint in prod entirely - Have the redis check have a maximum timeout of 1 second 2015-10-22 17:24:56 +00:00			`self.instance_skips = instance_skips or []`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00
			`def check_instance(self):`
			`"""`
			`Conducts a check on this specific instance, returning a dict representing the HealthCheck`
			`output and a number indicating the health check response code.`
			`"""`
Add missing arg 2015-10-22 19:57:34 +00:00			`service_statuses = check_all_services(self.app, self.instance_skips)`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`return self.get_instance_health(service_statuses)`

			`def check_endtoend(self):`
			`"""`
			`Conducts a check on all services, returning a dict representing the HealthCheck`
			`output and a number indicating the health check response code.`
			`"""`
Add missing arg 2015-10-22 19:57:34 +00:00			`service_statuses = check_all_services(self.app, [])`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`return self.calculate_overall_health(service_statuses)`
Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`def get_instance_health(self, service_statuses):`
Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00			`"""`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`For the given service statuses, returns a dict representing the HealthCheck`
			`output and a number indicating the health check response code. By default,`
			`this simply ensures that all services are reporting as healthy.`
Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00			`"""`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`return self.calculate_overall_health(service_statuses)`

			`def calculate_overall_health(self, service_statuses, skip=None, notes=None):`
			`""" Returns true if and only if all the given service statuses report as healthy. """`
			`is_healthy = True`
			`notes = notes or []`

			`for service_name in service_statuses:`
			`if skip and service_name in skip:`
			`notes.append('%s skipped in compute health' % service_name)`
			`continue`

			`is_healthy = is_healthy and service_statuses[service_name]`

			`data = {`
			`'services': service_statuses,`
Add is_testing info and mirror the moved endpoints so we can migrate safely. 2015-01-20 21:58:29 +00:00			`'notes': notes,`
Add Kubernetes configuration provider which writes config to a secret Fixes #145 2015-07-27 15:17:44 +00:00			`'is_testing': self.app.config['TESTING'],`
Add instance key ID to the health check endpoint Fixes #1429 2016-07-05 18:14:22 +00:00			`'config_provider': self.config_provider.provider_id,`
			`'local_service_key_id': self.instance_keys.local_key_id,`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`}`

			`return (data, 200 if is_healthy else 503)`

Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00
			`@classmethod`
Add instance key ID to the health check endpoint Fixes #1429 2016-07-05 18:14:22 +00:00			`def get_checker(cls, app, config_provider, instance_keys):`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`name = app.config['HEALTH_CHECKER'][0]`
			`parameters = app.config['HEALTH_CHECKER'][1] or {}`

Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00			`for subc in cls.__subclasses__():`
Add `RDSAwareHealthCheck` as alias for `ProductionHealthCheck` 2016-03-25 19:25:42 +00:00			`if name in subc.check_names():`
Add instance key ID to the health check endpoint Fixes #1429 2016-07-05 18:14:22 +00:00			`return subc(app, config_provider, instance_keys, **parameters)`
Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00
			`raise Exception('Unknown health check with name %s' % name)`


			`class LocalHealthCheck(HealthCheck):`
Add storage validation to the status endpoint Fixes #1659 2016-08-01 17:02:15 +00:00			`def __init__(self, app, config_provider, instance_keys):`
			`super(LocalHealthCheck, self).__init__(app, config_provider, instance_keys,`
			`['redis', 'storage'])`

Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00			`@classmethod`
Add `RDSAwareHealthCheck` as alias for `ProductionHealthCheck` 2016-03-25 19:25:42 +00:00			`def check_names(cls):`
			`return ['LocalHealthCheck']`
Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00

Add `RDSAwareHealthCheck` as alias for `ProductionHealthCheck` 2016-03-25 19:25:42 +00:00			`class RDSAwareHealthCheck(HealthCheck):`
Add missing constructor argument 2016-07-06 20:17:02 +00:00			`def __init__(self, app, config_provider, instance_keys, access_key, secret_key,`
			`db_instance='quay', region='us-east-1'):`
Add storage validation to the status endpoint Fixes #1659 2016-08-01 17:02:15 +00:00			`super(RDSAwareHealthCheck, self).__init__(app, config_provider, instance_keys,`
			`['redis', 'storage'])`

Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00			`self.access_key = access_key`
			`self.secret_key = secret_key`
Fix the DB health check Make sure to search for the proper DB identifier 2015-05-20 21:40:43 +00:00			`self.db_instance = db_instance`
Add `RDSAwareHealthCheck` as alias for `ProductionHealthCheck` 2016-03-25 19:25:42 +00:00			`self.region = region`
Strip whitespace from ALL the things. 2014-11-24 21:07:38 +00:00
Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00			`@classmethod`
Add `RDSAwareHealthCheck` as alias for `ProductionHealthCheck` 2016-03-25 19:25:42 +00:00			`def check_names(cls):`
			`return ['RDSAwareHealthCheck', 'ProductionHealthCheck']`
Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`def get_instance_health(self, service_statuses):`
			`# Note: We skip the redis check because if redis is down, we don't want ELB taking the`
			`# machines out of service. Redis is not considered a high avaliability-required service.`
I hate Redis! - Remove redis check from our health endpoint in prod entirely - Have the redis check have a maximum timeout of 1 second 2015-10-22 17:24:56 +00:00			`skip = []`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`notes = []`
Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`# If the database is marked as unhealthy, check the status of RDS directly. If RDS is`
			`# reporting as available, then the problem is with this instance. Otherwise, the problem is`
			`# with RDS, and so we skip the DB status so we can keep this machine as 'healthy'.`
			`db_healthy = service_statuses['database']`
Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues 2014-11-02 20:06:17 +00:00			`if not db_healthy:`
Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`rds_status = self._get_rds_status()`
			`notes.append('DB reports unhealthy; RDS status: %s' % rds_status)`

			`# If the RDS is in any state but available, then we skip the DB check since it will`
			`# fail and bring down the instance.`
			`if rds_status != 'available':`
			`skip.append('database')`

			`return self.calculate_overall_health(service_statuses, skip=skip, notes=notes)`


			`def _get_rds_status(self):`
			`""" Returns the status of the RDS instance as reported by AWS. """`
			`try:`
Add `RDSAwareHealthCheck` as alias for `ProductionHealthCheck` 2016-03-25 19:25:42 +00:00			`region = boto.rds2.connect_to_region(self.region, aws_access_key_id=self.access_key,`
			`aws_secret_access_key=self.secret_key)`

Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`response = region.describe_db_instances()['DescribeDBInstancesResponse']`
			`result = response['DescribeDBInstancesResult']`
Fix the DB health check Make sure to search for the proper DB identifier 2015-05-20 21:40:43 +00:00			`instances = [i for i in result['DBInstances'] if i['DBInstanceIdentifier'] == self.db_instance]`
			`if not instances:`
			`return 'error'`

Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend. 2015-01-20 21:53:05 +00:00			`status = instances[0]['DBInstanceStatus']`
			`return status`
			`except:`
			`logger.exception("Exception while checking RDS status")`
			`return 'error'`