Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend.

2015-01-20 16:53:05 -05:00 · 2015-01-20 16:53:05 -05:00 · b74b7de197
commit b74b7de197
parent 92d32bc636
3 changed files with 146 additions and 88 deletions
--- a/health/healthcheck.py
+++ b/health/healthcheck.py
@ -1,48 +1,83 @@
 import boto.rds2
 import logging
+from health.services import check_all_services

 logger = logging.getLogger(__name__)

-class HealthCheck(object):
-  def __init__(self):
-    pass
+def get_healthchecker(app):
+  """ Returns a HealthCheck instance for the given app. """
+  return HealthCheck.get_checker(app)

-  def conduct_healthcheck(self, db_healthy, buildlogs_healthy, registry_healthy):
+
+class HealthCheck(object):
+  def __init__(self, app):
+    self.app = app
+
+  def check_instance(self):
    """
-    Conducts any custom healthcheck work, returning a dict representing the HealthCheck
-    output and a boolean indicating whether the instance is healthy.
+    Conducts a check on this specific instance, returning a dict representing the HealthCheck
+    output and a number indicating the health check response code.
    """
-    raise NotImplementedError
+    service_statuses = check_all_services(self.app)
+    return self.get_instance_health(service_statuses)
+
+  def check_endtoend(self):
+    """
+    Conducts a check on all services, returning a dict representing the HealthCheck
+    output and a number indicating the health check response code.
+    """
+    service_statuses = check_all_services(self.app)
+    return self.calculate_overall_health(service_statuses)
+
+  def get_instance_health(self, service_statuses):
+    """
+    For the given service statuses, returns a dict representing the HealthCheck
+    output and a number indicating the health check response code. By default,
+    this simply ensures that all services are reporting as healthy.
+    """
+    return self.calculate_overall_health(service_statuses)
+
+  def calculate_overall_health(self, service_statuses, skip=None, notes=None):
+    """ Returns true if and only if all the given service statuses report as healthy. """
+    is_healthy = True
+    notes = notes or []
+
+    for service_name in service_statuses:
+      if skip and service_name in skip:
+        notes.append('%s skipped in compute health' % service_name)
+        continue
+
+      is_healthy = is_healthy and service_statuses[service_name]
+
+    data = {
+      'services': service_statuses,
+      'notes': notes
+    }
+
+    return (data, 200 if is_healthy else 503)
+

  @classmethod
-  def get_check(cls, name, parameters):
+  def get_checker(cls, app):
+    name = app.config['HEALTH_CHECKER'][0]
+    parameters = app.config['HEALTH_CHECKER'][1] or {}
+
    for subc in cls.__subclasses__():
      if subc.check_name() == name:
-        return subc(**parameters)
+        return subc(app, **parameters)

    raise Exception('Unknown health check with name %s' % name)


 class LocalHealthCheck(HealthCheck):
-  def __init__(self):
-    pass
-
  @classmethod
  def check_name(cls):
    return 'LocalHealthCheck'

-  def conduct_healthcheck(self, db_healthy, buildlogs_healthy, registry_healthy):
-    data = {
-      'db_healthy': db_healthy,
-      'buildlogs_healthy': buildlogs_healthy,
-      'registry_healthy': registry_healthy
-    }
-
-    return (data, db_healthy and buildlogs_healthy)
-

 class ProductionHealthCheck(HealthCheck):
-  def __init__(self, access_key, secret_key):
+  def __init__(self, app, access_key, secret_key):
+    super(ProductionHealthCheck, self).__init__(app)
    self.access_key = access_key
    self.secret_key = secret_key

@ -50,37 +85,38 @@ class ProductionHealthCheck(HealthCheck):
  def check_name(cls):
    return 'ProductionHealthCheck'

-  def conduct_healthcheck(self, db_healthy, buildlogs_healthy, registry_healthy):
-    data = {
-      'db_healthy': db_healthy,
-      'buildlogs_healthy': buildlogs_healthy,
-      'registry_healthy': registry_healthy
-    }
+  def get_instance_health(self, service_statuses):
+    # Note: We skip the redis check because if redis is down, we don't want ELB taking the
+    # machines out of service. Redis is not considered a high avaliability-required service.
+    skip = ['redis']
+    notes = []

-    # Only report unhealthy if the machine cannot connect to the DB. Redis isn't required for
-    # mission critical/high avaliability operations.
+    # If the database is marked as unhealthy, check the status of RDS directly. If RDS is
+    # reporting as available, then the problem is with this instance. Otherwise, the problem is
+    # with RDS, and so we skip the DB status so we can keep this machine as 'healthy'.
+    db_healthy = service_statuses['database']
    if not db_healthy:
-      # If the database is marked as unhealthy, check the status of RDS directly. If RDS is
-      # reporting as available, then the problem is with this instance. Otherwise, the problem is
-      # with RDS, and we can keep this machine as 'healthy'.
-      is_rds_working = False
-      try:
-        region = boto.rds2.connect_to_region('us-east-1',
-          aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
-        response = region.describe_db_instances()['DescribeDBInstancesResponse']
-        result = response['DescribeDBInstancesResult']
-        instances = result['DBInstances']
-        status = instances[0]['DBInstanceStatus']
-        is_rds_working = status == 'available'
-      except:
-        logger.exception("Exception while checking RDS status")
-        pass
+      rds_status = self._get_rds_status()
+      notes.append('DB reports unhealthy; RDS status: %s' % rds_status)

-      data['db_available_checked'] = True
-      data['db_available_status'] = is_rds_working
+      # If the RDS is in any state but available, then we skip the DB check since it will
+      # fail and bring down the instance.
+      if rds_status != 'available':
+        skip.append('database')

-      # If RDS is down, then we still report the machine as healthy, so that it can handle
-      # requests once RDS comes back up.
-      return (data, not is_rds_working and registry_healthy)
+    return self.calculate_overall_health(service_statuses, skip=skip, notes=notes)

-    return (data, db_healthy and registry_healthy)
+
+  def _get_rds_status(self):
+    """ Returns the status of the RDS instance as reported by AWS. """
+    try:
+      region = boto.rds2.connect_to_region('us-east-1',
+        aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
+      response = region.describe_db_instances()['DescribeDBInstancesResponse']
+      result = response['DescribeDBInstancesResult']
+      instances = result['DBInstances']
+      status = instances[0]['DBInstanceStatus']
+      return status
+    except:
+      logger.exception("Exception while checking RDS status")
+      return 'error'