From 98602a2d0cb7736329f4a29c2ae647f4dc5d5d4d Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Sun, 2 Nov 2014 15:06:17 -0500 Subject: [PATCH] Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues --- config.py | 3 ++ endpoints/web.py | 15 ++++++++ health/__init__.py | 0 health/healthcheck.py | 84 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+) create mode 100644 health/__init__.py create mode 100644 health/healthcheck.py diff --git a/config.py b/config.py index 221f808dd..72f262415 100644 --- a/config.py +++ b/config.py @@ -179,6 +179,9 @@ class DefaultConfig(object): DISTRIBUTED_STORAGE_PREFERENCE = ['local_us'] + # Health checker. + HEALTH_CHECKER = ('LocalHealthCheck', {}) + # Userfiles USERFILES_LOCATION = 'local_us' USERFILES_PATH = 'userfiles/' diff --git a/endpoints/web.py b/endpoints/web.py index e7ce571fa..b355f5ec9 100644 --- a/endpoints/web.py +++ b/endpoints/web.py @@ -5,6 +5,7 @@ from flask import (abort, redirect, request, url_for, make_response, Response, Blueprint, send_from_directory, jsonify) from flask.ext.login import current_user from urlparse import urlparse +from health.healthcheck import HealthCheck from data import model from data.model.oauth import DatabaseAuthorizationProvider @@ -151,6 +152,20 @@ def v1(): return index('') +@web.route('/health', methods=['GET']) +@no_cache +def health(): + db_healthy = model.check_health() + buildlogs_healthy = build_logs.check_health() + + check = HealthCheck.get_check(app.config['HEALTH_CHECKER'][0], app.config['HEALTH_CHECKER'][1]) + (data, is_healthy) = check.conduct_healthcheck(db_healthy, buildlogs_healthy) + + response = jsonify(dict(data = data, is_healthy = is_healthy)) + response.status_code = 200 if is_healthy else 503 + return response + + @web.route('/status', methods=['GET']) @no_cache def status(): diff --git a/health/__init__.py b/health/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/health/healthcheck.py b/health/healthcheck.py new file mode 100644 index 000000000..be532a2fd --- /dev/null +++ b/health/healthcheck.py @@ -0,0 +1,84 @@ +import boto.rds2 +import logging + +logger = logging.getLogger(__name__) + +class HealthCheck(object): + def __init__(self): + pass + + def conduct_healthcheck(self, db_healthy, buildlogs_healthy): + """ + Conducts any custom healthcheck work, returning a dict representing the HealthCheck + output and a boolean indicating whether the instance is healthy. + """ + raise NotImplementedError + + @classmethod + def get_check(cls, name, parameters): + for subc in cls.__subclasses__(): + if subc.check_name() == name: + return subc(**parameters) + + raise Exception('Unknown health check with name %s' % name) + + +class LocalHealthCheck(HealthCheck): + def __init__(self): + pass + + @classmethod + def check_name(cls): + return 'LocalHealthCheck' + + def conduct_healthcheck(self, db_healthy, buildlogs_healthy): + data = { + 'db_healthy': db_healthy, + 'buildlogs_healthy': buildlogs_healthy + } + + return (data, db_healthy and buildlogs_healthy) + + +class ProductionHealthCheck(HealthCheck): + def __init__(self, access_key, secret_key): + self.access_key = access_key + self.secret_key = secret_key + + @classmethod + def check_name(cls): + return 'ProductionHealthCheck' + + def conduct_healthcheck(self, db_healthy, buildlogs_healthy): + db_healthy = False + data = { + 'db_healthy': db_healthy, + 'buildlogs_healthy': buildlogs_healthy + } + + # Only report unhealthy if the machine cannot connect to the DB. Redis isn't required for + # mission critical/high avaliability operations. + if not db_healthy: + # If the database is marked as unhealthy, check the status of RDS directly. If RDS is + # reporting as available, then the problem is with this instance. Otherwise, the problem is + # with RDS, and we can keep this machine as 'healthy'. + is_rds_working = False + try: + region = boto.rds2.connect_to_region('us-east-1', + aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key) + response = region.describe_db_instances()['DescribeDBInstancesResponse'] + result = response['DescribeDBInstancesResult'] + instances = result['DBInstances'] + status = instances[0]['DBInstanceStatus'] + is_rds_working = status == 'available' + except: + logger.exception("Exception while checking RDS status") + pass + + data['db_available_checked'] = True + data['db_available_status'] = is_rds_working + + if is_rds_working: + return (data, True) + + return (data, db_healthy) \ No newline at end of file