diff --git a/endpoints/web.py b/endpoints/web.py index b91b3745e..55d5cd0a7 100644 --- a/endpoints/web.py +++ b/endpoints/web.py @@ -6,7 +6,7 @@ from flask import (abort, redirect, request, url_for, make_response, Response, from avatar_generator import Avatar from flask.ext.login import current_user from urlparse import urlparse -from health.healthcheck import HealthCheck +from health.healthcheck import get_healthchecker from data import model from data.model.oauth import DatabaseAuthorizationProvider @@ -156,47 +156,23 @@ def v1(): return index('') -@web.route('/health', methods=['GET']) +@web.route('/health/instance', methods=['GET']) @no_cache -def health(): - client = app.config['HTTPCLIENT'] - - db_healthy = model.check_health(app.config) - buildlogs_healthy = build_logs.check_health() - - hostname_parts = app.config['SERVER_HOSTNAME'].split(':') - port = '' - if len(hostname_parts) == 2: - port = ':' + hostname_parts[1] - - registry_url = '%s://localhost%s/v1/_internal_ping' % (app.config['PREFERRED_URL_SCHEME'], port) - registry_healthy = False - try: - registry_healthy = client.get(registry_url, verify=False, timeout=2).status_code == 200 - except Exception: - logger.exception('Exception when checking registry health: %s', registry_url) - - check = HealthCheck.get_check(app.config['HEALTH_CHECKER'][0], app.config['HEALTH_CHECKER'][1]) - (data, is_healthy) = check.conduct_healthcheck(db_healthy, buildlogs_healthy, registry_healthy) - - response = jsonify(dict(data=data, is_healthy=is_healthy)) - response.status_code = 200 if is_healthy else 503 +def instance_health(): + checker = get_healthchecker(app) + (data, status_code) = checker.check_instance() + response = jsonify(dict(data=data, status_code=status_code)) + response.status_code = status_code return response -@web.route('/status', methods=['GET']) +@web.route('/health/endtoend', methods=['GET']) @no_cache -def status(): - db_healthy = model.check_health(app.config) - buildlogs_healthy = build_logs.check_health() - - response = jsonify({ - 'db_healthy': db_healthy, - 'buildlogs_healthy': buildlogs_healthy, - 'is_testing': app.config['TESTING'], - }) - response.status_code = 200 if db_healthy and buildlogs_healthy else 503 - +def endtoend_health(): + checker = get_healthchecker(app) + (data, status_code) = checker.check_endtoend() + response = jsonify(dict(data=data, status_code=status_code)) + response.status_code = status_code return response diff --git a/health/healthcheck.py b/health/healthcheck.py index cc76c76c7..aa4c48199 100644 --- a/health/healthcheck.py +++ b/health/healthcheck.py @@ -1,48 +1,83 @@ import boto.rds2 import logging +from health.services import check_all_services logger = logging.getLogger(__name__) -class HealthCheck(object): - def __init__(self): - pass +def get_healthchecker(app): + """ Returns a HealthCheck instance for the given app. """ + return HealthCheck.get_checker(app) - def conduct_healthcheck(self, db_healthy, buildlogs_healthy, registry_healthy): + +class HealthCheck(object): + def __init__(self, app): + self.app = app + + def check_instance(self): """ - Conducts any custom healthcheck work, returning a dict representing the HealthCheck - output and a boolean indicating whether the instance is healthy. + Conducts a check on this specific instance, returning a dict representing the HealthCheck + output and a number indicating the health check response code. """ - raise NotImplementedError + service_statuses = check_all_services(self.app) + return self.get_instance_health(service_statuses) + + def check_endtoend(self): + """ + Conducts a check on all services, returning a dict representing the HealthCheck + output and a number indicating the health check response code. + """ + service_statuses = check_all_services(self.app) + return self.calculate_overall_health(service_statuses) + + def get_instance_health(self, service_statuses): + """ + For the given service statuses, returns a dict representing the HealthCheck + output and a number indicating the health check response code. By default, + this simply ensures that all services are reporting as healthy. + """ + return self.calculate_overall_health(service_statuses) + + def calculate_overall_health(self, service_statuses, skip=None, notes=None): + """ Returns true if and only if all the given service statuses report as healthy. """ + is_healthy = True + notes = notes or [] + + for service_name in service_statuses: + if skip and service_name in skip: + notes.append('%s skipped in compute health' % service_name) + continue + + is_healthy = is_healthy and service_statuses[service_name] + + data = { + 'services': service_statuses, + 'notes': notes + } + + return (data, 200 if is_healthy else 503) + @classmethod - def get_check(cls, name, parameters): + def get_checker(cls, app): + name = app.config['HEALTH_CHECKER'][0] + parameters = app.config['HEALTH_CHECKER'][1] or {} + for subc in cls.__subclasses__(): if subc.check_name() == name: - return subc(**parameters) + return subc(app, **parameters) raise Exception('Unknown health check with name %s' % name) class LocalHealthCheck(HealthCheck): - def __init__(self): - pass - @classmethod def check_name(cls): return 'LocalHealthCheck' - def conduct_healthcheck(self, db_healthy, buildlogs_healthy, registry_healthy): - data = { - 'db_healthy': db_healthy, - 'buildlogs_healthy': buildlogs_healthy, - 'registry_healthy': registry_healthy - } - - return (data, db_healthy and buildlogs_healthy) - class ProductionHealthCheck(HealthCheck): - def __init__(self, access_key, secret_key): + def __init__(self, app, access_key, secret_key): + super(ProductionHealthCheck, self).__init__(app) self.access_key = access_key self.secret_key = secret_key @@ -50,37 +85,38 @@ class ProductionHealthCheck(HealthCheck): def check_name(cls): return 'ProductionHealthCheck' - def conduct_healthcheck(self, db_healthy, buildlogs_healthy, registry_healthy): - data = { - 'db_healthy': db_healthy, - 'buildlogs_healthy': buildlogs_healthy, - 'registry_healthy': registry_healthy - } + def get_instance_health(self, service_statuses): + # Note: We skip the redis check because if redis is down, we don't want ELB taking the + # machines out of service. Redis is not considered a high avaliability-required service. + skip = ['redis'] + notes = [] - # Only report unhealthy if the machine cannot connect to the DB. Redis isn't required for - # mission critical/high avaliability operations. + # If the database is marked as unhealthy, check the status of RDS directly. If RDS is + # reporting as available, then the problem is with this instance. Otherwise, the problem is + # with RDS, and so we skip the DB status so we can keep this machine as 'healthy'. + db_healthy = service_statuses['database'] if not db_healthy: - # If the database is marked as unhealthy, check the status of RDS directly. If RDS is - # reporting as available, then the problem is with this instance. Otherwise, the problem is - # with RDS, and we can keep this machine as 'healthy'. - is_rds_working = False - try: - region = boto.rds2.connect_to_region('us-east-1', - aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key) - response = region.describe_db_instances()['DescribeDBInstancesResponse'] - result = response['DescribeDBInstancesResult'] - instances = result['DBInstances'] - status = instances[0]['DBInstanceStatus'] - is_rds_working = status == 'available' - except: - logger.exception("Exception while checking RDS status") - pass + rds_status = self._get_rds_status() + notes.append('DB reports unhealthy; RDS status: %s' % rds_status) - data['db_available_checked'] = True - data['db_available_status'] = is_rds_working + # If the RDS is in any state but available, then we skip the DB check since it will + # fail and bring down the instance. + if rds_status != 'available': + skip.append('database') - # If RDS is down, then we still report the machine as healthy, so that it can handle - # requests once RDS comes back up. - return (data, not is_rds_working and registry_healthy) + return self.calculate_overall_health(service_statuses, skip=skip, notes=notes) - return (data, db_healthy and registry_healthy) + + def _get_rds_status(self): + """ Returns the status of the RDS instance as reported by AWS. """ + try: + region = boto.rds2.connect_to_region('us-east-1', + aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key) + response = region.describe_db_instances()['DescribeDBInstancesResponse'] + result = response['DescribeDBInstancesResult'] + instances = result['DBInstances'] + status = instances[0]['DBInstanceStatus'] + return status + except: + logger.exception("Exception while checking RDS status") + return 'error' diff --git a/health/services.py b/health/services.py new file mode 100644 index 000000000..9aab53488 --- /dev/null +++ b/health/services.py @@ -0,0 +1,46 @@ +import logging +from data import model +from app import build_logs + +logger = logging.getLogger(__name__) + +def _check_registry_gunicorn(app): + """ Returns the status of the registry gunicorn workers. """ + # Compute the URL for checking the registry endpoint. We append a port if and only if the + # hostname contains one. + client = app.config['HTTPCLIENT'] + hostname_parts = app.config['SERVER_HOSTNAME'].split(':') + port = '' + if len(hostname_parts) == 2: + port = ':' + hostname_parts[1] + + registry_url = '%s://localhost%s/v1/_internal_ping' % (app.config['PREFERRED_URL_SCHEME'], port) + try: + return client.get(registry_url, verify=False, timeout=2).status_code == 200 + except Exception: + logger.exception('Exception when checking registry health: %s', registry_url) + return False + + +def _check_database(app): + """ Returns the status of the database, as accessed from this instance. """ + return model.check_health(app.config) + +def _check_redis(app): + """ Returns the status of Redis, as accessed from this instance. """ + return build_logs.check_health() + + +_SERVICES = { + 'registry_gunicorn': _check_registry_gunicorn, + 'database': _check_database, + 'redis': _check_redis +} + +def check_all_services(app): + """ Returns a dictionary containing the status of all the services defined. """ + status = {} + for name in _SERVICES: + status[name] = _SERVICES[name](app) + + return status \ No newline at end of file