Clean up the health checking code and move the endpoints to /health/instance and /health/endtoend.
This commit is contained in:
parent
92d32bc636
commit
b74b7de197
3 changed files with 146 additions and 88 deletions
|
@ -6,7 +6,7 @@ from flask import (abort, redirect, request, url_for, make_response, Response,
|
||||||
from avatar_generator import Avatar
|
from avatar_generator import Avatar
|
||||||
from flask.ext.login import current_user
|
from flask.ext.login import current_user
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
from health.healthcheck import HealthCheck
|
from health.healthcheck import get_healthchecker
|
||||||
|
|
||||||
from data import model
|
from data import model
|
||||||
from data.model.oauth import DatabaseAuthorizationProvider
|
from data.model.oauth import DatabaseAuthorizationProvider
|
||||||
|
@ -156,47 +156,23 @@ def v1():
|
||||||
return index('')
|
return index('')
|
||||||
|
|
||||||
|
|
||||||
@web.route('/health', methods=['GET'])
|
@web.route('/health/instance', methods=['GET'])
|
||||||
@no_cache
|
@no_cache
|
||||||
def health():
|
def instance_health():
|
||||||
client = app.config['HTTPCLIENT']
|
checker = get_healthchecker(app)
|
||||||
|
(data, status_code) = checker.check_instance()
|
||||||
db_healthy = model.check_health(app.config)
|
response = jsonify(dict(data=data, status_code=status_code))
|
||||||
buildlogs_healthy = build_logs.check_health()
|
response.status_code = status_code
|
||||||
|
|
||||||
hostname_parts = app.config['SERVER_HOSTNAME'].split(':')
|
|
||||||
port = ''
|
|
||||||
if len(hostname_parts) == 2:
|
|
||||||
port = ':' + hostname_parts[1]
|
|
||||||
|
|
||||||
registry_url = '%s://localhost%s/v1/_internal_ping' % (app.config['PREFERRED_URL_SCHEME'], port)
|
|
||||||
registry_healthy = False
|
|
||||||
try:
|
|
||||||
registry_healthy = client.get(registry_url, verify=False, timeout=2).status_code == 200
|
|
||||||
except Exception:
|
|
||||||
logger.exception('Exception when checking registry health: %s', registry_url)
|
|
||||||
|
|
||||||
check = HealthCheck.get_check(app.config['HEALTH_CHECKER'][0], app.config['HEALTH_CHECKER'][1])
|
|
||||||
(data, is_healthy) = check.conduct_healthcheck(db_healthy, buildlogs_healthy, registry_healthy)
|
|
||||||
|
|
||||||
response = jsonify(dict(data=data, is_healthy=is_healthy))
|
|
||||||
response.status_code = 200 if is_healthy else 503
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
@web.route('/status', methods=['GET'])
|
@web.route('/health/endtoend', methods=['GET'])
|
||||||
@no_cache
|
@no_cache
|
||||||
def status():
|
def endtoend_health():
|
||||||
db_healthy = model.check_health(app.config)
|
checker = get_healthchecker(app)
|
||||||
buildlogs_healthy = build_logs.check_health()
|
(data, status_code) = checker.check_endtoend()
|
||||||
|
response = jsonify(dict(data=data, status_code=status_code))
|
||||||
response = jsonify({
|
response.status_code = status_code
|
||||||
'db_healthy': db_healthy,
|
|
||||||
'buildlogs_healthy': buildlogs_healthy,
|
|
||||||
'is_testing': app.config['TESTING'],
|
|
||||||
})
|
|
||||||
response.status_code = 200 if db_healthy and buildlogs_healthy else 503
|
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,48 +1,83 @@
|
||||||
import boto.rds2
|
import boto.rds2
|
||||||
import logging
|
import logging
|
||||||
|
from health.services import check_all_services
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class HealthCheck(object):
|
def get_healthchecker(app):
|
||||||
def __init__(self):
|
""" Returns a HealthCheck instance for the given app. """
|
||||||
pass
|
return HealthCheck.get_checker(app)
|
||||||
|
|
||||||
def conduct_healthcheck(self, db_healthy, buildlogs_healthy, registry_healthy):
|
|
||||||
|
class HealthCheck(object):
|
||||||
|
def __init__(self, app):
|
||||||
|
self.app = app
|
||||||
|
|
||||||
|
def check_instance(self):
|
||||||
"""
|
"""
|
||||||
Conducts any custom healthcheck work, returning a dict representing the HealthCheck
|
Conducts a check on this specific instance, returning a dict representing the HealthCheck
|
||||||
output and a boolean indicating whether the instance is healthy.
|
output and a number indicating the health check response code.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
service_statuses = check_all_services(self.app)
|
||||||
|
return self.get_instance_health(service_statuses)
|
||||||
|
|
||||||
|
def check_endtoend(self):
|
||||||
|
"""
|
||||||
|
Conducts a check on all services, returning a dict representing the HealthCheck
|
||||||
|
output and a number indicating the health check response code.
|
||||||
|
"""
|
||||||
|
service_statuses = check_all_services(self.app)
|
||||||
|
return self.calculate_overall_health(service_statuses)
|
||||||
|
|
||||||
|
def get_instance_health(self, service_statuses):
|
||||||
|
"""
|
||||||
|
For the given service statuses, returns a dict representing the HealthCheck
|
||||||
|
output and a number indicating the health check response code. By default,
|
||||||
|
this simply ensures that all services are reporting as healthy.
|
||||||
|
"""
|
||||||
|
return self.calculate_overall_health(service_statuses)
|
||||||
|
|
||||||
|
def calculate_overall_health(self, service_statuses, skip=None, notes=None):
|
||||||
|
""" Returns true if and only if all the given service statuses report as healthy. """
|
||||||
|
is_healthy = True
|
||||||
|
notes = notes or []
|
||||||
|
|
||||||
|
for service_name in service_statuses:
|
||||||
|
if skip and service_name in skip:
|
||||||
|
notes.append('%s skipped in compute health' % service_name)
|
||||||
|
continue
|
||||||
|
|
||||||
|
is_healthy = is_healthy and service_statuses[service_name]
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'services': service_statuses,
|
||||||
|
'notes': notes
|
||||||
|
}
|
||||||
|
|
||||||
|
return (data, 200 if is_healthy else 503)
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_check(cls, name, parameters):
|
def get_checker(cls, app):
|
||||||
|
name = app.config['HEALTH_CHECKER'][0]
|
||||||
|
parameters = app.config['HEALTH_CHECKER'][1] or {}
|
||||||
|
|
||||||
for subc in cls.__subclasses__():
|
for subc in cls.__subclasses__():
|
||||||
if subc.check_name() == name:
|
if subc.check_name() == name:
|
||||||
return subc(**parameters)
|
return subc(app, **parameters)
|
||||||
|
|
||||||
raise Exception('Unknown health check with name %s' % name)
|
raise Exception('Unknown health check with name %s' % name)
|
||||||
|
|
||||||
|
|
||||||
class LocalHealthCheck(HealthCheck):
|
class LocalHealthCheck(HealthCheck):
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def check_name(cls):
|
def check_name(cls):
|
||||||
return 'LocalHealthCheck'
|
return 'LocalHealthCheck'
|
||||||
|
|
||||||
def conduct_healthcheck(self, db_healthy, buildlogs_healthy, registry_healthy):
|
|
||||||
data = {
|
|
||||||
'db_healthy': db_healthy,
|
|
||||||
'buildlogs_healthy': buildlogs_healthy,
|
|
||||||
'registry_healthy': registry_healthy
|
|
||||||
}
|
|
||||||
|
|
||||||
return (data, db_healthy and buildlogs_healthy)
|
|
||||||
|
|
||||||
|
|
||||||
class ProductionHealthCheck(HealthCheck):
|
class ProductionHealthCheck(HealthCheck):
|
||||||
def __init__(self, access_key, secret_key):
|
def __init__(self, app, access_key, secret_key):
|
||||||
|
super(ProductionHealthCheck, self).__init__(app)
|
||||||
self.access_key = access_key
|
self.access_key = access_key
|
||||||
self.secret_key = secret_key
|
self.secret_key = secret_key
|
||||||
|
|
||||||
|
@ -50,37 +85,38 @@ class ProductionHealthCheck(HealthCheck):
|
||||||
def check_name(cls):
|
def check_name(cls):
|
||||||
return 'ProductionHealthCheck'
|
return 'ProductionHealthCheck'
|
||||||
|
|
||||||
def conduct_healthcheck(self, db_healthy, buildlogs_healthy, registry_healthy):
|
def get_instance_health(self, service_statuses):
|
||||||
data = {
|
# Note: We skip the redis check because if redis is down, we don't want ELB taking the
|
||||||
'db_healthy': db_healthy,
|
# machines out of service. Redis is not considered a high avaliability-required service.
|
||||||
'buildlogs_healthy': buildlogs_healthy,
|
skip = ['redis']
|
||||||
'registry_healthy': registry_healthy
|
notes = []
|
||||||
}
|
|
||||||
|
|
||||||
# Only report unhealthy if the machine cannot connect to the DB. Redis isn't required for
|
# If the database is marked as unhealthy, check the status of RDS directly. If RDS is
|
||||||
# mission critical/high avaliability operations.
|
# reporting as available, then the problem is with this instance. Otherwise, the problem is
|
||||||
|
# with RDS, and so we skip the DB status so we can keep this machine as 'healthy'.
|
||||||
|
db_healthy = service_statuses['database']
|
||||||
if not db_healthy:
|
if not db_healthy:
|
||||||
# If the database is marked as unhealthy, check the status of RDS directly. If RDS is
|
rds_status = self._get_rds_status()
|
||||||
# reporting as available, then the problem is with this instance. Otherwise, the problem is
|
notes.append('DB reports unhealthy; RDS status: %s' % rds_status)
|
||||||
# with RDS, and we can keep this machine as 'healthy'.
|
|
||||||
is_rds_working = False
|
|
||||||
try:
|
|
||||||
region = boto.rds2.connect_to_region('us-east-1',
|
|
||||||
aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
|
||||||
response = region.describe_db_instances()['DescribeDBInstancesResponse']
|
|
||||||
result = response['DescribeDBInstancesResult']
|
|
||||||
instances = result['DBInstances']
|
|
||||||
status = instances[0]['DBInstanceStatus']
|
|
||||||
is_rds_working = status == 'available'
|
|
||||||
except:
|
|
||||||
logger.exception("Exception while checking RDS status")
|
|
||||||
pass
|
|
||||||
|
|
||||||
data['db_available_checked'] = True
|
# If the RDS is in any state but available, then we skip the DB check since it will
|
||||||
data['db_available_status'] = is_rds_working
|
# fail and bring down the instance.
|
||||||
|
if rds_status != 'available':
|
||||||
|
skip.append('database')
|
||||||
|
|
||||||
# If RDS is down, then we still report the machine as healthy, so that it can handle
|
return self.calculate_overall_health(service_statuses, skip=skip, notes=notes)
|
||||||
# requests once RDS comes back up.
|
|
||||||
return (data, not is_rds_working and registry_healthy)
|
|
||||||
|
|
||||||
return (data, db_healthy and registry_healthy)
|
|
||||||
|
def _get_rds_status(self):
|
||||||
|
""" Returns the status of the RDS instance as reported by AWS. """
|
||||||
|
try:
|
||||||
|
region = boto.rds2.connect_to_region('us-east-1',
|
||||||
|
aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
||||||
|
response = region.describe_db_instances()['DescribeDBInstancesResponse']
|
||||||
|
result = response['DescribeDBInstancesResult']
|
||||||
|
instances = result['DBInstances']
|
||||||
|
status = instances[0]['DBInstanceStatus']
|
||||||
|
return status
|
||||||
|
except:
|
||||||
|
logger.exception("Exception while checking RDS status")
|
||||||
|
return 'error'
|
||||||
|
|
46
health/services.py
Normal file
46
health/services.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
import logging
|
||||||
|
from data import model
|
||||||
|
from app import build_logs
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def _check_registry_gunicorn(app):
|
||||||
|
""" Returns the status of the registry gunicorn workers. """
|
||||||
|
# Compute the URL for checking the registry endpoint. We append a port if and only if the
|
||||||
|
# hostname contains one.
|
||||||
|
client = app.config['HTTPCLIENT']
|
||||||
|
hostname_parts = app.config['SERVER_HOSTNAME'].split(':')
|
||||||
|
port = ''
|
||||||
|
if len(hostname_parts) == 2:
|
||||||
|
port = ':' + hostname_parts[1]
|
||||||
|
|
||||||
|
registry_url = '%s://localhost%s/v1/_internal_ping' % (app.config['PREFERRED_URL_SCHEME'], port)
|
||||||
|
try:
|
||||||
|
return client.get(registry_url, verify=False, timeout=2).status_code == 200
|
||||||
|
except Exception:
|
||||||
|
logger.exception('Exception when checking registry health: %s', registry_url)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _check_database(app):
|
||||||
|
""" Returns the status of the database, as accessed from this instance. """
|
||||||
|
return model.check_health(app.config)
|
||||||
|
|
||||||
|
def _check_redis(app):
|
||||||
|
""" Returns the status of Redis, as accessed from this instance. """
|
||||||
|
return build_logs.check_health()
|
||||||
|
|
||||||
|
|
||||||
|
_SERVICES = {
|
||||||
|
'registry_gunicorn': _check_registry_gunicorn,
|
||||||
|
'database': _check_database,
|
||||||
|
'redis': _check_redis
|
||||||
|
}
|
||||||
|
|
||||||
|
def check_all_services(app):
|
||||||
|
""" Returns a dictionary containing the status of all the services defined. """
|
||||||
|
status = {}
|
||||||
|
for name in _SERVICES:
|
||||||
|
status[name] = _SERVICES[name](app)
|
||||||
|
|
||||||
|
return status
|
Reference in a new issue