Add a new configurable health check, to make sure production instances are not taken down by Redis or non-local DB issues
This commit is contained in:
parent
7349d9d4cf
commit
98602a2d0c
4 changed files with 102 additions and 0 deletions
|
@ -179,6 +179,9 @@ class DefaultConfig(object):
|
|||
|
||||
DISTRIBUTED_STORAGE_PREFERENCE = ['local_us']
|
||||
|
||||
# Health checker.
|
||||
HEALTH_CHECKER = ('LocalHealthCheck', {})
|
||||
|
||||
# Userfiles
|
||||
USERFILES_LOCATION = 'local_us'
|
||||
USERFILES_PATH = 'userfiles/'
|
||||
|
|
|
@ -5,6 +5,7 @@ from flask import (abort, redirect, request, url_for, make_response, Response,
|
|||
Blueprint, send_from_directory, jsonify)
|
||||
from flask.ext.login import current_user
|
||||
from urlparse import urlparse
|
||||
from health.healthcheck import HealthCheck
|
||||
|
||||
from data import model
|
||||
from data.model.oauth import DatabaseAuthorizationProvider
|
||||
|
@ -151,6 +152,20 @@ def v1():
|
|||
return index('')
|
||||
|
||||
|
||||
@web.route('/health', methods=['GET'])
|
||||
@no_cache
|
||||
def health():
|
||||
db_healthy = model.check_health()
|
||||
buildlogs_healthy = build_logs.check_health()
|
||||
|
||||
check = HealthCheck.get_check(app.config['HEALTH_CHECKER'][0], app.config['HEALTH_CHECKER'][1])
|
||||
(data, is_healthy) = check.conduct_healthcheck(db_healthy, buildlogs_healthy)
|
||||
|
||||
response = jsonify(dict(data = data, is_healthy = is_healthy))
|
||||
response.status_code = 200 if is_healthy else 503
|
||||
return response
|
||||
|
||||
|
||||
@web.route('/status', methods=['GET'])
|
||||
@no_cache
|
||||
def status():
|
||||
|
|
0
health/__init__.py
Normal file
0
health/__init__.py
Normal file
84
health/healthcheck.py
Normal file
84
health/healthcheck.py
Normal file
|
@ -0,0 +1,84 @@
|
|||
import boto.rds2
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class HealthCheck(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def conduct_healthcheck(self, db_healthy, buildlogs_healthy):
|
||||
"""
|
||||
Conducts any custom healthcheck work, returning a dict representing the HealthCheck
|
||||
output and a boolean indicating whether the instance is healthy.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def get_check(cls, name, parameters):
|
||||
for subc in cls.__subclasses__():
|
||||
if subc.check_name() == name:
|
||||
return subc(**parameters)
|
||||
|
||||
raise Exception('Unknown health check with name %s' % name)
|
||||
|
||||
|
||||
class LocalHealthCheck(HealthCheck):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def check_name(cls):
|
||||
return 'LocalHealthCheck'
|
||||
|
||||
def conduct_healthcheck(self, db_healthy, buildlogs_healthy):
|
||||
data = {
|
||||
'db_healthy': db_healthy,
|
||||
'buildlogs_healthy': buildlogs_healthy
|
||||
}
|
||||
|
||||
return (data, db_healthy and buildlogs_healthy)
|
||||
|
||||
|
||||
class ProductionHealthCheck(HealthCheck):
|
||||
def __init__(self, access_key, secret_key):
|
||||
self.access_key = access_key
|
||||
self.secret_key = secret_key
|
||||
|
||||
@classmethod
|
||||
def check_name(cls):
|
||||
return 'ProductionHealthCheck'
|
||||
|
||||
def conduct_healthcheck(self, db_healthy, buildlogs_healthy):
|
||||
db_healthy = False
|
||||
data = {
|
||||
'db_healthy': db_healthy,
|
||||
'buildlogs_healthy': buildlogs_healthy
|
||||
}
|
||||
|
||||
# Only report unhealthy if the machine cannot connect to the DB. Redis isn't required for
|
||||
# mission critical/high avaliability operations.
|
||||
if not db_healthy:
|
||||
# If the database is marked as unhealthy, check the status of RDS directly. If RDS is
|
||||
# reporting as available, then the problem is with this instance. Otherwise, the problem is
|
||||
# with RDS, and we can keep this machine as 'healthy'.
|
||||
is_rds_working = False
|
||||
try:
|
||||
region = boto.rds2.connect_to_region('us-east-1',
|
||||
aws_access_key_id=self.access_key, aws_secret_access_key=self.secret_key)
|
||||
response = region.describe_db_instances()['DescribeDBInstancesResponse']
|
||||
result = response['DescribeDBInstancesResult']
|
||||
instances = result['DBInstances']
|
||||
status = instances[0]['DBInstanceStatus']
|
||||
is_rds_working = status == 'available'
|
||||
except:
|
||||
logger.exception("Exception while checking RDS status")
|
||||
pass
|
||||
|
||||
data['db_available_checked'] = True
|
||||
data['db_available_status'] = is_rds_working
|
||||
|
||||
if is_rds_working:
|
||||
return (data, True)
|
||||
|
||||
return (data, db_healthy)
|
Reference in a new issue