Merge pull request #2636 from coreos-inc/auth-health-check

Add support to health checks for auth and make health checks more descriptive
This commit is contained in:
josephschorr 2017-07-19 12:55:37 -04:00 committed by GitHub
commit 38d3417ca7
12 changed files with 140 additions and 24 deletions

View file

@ -471,3 +471,6 @@ class DefaultConfig(ImmutableConfig):
# Feature Flag: Whether users can view and change their tag expiration. # Feature Flag: Whether users can view and change their tag expiration.
FEATURE_CHANGE_TAG_EXPIRATION = True FEATURE_CHANGE_TAG_EXPIRATION = True
# Defines a secret for enabling the health-check endpoint's debug information.
ENABLE_HEALTH_DEBUG_SECRET = None

View file

@ -120,15 +120,14 @@ class RedisBuildLogs(object):
connection = redis.StrictRedis(**args) connection = redis.StrictRedis(**args)
if not connection.ping() == True: if not connection.ping() == True:
return False return (False, 'Could not ping redis')
# Ensure we can write and read a key. # Ensure we can write and read a key.
connection.set(self._health_key(), time.time()) connection.set(self._health_key(), time.time())
connection.get(self._health_key()) connection.get(self._health_key())
return (True, None)
return True except redis.RedisError as re:
except redis.RedisError: return (False, 'Could not connect to redis: %s' % re.message)
return False
class BuildLogs(object): class BuildLogs(object):

View file

@ -11,12 +11,11 @@ def check_health(app_config):
# check). # check).
try: try:
validate_database_url(app_config['DB_URI'], {}, connect_timeout=3) validate_database_url(app_config['DB_URI'], {}, connect_timeout=3)
except Exception: except Exception as ex:
logger.exception('Could not connect to the database') return (False, 'Could not connect to the database: %s', ex.message)
return False
# We will connect to the db, check that it contains some team role kinds # We will connect to the db, check that it contains some team role kinds
try: try:
return bool(list(TeamRole.select().limit(1))) return (bool(list(TeamRole.select().limit(1))), 'Could not connect to the database')
except: except Exception as ex:
return False return (False, 'Could not connect to the database: %s', ex.message)

View file

@ -150,6 +150,10 @@ class UserAuthentication(object):
return data.get('password', encrypted) return data.get('password', encrypted)
def ping(self):
""" Returns whether the authentication engine is reachable and working. """
return self.state.ping()
@property @property
def federated_service(self): def federated_service(self):
""" Returns the name of the federated service for the auth system. If none, should return None. """ Returns the name of the federated service for the auth system. If none, should return None.

View file

@ -5,6 +5,10 @@ class DatabaseUsers(object):
def federated_service(self): def federated_service(self):
return None return None
def ping(self):
""" Always assumed to be working. If the DB is broken, other checks will handle it. """
return (True, None)
def verify_credentials(self, username_or_email, password): def verify_credentials(self, username_or_email, password):
""" Simply delegate to the model implementation. """ """ Simply delegate to the model implementation. """
result = model.user.verify_user(username_or_email, password) result = model.user.verify_user(username_or_email, password)

View file

@ -37,6 +37,12 @@ class ExternalJWTAuthN(FederatedUsers):
with open(public_key_path) as public_key_file: with open(public_key_path) as public_key_file:
self.public_key = public_key_file.read() self.public_key = public_key_file.read()
def ping(self):
result = self.client.get(self.getuser_url, timeout=2)
if result.status_code // 100 != 4:
return (False, result.text or 'Could not reach JWT authn endpoint')
return (True, None)
def get_user(self, username_or_email): def get_user(self, username_or_email):
if self.getuser_url is None: if self.getuser_url is None:

View file

@ -193,6 +193,18 @@ class LDAPUsers(FederatedUsers):
email = response.get(self._email_attr, [None])[0] email = response.get(self._email_attr, [None])[0]
return (UserInformation(username=username, email=email, id=username), None) return (UserInformation(username=username, email=email, id=username), None)
def ping(self):
try:
with self._ldap.get_connection():
pass
except ldap.INVALID_CREDENTIALS:
return (False, 'LDAP Admin dn or password is invalid')
except ldap.LDAPError as lde:
logger.exception('Exception when trying to health check LDAP')
return (False, lde.message)
return (True, None)
def get_user(self, username_or_email): def get_user(self, username_or_email):
""" Looks up a username or email in LDAP. """ """ Looks up a username or email in LDAP. """
logger.debug('Looking up LDAP username or email %s', username_or_email) logger.debug('Looking up LDAP username or email %s', username_or_email)

View file

@ -36,6 +36,21 @@ class KeystoneV2Users(FederatedUsers):
self.debug = os.environ.get('USERS_DEBUG') == '1' self.debug = os.environ.get('USERS_DEBUG') == '1'
self.requires_email = requires_email self.requires_email = requires_email
def ping(self):
try:
keystone_client = kclient.Client(username=self.admin_username, password=self.admin_password,
tenant_name=self.admin_tenant, auth_url=self.auth_url,
timeout=self.timeout, debug=self.debug)
keystone_client.user_id # Make sure we loaded a valid user.
except KeystoneUnauthorized as kut:
logger.exception('Keystone unauthorized admin')
return (False, 'Keystone admin credentials are invalid: %s' % kut.message)
except Exception:
logger.exception('Keystone unauthorized admin')
return (False, 'Keystone ping check failed: %s' % kut.message)
return (True, None)
def verify_credentials(self, username_or_email, password): def verify_credentials(self, username_or_email, password):
try: try:
keystone_client = kclient.Client(username=username_or_email, password=password, keystone_client = kclient.Client(username=username_or_email, password=password,
@ -89,6 +104,18 @@ class KeystoneV3Users(FederatedUsers):
tenant_name=self.admin_tenant, auth_url=self.auth_url, tenant_name=self.admin_tenant, auth_url=self.auth_url,
timeout=self.timeout, debug=self.debug) timeout=self.timeout, debug=self.debug)
def ping(self):
try:
self._get_admin_client().user_id # Make sure we loaded a valid user
except KeystoneUnauthorized as kut:
logger.exception('Keystone unauthorized admin')
return (False, 'Keystone admin credentials are invalid: %s' % kut.message)
except Exception:
logger.exception('Keystone unauthorized admin')
return (False, 'Keystone ping check failed: %s' % kut.message)
return (True, None)
def verify_credentials(self, username_or_email, password): def verify_credentials(self, username_or_email, password):
try: try:
keystone_client = kv3client.Client(username=username_or_email, password=password, keystone_client = kv3client.Client(username=username_or_email, password=password,

View file

@ -6,6 +6,7 @@ from data.database import model
from data.users.federated import DISABLED_MESSAGE from data.users.federated import DISABLED_MESSAGE
from test.test_ldap import mock_ldap from test.test_ldap import mock_ldap
from test.test_keystone_auth import fake_keystone from test.test_keystone_auth import fake_keystone
from test.test_external_jwt_authn import fake_jwt
from test.fixtures import * from test.fixtures import *
@ -34,3 +35,16 @@ def test_auth_createuser(auth_system_builder, user1, user2, config, app):
new_user, err = auth.verify_and_link_user(*user2) new_user, err = auth.verify_and_link_user(*user2)
assert new_user is None assert new_user is None
assert err == DISABLED_MESSAGE assert err == DISABLED_MESSAGE
@pytest.mark.parametrize('auth_system_builder,auth_kwargs', [
(mock_ldap, {}),
(fake_keystone, {'version': 3}),
(fake_keystone, {'version': 2}),
(fake_jwt, {}),
])
def test_ping(auth_system_builder, auth_kwargs, app):
with auth_system_builder(**auth_kwargs) as auth:
status, err = auth.ping()
assert status
assert err is None

View file

@ -6,7 +6,7 @@ from datetime import timedelta, datetime
from cachetools import lru_cache from cachetools import lru_cache
from flask import (abort, redirect, request, url_for, make_response, Response, render_template, from flask import (abort, redirect, request, url_for, make_response, Response, render_template,
Blueprint, jsonify, send_file) Blueprint, jsonify, send_file, session)
from flask_login import current_user from flask_login import current_user
import features import features
@ -260,6 +260,7 @@ def privacy():
# TODO(jschorr): Remove this mirrored endpoint once we migrate ELB. # TODO(jschorr): Remove this mirrored endpoint once we migrate ELB.
@web.route('/health', methods=['GET']) @web.route('/health', methods=['GET'])
@web.route('/health/instance', methods=['GET']) @web.route('/health/instance', methods=['GET'])
@process_auth_or_cookie
@no_cache @no_cache
def instance_health(): def instance_health():
checker = get_healthchecker(app, config_provider, instance_keys) checker = get_healthchecker(app, config_provider, instance_keys)
@ -272,6 +273,7 @@ def instance_health():
# TODO(jschorr): Remove this mirrored endpoint once we migrate pingdom. # TODO(jschorr): Remove this mirrored endpoint once we migrate pingdom.
@web.route('/status', methods=['GET']) @web.route('/status', methods=['GET'])
@web.route('/health/endtoend', methods=['GET']) @web.route('/health/endtoend', methods=['GET'])
@process_auth_or_cookie
@no_cache @no_cache
def endtoend_health(): def endtoend_health():
checker = get_healthchecker(app, config_provider, instance_keys) checker = get_healthchecker(app, config_provider, instance_keys)
@ -283,6 +285,7 @@ def endtoend_health():
@web.route('/health/dbrevision', methods=['GET']) @web.route('/health/dbrevision', methods=['GET'])
@route_show_if(features.BILLING) # Since this is only used in production. @route_show_if(features.BILLING) # Since this is only used in production.
@process_auth_or_cookie
@no_cache @no_cache
def dbrevision_health(): def dbrevision_health():
# Find the revision from the database. # Find the revision from the database.
@ -305,6 +308,23 @@ def dbrevision_health():
return response return response
@web.route('/health/enabledebug/<secret>', methods=['GET'])
@no_cache
def enable_health_debug(secret):
if not secret:
abort(404)
if not app.config.get('ENABLE_HEALTH_DEBUG_SECRET'):
abort(404)
if app.config.get('ENABLE_HEALTH_DEBUG_SECRET') != secret:
abort(404)
session['health_debug'] = True
return make_response('Health check debug information enabled')
@web.route('/robots.txt', methods=['GET']) @web.route('/robots.txt', methods=['GET'])
def robots(): def robots():
robots_txt = make_response(render_template('robots.txt', baseurl=get_app_url())) robots_txt = make_response(render_template('robots.txt', baseurl=get_app_url()))

View file

@ -1,5 +1,8 @@
import boto.rds2 import boto.rds2
import logging import logging
from auth.permissions import SuperUserPermission
from flask import session
from health.services import check_all_services from health.services import check_all_services
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -46,21 +49,40 @@ class HealthCheck(object):
is_healthy = True is_healthy = True
notes = notes or [] notes = notes or []
service_statuses_bools = {}
service_status_expanded = {}
for service_name in service_statuses: for service_name in service_statuses:
status, err = service_statuses[service_name]
service_statuses_bools[service_name] = status
service_status_expanded[service_name] = {
'status': status,
}
if not status:
service_status_expanded[service_name]['failure'] = err
if skip and service_name in skip: if skip and service_name in skip:
notes.append('%s skipped in compute health' % service_name) notes.append('%s skipped in compute health' % service_name)
continue continue
is_healthy = is_healthy and service_statuses[service_name] is_healthy = is_healthy and status
data = { data = {
'services': service_statuses, 'services': service_statuses_bools,
'notes': notes,
'is_testing': self.app.config['TESTING'],
'config_provider': self.config_provider.provider_id,
'local_service_key_id': self.instance_keys.local_key_id,
} }
add_debug_information = SuperUserPermission().can() or session.get('health_debug', False)
if add_debug_information:
data.update({
'services_expanded': service_status_expanded,
'notes': notes,
'is_testing': self.app.config['TESTING'],
'config_provider': self.config_provider.provider_id,
'local_service_key_id': self.instance_keys.local_key_id,
})
return (data, 200 if is_healthy else 503) return (data, 200 if is_healthy else 503)
@classmethod @classmethod

View file

@ -1,5 +1,5 @@
import logging import logging
from app import build_logs, storage from app import build_logs, storage, authentication
from health.models_pre_oci import pre_oci_model as model from health.models_pre_oci import pre_oci_model as model
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -21,10 +21,11 @@ def _check_registry_gunicorn(app):
registry_url = '%s://localhost%s/v1/_internal_ping' % (scheme, port) registry_url = '%s://localhost%s/v1/_internal_ping' % (scheme, port)
try: try:
return client.get(registry_url, verify=False, timeout=2).status_code == 200 status_code = client.get(registry_url, verify=False, timeout=2).status_code
except Exception: return (status_code == 200, 'Got non-200 response for registry: %s' % status_code)
except Exception as ex:
logger.exception('Exception when checking registry health: %s', registry_url) logger.exception('Exception when checking registry health: %s', registry_url)
return False return (False, 'Exception when checking registry health: %s' % registry_url)
def _check_database(app): def _check_database(app):
@ -41,10 +42,14 @@ def _check_storage(app):
""" Returns the status of storage, as accessed from this instance. """ """ Returns the status of storage, as accessed from this instance. """
try: try:
storage.validate(storage.preferred_locations, app.config['HTTPCLIENT']) storage.validate(storage.preferred_locations, app.config['HTTPCLIENT'])
return True return (True, None)
except Exception as ex: except Exception as ex:
logger.exception('Storage check failed with exception %s', ex) logger.exception('Storage check failed with exception %s', ex)
return False return (False, 'Storage check failed with exception %s' % ex.message)
def _check_auth(app):
""" Returns the status of the auth engine, as accessed from this instance. """
return authentication.ping()
_SERVICES = { _SERVICES = {
@ -52,6 +57,7 @@ _SERVICES = {
'database': _check_database, 'database': _check_database,
'redis': _check_redis, 'redis': _check_redis,
'storage': _check_storage, 'storage': _check_storage,
'auth': _check_auth,
} }
def check_all_services(app, skip): def check_all_services(app, skip):