Merge pull request #2636 from coreos-inc/auth-health-check

Add support to health checks for auth and make health checks more descriptive
This commit is contained in:
josephschorr 2017-07-19 12:55:37 -04:00 committed by GitHub
commit 38d3417ca7
12 changed files with 140 additions and 24 deletions

View file

@ -471,3 +471,6 @@ class DefaultConfig(ImmutableConfig):
# Feature Flag: Whether users can view and change their tag expiration.
FEATURE_CHANGE_TAG_EXPIRATION = True
# Defines a secret for enabling the health-check endpoint's debug information.
ENABLE_HEALTH_DEBUG_SECRET = None

View file

@ -120,15 +120,14 @@ class RedisBuildLogs(object):
connection = redis.StrictRedis(**args)
if not connection.ping() == True:
return False
return (False, 'Could not ping redis')
# Ensure we can write and read a key.
connection.set(self._health_key(), time.time())
connection.get(self._health_key())
return True
except redis.RedisError:
return False
return (True, None)
except redis.RedisError as re:
return (False, 'Could not connect to redis: %s' % re.message)
class BuildLogs(object):

View file

@ -11,12 +11,11 @@ def check_health(app_config):
# check).
try:
validate_database_url(app_config['DB_URI'], {}, connect_timeout=3)
except Exception:
logger.exception('Could not connect to the database')
return False
except Exception as ex:
return (False, 'Could not connect to the database: %s', ex.message)
# We will connect to the db, check that it contains some team role kinds
try:
return bool(list(TeamRole.select().limit(1)))
except:
return False
return (bool(list(TeamRole.select().limit(1))), 'Could not connect to the database')
except Exception as ex:
return (False, 'Could not connect to the database: %s', ex.message)

View file

@ -150,6 +150,10 @@ class UserAuthentication(object):
return data.get('password', encrypted)
def ping(self):
""" Returns whether the authentication engine is reachable and working. """
return self.state.ping()
@property
def federated_service(self):
""" Returns the name of the federated service for the auth system. If none, should return None.

View file

@ -5,6 +5,10 @@ class DatabaseUsers(object):
def federated_service(self):
return None
def ping(self):
""" Always assumed to be working. If the DB is broken, other checks will handle it. """
return (True, None)
def verify_credentials(self, username_or_email, password):
""" Simply delegate to the model implementation. """
result = model.user.verify_user(username_or_email, password)

View file

@ -37,6 +37,12 @@ class ExternalJWTAuthN(FederatedUsers):
with open(public_key_path) as public_key_file:
self.public_key = public_key_file.read()
def ping(self):
result = self.client.get(self.getuser_url, timeout=2)
if result.status_code // 100 != 4:
return (False, result.text or 'Could not reach JWT authn endpoint')
return (True, None)
def get_user(self, username_or_email):
if self.getuser_url is None:

View file

@ -193,6 +193,18 @@ class LDAPUsers(FederatedUsers):
email = response.get(self._email_attr, [None])[0]
return (UserInformation(username=username, email=email, id=username), None)
def ping(self):
try:
with self._ldap.get_connection():
pass
except ldap.INVALID_CREDENTIALS:
return (False, 'LDAP Admin dn or password is invalid')
except ldap.LDAPError as lde:
logger.exception('Exception when trying to health check LDAP')
return (False, lde.message)
return (True, None)
def get_user(self, username_or_email):
""" Looks up a username or email in LDAP. """
logger.debug('Looking up LDAP username or email %s', username_or_email)

View file

@ -36,6 +36,21 @@ class KeystoneV2Users(FederatedUsers):
self.debug = os.environ.get('USERS_DEBUG') == '1'
self.requires_email = requires_email
def ping(self):
try:
keystone_client = kclient.Client(username=self.admin_username, password=self.admin_password,
tenant_name=self.admin_tenant, auth_url=self.auth_url,
timeout=self.timeout, debug=self.debug)
keystone_client.user_id # Make sure we loaded a valid user.
except KeystoneUnauthorized as kut:
logger.exception('Keystone unauthorized admin')
return (False, 'Keystone admin credentials are invalid: %s' % kut.message)
except Exception:
logger.exception('Keystone unauthorized admin')
return (False, 'Keystone ping check failed: %s' % kut.message)
return (True, None)
def verify_credentials(self, username_or_email, password):
try:
keystone_client = kclient.Client(username=username_or_email, password=password,
@ -89,6 +104,18 @@ class KeystoneV3Users(FederatedUsers):
tenant_name=self.admin_tenant, auth_url=self.auth_url,
timeout=self.timeout, debug=self.debug)
def ping(self):
try:
self._get_admin_client().user_id # Make sure we loaded a valid user
except KeystoneUnauthorized as kut:
logger.exception('Keystone unauthorized admin')
return (False, 'Keystone admin credentials are invalid: %s' % kut.message)
except Exception:
logger.exception('Keystone unauthorized admin')
return (False, 'Keystone ping check failed: %s' % kut.message)
return (True, None)
def verify_credentials(self, username_or_email, password):
try:
keystone_client = kv3client.Client(username=username_or_email, password=password,

View file

@ -6,6 +6,7 @@ from data.database import model
from data.users.federated import DISABLED_MESSAGE
from test.test_ldap import mock_ldap
from test.test_keystone_auth import fake_keystone
from test.test_external_jwt_authn import fake_jwt
from test.fixtures import *
@ -34,3 +35,16 @@ def test_auth_createuser(auth_system_builder, user1, user2, config, app):
new_user, err = auth.verify_and_link_user(*user2)
assert new_user is None
assert err == DISABLED_MESSAGE
@pytest.mark.parametrize('auth_system_builder,auth_kwargs', [
(mock_ldap, {}),
(fake_keystone, {'version': 3}),
(fake_keystone, {'version': 2}),
(fake_jwt, {}),
])
def test_ping(auth_system_builder, auth_kwargs, app):
with auth_system_builder(**auth_kwargs) as auth:
status, err = auth.ping()
assert status
assert err is None

View file

@ -6,7 +6,7 @@ from datetime import timedelta, datetime
from cachetools import lru_cache
from flask import (abort, redirect, request, url_for, make_response, Response, render_template,
Blueprint, jsonify, send_file)
Blueprint, jsonify, send_file, session)
from flask_login import current_user
import features
@ -260,6 +260,7 @@ def privacy():
# TODO(jschorr): Remove this mirrored endpoint once we migrate ELB.
@web.route('/health', methods=['GET'])
@web.route('/health/instance', methods=['GET'])
@process_auth_or_cookie
@no_cache
def instance_health():
checker = get_healthchecker(app, config_provider, instance_keys)
@ -272,6 +273,7 @@ def instance_health():
# TODO(jschorr): Remove this mirrored endpoint once we migrate pingdom.
@web.route('/status', methods=['GET'])
@web.route('/health/endtoend', methods=['GET'])
@process_auth_or_cookie
@no_cache
def endtoend_health():
checker = get_healthchecker(app, config_provider, instance_keys)
@ -283,6 +285,7 @@ def endtoend_health():
@web.route('/health/dbrevision', methods=['GET'])
@route_show_if(features.BILLING) # Since this is only used in production.
@process_auth_or_cookie
@no_cache
def dbrevision_health():
# Find the revision from the database.
@ -305,6 +308,23 @@ def dbrevision_health():
return response
@web.route('/health/enabledebug/<secret>', methods=['GET'])
@no_cache
def enable_health_debug(secret):
if not secret:
abort(404)
if not app.config.get('ENABLE_HEALTH_DEBUG_SECRET'):
abort(404)
if app.config.get('ENABLE_HEALTH_DEBUG_SECRET') != secret:
abort(404)
session['health_debug'] = True
return make_response('Health check debug information enabled')
@web.route('/robots.txt', methods=['GET'])
def robots():
robots_txt = make_response(render_template('robots.txt', baseurl=get_app_url()))

View file

@ -1,5 +1,8 @@
import boto.rds2
import logging
from auth.permissions import SuperUserPermission
from flask import session
from health.services import check_all_services
logger = logging.getLogger(__name__)
@ -46,21 +49,40 @@ class HealthCheck(object):
is_healthy = True
notes = notes or []
service_statuses_bools = {}
service_status_expanded = {}
for service_name in service_statuses:
status, err = service_statuses[service_name]
service_statuses_bools[service_name] = status
service_status_expanded[service_name] = {
'status': status,
}
if not status:
service_status_expanded[service_name]['failure'] = err
if skip and service_name in skip:
notes.append('%s skipped in compute health' % service_name)
continue
is_healthy = is_healthy and service_statuses[service_name]
is_healthy = is_healthy and status
data = {
'services': service_statuses,
'notes': notes,
'is_testing': self.app.config['TESTING'],
'config_provider': self.config_provider.provider_id,
'local_service_key_id': self.instance_keys.local_key_id,
'services': service_statuses_bools,
}
add_debug_information = SuperUserPermission().can() or session.get('health_debug', False)
if add_debug_information:
data.update({
'services_expanded': service_status_expanded,
'notes': notes,
'is_testing': self.app.config['TESTING'],
'config_provider': self.config_provider.provider_id,
'local_service_key_id': self.instance_keys.local_key_id,
})
return (data, 200 if is_healthy else 503)
@classmethod

View file

@ -1,5 +1,5 @@
import logging
from app import build_logs, storage
from app import build_logs, storage, authentication
from health.models_pre_oci import pre_oci_model as model
logger = logging.getLogger(__name__)
@ -21,10 +21,11 @@ def _check_registry_gunicorn(app):
registry_url = '%s://localhost%s/v1/_internal_ping' % (scheme, port)
try:
return client.get(registry_url, verify=False, timeout=2).status_code == 200
except Exception:
status_code = client.get(registry_url, verify=False, timeout=2).status_code
return (status_code == 200, 'Got non-200 response for registry: %s' % status_code)
except Exception as ex:
logger.exception('Exception when checking registry health: %s', registry_url)
return False
return (False, 'Exception when checking registry health: %s' % registry_url)
def _check_database(app):
@ -41,10 +42,14 @@ def _check_storage(app):
""" Returns the status of storage, as accessed from this instance. """
try:
storage.validate(storage.preferred_locations, app.config['HTTPCLIENT'])
return True
return (True, None)
except Exception as ex:
logger.exception('Storage check failed with exception %s', ex)
return False
return (False, 'Storage check failed with exception %s' % ex.message)
def _check_auth(app):
""" Returns the status of the auth engine, as accessed from this instance. """
return authentication.ping()
_SERVICES = {
@ -52,6 +57,7 @@ _SERVICES = {
'database': _check_database,
'redis': _check_redis,
'storage': _check_storage,
'auth': _check_auth,
}
def check_all_services(app, skip):