Merge pull request #2636 from coreos-inc/auth-health-check
Add support to health checks for auth and make health checks more descriptive
This commit is contained in:
commit
38d3417ca7
12 changed files with 140 additions and 24 deletions
|
@ -471,3 +471,6 @@ class DefaultConfig(ImmutableConfig):
|
|||
|
||||
# Feature Flag: Whether users can view and change their tag expiration.
|
||||
FEATURE_CHANGE_TAG_EXPIRATION = True
|
||||
|
||||
# Defines a secret for enabling the health-check endpoint's debug information.
|
||||
ENABLE_HEALTH_DEBUG_SECRET = None
|
||||
|
|
|
@ -120,15 +120,14 @@ class RedisBuildLogs(object):
|
|||
|
||||
connection = redis.StrictRedis(**args)
|
||||
if not connection.ping() == True:
|
||||
return False
|
||||
return (False, 'Could not ping redis')
|
||||
|
||||
# Ensure we can write and read a key.
|
||||
connection.set(self._health_key(), time.time())
|
||||
connection.get(self._health_key())
|
||||
|
||||
return True
|
||||
except redis.RedisError:
|
||||
return False
|
||||
return (True, None)
|
||||
except redis.RedisError as re:
|
||||
return (False, 'Could not connect to redis: %s' % re.message)
|
||||
|
||||
|
||||
class BuildLogs(object):
|
||||
|
|
|
@ -11,12 +11,11 @@ def check_health(app_config):
|
|||
# check).
|
||||
try:
|
||||
validate_database_url(app_config['DB_URI'], {}, connect_timeout=3)
|
||||
except Exception:
|
||||
logger.exception('Could not connect to the database')
|
||||
return False
|
||||
except Exception as ex:
|
||||
return (False, 'Could not connect to the database: %s', ex.message)
|
||||
|
||||
# We will connect to the db, check that it contains some team role kinds
|
||||
try:
|
||||
return bool(list(TeamRole.select().limit(1)))
|
||||
except:
|
||||
return False
|
||||
return (bool(list(TeamRole.select().limit(1))), 'Could not connect to the database')
|
||||
except Exception as ex:
|
||||
return (False, 'Could not connect to the database: %s', ex.message)
|
||||
|
|
|
@ -150,6 +150,10 @@ class UserAuthentication(object):
|
|||
|
||||
return data.get('password', encrypted)
|
||||
|
||||
def ping(self):
|
||||
""" Returns whether the authentication engine is reachable and working. """
|
||||
return self.state.ping()
|
||||
|
||||
@property
|
||||
def federated_service(self):
|
||||
""" Returns the name of the federated service for the auth system. If none, should return None.
|
||||
|
|
|
@ -5,6 +5,10 @@ class DatabaseUsers(object):
|
|||
def federated_service(self):
|
||||
return None
|
||||
|
||||
def ping(self):
|
||||
""" Always assumed to be working. If the DB is broken, other checks will handle it. """
|
||||
return (True, None)
|
||||
|
||||
def verify_credentials(self, username_or_email, password):
|
||||
""" Simply delegate to the model implementation. """
|
||||
result = model.user.verify_user(username_or_email, password)
|
||||
|
|
|
@ -37,6 +37,12 @@ class ExternalJWTAuthN(FederatedUsers):
|
|||
with open(public_key_path) as public_key_file:
|
||||
self.public_key = public_key_file.read()
|
||||
|
||||
def ping(self):
|
||||
result = self.client.get(self.getuser_url, timeout=2)
|
||||
if result.status_code // 100 != 4:
|
||||
return (False, result.text or 'Could not reach JWT authn endpoint')
|
||||
|
||||
return (True, None)
|
||||
|
||||
def get_user(self, username_or_email):
|
||||
if self.getuser_url is None:
|
||||
|
|
|
@ -193,6 +193,18 @@ class LDAPUsers(FederatedUsers):
|
|||
email = response.get(self._email_attr, [None])[0]
|
||||
return (UserInformation(username=username, email=email, id=username), None)
|
||||
|
||||
def ping(self):
|
||||
try:
|
||||
with self._ldap.get_connection():
|
||||
pass
|
||||
except ldap.INVALID_CREDENTIALS:
|
||||
return (False, 'LDAP Admin dn or password is invalid')
|
||||
except ldap.LDAPError as lde:
|
||||
logger.exception('Exception when trying to health check LDAP')
|
||||
return (False, lde.message)
|
||||
|
||||
return (True, None)
|
||||
|
||||
def get_user(self, username_or_email):
|
||||
""" Looks up a username or email in LDAP. """
|
||||
logger.debug('Looking up LDAP username or email %s', username_or_email)
|
||||
|
|
|
@ -36,6 +36,21 @@ class KeystoneV2Users(FederatedUsers):
|
|||
self.debug = os.environ.get('USERS_DEBUG') == '1'
|
||||
self.requires_email = requires_email
|
||||
|
||||
def ping(self):
|
||||
try:
|
||||
keystone_client = kclient.Client(username=self.admin_username, password=self.admin_password,
|
||||
tenant_name=self.admin_tenant, auth_url=self.auth_url,
|
||||
timeout=self.timeout, debug=self.debug)
|
||||
keystone_client.user_id # Make sure we loaded a valid user.
|
||||
except KeystoneUnauthorized as kut:
|
||||
logger.exception('Keystone unauthorized admin')
|
||||
return (False, 'Keystone admin credentials are invalid: %s' % kut.message)
|
||||
except Exception:
|
||||
logger.exception('Keystone unauthorized admin')
|
||||
return (False, 'Keystone ping check failed: %s' % kut.message)
|
||||
|
||||
return (True, None)
|
||||
|
||||
def verify_credentials(self, username_or_email, password):
|
||||
try:
|
||||
keystone_client = kclient.Client(username=username_or_email, password=password,
|
||||
|
@ -89,6 +104,18 @@ class KeystoneV3Users(FederatedUsers):
|
|||
tenant_name=self.admin_tenant, auth_url=self.auth_url,
|
||||
timeout=self.timeout, debug=self.debug)
|
||||
|
||||
def ping(self):
|
||||
try:
|
||||
self._get_admin_client().user_id # Make sure we loaded a valid user
|
||||
except KeystoneUnauthorized as kut:
|
||||
logger.exception('Keystone unauthorized admin')
|
||||
return (False, 'Keystone admin credentials are invalid: %s' % kut.message)
|
||||
except Exception:
|
||||
logger.exception('Keystone unauthorized admin')
|
||||
return (False, 'Keystone ping check failed: %s' % kut.message)
|
||||
|
||||
return (True, None)
|
||||
|
||||
def verify_credentials(self, username_or_email, password):
|
||||
try:
|
||||
keystone_client = kv3client.Client(username=username_or_email, password=password,
|
||||
|
|
|
@ -6,6 +6,7 @@ from data.database import model
|
|||
from data.users.federated import DISABLED_MESSAGE
|
||||
from test.test_ldap import mock_ldap
|
||||
from test.test_keystone_auth import fake_keystone
|
||||
from test.test_external_jwt_authn import fake_jwt
|
||||
|
||||
from test.fixtures import *
|
||||
|
||||
|
@ -34,3 +35,16 @@ def test_auth_createuser(auth_system_builder, user1, user2, config, app):
|
|||
new_user, err = auth.verify_and_link_user(*user2)
|
||||
assert new_user is None
|
||||
assert err == DISABLED_MESSAGE
|
||||
|
||||
|
||||
@pytest.mark.parametrize('auth_system_builder,auth_kwargs', [
|
||||
(mock_ldap, {}),
|
||||
(fake_keystone, {'version': 3}),
|
||||
(fake_keystone, {'version': 2}),
|
||||
(fake_jwt, {}),
|
||||
])
|
||||
def test_ping(auth_system_builder, auth_kwargs, app):
|
||||
with auth_system_builder(**auth_kwargs) as auth:
|
||||
status, err = auth.ping()
|
||||
assert status
|
||||
assert err is None
|
||||
|
|
|
@ -6,7 +6,7 @@ from datetime import timedelta, datetime
|
|||
|
||||
from cachetools import lru_cache
|
||||
from flask import (abort, redirect, request, url_for, make_response, Response, render_template,
|
||||
Blueprint, jsonify, send_file)
|
||||
Blueprint, jsonify, send_file, session)
|
||||
from flask_login import current_user
|
||||
|
||||
import features
|
||||
|
@ -260,6 +260,7 @@ def privacy():
|
|||
# TODO(jschorr): Remove this mirrored endpoint once we migrate ELB.
|
||||
@web.route('/health', methods=['GET'])
|
||||
@web.route('/health/instance', methods=['GET'])
|
||||
@process_auth_or_cookie
|
||||
@no_cache
|
||||
def instance_health():
|
||||
checker = get_healthchecker(app, config_provider, instance_keys)
|
||||
|
@ -272,6 +273,7 @@ def instance_health():
|
|||
# TODO(jschorr): Remove this mirrored endpoint once we migrate pingdom.
|
||||
@web.route('/status', methods=['GET'])
|
||||
@web.route('/health/endtoend', methods=['GET'])
|
||||
@process_auth_or_cookie
|
||||
@no_cache
|
||||
def endtoend_health():
|
||||
checker = get_healthchecker(app, config_provider, instance_keys)
|
||||
|
@ -283,6 +285,7 @@ def endtoend_health():
|
|||
|
||||
@web.route('/health/dbrevision', methods=['GET'])
|
||||
@route_show_if(features.BILLING) # Since this is only used in production.
|
||||
@process_auth_or_cookie
|
||||
@no_cache
|
||||
def dbrevision_health():
|
||||
# Find the revision from the database.
|
||||
|
@ -305,6 +308,23 @@ def dbrevision_health():
|
|||
return response
|
||||
|
||||
|
||||
@web.route('/health/enabledebug/<secret>', methods=['GET'])
|
||||
@no_cache
|
||||
def enable_health_debug(secret):
|
||||
if not secret:
|
||||
abort(404)
|
||||
|
||||
if not app.config.get('ENABLE_HEALTH_DEBUG_SECRET'):
|
||||
abort(404)
|
||||
|
||||
if app.config.get('ENABLE_HEALTH_DEBUG_SECRET') != secret:
|
||||
abort(404)
|
||||
|
||||
session['health_debug'] = True
|
||||
return make_response('Health check debug information enabled')
|
||||
|
||||
|
||||
|
||||
@web.route('/robots.txt', methods=['GET'])
|
||||
def robots():
|
||||
robots_txt = make_response(render_template('robots.txt', baseurl=get_app_url()))
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
import boto.rds2
|
||||
import logging
|
||||
|
||||
from auth.permissions import SuperUserPermission
|
||||
from flask import session
|
||||
from health.services import check_all_services
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -46,21 +49,40 @@ class HealthCheck(object):
|
|||
is_healthy = True
|
||||
notes = notes or []
|
||||
|
||||
service_statuses_bools = {}
|
||||
service_status_expanded = {}
|
||||
|
||||
for service_name in service_statuses:
|
||||
status, err = service_statuses[service_name]
|
||||
|
||||
service_statuses_bools[service_name] = status
|
||||
service_status_expanded[service_name] = {
|
||||
'status': status,
|
||||
}
|
||||
|
||||
if not status:
|
||||
service_status_expanded[service_name]['failure'] = err
|
||||
|
||||
if skip and service_name in skip:
|
||||
notes.append('%s skipped in compute health' % service_name)
|
||||
continue
|
||||
|
||||
is_healthy = is_healthy and service_statuses[service_name]
|
||||
is_healthy = is_healthy and status
|
||||
|
||||
data = {
|
||||
'services': service_statuses,
|
||||
'notes': notes,
|
||||
'is_testing': self.app.config['TESTING'],
|
||||
'config_provider': self.config_provider.provider_id,
|
||||
'local_service_key_id': self.instance_keys.local_key_id,
|
||||
'services': service_statuses_bools,
|
||||
}
|
||||
|
||||
add_debug_information = SuperUserPermission().can() or session.get('health_debug', False)
|
||||
if add_debug_information:
|
||||
data.update({
|
||||
'services_expanded': service_status_expanded,
|
||||
'notes': notes,
|
||||
'is_testing': self.app.config['TESTING'],
|
||||
'config_provider': self.config_provider.provider_id,
|
||||
'local_service_key_id': self.instance_keys.local_key_id,
|
||||
})
|
||||
|
||||
return (data, 200 if is_healthy else 503)
|
||||
|
||||
@classmethod
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import logging
|
||||
from app import build_logs, storage
|
||||
from app import build_logs, storage, authentication
|
||||
from health.models_pre_oci import pre_oci_model as model
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -21,10 +21,11 @@ def _check_registry_gunicorn(app):
|
|||
|
||||
registry_url = '%s://localhost%s/v1/_internal_ping' % (scheme, port)
|
||||
try:
|
||||
return client.get(registry_url, verify=False, timeout=2).status_code == 200
|
||||
except Exception:
|
||||
status_code = client.get(registry_url, verify=False, timeout=2).status_code
|
||||
return (status_code == 200, 'Got non-200 response for registry: %s' % status_code)
|
||||
except Exception as ex:
|
||||
logger.exception('Exception when checking registry health: %s', registry_url)
|
||||
return False
|
||||
return (False, 'Exception when checking registry health: %s' % registry_url)
|
||||
|
||||
|
||||
def _check_database(app):
|
||||
|
@ -41,10 +42,14 @@ def _check_storage(app):
|
|||
""" Returns the status of storage, as accessed from this instance. """
|
||||
try:
|
||||
storage.validate(storage.preferred_locations, app.config['HTTPCLIENT'])
|
||||
return True
|
||||
return (True, None)
|
||||
except Exception as ex:
|
||||
logger.exception('Storage check failed with exception %s', ex)
|
||||
return False
|
||||
return (False, 'Storage check failed with exception %s' % ex.message)
|
||||
|
||||
def _check_auth(app):
|
||||
""" Returns the status of the auth engine, as accessed from this instance. """
|
||||
return authentication.ping()
|
||||
|
||||
|
||||
_SERVICES = {
|
||||
|
@ -52,6 +57,7 @@ _SERVICES = {
|
|||
'database': _check_database,
|
||||
'redis': _check_redis,
|
||||
'storage': _check_storage,
|
||||
'auth': _check_auth,
|
||||
}
|
||||
|
||||
def check_all_services(app, skip):
|
||||
|
|
Reference in a new issue