Merge pull request #2636 from coreos-inc/auth-health-check
Add support to health checks for auth and make health checks more descriptive
This commit is contained in:
commit
38d3417ca7
12 changed files with 140 additions and 24 deletions
|
@ -471,3 +471,6 @@ class DefaultConfig(ImmutableConfig):
|
||||||
|
|
||||||
# Feature Flag: Whether users can view and change their tag expiration.
|
# Feature Flag: Whether users can view and change their tag expiration.
|
||||||
FEATURE_CHANGE_TAG_EXPIRATION = True
|
FEATURE_CHANGE_TAG_EXPIRATION = True
|
||||||
|
|
||||||
|
# Defines a secret for enabling the health-check endpoint's debug information.
|
||||||
|
ENABLE_HEALTH_DEBUG_SECRET = None
|
||||||
|
|
|
@ -120,15 +120,14 @@ class RedisBuildLogs(object):
|
||||||
|
|
||||||
connection = redis.StrictRedis(**args)
|
connection = redis.StrictRedis(**args)
|
||||||
if not connection.ping() == True:
|
if not connection.ping() == True:
|
||||||
return False
|
return (False, 'Could not ping redis')
|
||||||
|
|
||||||
# Ensure we can write and read a key.
|
# Ensure we can write and read a key.
|
||||||
connection.set(self._health_key(), time.time())
|
connection.set(self._health_key(), time.time())
|
||||||
connection.get(self._health_key())
|
connection.get(self._health_key())
|
||||||
|
return (True, None)
|
||||||
return True
|
except redis.RedisError as re:
|
||||||
except redis.RedisError:
|
return (False, 'Could not connect to redis: %s' % re.message)
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
class BuildLogs(object):
|
class BuildLogs(object):
|
||||||
|
|
|
@ -11,12 +11,11 @@ def check_health(app_config):
|
||||||
# check).
|
# check).
|
||||||
try:
|
try:
|
||||||
validate_database_url(app_config['DB_URI'], {}, connect_timeout=3)
|
validate_database_url(app_config['DB_URI'], {}, connect_timeout=3)
|
||||||
except Exception:
|
except Exception as ex:
|
||||||
logger.exception('Could not connect to the database')
|
return (False, 'Could not connect to the database: %s', ex.message)
|
||||||
return False
|
|
||||||
|
|
||||||
# We will connect to the db, check that it contains some team role kinds
|
# We will connect to the db, check that it contains some team role kinds
|
||||||
try:
|
try:
|
||||||
return bool(list(TeamRole.select().limit(1)))
|
return (bool(list(TeamRole.select().limit(1))), 'Could not connect to the database')
|
||||||
except:
|
except Exception as ex:
|
||||||
return False
|
return (False, 'Could not connect to the database: %s', ex.message)
|
||||||
|
|
|
@ -150,6 +150,10 @@ class UserAuthentication(object):
|
||||||
|
|
||||||
return data.get('password', encrypted)
|
return data.get('password', encrypted)
|
||||||
|
|
||||||
|
def ping(self):
|
||||||
|
""" Returns whether the authentication engine is reachable and working. """
|
||||||
|
return self.state.ping()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def federated_service(self):
|
def federated_service(self):
|
||||||
""" Returns the name of the federated service for the auth system. If none, should return None.
|
""" Returns the name of the federated service for the auth system. If none, should return None.
|
||||||
|
|
|
@ -5,6 +5,10 @@ class DatabaseUsers(object):
|
||||||
def federated_service(self):
|
def federated_service(self):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def ping(self):
|
||||||
|
""" Always assumed to be working. If the DB is broken, other checks will handle it. """
|
||||||
|
return (True, None)
|
||||||
|
|
||||||
def verify_credentials(self, username_or_email, password):
|
def verify_credentials(self, username_or_email, password):
|
||||||
""" Simply delegate to the model implementation. """
|
""" Simply delegate to the model implementation. """
|
||||||
result = model.user.verify_user(username_or_email, password)
|
result = model.user.verify_user(username_or_email, password)
|
||||||
|
|
|
@ -37,6 +37,12 @@ class ExternalJWTAuthN(FederatedUsers):
|
||||||
with open(public_key_path) as public_key_file:
|
with open(public_key_path) as public_key_file:
|
||||||
self.public_key = public_key_file.read()
|
self.public_key = public_key_file.read()
|
||||||
|
|
||||||
|
def ping(self):
|
||||||
|
result = self.client.get(self.getuser_url, timeout=2)
|
||||||
|
if result.status_code // 100 != 4:
|
||||||
|
return (False, result.text or 'Could not reach JWT authn endpoint')
|
||||||
|
|
||||||
|
return (True, None)
|
||||||
|
|
||||||
def get_user(self, username_or_email):
|
def get_user(self, username_or_email):
|
||||||
if self.getuser_url is None:
|
if self.getuser_url is None:
|
||||||
|
|
|
@ -193,6 +193,18 @@ class LDAPUsers(FederatedUsers):
|
||||||
email = response.get(self._email_attr, [None])[0]
|
email = response.get(self._email_attr, [None])[0]
|
||||||
return (UserInformation(username=username, email=email, id=username), None)
|
return (UserInformation(username=username, email=email, id=username), None)
|
||||||
|
|
||||||
|
def ping(self):
|
||||||
|
try:
|
||||||
|
with self._ldap.get_connection():
|
||||||
|
pass
|
||||||
|
except ldap.INVALID_CREDENTIALS:
|
||||||
|
return (False, 'LDAP Admin dn or password is invalid')
|
||||||
|
except ldap.LDAPError as lde:
|
||||||
|
logger.exception('Exception when trying to health check LDAP')
|
||||||
|
return (False, lde.message)
|
||||||
|
|
||||||
|
return (True, None)
|
||||||
|
|
||||||
def get_user(self, username_or_email):
|
def get_user(self, username_or_email):
|
||||||
""" Looks up a username or email in LDAP. """
|
""" Looks up a username or email in LDAP. """
|
||||||
logger.debug('Looking up LDAP username or email %s', username_or_email)
|
logger.debug('Looking up LDAP username or email %s', username_or_email)
|
||||||
|
|
|
@ -36,6 +36,21 @@ class KeystoneV2Users(FederatedUsers):
|
||||||
self.debug = os.environ.get('USERS_DEBUG') == '1'
|
self.debug = os.environ.get('USERS_DEBUG') == '1'
|
||||||
self.requires_email = requires_email
|
self.requires_email = requires_email
|
||||||
|
|
||||||
|
def ping(self):
|
||||||
|
try:
|
||||||
|
keystone_client = kclient.Client(username=self.admin_username, password=self.admin_password,
|
||||||
|
tenant_name=self.admin_tenant, auth_url=self.auth_url,
|
||||||
|
timeout=self.timeout, debug=self.debug)
|
||||||
|
keystone_client.user_id # Make sure we loaded a valid user.
|
||||||
|
except KeystoneUnauthorized as kut:
|
||||||
|
logger.exception('Keystone unauthorized admin')
|
||||||
|
return (False, 'Keystone admin credentials are invalid: %s' % kut.message)
|
||||||
|
except Exception:
|
||||||
|
logger.exception('Keystone unauthorized admin')
|
||||||
|
return (False, 'Keystone ping check failed: %s' % kut.message)
|
||||||
|
|
||||||
|
return (True, None)
|
||||||
|
|
||||||
def verify_credentials(self, username_or_email, password):
|
def verify_credentials(self, username_or_email, password):
|
||||||
try:
|
try:
|
||||||
keystone_client = kclient.Client(username=username_or_email, password=password,
|
keystone_client = kclient.Client(username=username_or_email, password=password,
|
||||||
|
@ -89,6 +104,18 @@ class KeystoneV3Users(FederatedUsers):
|
||||||
tenant_name=self.admin_tenant, auth_url=self.auth_url,
|
tenant_name=self.admin_tenant, auth_url=self.auth_url,
|
||||||
timeout=self.timeout, debug=self.debug)
|
timeout=self.timeout, debug=self.debug)
|
||||||
|
|
||||||
|
def ping(self):
|
||||||
|
try:
|
||||||
|
self._get_admin_client().user_id # Make sure we loaded a valid user
|
||||||
|
except KeystoneUnauthorized as kut:
|
||||||
|
logger.exception('Keystone unauthorized admin')
|
||||||
|
return (False, 'Keystone admin credentials are invalid: %s' % kut.message)
|
||||||
|
except Exception:
|
||||||
|
logger.exception('Keystone unauthorized admin')
|
||||||
|
return (False, 'Keystone ping check failed: %s' % kut.message)
|
||||||
|
|
||||||
|
return (True, None)
|
||||||
|
|
||||||
def verify_credentials(self, username_or_email, password):
|
def verify_credentials(self, username_or_email, password):
|
||||||
try:
|
try:
|
||||||
keystone_client = kv3client.Client(username=username_or_email, password=password,
|
keystone_client = kv3client.Client(username=username_or_email, password=password,
|
||||||
|
|
|
@ -6,6 +6,7 @@ from data.database import model
|
||||||
from data.users.federated import DISABLED_MESSAGE
|
from data.users.federated import DISABLED_MESSAGE
|
||||||
from test.test_ldap import mock_ldap
|
from test.test_ldap import mock_ldap
|
||||||
from test.test_keystone_auth import fake_keystone
|
from test.test_keystone_auth import fake_keystone
|
||||||
|
from test.test_external_jwt_authn import fake_jwt
|
||||||
|
|
||||||
from test.fixtures import *
|
from test.fixtures import *
|
||||||
|
|
||||||
|
@ -34,3 +35,16 @@ def test_auth_createuser(auth_system_builder, user1, user2, config, app):
|
||||||
new_user, err = auth.verify_and_link_user(*user2)
|
new_user, err = auth.verify_and_link_user(*user2)
|
||||||
assert new_user is None
|
assert new_user is None
|
||||||
assert err == DISABLED_MESSAGE
|
assert err == DISABLED_MESSAGE
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('auth_system_builder,auth_kwargs', [
|
||||||
|
(mock_ldap, {}),
|
||||||
|
(fake_keystone, {'version': 3}),
|
||||||
|
(fake_keystone, {'version': 2}),
|
||||||
|
(fake_jwt, {}),
|
||||||
|
])
|
||||||
|
def test_ping(auth_system_builder, auth_kwargs, app):
|
||||||
|
with auth_system_builder(**auth_kwargs) as auth:
|
||||||
|
status, err = auth.ping()
|
||||||
|
assert status
|
||||||
|
assert err is None
|
||||||
|
|
|
@ -6,7 +6,7 @@ from datetime import timedelta, datetime
|
||||||
|
|
||||||
from cachetools import lru_cache
|
from cachetools import lru_cache
|
||||||
from flask import (abort, redirect, request, url_for, make_response, Response, render_template,
|
from flask import (abort, redirect, request, url_for, make_response, Response, render_template,
|
||||||
Blueprint, jsonify, send_file)
|
Blueprint, jsonify, send_file, session)
|
||||||
from flask_login import current_user
|
from flask_login import current_user
|
||||||
|
|
||||||
import features
|
import features
|
||||||
|
@ -260,6 +260,7 @@ def privacy():
|
||||||
# TODO(jschorr): Remove this mirrored endpoint once we migrate ELB.
|
# TODO(jschorr): Remove this mirrored endpoint once we migrate ELB.
|
||||||
@web.route('/health', methods=['GET'])
|
@web.route('/health', methods=['GET'])
|
||||||
@web.route('/health/instance', methods=['GET'])
|
@web.route('/health/instance', methods=['GET'])
|
||||||
|
@process_auth_or_cookie
|
||||||
@no_cache
|
@no_cache
|
||||||
def instance_health():
|
def instance_health():
|
||||||
checker = get_healthchecker(app, config_provider, instance_keys)
|
checker = get_healthchecker(app, config_provider, instance_keys)
|
||||||
|
@ -272,6 +273,7 @@ def instance_health():
|
||||||
# TODO(jschorr): Remove this mirrored endpoint once we migrate pingdom.
|
# TODO(jschorr): Remove this mirrored endpoint once we migrate pingdom.
|
||||||
@web.route('/status', methods=['GET'])
|
@web.route('/status', methods=['GET'])
|
||||||
@web.route('/health/endtoend', methods=['GET'])
|
@web.route('/health/endtoend', methods=['GET'])
|
||||||
|
@process_auth_or_cookie
|
||||||
@no_cache
|
@no_cache
|
||||||
def endtoend_health():
|
def endtoend_health():
|
||||||
checker = get_healthchecker(app, config_provider, instance_keys)
|
checker = get_healthchecker(app, config_provider, instance_keys)
|
||||||
|
@ -283,6 +285,7 @@ def endtoend_health():
|
||||||
|
|
||||||
@web.route('/health/dbrevision', methods=['GET'])
|
@web.route('/health/dbrevision', methods=['GET'])
|
||||||
@route_show_if(features.BILLING) # Since this is only used in production.
|
@route_show_if(features.BILLING) # Since this is only used in production.
|
||||||
|
@process_auth_or_cookie
|
||||||
@no_cache
|
@no_cache
|
||||||
def dbrevision_health():
|
def dbrevision_health():
|
||||||
# Find the revision from the database.
|
# Find the revision from the database.
|
||||||
|
@ -305,6 +308,23 @@ def dbrevision_health():
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
@web.route('/health/enabledebug/<secret>', methods=['GET'])
|
||||||
|
@no_cache
|
||||||
|
def enable_health_debug(secret):
|
||||||
|
if not secret:
|
||||||
|
abort(404)
|
||||||
|
|
||||||
|
if not app.config.get('ENABLE_HEALTH_DEBUG_SECRET'):
|
||||||
|
abort(404)
|
||||||
|
|
||||||
|
if app.config.get('ENABLE_HEALTH_DEBUG_SECRET') != secret:
|
||||||
|
abort(404)
|
||||||
|
|
||||||
|
session['health_debug'] = True
|
||||||
|
return make_response('Health check debug information enabled')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@web.route('/robots.txt', methods=['GET'])
|
@web.route('/robots.txt', methods=['GET'])
|
||||||
def robots():
|
def robots():
|
||||||
robots_txt = make_response(render_template('robots.txt', baseurl=get_app_url()))
|
robots_txt = make_response(render_template('robots.txt', baseurl=get_app_url()))
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
import boto.rds2
|
import boto.rds2
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
from auth.permissions import SuperUserPermission
|
||||||
|
from flask import session
|
||||||
from health.services import check_all_services
|
from health.services import check_all_services
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -46,21 +49,40 @@ class HealthCheck(object):
|
||||||
is_healthy = True
|
is_healthy = True
|
||||||
notes = notes or []
|
notes = notes or []
|
||||||
|
|
||||||
|
service_statuses_bools = {}
|
||||||
|
service_status_expanded = {}
|
||||||
|
|
||||||
for service_name in service_statuses:
|
for service_name in service_statuses:
|
||||||
|
status, err = service_statuses[service_name]
|
||||||
|
|
||||||
|
service_statuses_bools[service_name] = status
|
||||||
|
service_status_expanded[service_name] = {
|
||||||
|
'status': status,
|
||||||
|
}
|
||||||
|
|
||||||
|
if not status:
|
||||||
|
service_status_expanded[service_name]['failure'] = err
|
||||||
|
|
||||||
if skip and service_name in skip:
|
if skip and service_name in skip:
|
||||||
notes.append('%s skipped in compute health' % service_name)
|
notes.append('%s skipped in compute health' % service_name)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
is_healthy = is_healthy and service_statuses[service_name]
|
is_healthy = is_healthy and status
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
'services': service_statuses,
|
'services': service_statuses_bools,
|
||||||
'notes': notes,
|
|
||||||
'is_testing': self.app.config['TESTING'],
|
|
||||||
'config_provider': self.config_provider.provider_id,
|
|
||||||
'local_service_key_id': self.instance_keys.local_key_id,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
add_debug_information = SuperUserPermission().can() or session.get('health_debug', False)
|
||||||
|
if add_debug_information:
|
||||||
|
data.update({
|
||||||
|
'services_expanded': service_status_expanded,
|
||||||
|
'notes': notes,
|
||||||
|
'is_testing': self.app.config['TESTING'],
|
||||||
|
'config_provider': self.config_provider.provider_id,
|
||||||
|
'local_service_key_id': self.instance_keys.local_key_id,
|
||||||
|
})
|
||||||
|
|
||||||
return (data, 200 if is_healthy else 503)
|
return (data, 200 if is_healthy else 503)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import logging
|
import logging
|
||||||
from app import build_logs, storage
|
from app import build_logs, storage, authentication
|
||||||
from health.models_pre_oci import pre_oci_model as model
|
from health.models_pre_oci import pre_oci_model as model
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -21,10 +21,11 @@ def _check_registry_gunicorn(app):
|
||||||
|
|
||||||
registry_url = '%s://localhost%s/v1/_internal_ping' % (scheme, port)
|
registry_url = '%s://localhost%s/v1/_internal_ping' % (scheme, port)
|
||||||
try:
|
try:
|
||||||
return client.get(registry_url, verify=False, timeout=2).status_code == 200
|
status_code = client.get(registry_url, verify=False, timeout=2).status_code
|
||||||
except Exception:
|
return (status_code == 200, 'Got non-200 response for registry: %s' % status_code)
|
||||||
|
except Exception as ex:
|
||||||
logger.exception('Exception when checking registry health: %s', registry_url)
|
logger.exception('Exception when checking registry health: %s', registry_url)
|
||||||
return False
|
return (False, 'Exception when checking registry health: %s' % registry_url)
|
||||||
|
|
||||||
|
|
||||||
def _check_database(app):
|
def _check_database(app):
|
||||||
|
@ -41,10 +42,14 @@ def _check_storage(app):
|
||||||
""" Returns the status of storage, as accessed from this instance. """
|
""" Returns the status of storage, as accessed from this instance. """
|
||||||
try:
|
try:
|
||||||
storage.validate(storage.preferred_locations, app.config['HTTPCLIENT'])
|
storage.validate(storage.preferred_locations, app.config['HTTPCLIENT'])
|
||||||
return True
|
return (True, None)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logger.exception('Storage check failed with exception %s', ex)
|
logger.exception('Storage check failed with exception %s', ex)
|
||||||
return False
|
return (False, 'Storage check failed with exception %s' % ex.message)
|
||||||
|
|
||||||
|
def _check_auth(app):
|
||||||
|
""" Returns the status of the auth engine, as accessed from this instance. """
|
||||||
|
return authentication.ping()
|
||||||
|
|
||||||
|
|
||||||
_SERVICES = {
|
_SERVICES = {
|
||||||
|
@ -52,6 +57,7 @@ _SERVICES = {
|
||||||
'database': _check_database,
|
'database': _check_database,
|
||||||
'redis': _check_redis,
|
'redis': _check_redis,
|
||||||
'storage': _check_storage,
|
'storage': _check_storage,
|
||||||
|
'auth': _check_auth,
|
||||||
}
|
}
|
||||||
|
|
||||||
def check_all_services(app, skip):
|
def check_all_services(app, skip):
|
||||||
|
|
Reference in a new issue