From 0dfb6806e3f2c76b91e99304184104ed06610953 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 10 May 2017 20:55:10 -0700 Subject: [PATCH 1/4] Add ping method to auth engines to determine if they are reachable --- data/users/__init__.py | 4 ++++ data/users/database.py | 4 ++++ data/users/externaljwt.py | 6 ++++++ data/users/externalldap.py | 12 ++++++++++++ data/users/keystone.py | 27 +++++++++++++++++++++++++++ data/users/test/test_users.py | 14 ++++++++++++++ 6 files changed, 67 insertions(+) diff --git a/data/users/__init__.py b/data/users/__init__.py index f8e47ba16..519947f8d 100644 --- a/data/users/__init__.py +++ b/data/users/__init__.py @@ -150,6 +150,10 @@ class UserAuthentication(object): return data.get('password', encrypted) + def ping(self): + """ Returns whether the authentication engine is reachable and working. """ + return self.state.ping() + @property def federated_service(self): """ Returns the name of the federated service for the auth system. If none, should return None. diff --git a/data/users/database.py b/data/users/database.py index 862065723..ce04b3c9d 100644 --- a/data/users/database.py +++ b/data/users/database.py @@ -5,6 +5,10 @@ class DatabaseUsers(object): def federated_service(self): return None + def ping(self): + """ Always assumed to be working. If the DB is broken, other checks will handle it. """ + return (True, None) + def verify_credentials(self, username_or_email, password): """ Simply delegate to the model implementation. """ result = model.user.verify_user(username_or_email, password) diff --git a/data/users/externaljwt.py b/data/users/externaljwt.py index 33e8f506b..64960a08b 100644 --- a/data/users/externaljwt.py +++ b/data/users/externaljwt.py @@ -37,6 +37,12 @@ class ExternalJWTAuthN(FederatedUsers): with open(public_key_path) as public_key_file: self.public_key = public_key_file.read() + def ping(self): + result = self.client.get(self.getuser_url, timeout=2) + if result.status_code // 100 != 4: + return (False, result.text or 'Could not reach JWT authn endpoint') + + return (True, None) def get_user(self, username_or_email): if self.getuser_url is None: diff --git a/data/users/externalldap.py b/data/users/externalldap.py index 9ed346ae4..f2c7e298f 100644 --- a/data/users/externalldap.py +++ b/data/users/externalldap.py @@ -193,6 +193,18 @@ class LDAPUsers(FederatedUsers): email = response.get(self._email_attr, [None])[0] return (UserInformation(username=username, email=email, id=username), None) + def ping(self): + try: + with self._ldap.get_connection(): + pass + except ldap.INVALID_CREDENTIALS: + return (False, 'LDAP Admin dn or password is invalid') + except ldap.LDAPError as lde: + logger.exception('Exception when trying to health check LDAP') + return (False, lde.message) + + return (True, None) + def get_user(self, username_or_email): """ Looks up a username or email in LDAP. """ logger.debug('Looking up LDAP username or email %s', username_or_email) diff --git a/data/users/keystone.py b/data/users/keystone.py index 8c46a7869..d6fa3c628 100644 --- a/data/users/keystone.py +++ b/data/users/keystone.py @@ -36,6 +36,21 @@ class KeystoneV2Users(FederatedUsers): self.debug = os.environ.get('USERS_DEBUG') == '1' self.requires_email = requires_email + def ping(self): + try: + keystone_client = kclient.Client(username=self.admin_username, password=self.admin_password, + tenant_name=self.admin_tenant, auth_url=self.auth_url, + timeout=self.timeout, debug=self.debug) + keystone_client.user_id # Make sure we loaded a valid user. + except KeystoneUnauthorized as kut: + logger.exception('Keystone unauthorized admin') + return (False, 'Keystone admin credentials are invalid: %s' % kut.message) + except Exception: + logger.exception('Keystone unauthorized admin') + return (False, 'Keystone ping check failed: %s' % kut.message) + + return (True, None) + def verify_credentials(self, username_or_email, password): try: keystone_client = kclient.Client(username=username_or_email, password=password, @@ -89,6 +104,18 @@ class KeystoneV3Users(FederatedUsers): tenant_name=self.admin_tenant, auth_url=self.auth_url, timeout=self.timeout, debug=self.debug) + def ping(self): + try: + self._get_admin_client().user_id # Make sure we loaded a valid user + except KeystoneUnauthorized as kut: + logger.exception('Keystone unauthorized admin') + return (False, 'Keystone admin credentials are invalid: %s' % kut.message) + except Exception: + logger.exception('Keystone unauthorized admin') + return (False, 'Keystone ping check failed: %s' % kut.message) + + return (True, None) + def verify_credentials(self, username_or_email, password): try: keystone_client = kv3client.Client(username=username_or_email, password=password, diff --git a/data/users/test/test_users.py b/data/users/test/test_users.py index 007332880..d3e5eb09d 100644 --- a/data/users/test/test_users.py +++ b/data/users/test/test_users.py @@ -6,6 +6,7 @@ from data.database import model from data.users.federated import DISABLED_MESSAGE from test.test_ldap import mock_ldap from test.test_keystone_auth import fake_keystone +from test.test_external_jwt_authn import fake_jwt from test.fixtures import * @@ -34,3 +35,16 @@ def test_auth_createuser(auth_system_builder, user1, user2, config, app): new_user, err = auth.verify_and_link_user(*user2) assert new_user is None assert err == DISABLED_MESSAGE + + +@pytest.mark.parametrize('auth_system_builder,auth_kwargs', [ + (mock_ldap, {}), + (fake_keystone, {'version': 3}), + (fake_keystone, {'version': 2}), + (fake_jwt, {}), +]) +def test_ping(auth_system_builder, auth_kwargs, app): + with auth_system_builder(**auth_kwargs) as auth: + status, err = auth.ping() + assert status + assert err is None From e44a503bd02c7839076233358fc1097354a142e6 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 10 May 2017 20:55:19 -0700 Subject: [PATCH 2/4] Add status check for auth endpoint --- health/services.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/health/services.py b/health/services.py index 49ba07199..95d4ba824 100644 --- a/health/services.py +++ b/health/services.py @@ -1,5 +1,5 @@ import logging -from app import build_logs, storage +from app import build_logs, storage, authentication from health.models_pre_oci import pre_oci_model as model logger = logging.getLogger(__name__) @@ -46,12 +46,18 @@ def _check_storage(app): logger.exception('Storage check failed with exception %s', ex) return False +def _check_auth(app): + """ Returns the status of the auth engine, as accessed from this instance. """ + (status, _) = authentication.ping() + return status + _SERVICES = { 'registry_gunicorn': _check_registry_gunicorn, 'database': _check_database, 'redis': _check_redis, 'storage': _check_storage, + 'auth': _check_auth, } def check_all_services(app, skip): From 4ad3682b9c59178be491a52caf4ed26554d901b8 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 10 May 2017 21:05:14 -0700 Subject: [PATCH 3/4] Make health check failures report their reasons Note that we add a new block with expanded service info, to avoid breaking compatibility with existing callers of the health endpoint --- data/buildlogs.py | 9 ++++----- data/model/health.py | 11 +++++------ health/healthcheck.py | 18 ++++++++++++++++-- health/services.py | 14 +++++++------- 4 files changed, 32 insertions(+), 20 deletions(-) diff --git a/data/buildlogs.py b/data/buildlogs.py index ba8b1ce78..f989977e3 100644 --- a/data/buildlogs.py +++ b/data/buildlogs.py @@ -120,15 +120,14 @@ class RedisBuildLogs(object): connection = redis.StrictRedis(**args) if not connection.ping() == True: - return False + return (False, 'Could not ping redis') # Ensure we can write and read a key. connection.set(self._health_key(), time.time()) connection.get(self._health_key()) - - return True - except redis.RedisError: - return False + return (True, None) + except redis.RedisError as re: + return (False, 'Could not connect to redis: %s' % re.message) class BuildLogs(object): diff --git a/data/model/health.py b/data/model/health.py index 0d177b400..ca796a48b 100644 --- a/data/model/health.py +++ b/data/model/health.py @@ -11,12 +11,11 @@ def check_health(app_config): # check). try: validate_database_url(app_config['DB_URI'], {}, connect_timeout=3) - except Exception: - logger.exception('Could not connect to the database') - return False + except Exception as ex: + return (False, 'Could not connect to the database: %s', ex.message) # We will connect to the db, check that it contains some team role kinds try: - return bool(list(TeamRole.select().limit(1))) - except: - return False + return (bool(list(TeamRole.select().limit(1))), 'Could not connect to the database') + except Exception as ex: + return (False, 'Could not connect to the database: %s', ex.message) diff --git a/health/healthcheck.py b/health/healthcheck.py index cfb427a31..632bb5920 100644 --- a/health/healthcheck.py +++ b/health/healthcheck.py @@ -46,15 +46,29 @@ class HealthCheck(object): is_healthy = True notes = notes or [] + service_statuses_bools = {} + service_status_expanded = {} + for service_name in service_statuses: + status, err = service_statuses[service_name] + + service_statuses_bools[service_name] = status + service_status_expanded[service_name] = { + 'status': status, + } + + if not status: + service_status_expanded[service_name]['failure'] = err + if skip and service_name in skip: notes.append('%s skipped in compute health' % service_name) continue - is_healthy = is_healthy and service_statuses[service_name] + is_healthy = is_healthy and status data = { - 'services': service_statuses, + 'services': service_statuses_bools, + 'services_expanded': service_status_expanded, 'notes': notes, 'is_testing': self.app.config['TESTING'], 'config_provider': self.config_provider.provider_id, diff --git a/health/services.py b/health/services.py index 95d4ba824..b1191032a 100644 --- a/health/services.py +++ b/health/services.py @@ -21,10 +21,11 @@ def _check_registry_gunicorn(app): registry_url = '%s://localhost%s/v1/_internal_ping' % (scheme, port) try: - return client.get(registry_url, verify=False, timeout=2).status_code == 200 - except Exception: + status_code = client.get(registry_url, verify=False, timeout=2).status_code + return (status_code == 200, 'Got non-200 response for registry: %s' % status_code) + except Exception as ex: logger.exception('Exception when checking registry health: %s', registry_url) - return False + return (False, 'Exception when checking registry health: %s' % registry_url) def _check_database(app): @@ -41,15 +42,14 @@ def _check_storage(app): """ Returns the status of storage, as accessed from this instance. """ try: storage.validate(storage.preferred_locations, app.config['HTTPCLIENT']) - return True + return (True, None) except Exception as ex: logger.exception('Storage check failed with exception %s', ex) - return False + return (False, 'Storage check failed with exception %s' % ex.message) def _check_auth(app): """ Returns the status of the auth engine, as accessed from this instance. """ - (status, _) = authentication.ping() - return status + return authentication.ping() _SERVICES = { From b7d6bb12fa687b1f647e524041a01f2d3ae09e01 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 24 May 2017 18:26:22 -0400 Subject: [PATCH 4/4] Hide extended health check information behind superuser permission or a session property Also adds an endpoint that (when specified with the proper secret), sets the session property --- config.py | 3 +++ endpoints/web.py | 22 +++++++++++++++++++++- health/healthcheck.py | 18 +++++++++++++----- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/config.py b/config.py index 671e174bd..795443317 100644 --- a/config.py +++ b/config.py @@ -471,3 +471,6 @@ class DefaultConfig(ImmutableConfig): # Feature Flag: Whether users can view and change their tag expiration. FEATURE_CHANGE_TAG_EXPIRATION = True + + # Defines a secret for enabling the health-check endpoint's debug information. + ENABLE_HEALTH_DEBUG_SECRET = None diff --git a/endpoints/web.py b/endpoints/web.py index 499cc181c..feefc8465 100644 --- a/endpoints/web.py +++ b/endpoints/web.py @@ -6,7 +6,7 @@ from datetime import timedelta, datetime from cachetools import lru_cache from flask import (abort, redirect, request, url_for, make_response, Response, render_template, - Blueprint, jsonify, send_file) + Blueprint, jsonify, send_file, session) from flask_login import current_user import features @@ -260,6 +260,7 @@ def privacy(): # TODO(jschorr): Remove this mirrored endpoint once we migrate ELB. @web.route('/health', methods=['GET']) @web.route('/health/instance', methods=['GET']) +@process_auth_or_cookie @no_cache def instance_health(): checker = get_healthchecker(app, config_provider, instance_keys) @@ -272,6 +273,7 @@ def instance_health(): # TODO(jschorr): Remove this mirrored endpoint once we migrate pingdom. @web.route('/status', methods=['GET']) @web.route('/health/endtoend', methods=['GET']) +@process_auth_or_cookie @no_cache def endtoend_health(): checker = get_healthchecker(app, config_provider, instance_keys) @@ -283,6 +285,7 @@ def endtoend_health(): @web.route('/health/dbrevision', methods=['GET']) @route_show_if(features.BILLING) # Since this is only used in production. +@process_auth_or_cookie @no_cache def dbrevision_health(): # Find the revision from the database. @@ -305,6 +308,23 @@ def dbrevision_health(): return response +@web.route('/health/enabledebug/', methods=['GET']) +@no_cache +def enable_health_debug(secret): + if not secret: + abort(404) + + if not app.config.get('ENABLE_HEALTH_DEBUG_SECRET'): + abort(404) + + if app.config.get('ENABLE_HEALTH_DEBUG_SECRET') != secret: + abort(404) + + session['health_debug'] = True + return make_response('Health check debug information enabled') + + + @web.route('/robots.txt', methods=['GET']) def robots(): robots_txt = make_response(render_template('robots.txt', baseurl=get_app_url())) diff --git a/health/healthcheck.py b/health/healthcheck.py index 632bb5920..f0e8a28b9 100644 --- a/health/healthcheck.py +++ b/health/healthcheck.py @@ -1,5 +1,8 @@ import boto.rds2 import logging + +from auth.permissions import SuperUserPermission +from flask import session from health.services import check_all_services logger = logging.getLogger(__name__) @@ -68,13 +71,18 @@ class HealthCheck(object): data = { 'services': service_statuses_bools, - 'services_expanded': service_status_expanded, - 'notes': notes, - 'is_testing': self.app.config['TESTING'], - 'config_provider': self.config_provider.provider_id, - 'local_service_key_id': self.instance_keys.local_key_id, } + add_debug_information = SuperUserPermission().can() or session.get('health_debug', False) + if add_debug_information: + data.update({ + 'services_expanded': service_status_expanded, + 'notes': notes, + 'is_testing': self.app.config['TESTING'], + 'config_provider': self.config_provider.provider_id, + 'local_service_key_id': self.instance_keys.local_key_id, + }) + return (data, 200 if is_healthy else 503) @classmethod