From c1cc52f58ba97f2b99462302748f48b7df98bba1 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Fri, 10 Nov 2017 13:47:19 -0500 Subject: [PATCH 1/3] Add a health check for the instance key If the key expires or disappears, the node will now go unhealthy, taking it out of service and preventing downtime --- health/services.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/health/services.py b/health/services.py index c5e6a3a51..5998292af 100644 --- a/health/services.py +++ b/health/services.py @@ -1,5 +1,5 @@ import logging -from app import build_logs, storage, authentication +from app import build_logs, storage, authentication, instance_keys from health.models_pre_oci import pre_oci_model as model logger = logging.getLogger(__name__) @@ -50,11 +50,36 @@ def _check_storage(app): logger.exception('Storage check failed with exception %s', ex) return (False, 'Storage check failed with exception %s' % ex.message) + def _check_auth(app): """ Returns the status of the auth engine, as accessed from this instance. """ return authentication.ping() +def _check_service_key(app): + """ Returns the status of the service key for this instance. If the key has disappeared or + has expired, then will return False. + """ + if not app.config.get('SETUP_COMPLETE', False): + return (True, 'Stack not fully setup') + + try: + kid = instance_keys.local_key_id + except IOError as ex: + # Key has not been created yet. + return (True, 'Stack not fully setup') + + try: + result = bool(instance_keys.get_service_key_public_key(kid)) + return (result, 'Could not find valid instance service key %s' % kid) + except Exception as ex: + logger.exception('Got exception when trying to retrieve the instance key') + + # NOTE: We return *True* here if there was an exception when retrieving the key, as it means + # the database is down, which will be handled by the database health check. + return (True, 'Failed to get instance key due to a database issue') + + _SERVICES = { 'registry_gunicorn': _check_gunicorn('v1/_internal_ping'), 'web_gunicorn': _check_gunicorn('_internal_ping'), @@ -63,6 +88,7 @@ _SERVICES = { 'redis': _check_redis, 'storage': _check_storage, 'auth': _check_auth, + 'service_key': _check_service_key, } def check_all_services(app, skip): From a927ce3e0f4640dd1bf8c9c767b678f0c8ec6aee Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Fri, 10 Nov 2017 14:56:12 -0500 Subject: [PATCH 2/3] Have boot.py verify that the existing instance's service key is valid and regenerate if it is not This prevents the scenario where a container is restarted after an outage and therefore runs with a bad key --- boot.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/boot.py b/boot.py index 545db0ed4..f494b7a88 100755 --- a/boot.py +++ b/boot.py @@ -10,7 +10,9 @@ import release import os.path from app import app +from data.model import ServiceKeyDoesNotExist from data.model.release import set_region_release +from data.model.service_keys import get_service_key from util.config.database import sync_database_with_config from util.generatepresharedkey import generate_key from _init import CONF_DIR @@ -44,8 +46,21 @@ def setup_jwt_proxy(): Creates a service key for quay to use in the jwtproxy and generates the JWT proxy configuration. """ if os.path.exists(os.path.join(CONF_DIR, 'jwtproxy_conf.yaml')): - # Proxy is already setup. - return + # Proxy is already setup. Make sure the service key is still valid. + try: + with open(app.config['INSTANCE_SERVICE_KEY_KID_LOCATION']) as f: + quay_key_id = f.read() + + try: + get_service_key(quay_key_id, approved_only=False) + except ServiceKeyDoesNotExist: + logger.exception('Could not find non-expired existing service key %s; creating a new one', + quay_key_id) + + # Found a valid service key, so exiting. + return + except IOError: + logger.exception('Could not load existing service key; creating a new one') # Generate the key for this Quay instance to use. minutes_until_expiration = app.config.get('INSTANCE_SERVICE_KEY_EXPIRATION', 120) From bbdf9e074cc91d820433255f9cb6a9e5eb01a3db Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Fri, 10 Nov 2017 15:46:09 -0500 Subject: [PATCH 3/3] Add metrics for tracking when instance key renewal succeeds and fails, as well as when instance key *lookup* fails --- auth/registry_jwt_auth.py | 5 +++-- boot.py | 6 +++++- data/buildlogs.py | 2 +- data/model/health.py | 3 ++- health/healthcheck.py | 6 ++++-- health/services.py | 16 +++++++------- test/registry_tests.py | 4 ++-- util/metrics/metricqueue.py | 12 +++++++++++ util/security/registry_jwt.py | 9 +++++--- workers/servicekeyworker/servicekeyworker.py | 22 +++++++++++++++----- 10 files changed, 61 insertions(+), 24 deletions(-) diff --git a/auth/registry_jwt_auth.py b/auth/registry_jwt_auth.py index 35126b86f..2b6c2f2c4 100644 --- a/auth/registry_jwt_auth.py +++ b/auth/registry_jwt_auth.py @@ -6,7 +6,7 @@ from jsonschema import validate, ValidationError from flask import request, url_for from flask_principal import identity_changed, Identity -from app import app, get_app_url, instance_keys +from app import app, get_app_url, instance_keys, metric_queue from auth.auth_context import (set_grant_context, get_grant_context) from auth.permissions import repository_read_grant, repository_write_grant, repository_admin_grant from util.http import abort @@ -157,7 +157,8 @@ def identity_from_bearer_token(bearer_header): logger.debug('Validating auth header: %s', bearer_header) try: - payload = decode_bearer_header(bearer_header, instance_keys, app.config) + payload = decode_bearer_header(bearer_header, instance_keys, app.config, + metric_queue=metric_queue) except InvalidBearerTokenException as bte: logger.exception('Invalid bearer token: %s', bte) raise InvalidJWTException(bte) diff --git a/boot.py b/boot.py index f494b7a88..b34d24979 100755 --- a/boot.py +++ b/boot.py @@ -6,6 +6,7 @@ from urlparse import urlunparse from jinja2 import Template from cachetools import lru_cache +import logging import release import os.path @@ -18,6 +19,9 @@ from util.generatepresharedkey import generate_key from _init import CONF_DIR +logger = logging.getLogger(__name__) + + @lru_cache(maxsize=1) def get_audience(): audience = app.config.get('JWTPROXY_AUDIENCE') @@ -53,12 +57,12 @@ def setup_jwt_proxy(): try: get_service_key(quay_key_id, approved_only=False) + return except ServiceKeyDoesNotExist: logger.exception('Could not find non-expired existing service key %s; creating a new one', quay_key_id) # Found a valid service key, so exiting. - return except IOError: logger.exception('Could not load existing service key; creating a new one') diff --git a/data/buildlogs.py b/data/buildlogs.py index f989977e3..bf38f921a 100644 --- a/data/buildlogs.py +++ b/data/buildlogs.py @@ -119,7 +119,7 @@ class RedisBuildLogs(object): args.update({'socket_connect_timeout': 1, 'socket_timeout': 1}) connection = redis.StrictRedis(**args) - if not connection.ping() == True: + if not connection.ping(): return (False, 'Could not ping redis') # Ensure we can write and read a key. diff --git a/data/model/health.py b/data/model/health.py index 80c0cf01d..b40cee025 100644 --- a/data/model/health.py +++ b/data/model/health.py @@ -16,6 +16,7 @@ def check_health(app_config): # We will connect to the db, check that it contains some team role kinds try: - return (bool(list(TeamRole.select().limit(1))), 'Could not connect to the database') + okay = bool(list(TeamRole.select().limit(1))) + return (okay, 'Could not connect to the database' if not okay else None) except Exception as ex: return (False, 'Could not connect to the database: %s' % ex.message) diff --git a/health/healthcheck.py b/health/healthcheck.py index f0e8a28b9..99671e74a 100644 --- a/health/healthcheck.py +++ b/health/healthcheck.py @@ -53,7 +53,7 @@ class HealthCheck(object): service_status_expanded = {} for service_name in service_statuses: - status, err = service_statuses[service_name] + status, message = service_statuses[service_name] service_statuses_bools[service_name] = status service_status_expanded[service_name] = { @@ -61,7 +61,9 @@ class HealthCheck(object): } if not status: - service_status_expanded[service_name]['failure'] = err + service_status_expanded[service_name]['failure'] = message + elif message: + service_status_expanded[service_name]['message'] = message if skip and service_name in skip: notes.append('%s skipped in compute health' % service_name) diff --git a/health/services.py b/health/services.py index 5998292af..1ceab38b5 100644 --- a/health/services.py +++ b/health/services.py @@ -4,7 +4,6 @@ from health.models_pre_oci import pre_oci_model as model logger = logging.getLogger(__name__) - def _check_gunicorn(endpoint): def fn(app): """ Returns the status of the gunicorn workers. """ @@ -23,7 +22,9 @@ def _check_gunicorn(endpoint): registry_url = '%s://localhost%s/%s' % (scheme, port, endpoint) try: status_code = client.get(registry_url, verify=False, timeout=2).status_code - return (status_code == 200, 'Got non-200 response for worker: %s' % status_code) + okay = status_code == 200 + message = 'Got non-200 response for worker: %s' % status_code if not okay else None + return (okay, message) except Exception as ex: logger.exception('Exception when checking worker health: %s', registry_url) return (False, 'Exception when checking worker health: %s' % registry_url) @@ -61,23 +62,24 @@ def _check_service_key(app): has expired, then will return False. """ if not app.config.get('SETUP_COMPLETE', False): - return (True, 'Stack not fully setup') + return (True, 'Stack not fully setup; skipping check') try: kid = instance_keys.local_key_id except IOError as ex: # Key has not been created yet. - return (True, 'Stack not fully setup') + return (True, 'Stack not fully setup; skipping check') try: - result = bool(instance_keys.get_service_key_public_key(kid)) - return (result, 'Could not find valid instance service key %s' % kid) + key_is_valid = bool(instance_keys.get_service_key_public_key(kid)) + message = 'Could not find valid instance service key %s' % kid if not key_is_valid else None + return (key_is_valid, message) except Exception as ex: logger.exception('Got exception when trying to retrieve the instance key') # NOTE: We return *True* here if there was an exception when retrieving the key, as it means # the database is down, which will be handled by the database health check. - return (True, 'Failed to get instance key due to a database issue') + return (True, 'Failed to get instance key due to a database issue; skipping check') _SERVICES = { diff --git a/test/registry_tests.py b/test/registry_tests.py index 72edd4d4e..47ae85256 100644 --- a/test/registry_tests.py +++ b/test/registry_tests.py @@ -30,7 +30,7 @@ from jwkest.jwk import RSAKey import endpoints.decorated # required for side effect -from app import app, storage, instance_keys, get_app_url +from app import app, storage, instance_keys, get_app_url, metric_queue from data.database import close_db_filter, configure, DerivedStorageForImage, QueueItem, Image from data import model from digest.checksums import compute_simple @@ -2491,7 +2491,7 @@ class V2LoginTests(V2RegistryLoginMixin, LoginTests, RegistryTestCaseMixin, Base encoded = response.json()['token'] header = 'Bearer ' + encoded - payload = decode_bearer_header(header, instance_keys, app.config) + payload = decode_bearer_header(header, instance_keys, app.config, metric_queue=metric_queue) self.assertIsNotNone(payload) if scope is None: diff --git a/util/metrics/metricqueue.py b/util/metrics/metricqueue.py index ec90867a4..6348ec6d2 100644 --- a/util/metrics/metricqueue.py +++ b/util/metrics/metricqueue.py @@ -102,6 +102,18 @@ class MetricQueue(object): self.org_count = prom.create_gauge('org_count', 'Number of Organizations') self.robot_count = prom.create_gauge('robot_count', 'Number of robot accounts') + self.instance_key_renewal_success = prom.create_counter('instance_key_renewal_success', + 'Instance Key Renewal Success Count', + labelnames=['key_id']) + + self.instance_key_renewal_failure = prom.create_counter('instance_key_renewal_failure', + 'Instance Key Renewal Failure Count', + labelnames=['key_id']) + + self.invalid_instance_key_count = prom.create_counter('invalid_registry_instance_key_count', + 'Invalid registry instance key count', + labelnames=['key_id']) + # Deprecated: Define an in-memory queue for reporting metrics to CloudWatch or another # provider. self._queue = None diff --git a/util/security/registry_jwt.py b/util/security/registry_jwt.py index b21ad1044..ff5cbb9a2 100644 --- a/util/security/registry_jwt.py +++ b/util/security/registry_jwt.py @@ -22,7 +22,7 @@ class InvalidBearerTokenException(Exception): pass -def decode_bearer_header(bearer_header, instance_keys, config): +def decode_bearer_header(bearer_header, instance_keys, config, metric_queue=None): """ decode_bearer_header decodes the given bearer header that contains an encoded JWT with both a Key ID as well as the signed JWT and returns the decoded and validated JWT. On any error, raises an InvalidBearerTokenException with the reason for failure. @@ -34,10 +34,10 @@ def decode_bearer_header(bearer_header, instance_keys, config): encoded_jwt = match.group(1) logger.debug('encoded JWT: %s', encoded_jwt) - return decode_bearer_token(encoded_jwt, instance_keys, config) + return decode_bearer_token(encoded_jwt, instance_keys, config, metric_queue=metric_queue) -def decode_bearer_token(bearer_token, instance_keys, config): +def decode_bearer_token(bearer_token, instance_keys, config, metric_queue=None): """ decode_bearer_token decodes the given bearer token that contains both a Key ID as well as the encoded JWT and returns the decoded and validated JWT. On any error, raises an InvalidBearerTokenException with the reason for failure. @@ -52,6 +52,9 @@ def decode_bearer_token(bearer_token, instance_keys, config): # Find the matching public key. public_key = instance_keys.get_service_key_public_key(kid) if public_key is None: + if metric_queue is not None: + metric_queue.invalid_instance_key_count.Inc(labelvalues=[kid]) + logger.error('Could not find requested service key %s with encoded JWT: %s', kid, bearer_token) raise InvalidBearerTokenException('Unknown service key') diff --git a/workers/servicekeyworker/servicekeyworker.py b/workers/servicekeyworker/servicekeyworker.py index 96457e0e8..d7eaecfa1 100644 --- a/workers/servicekeyworker/servicekeyworker.py +++ b/workers/servicekeyworker/servicekeyworker.py @@ -1,7 +1,7 @@ import logging from datetime import datetime, timedelta -from app import app, instance_keys +from app import app, instance_keys, metric_queue from workers.servicekeyworker.models_pre_oci import pre_oci_model as model from workers.worker import Worker @@ -18,10 +18,22 @@ class ServiceKeyWorker(Worker): """ Refreshes the instance's active service key so it doesn't get garbage collected. """ - expiration = timedelta(minutes=instance_keys.service_key_expiration) - logger.debug('Starting refresh of automatic service keys') - model.set_key_expiration(instance_keys.local_key_id, datetime.utcnow() + expiration) - logger.debug('Finished refresh of automatic service keys') + expiration_time = timedelta(minutes=instance_keys.service_key_expiration) + new_expiration = datetime.utcnow() + expiration_time + + logger.debug('Starting automatic refresh of service key %s to new expiration %s', + instance_keys.local_key_id, new_expiration) + try: + model.set_key_expiration(instance_keys.local_key_id, new_expiration) + except Exception as ex: + logger.exception('Failure for automatic refresh of service key %s with new expiration %s', + instance_keys.local_key_id, new_expiration) + metric_queue.instance_key_renewal_failure.Inc(labelvalues=[instance_keys.local_key_id]) + raise ex + + logger.debug('Finished automatic refresh of service key %s with new expiration %s', + instance_keys.local_key_id, new_expiration) + metric_queue.instance_key_renewal_success.Inc(labelvalues=[instance_keys.local_key_id]) if __name__ == "__main__":