From c1cc52f58ba97f2b99462302748f48b7df98bba1 Mon Sep 17 00:00:00 2001
From: Joseph Schorr <josephschorr@users.noreply.github.com>
Date: Fri, 10 Nov 2017 13:47:19 -0500
Subject: [PATCH 1/3] Add a health check for the instance key

If the key expires or disappears, the node will now go unhealthy, taking it out of service and preventing downtime
---
 health/services.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/health/services.py b/health/services.py
index c5e6a3a51..5998292af 100644
--- a/health/services.py
+++ b/health/services.py
@@ -1,5 +1,5 @@
 import logging
-from app import build_logs, storage, authentication
+from app import build_logs, storage, authentication, instance_keys
 from health.models_pre_oci import pre_oci_model as model
 
 logger = logging.getLogger(__name__)
@@ -50,11 +50,36 @@ def _check_storage(app):
     logger.exception('Storage check failed with exception %s', ex)
     return (False, 'Storage check failed with exception %s' % ex.message)
 
+
 def _check_auth(app):
   """ Returns the status of the auth engine, as accessed from this instance. """
   return authentication.ping()
 
 
+def _check_service_key(app):
+  """ Returns the status of the service key for this instance. If the key has disappeared or
+      has expired, then will return False.
+  """
+  if not app.config.get('SETUP_COMPLETE', False):
+    return (True, 'Stack not fully setup')
+
+  try:
+    kid = instance_keys.local_key_id
+  except IOError as ex:
+    # Key has not been created yet.
+    return (True, 'Stack not fully setup')
+
+  try:
+    result = bool(instance_keys.get_service_key_public_key(kid))
+    return (result, 'Could not find valid instance service key %s' % kid)
+  except Exception as ex:
+    logger.exception('Got exception when trying to retrieve the instance key')
+
+    # NOTE: We return *True* here if there was an exception when retrieving the key, as it means
+    # the database is down, which will be handled by the database health check.
+    return (True, 'Failed to get instance key due to a database issue')
+
+
 _SERVICES = {
   'registry_gunicorn': _check_gunicorn('v1/_internal_ping'),
   'web_gunicorn': _check_gunicorn('_internal_ping'),
@@ -63,6 +88,7 @@ _SERVICES = {
   'redis': _check_redis,
   'storage': _check_storage,
   'auth': _check_auth,
+  'service_key': _check_service_key,
 }
 
 def check_all_services(app, skip):

From a927ce3e0f4640dd1bf8c9c767b678f0c8ec6aee Mon Sep 17 00:00:00 2001
From: Joseph Schorr <josephschorr@users.noreply.github.com>
Date: Fri, 10 Nov 2017 14:56:12 -0500
Subject: [PATCH 2/3] Have boot.py verify that the existing instance's service
 key is valid and regenerate if it is not

This prevents the scenario where a container is restarted after an outage and therefore runs with a bad key
---
 boot.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/boot.py b/boot.py
index 545db0ed4..f494b7a88 100755
--- a/boot.py
+++ b/boot.py
@@ -10,7 +10,9 @@ import release
 import os.path
 
 from app import app
+from data.model import ServiceKeyDoesNotExist
 from data.model.release import set_region_release
+from data.model.service_keys import get_service_key
 from util.config.database import sync_database_with_config
 from util.generatepresharedkey import generate_key
 from  _init import CONF_DIR
@@ -44,8 +46,21 @@ def setup_jwt_proxy():
   Creates a service key for quay to use in the jwtproxy and generates the JWT proxy configuration.
   """
   if os.path.exists(os.path.join(CONF_DIR, 'jwtproxy_conf.yaml')):
-    # Proxy is already setup.
-    return
+    # Proxy is already setup. Make sure the service key is still valid.
+    try:
+      with open(app.config['INSTANCE_SERVICE_KEY_KID_LOCATION']) as f:
+        quay_key_id = f.read()
+
+      try:
+        get_service_key(quay_key_id, approved_only=False)
+      except ServiceKeyDoesNotExist:
+        logger.exception('Could not find non-expired existing service key %s; creating a new one',
+                         quay_key_id)
+
+      # Found a valid service key, so exiting.
+      return
+    except IOError:
+      logger.exception('Could not load existing service key; creating a new one')
 
   # Generate the key for this Quay instance to use.
   minutes_until_expiration = app.config.get('INSTANCE_SERVICE_KEY_EXPIRATION', 120)

From bbdf9e074cc91d820433255f9cb6a9e5eb01a3db Mon Sep 17 00:00:00 2001
From: Joseph Schorr <josephschorr@users.noreply.github.com>
Date: Fri, 10 Nov 2017 15:46:09 -0500
Subject: [PATCH 3/3] Add metrics for tracking when instance key renewal
 succeeds and fails, as well as when instance key *lookup* fails

---
 auth/registry_jwt_auth.py                    |  5 +++--
 boot.py                                      |  6 +++++-
 data/buildlogs.py                            |  2 +-
 data/model/health.py                         |  3 ++-
 health/healthcheck.py                        |  6 ++++--
 health/services.py                           | 16 +++++++-------
 test/registry_tests.py                       |  4 ++--
 util/metrics/metricqueue.py                  | 12 +++++++++++
 util/security/registry_jwt.py                |  9 +++++---
 workers/servicekeyworker/servicekeyworker.py | 22 +++++++++++++++-----
 10 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/auth/registry_jwt_auth.py b/auth/registry_jwt_auth.py
index 35126b86f..2b6c2f2c4 100644
--- a/auth/registry_jwt_auth.py
+++ b/auth/registry_jwt_auth.py
@@ -6,7 +6,7 @@ from jsonschema import validate, ValidationError
 from flask import request, url_for
 from flask_principal import identity_changed, Identity
 
-from app import app, get_app_url, instance_keys
+from app import app, get_app_url, instance_keys, metric_queue
 from auth.auth_context import (set_grant_context, get_grant_context)
 from auth.permissions import repository_read_grant, repository_write_grant, repository_admin_grant
 from util.http import abort
@@ -157,7 +157,8 @@ def identity_from_bearer_token(bearer_header):
   logger.debug('Validating auth header: %s', bearer_header)
 
   try:
-    payload = decode_bearer_header(bearer_header, instance_keys, app.config)
+    payload = decode_bearer_header(bearer_header, instance_keys, app.config,
+                                   metric_queue=metric_queue)
   except InvalidBearerTokenException as bte:
     logger.exception('Invalid bearer token: %s', bte)
     raise InvalidJWTException(bte)
diff --git a/boot.py b/boot.py
index f494b7a88..b34d24979 100755
--- a/boot.py
+++ b/boot.py
@@ -6,6 +6,7 @@ from urlparse import urlunparse
 from jinja2 import Template
 from cachetools import lru_cache
 
+import logging
 import release
 import os.path
 
@@ -18,6 +19,9 @@ from util.generatepresharedkey import generate_key
 from  _init import CONF_DIR
 
 
+logger = logging.getLogger(__name__)
+
+
 @lru_cache(maxsize=1)
 def get_audience():
   audience = app.config.get('JWTPROXY_AUDIENCE')
@@ -53,12 +57,12 @@ def setup_jwt_proxy():
 
       try:
         get_service_key(quay_key_id, approved_only=False)
+        return
       except ServiceKeyDoesNotExist:
         logger.exception('Could not find non-expired existing service key %s; creating a new one',
                          quay_key_id)
 
       # Found a valid service key, so exiting.
-      return
     except IOError:
       logger.exception('Could not load existing service key; creating a new one')
 
diff --git a/data/buildlogs.py b/data/buildlogs.py
index f989977e3..bf38f921a 100644
--- a/data/buildlogs.py
+++ b/data/buildlogs.py
@@ -119,7 +119,7 @@ class RedisBuildLogs(object):
       args.update({'socket_connect_timeout': 1, 'socket_timeout': 1})
 
       connection = redis.StrictRedis(**args)
-      if not connection.ping() == True:
+      if not connection.ping():
         return (False, 'Could not ping redis')
 
       # Ensure we can write and read a key.
diff --git a/data/model/health.py b/data/model/health.py
index 80c0cf01d..b40cee025 100644
--- a/data/model/health.py
+++ b/data/model/health.py
@@ -16,6 +16,7 @@ def check_health(app_config):
 
   # We will connect to the db, check that it contains some team role kinds
   try:
-    return (bool(list(TeamRole.select().limit(1))), 'Could not connect to the database')
+    okay = bool(list(TeamRole.select().limit(1)))
+    return (okay, 'Could not connect to the database' if not okay else None)
   except Exception as ex:
     return (False, 'Could not connect to the database: %s' % ex.message)
diff --git a/health/healthcheck.py b/health/healthcheck.py
index f0e8a28b9..99671e74a 100644
--- a/health/healthcheck.py
+++ b/health/healthcheck.py
@@ -53,7 +53,7 @@ class HealthCheck(object):
     service_status_expanded = {}
 
     for service_name in service_statuses:
-      status, err = service_statuses[service_name]
+      status, message = service_statuses[service_name]
 
       service_statuses_bools[service_name] = status
       service_status_expanded[service_name] = {
@@ -61,7 +61,9 @@ class HealthCheck(object):
       }
 
       if not status:
-        service_status_expanded[service_name]['failure'] = err
+        service_status_expanded[service_name]['failure'] = message
+      elif message:
+        service_status_expanded[service_name]['message'] = message
 
       if skip and service_name in skip:
         notes.append('%s skipped in compute health' % service_name)
diff --git a/health/services.py b/health/services.py
index 5998292af..1ceab38b5 100644
--- a/health/services.py
+++ b/health/services.py
@@ -4,7 +4,6 @@ from health.models_pre_oci import pre_oci_model as model
 
 logger = logging.getLogger(__name__)
 
-
 def _check_gunicorn(endpoint):
   def fn(app):
     """ Returns the status of the gunicorn workers. """
@@ -23,7 +22,9 @@ def _check_gunicorn(endpoint):
     registry_url = '%s://localhost%s/%s' % (scheme, port, endpoint)
     try:
       status_code = client.get(registry_url, verify=False, timeout=2).status_code
-      return (status_code == 200, 'Got non-200 response for worker: %s' % status_code)
+      okay = status_code == 200
+      message = 'Got non-200 response for worker: %s' % status_code if not okay else None
+      return (okay, message)
     except Exception as ex:
       logger.exception('Exception when checking worker health: %s', registry_url)
       return (False, 'Exception when checking worker health: %s' % registry_url)
@@ -61,23 +62,24 @@ def _check_service_key(app):
       has expired, then will return False.
   """
   if not app.config.get('SETUP_COMPLETE', False):
-    return (True, 'Stack not fully setup')
+    return (True, 'Stack not fully setup; skipping check')
 
   try:
     kid = instance_keys.local_key_id
   except IOError as ex:
     # Key has not been created yet.
-    return (True, 'Stack not fully setup')
+    return (True, 'Stack not fully setup; skipping check')
 
   try:
-    result = bool(instance_keys.get_service_key_public_key(kid))
-    return (result, 'Could not find valid instance service key %s' % kid)
+    key_is_valid = bool(instance_keys.get_service_key_public_key(kid))
+    message = 'Could not find valid instance service key %s' % kid if not key_is_valid else None
+    return (key_is_valid, message)
   except Exception as ex:
     logger.exception('Got exception when trying to retrieve the instance key')
 
     # NOTE: We return *True* here if there was an exception when retrieving the key, as it means
     # the database is down, which will be handled by the database health check.
-    return (True, 'Failed to get instance key due to a database issue')
+    return (True, 'Failed to get instance key due to a database issue; skipping check')
 
 
 _SERVICES = {
diff --git a/test/registry_tests.py b/test/registry_tests.py
index 72edd4d4e..47ae85256 100644
--- a/test/registry_tests.py
+++ b/test/registry_tests.py
@@ -30,7 +30,7 @@ from jwkest.jwk import RSAKey
 
 import endpoints.decorated # required for side effect
 
-from app import app, storage, instance_keys, get_app_url
+from app import app, storage, instance_keys, get_app_url, metric_queue
 from data.database import close_db_filter, configure, DerivedStorageForImage, QueueItem, Image
 from data import model
 from digest.checksums import compute_simple
@@ -2491,7 +2491,7 @@ class V2LoginTests(V2RegistryLoginMixin, LoginTests, RegistryTestCaseMixin, Base
     encoded = response.json()['token']
     header = 'Bearer ' + encoded
 
-    payload = decode_bearer_header(header, instance_keys, app.config)
+    payload = decode_bearer_header(header, instance_keys, app.config, metric_queue=metric_queue)
     self.assertIsNotNone(payload)
 
     if scope is None:
diff --git a/util/metrics/metricqueue.py b/util/metrics/metricqueue.py
index ec90867a4..6348ec6d2 100644
--- a/util/metrics/metricqueue.py
+++ b/util/metrics/metricqueue.py
@@ -102,6 +102,18 @@ class MetricQueue(object):
     self.org_count = prom.create_gauge('org_count', 'Number of Organizations')
     self.robot_count = prom.create_gauge('robot_count', 'Number of robot accounts')
 
+    self.instance_key_renewal_success = prom.create_counter('instance_key_renewal_success',
+                                                            'Instance Key Renewal Success Count',
+                                                            labelnames=['key_id'])
+
+    self.instance_key_renewal_failure = prom.create_counter('instance_key_renewal_failure',
+                                                            'Instance Key Renewal Failure Count',
+                                                            labelnames=['key_id'])
+
+    self.invalid_instance_key_count = prom.create_counter('invalid_registry_instance_key_count',
+                                                          'Invalid registry instance key count',
+                                                          labelnames=['key_id'])
+
     # Deprecated: Define an in-memory queue for reporting metrics to CloudWatch or another
     # provider.
     self._queue = None
diff --git a/util/security/registry_jwt.py b/util/security/registry_jwt.py
index b21ad1044..ff5cbb9a2 100644
--- a/util/security/registry_jwt.py
+++ b/util/security/registry_jwt.py
@@ -22,7 +22,7 @@ class InvalidBearerTokenException(Exception):
   pass
 
 
-def decode_bearer_header(bearer_header, instance_keys, config):
+def decode_bearer_header(bearer_header, instance_keys, config, metric_queue=None):
   """ decode_bearer_header decodes the given bearer header that contains an encoded JWT with both
       a Key ID as well as the signed JWT and returns the decoded and validated JWT. On any error,
       raises an InvalidBearerTokenException with the reason for failure.
@@ -34,10 +34,10 @@ def decode_bearer_header(bearer_header, instance_keys, config):
 
   encoded_jwt = match.group(1)
   logger.debug('encoded JWT: %s', encoded_jwt)
-  return decode_bearer_token(encoded_jwt, instance_keys, config)
+  return decode_bearer_token(encoded_jwt, instance_keys, config, metric_queue=metric_queue)
 
 
-def decode_bearer_token(bearer_token, instance_keys, config):
+def decode_bearer_token(bearer_token, instance_keys, config, metric_queue=None):
   """ decode_bearer_token decodes the given bearer token that contains both a Key ID as well as the
       encoded JWT and returns the decoded and validated JWT. On any error, raises an
       InvalidBearerTokenException with the reason for failure.
@@ -52,6 +52,9 @@ def decode_bearer_token(bearer_token, instance_keys, config):
   # Find the matching public key.
   public_key = instance_keys.get_service_key_public_key(kid)
   if public_key is None:
+    if metric_queue is not None:
+      metric_queue.invalid_instance_key_count.Inc(labelvalues=[kid])
+
     logger.error('Could not find requested service key %s with encoded JWT: %s', kid, bearer_token)
     raise InvalidBearerTokenException('Unknown service key')
 
diff --git a/workers/servicekeyworker/servicekeyworker.py b/workers/servicekeyworker/servicekeyworker.py
index 96457e0e8..d7eaecfa1 100644
--- a/workers/servicekeyworker/servicekeyworker.py
+++ b/workers/servicekeyworker/servicekeyworker.py
@@ -1,7 +1,7 @@
 import logging
 from datetime import datetime, timedelta
 
-from app import app, instance_keys
+from app import app, instance_keys, metric_queue
 from workers.servicekeyworker.models_pre_oci import pre_oci_model as model
 from workers.worker import Worker
 
@@ -18,10 +18,22 @@ class ServiceKeyWorker(Worker):
     """
     Refreshes the instance's active service key so it doesn't get garbage collected.
     """
-    expiration = timedelta(minutes=instance_keys.service_key_expiration)
-    logger.debug('Starting refresh of automatic service keys')
-    model.set_key_expiration(instance_keys.local_key_id, datetime.utcnow() + expiration)
-    logger.debug('Finished refresh of automatic service keys')
+    expiration_time = timedelta(minutes=instance_keys.service_key_expiration)
+    new_expiration = datetime.utcnow() + expiration_time
+
+    logger.debug('Starting automatic refresh of service key %s to new expiration %s',
+                 instance_keys.local_key_id, new_expiration)
+    try:
+      model.set_key_expiration(instance_keys.local_key_id, new_expiration)
+    except Exception as ex:
+      logger.exception('Failure for automatic refresh of service key %s with new expiration %s',
+                       instance_keys.local_key_id, new_expiration)
+      metric_queue.instance_key_renewal_failure.Inc(labelvalues=[instance_keys.local_key_id])
+      raise ex
+
+    logger.debug('Finished automatic refresh of service key %s with new expiration %s',
+                 instance_keys.local_key_id, new_expiration)
+    metric_queue.instance_key_renewal_success.Inc(labelvalues=[instance_keys.local_key_id])
 
 
 if __name__ == "__main__":