Add metrics for tracking when instance key renewal succeeds and fails, as well as when instance key *lookup* fails
This commit is contained in:
parent
a927ce3e0f
commit
bbdf9e074c
10 changed files with 61 additions and 24 deletions
|
@ -6,7 +6,7 @@ from jsonschema import validate, ValidationError
|
||||||
from flask import request, url_for
|
from flask import request, url_for
|
||||||
from flask_principal import identity_changed, Identity
|
from flask_principal import identity_changed, Identity
|
||||||
|
|
||||||
from app import app, get_app_url, instance_keys
|
from app import app, get_app_url, instance_keys, metric_queue
|
||||||
from auth.auth_context import (set_grant_context, get_grant_context)
|
from auth.auth_context import (set_grant_context, get_grant_context)
|
||||||
from auth.permissions import repository_read_grant, repository_write_grant, repository_admin_grant
|
from auth.permissions import repository_read_grant, repository_write_grant, repository_admin_grant
|
||||||
from util.http import abort
|
from util.http import abort
|
||||||
|
@ -157,7 +157,8 @@ def identity_from_bearer_token(bearer_header):
|
||||||
logger.debug('Validating auth header: %s', bearer_header)
|
logger.debug('Validating auth header: %s', bearer_header)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
payload = decode_bearer_header(bearer_header, instance_keys, app.config)
|
payload = decode_bearer_header(bearer_header, instance_keys, app.config,
|
||||||
|
metric_queue=metric_queue)
|
||||||
except InvalidBearerTokenException as bte:
|
except InvalidBearerTokenException as bte:
|
||||||
logger.exception('Invalid bearer token: %s', bte)
|
logger.exception('Invalid bearer token: %s', bte)
|
||||||
raise InvalidJWTException(bte)
|
raise InvalidJWTException(bte)
|
||||||
|
|
6
boot.py
6
boot.py
|
@ -6,6 +6,7 @@ from urlparse import urlunparse
|
||||||
from jinja2 import Template
|
from jinja2 import Template
|
||||||
from cachetools import lru_cache
|
from cachetools import lru_cache
|
||||||
|
|
||||||
|
import logging
|
||||||
import release
|
import release
|
||||||
import os.path
|
import os.path
|
||||||
|
|
||||||
|
@ -18,6 +19,9 @@ from util.generatepresharedkey import generate_key
|
||||||
from _init import CONF_DIR
|
from _init import CONF_DIR
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=1)
|
@lru_cache(maxsize=1)
|
||||||
def get_audience():
|
def get_audience():
|
||||||
audience = app.config.get('JWTPROXY_AUDIENCE')
|
audience = app.config.get('JWTPROXY_AUDIENCE')
|
||||||
|
@ -53,12 +57,12 @@ def setup_jwt_proxy():
|
||||||
|
|
||||||
try:
|
try:
|
||||||
get_service_key(quay_key_id, approved_only=False)
|
get_service_key(quay_key_id, approved_only=False)
|
||||||
|
return
|
||||||
except ServiceKeyDoesNotExist:
|
except ServiceKeyDoesNotExist:
|
||||||
logger.exception('Could not find non-expired existing service key %s; creating a new one',
|
logger.exception('Could not find non-expired existing service key %s; creating a new one',
|
||||||
quay_key_id)
|
quay_key_id)
|
||||||
|
|
||||||
# Found a valid service key, so exiting.
|
# Found a valid service key, so exiting.
|
||||||
return
|
|
||||||
except IOError:
|
except IOError:
|
||||||
logger.exception('Could not load existing service key; creating a new one')
|
logger.exception('Could not load existing service key; creating a new one')
|
||||||
|
|
||||||
|
|
|
@ -119,7 +119,7 @@ class RedisBuildLogs(object):
|
||||||
args.update({'socket_connect_timeout': 1, 'socket_timeout': 1})
|
args.update({'socket_connect_timeout': 1, 'socket_timeout': 1})
|
||||||
|
|
||||||
connection = redis.StrictRedis(**args)
|
connection = redis.StrictRedis(**args)
|
||||||
if not connection.ping() == True:
|
if not connection.ping():
|
||||||
return (False, 'Could not ping redis')
|
return (False, 'Could not ping redis')
|
||||||
|
|
||||||
# Ensure we can write and read a key.
|
# Ensure we can write and read a key.
|
||||||
|
|
|
@ -16,6 +16,7 @@ def check_health(app_config):
|
||||||
|
|
||||||
# We will connect to the db, check that it contains some team role kinds
|
# We will connect to the db, check that it contains some team role kinds
|
||||||
try:
|
try:
|
||||||
return (bool(list(TeamRole.select().limit(1))), 'Could not connect to the database')
|
okay = bool(list(TeamRole.select().limit(1)))
|
||||||
|
return (okay, 'Could not connect to the database' if not okay else None)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
return (False, 'Could not connect to the database: %s' % ex.message)
|
return (False, 'Could not connect to the database: %s' % ex.message)
|
||||||
|
|
|
@ -53,7 +53,7 @@ class HealthCheck(object):
|
||||||
service_status_expanded = {}
|
service_status_expanded = {}
|
||||||
|
|
||||||
for service_name in service_statuses:
|
for service_name in service_statuses:
|
||||||
status, err = service_statuses[service_name]
|
status, message = service_statuses[service_name]
|
||||||
|
|
||||||
service_statuses_bools[service_name] = status
|
service_statuses_bools[service_name] = status
|
||||||
service_status_expanded[service_name] = {
|
service_status_expanded[service_name] = {
|
||||||
|
@ -61,7 +61,9 @@ class HealthCheck(object):
|
||||||
}
|
}
|
||||||
|
|
||||||
if not status:
|
if not status:
|
||||||
service_status_expanded[service_name]['failure'] = err
|
service_status_expanded[service_name]['failure'] = message
|
||||||
|
elif message:
|
||||||
|
service_status_expanded[service_name]['message'] = message
|
||||||
|
|
||||||
if skip and service_name in skip:
|
if skip and service_name in skip:
|
||||||
notes.append('%s skipped in compute health' % service_name)
|
notes.append('%s skipped in compute health' % service_name)
|
||||||
|
|
|
@ -4,7 +4,6 @@ from health.models_pre_oci import pre_oci_model as model
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def _check_gunicorn(endpoint):
|
def _check_gunicorn(endpoint):
|
||||||
def fn(app):
|
def fn(app):
|
||||||
""" Returns the status of the gunicorn workers. """
|
""" Returns the status of the gunicorn workers. """
|
||||||
|
@ -23,7 +22,9 @@ def _check_gunicorn(endpoint):
|
||||||
registry_url = '%s://localhost%s/%s' % (scheme, port, endpoint)
|
registry_url = '%s://localhost%s/%s' % (scheme, port, endpoint)
|
||||||
try:
|
try:
|
||||||
status_code = client.get(registry_url, verify=False, timeout=2).status_code
|
status_code = client.get(registry_url, verify=False, timeout=2).status_code
|
||||||
return (status_code == 200, 'Got non-200 response for worker: %s' % status_code)
|
okay = status_code == 200
|
||||||
|
message = 'Got non-200 response for worker: %s' % status_code if not okay else None
|
||||||
|
return (okay, message)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logger.exception('Exception when checking worker health: %s', registry_url)
|
logger.exception('Exception when checking worker health: %s', registry_url)
|
||||||
return (False, 'Exception when checking worker health: %s' % registry_url)
|
return (False, 'Exception when checking worker health: %s' % registry_url)
|
||||||
|
@ -61,23 +62,24 @@ def _check_service_key(app):
|
||||||
has expired, then will return False.
|
has expired, then will return False.
|
||||||
"""
|
"""
|
||||||
if not app.config.get('SETUP_COMPLETE', False):
|
if not app.config.get('SETUP_COMPLETE', False):
|
||||||
return (True, 'Stack not fully setup')
|
return (True, 'Stack not fully setup; skipping check')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
kid = instance_keys.local_key_id
|
kid = instance_keys.local_key_id
|
||||||
except IOError as ex:
|
except IOError as ex:
|
||||||
# Key has not been created yet.
|
# Key has not been created yet.
|
||||||
return (True, 'Stack not fully setup')
|
return (True, 'Stack not fully setup; skipping check')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = bool(instance_keys.get_service_key_public_key(kid))
|
key_is_valid = bool(instance_keys.get_service_key_public_key(kid))
|
||||||
return (result, 'Could not find valid instance service key %s' % kid)
|
message = 'Could not find valid instance service key %s' % kid if not key_is_valid else None
|
||||||
|
return (key_is_valid, message)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logger.exception('Got exception when trying to retrieve the instance key')
|
logger.exception('Got exception when trying to retrieve the instance key')
|
||||||
|
|
||||||
# NOTE: We return *True* here if there was an exception when retrieving the key, as it means
|
# NOTE: We return *True* here if there was an exception when retrieving the key, as it means
|
||||||
# the database is down, which will be handled by the database health check.
|
# the database is down, which will be handled by the database health check.
|
||||||
return (True, 'Failed to get instance key due to a database issue')
|
return (True, 'Failed to get instance key due to a database issue; skipping check')
|
||||||
|
|
||||||
|
|
||||||
_SERVICES = {
|
_SERVICES = {
|
||||||
|
|
|
@ -30,7 +30,7 @@ from jwkest.jwk import RSAKey
|
||||||
|
|
||||||
import endpoints.decorated # required for side effect
|
import endpoints.decorated # required for side effect
|
||||||
|
|
||||||
from app import app, storage, instance_keys, get_app_url
|
from app import app, storage, instance_keys, get_app_url, metric_queue
|
||||||
from data.database import close_db_filter, configure, DerivedStorageForImage, QueueItem, Image
|
from data.database import close_db_filter, configure, DerivedStorageForImage, QueueItem, Image
|
||||||
from data import model
|
from data import model
|
||||||
from digest.checksums import compute_simple
|
from digest.checksums import compute_simple
|
||||||
|
@ -2491,7 +2491,7 @@ class V2LoginTests(V2RegistryLoginMixin, LoginTests, RegistryTestCaseMixin, Base
|
||||||
encoded = response.json()['token']
|
encoded = response.json()['token']
|
||||||
header = 'Bearer ' + encoded
|
header = 'Bearer ' + encoded
|
||||||
|
|
||||||
payload = decode_bearer_header(header, instance_keys, app.config)
|
payload = decode_bearer_header(header, instance_keys, app.config, metric_queue=metric_queue)
|
||||||
self.assertIsNotNone(payload)
|
self.assertIsNotNone(payload)
|
||||||
|
|
||||||
if scope is None:
|
if scope is None:
|
||||||
|
|
|
@ -102,6 +102,18 @@ class MetricQueue(object):
|
||||||
self.org_count = prom.create_gauge('org_count', 'Number of Organizations')
|
self.org_count = prom.create_gauge('org_count', 'Number of Organizations')
|
||||||
self.robot_count = prom.create_gauge('robot_count', 'Number of robot accounts')
|
self.robot_count = prom.create_gauge('robot_count', 'Number of robot accounts')
|
||||||
|
|
||||||
|
self.instance_key_renewal_success = prom.create_counter('instance_key_renewal_success',
|
||||||
|
'Instance Key Renewal Success Count',
|
||||||
|
labelnames=['key_id'])
|
||||||
|
|
||||||
|
self.instance_key_renewal_failure = prom.create_counter('instance_key_renewal_failure',
|
||||||
|
'Instance Key Renewal Failure Count',
|
||||||
|
labelnames=['key_id'])
|
||||||
|
|
||||||
|
self.invalid_instance_key_count = prom.create_counter('invalid_registry_instance_key_count',
|
||||||
|
'Invalid registry instance key count',
|
||||||
|
labelnames=['key_id'])
|
||||||
|
|
||||||
# Deprecated: Define an in-memory queue for reporting metrics to CloudWatch or another
|
# Deprecated: Define an in-memory queue for reporting metrics to CloudWatch or another
|
||||||
# provider.
|
# provider.
|
||||||
self._queue = None
|
self._queue = None
|
||||||
|
|
|
@ -22,7 +22,7 @@ class InvalidBearerTokenException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def decode_bearer_header(bearer_header, instance_keys, config):
|
def decode_bearer_header(bearer_header, instance_keys, config, metric_queue=None):
|
||||||
""" decode_bearer_header decodes the given bearer header that contains an encoded JWT with both
|
""" decode_bearer_header decodes the given bearer header that contains an encoded JWT with both
|
||||||
a Key ID as well as the signed JWT and returns the decoded and validated JWT. On any error,
|
a Key ID as well as the signed JWT and returns the decoded and validated JWT. On any error,
|
||||||
raises an InvalidBearerTokenException with the reason for failure.
|
raises an InvalidBearerTokenException with the reason for failure.
|
||||||
|
@ -34,10 +34,10 @@ def decode_bearer_header(bearer_header, instance_keys, config):
|
||||||
|
|
||||||
encoded_jwt = match.group(1)
|
encoded_jwt = match.group(1)
|
||||||
logger.debug('encoded JWT: %s', encoded_jwt)
|
logger.debug('encoded JWT: %s', encoded_jwt)
|
||||||
return decode_bearer_token(encoded_jwt, instance_keys, config)
|
return decode_bearer_token(encoded_jwt, instance_keys, config, metric_queue=metric_queue)
|
||||||
|
|
||||||
|
|
||||||
def decode_bearer_token(bearer_token, instance_keys, config):
|
def decode_bearer_token(bearer_token, instance_keys, config, metric_queue=None):
|
||||||
""" decode_bearer_token decodes the given bearer token that contains both a Key ID as well as the
|
""" decode_bearer_token decodes the given bearer token that contains both a Key ID as well as the
|
||||||
encoded JWT and returns the decoded and validated JWT. On any error, raises an
|
encoded JWT and returns the decoded and validated JWT. On any error, raises an
|
||||||
InvalidBearerTokenException with the reason for failure.
|
InvalidBearerTokenException with the reason for failure.
|
||||||
|
@ -52,6 +52,9 @@ def decode_bearer_token(bearer_token, instance_keys, config):
|
||||||
# Find the matching public key.
|
# Find the matching public key.
|
||||||
public_key = instance_keys.get_service_key_public_key(kid)
|
public_key = instance_keys.get_service_key_public_key(kid)
|
||||||
if public_key is None:
|
if public_key is None:
|
||||||
|
if metric_queue is not None:
|
||||||
|
metric_queue.invalid_instance_key_count.Inc(labelvalues=[kid])
|
||||||
|
|
||||||
logger.error('Could not find requested service key %s with encoded JWT: %s', kid, bearer_token)
|
logger.error('Could not find requested service key %s with encoded JWT: %s', kid, bearer_token)
|
||||||
raise InvalidBearerTokenException('Unknown service key')
|
raise InvalidBearerTokenException('Unknown service key')
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
from app import app, instance_keys
|
from app import app, instance_keys, metric_queue
|
||||||
from workers.servicekeyworker.models_pre_oci import pre_oci_model as model
|
from workers.servicekeyworker.models_pre_oci import pre_oci_model as model
|
||||||
from workers.worker import Worker
|
from workers.worker import Worker
|
||||||
|
|
||||||
|
@ -18,10 +18,22 @@ class ServiceKeyWorker(Worker):
|
||||||
"""
|
"""
|
||||||
Refreshes the instance's active service key so it doesn't get garbage collected.
|
Refreshes the instance's active service key so it doesn't get garbage collected.
|
||||||
"""
|
"""
|
||||||
expiration = timedelta(minutes=instance_keys.service_key_expiration)
|
expiration_time = timedelta(minutes=instance_keys.service_key_expiration)
|
||||||
logger.debug('Starting refresh of automatic service keys')
|
new_expiration = datetime.utcnow() + expiration_time
|
||||||
model.set_key_expiration(instance_keys.local_key_id, datetime.utcnow() + expiration)
|
|
||||||
logger.debug('Finished refresh of automatic service keys')
|
logger.debug('Starting automatic refresh of service key %s to new expiration %s',
|
||||||
|
instance_keys.local_key_id, new_expiration)
|
||||||
|
try:
|
||||||
|
model.set_key_expiration(instance_keys.local_key_id, new_expiration)
|
||||||
|
except Exception as ex:
|
||||||
|
logger.exception('Failure for automatic refresh of service key %s with new expiration %s',
|
||||||
|
instance_keys.local_key_id, new_expiration)
|
||||||
|
metric_queue.instance_key_renewal_failure.Inc(labelvalues=[instance_keys.local_key_id])
|
||||||
|
raise ex
|
||||||
|
|
||||||
|
logger.debug('Finished automatic refresh of service key %s with new expiration %s',
|
||||||
|
instance_keys.local_key_id, new_expiration)
|
||||||
|
metric_queue.instance_key_renewal_success.Inc(labelvalues=[instance_keys.local_key_id])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Reference in a new issue