util/metrics: remove metricqueue abstraction

This change replaces the metricqueue library with a native Prometheus
client implementation with the intention to aggregated results with the
Prometheus PushGateway.

This change also adds instrumentation for greenlet context switches.
This commit is contained in:
Jimmy Zelinskie 2019-11-13 14:50:33 -05:00
parent 23c5120790
commit 4bf4ce33c9
57 changed files with 526 additions and 690 deletions

36
app.py
View file

@ -19,7 +19,6 @@ import features
from _init import (config_provider, CONF_DIR, IS_KUBERNETES, IS_TESTING, OVERRIDE_CONFIG_DIRECTORY, from _init import (config_provider, CONF_DIR, IS_KUBERNETES, IS_TESTING, OVERRIDE_CONFIG_DIRECTORY,
IS_BUILDING) IS_BUILDING)
from auth.auth_context import get_authenticated_user
from avatars.avatars import Avatar from avatars.avatars import Avatar
from buildman.manager.buildcanceller import BuildCanceller from buildman.manager.buildcanceller import BuildCanceller
from data import database from data import database
@ -30,7 +29,7 @@ from data.billing import Billing
from data.buildlogs import BuildLogs from data.buildlogs import BuildLogs
from data.cache import get_model_cache from data.cache import get_model_cache
from data.model.user import LoginWrappedDBUser from data.model.user import LoginWrappedDBUser
from data.queue import WorkQueue, BuildMetricQueueReporter from data.queue import WorkQueue
from data.userevent import UserEventsBuilderModule from data.userevent import UserEventsBuilderModule
from data.userfiles import Userfiles from data.userfiles import Userfiles
from data.users import UserAuthentication from data.users import UserAuthentication
@ -52,14 +51,13 @@ from util.names import urn_generator
from util.config.configutil import generate_secret_key from util.config.configutil import generate_secret_key
from util.config.superusermanager import SuperUserManager from util.config.superusermanager import SuperUserManager
from util.label_validator import LabelValidator from util.label_validator import LabelValidator
from util.metrics.metricqueue import MetricQueue
from util.metrics.prometheus import PrometheusPlugin from util.metrics.prometheus import PrometheusPlugin
from util.saas.cloudwatch import start_cloudwatch_sender
from util.secscan.api import SecurityScannerAPI from util.secscan.api import SecurityScannerAPI
from util.repomirror.api import RepoMirrorAPI from util.repomirror.api import RepoMirrorAPI
from util.tufmetadata.api import TUFMetadataAPI from util.tufmetadata.api import TUFMetadataAPI
from util.security.instancekeys import InstanceKeys from util.security.instancekeys import InstanceKeys
from util.security.signing import Signer from util.security.signing import Signer
from util.greenlet_tracing import enable_tracing
OVERRIDE_CONFIG_YAML_FILENAME = os.path.join(CONF_DIR, 'stack/config.yaml') OVERRIDE_CONFIG_YAML_FILENAME = os.path.join(CONF_DIR, 'stack/config.yaml')
@ -205,6 +203,9 @@ def _request_end(resp):
return resp return resp
if app.config.get('GREENLET_TRACING', True):
enable_tracing()
root_logger = logging.getLogger() root_logger = logging.getLogger()
@ -224,11 +225,10 @@ avatar = Avatar(app)
login_manager = LoginManager(app) login_manager = LoginManager(app)
mail = Mail(app) mail = Mail(app)
prometheus = PrometheusPlugin(app) prometheus = PrometheusPlugin(app)
metric_queue = MetricQueue(prometheus) chunk_cleanup_queue = WorkQueue(app.config['CHUNK_CLEANUP_QUEUE_NAME'], tf)
chunk_cleanup_queue = WorkQueue(app.config['CHUNK_CLEANUP_QUEUE_NAME'], tf, metric_queue=metric_queue)
instance_keys = InstanceKeys(app) instance_keys = InstanceKeys(app)
ip_resolver = IPResolver(app) ip_resolver = IPResolver(app)
storage = Storage(app, metric_queue, chunk_cleanup_queue, instance_keys, config_provider, ip_resolver) storage = Storage(app, chunk_cleanup_queue, instance_keys, config_provider, ip_resolver)
userfiles = Userfiles(app, storage) userfiles = Userfiles(app, storage)
log_archive = LogArchive(app, storage) log_archive = LogArchive(app, storage)
analytics = Analytics(app) analytics = Analytics(app)
@ -244,8 +244,6 @@ instance_keys = InstanceKeys(app)
label_validator = LabelValidator(app) label_validator = LabelValidator(app)
build_canceller = BuildCanceller(app) build_canceller = BuildCanceller(app)
start_cloudwatch_sender(metric_queue, app)
github_trigger = GithubOAuthService(app.config, 'GITHUB_TRIGGER_CONFIG') github_trigger = GithubOAuthService(app.config, 'GITHUB_TRIGGER_CONFIG')
gitlab_trigger = GitLabOAuthService(app.config, 'GITLAB_TRIGGER_CONFIG') gitlab_trigger = GitLabOAuthService(app.config, 'GITLAB_TRIGGER_CONFIG')
@ -253,29 +251,24 @@ oauth_login = OAuthLoginManager(app.config)
oauth_apps = [github_trigger, gitlab_trigger] oauth_apps = [github_trigger, gitlab_trigger]
image_replication_queue = WorkQueue(app.config['REPLICATION_QUEUE_NAME'], tf, image_replication_queue = WorkQueue(app.config['REPLICATION_QUEUE_NAME'], tf,
has_namespace=False, metric_queue=metric_queue) has_namespace=False)
dockerfile_build_queue = WorkQueue(app.config['DOCKERFILE_BUILD_QUEUE_NAME'], tf, dockerfile_build_queue = WorkQueue(app.config['DOCKERFILE_BUILD_QUEUE_NAME'], tf,
metric_queue=metric_queue,
reporter=BuildMetricQueueReporter(metric_queue),
has_namespace=True) has_namespace=True)
notification_queue = WorkQueue(app.config['NOTIFICATION_QUEUE_NAME'], tf, has_namespace=True, notification_queue = WorkQueue(app.config['NOTIFICATION_QUEUE_NAME'], tf, has_namespace=True)
metric_queue=metric_queue)
secscan_notification_queue = WorkQueue(app.config['SECSCAN_NOTIFICATION_QUEUE_NAME'], tf, secscan_notification_queue = WorkQueue(app.config['SECSCAN_NOTIFICATION_QUEUE_NAME'], tf,
has_namespace=False, has_namespace=False)
metric_queue=metric_queue)
export_action_logs_queue = WorkQueue(app.config['EXPORT_ACTION_LOGS_QUEUE_NAME'], tf, export_action_logs_queue = WorkQueue(app.config['EXPORT_ACTION_LOGS_QUEUE_NAME'], tf,
has_namespace=True, has_namespace=True)
metric_queue=metric_queue)
# Note: We set `has_namespace` to `False` here, as we explicitly want this queue to not be emptied # Note: We set `has_namespace` to `False` here, as we explicitly want this queue to not be emptied
# when a namespace is marked for deletion. # when a namespace is marked for deletion.
namespace_gc_queue = WorkQueue(app.config['NAMESPACE_GC_QUEUE_NAME'], tf, has_namespace=False, namespace_gc_queue = WorkQueue(app.config['NAMESPACE_GC_QUEUE_NAME'], tf, has_namespace=False)
metric_queue=metric_queue)
all_queues = [image_replication_queue, dockerfile_build_queue, notification_queue, all_queues = [image_replication_queue, dockerfile_build_queue, notification_queue,
secscan_notification_queue, chunk_cleanup_queue, namespace_gc_queue] secscan_notification_queue, chunk_cleanup_queue, namespace_gc_queue]
url_scheme_and_hostname = URLSchemeAndHostname(app.config['PREFERRED_URL_SCHEME'], app.config['SERVER_HOSTNAME']) url_scheme_and_hostname = URLSchemeAndHostname(app.config['PREFERRED_URL_SCHEME'],
app.config['SERVER_HOSTNAME'])
secscan_api = SecurityScannerAPI(app.config, storage, app.config['SERVER_HOSTNAME'], app.config['HTTPCLIENT'], secscan_api = SecurityScannerAPI(app.config, storage, app.config['SERVER_HOSTNAME'], app.config['HTTPCLIENT'],
uri_creator=get_blob_download_uri_getter(app.test_request_context('/'), url_scheme_and_hostname), uri_creator=get_blob_download_uri_getter(app.test_request_context('/'), url_scheme_and_hostname),
instance_keys=instance_keys) instance_keys=instance_keys)
@ -296,6 +289,7 @@ else:
if app.config.get('DATABASE_SECRET_KEY') is None and app.config.get('SETUP_COMPLETE', False): if app.config.get('DATABASE_SECRET_KEY') is None and app.config.get('SETUP_COMPLETE', False):
raise Exception('Missing DATABASE_SECRET_KEY in config; did you perhaps forget to add it?') raise Exception('Missing DATABASE_SECRET_KEY in config; did you perhaps forget to add it?')
database.configure(app.config) database.configure(app.config)
model.config.app_config = app.config model.config.app_config = app.config

View file

@ -2,8 +2,8 @@ import logging
from functools import wraps from functools import wraps
from flask import request, session from flask import request, session
from prometheus_client import Counter
from app import metric_queue
from auth.basic import validate_basic_auth from auth.basic import validate_basic_auth
from auth.oauth import validate_bearer_auth from auth.oauth import validate_bearer_auth
from auth.cookie import validate_session_cookie from auth.cookie import validate_session_cookie
@ -14,6 +14,12 @@ from util.http import abort
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
authentication_count = Counter('quay_authentication_attempts_total',
'number of authentication attempts accross the registry and API',
labelnames=['auth_kind', 'success'])
def _auth_decorator(pass_result=False, handlers=None): def _auth_decorator(pass_result=False, handlers=None):
""" Builds an auth decorator that runs the given handlers and, if any return successfully, """ Builds an auth decorator that runs the given handlers and, if any return successfully,
sets up the auth context. The wrapped function will be invoked *regardless of success or sets up the auth context. The wrapped function will be invoked *regardless of success or
@ -39,13 +45,13 @@ def _auth_decorator(pass_result=False, handlers=None):
result.apply_to_context() result.apply_to_context()
# Log the metric. # Log the metric.
metric_queue.authentication_count.Inc(labelvalues=[result.kind, True]) authentication_count.labels(result.kind, True).inc()
break break
# Otherwise, report the error. # Otherwise, report the error.
if result.error_message is not None: if result.error_message is not None:
# Log the failure. # Log the failure.
metric_queue.authentication_count.Inc(labelvalues=[result.kind, False]) authentication_count.labels(result.kind, False).inc()
break break
if pass_result: if pass_result:
@ -72,10 +78,10 @@ def require_session_login(func):
result = validate_session_cookie() result = validate_session_cookie()
if result.has_nonrobot_user: if result.has_nonrobot_user:
result.apply_to_context() result.apply_to_context()
metric_queue.authentication_count.Inc(labelvalues=[result.kind, True]) authentication_count.labels(result.kind, True).inc()
return func(*args, **kwargs) return func(*args, **kwargs)
elif not result.missing: elif not result.missing:
metric_queue.authentication_count.Inc(labelvalues=[result.kind, False]) authentication_count.labels(result.kind, False).inc()
abort(401, message='Method requires login and no valid login could be loaded.') abort(401, message='Method requires login and no valid login could be loaded.')
return wrapper return wrapper

View file

@ -6,7 +6,7 @@ from jsonschema import validate, ValidationError
from flask import request, url_for from flask import request, url_for
from flask_principal import identity_changed, Identity from flask_principal import identity_changed, Identity
from app import app, get_app_url, instance_keys, metric_queue from app import app, get_app_url, instance_keys
from auth.auth_context import set_authenticated_context from auth.auth_context import set_authenticated_context
from auth.auth_context_type import SignedAuthContext from auth.auth_context_type import SignedAuthContext
from auth.permissions import repository_read_grant, repository_write_grant, repository_admin_grant from auth.permissions import repository_read_grant, repository_write_grant, repository_admin_grant
@ -89,8 +89,7 @@ def identity_from_bearer_token(bearer_header):
logger.debug('Validating auth header: %s', bearer_header) logger.debug('Validating auth header: %s', bearer_header)
try: try:
payload = decode_bearer_header(bearer_header, instance_keys, app.config, payload = decode_bearer_header(bearer_header, instance_keys, app.config)
metric_queue=metric_queue)
except InvalidBearerTokenException as bte: except InvalidBearerTokenException as bte:
logger.exception('Invalid bearer token: %s', bte) logger.exception('Invalid bearer token: %s', bte)
raise InvalidJWTException(bte) raise InvalidJWTException(bte)

View file

@ -9,6 +9,9 @@ class AuthKind(Enum):
signed_grant = 'signed_grant' signed_grant = 'signed_grant'
credentials = 'credentials' credentials = 'credentials'
def __str__(self):
return '%s' % self.value
class ValidateResult(object): class ValidateResult(object):
""" A result of validating auth in one form or another. """ """ A result of validating auth in one form or another. """

View file

@ -8,9 +8,9 @@ from collections import namedtuple
from datetime import datetime, timedelta from datetime import datetime, timedelta
from six import iteritems from six import iteritems
from prometheus_client import Counter, Histogram
from trollius import From, coroutine, Return, async, sleep from trollius import From, coroutine, Return, async, sleep
from app import metric_queue
from buildman.orchestrator import (orchestrator_from_config, KeyEvent, from buildman.orchestrator import (orchestrator_from_config, KeyEvent,
OrchestratorError, OrchestratorConnectionError, OrchestratorError, OrchestratorConnectionError,
ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION) ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
@ -26,6 +26,17 @@ from util.morecollections import AttrDict
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
build_fallback = Counter('quay_build_fallback_total',
'number of times a build has been retried',
labelnames=['executor'])
build_ack_duration = Histogram('quay_build_ack_duration_seconds',
'seconds taken for the builder to acknowledge a queued build',
labelnames=['executor'])
build_duration = Histogram('quay_build_duration_seconds',
"seconds taken for a build's execution",
labelnames=['executor', 'job_status'])
JOB_PREFIX = 'building/' JOB_PREFIX = 'building/'
LOCK_PREFIX = 'lock/' LOCK_PREFIX = 'lock/'
REALM_PREFIX = 'realm/' REALM_PREFIX = 'realm/'
@ -428,7 +439,7 @@ class EphemeralBuilderManager(BaseManager):
# Check if we can use this executor based on the retries remaining. # Check if we can use this executor based on the retries remaining.
if executor.minimum_retry_threshold > build_job.retries_remaining: if executor.minimum_retry_threshold > build_job.retries_remaining:
metric_queue.builder_fallback.Inc() build_fallback.labels(executor.name).inc()
logger.debug('Job %s cannot use executor %s as it is below retry threshold %s (retry #%s)', logger.debug('Job %s cannot use executor %s as it is below retry threshold %s (retry #%s)',
build_uuid, executor.name, executor.minimum_retry_threshold, build_uuid, executor.name, executor.minimum_retry_threshold,
build_job.retries_remaining) build_job.retries_remaining)
@ -440,28 +451,9 @@ class EphemeralBuilderManager(BaseManager):
try: try:
execution_id = yield From(executor.start_builder(realm, token, build_uuid)) execution_id = yield From(executor.start_builder(realm, token, build_uuid))
except: except:
try:
metric_queue.build_start_failure.Inc(labelvalues=[executor.name])
metric_queue.put_deprecated(('ExecutorFailure-%s' % executor.name), 1, unit='Count')
except:
logger.exception('Exception when writing failure metric for execution %s for job %s',
execution_id, build_uuid)
logger.exception('Exception when starting builder for job: %s', build_uuid) logger.exception('Exception when starting builder for job: %s', build_uuid)
continue continue
try:
metric_queue.build_start_success.Inc(labelvalues=[executor.name])
except:
logger.exception('Exception when writing success metric for execution %s for job %s',
execution_id, build_uuid)
try:
metric_queue.ephemeral_build_workers.Inc()
except:
logger.exception('Exception when writing start metrics for execution %s for job %s',
execution_id, build_uuid)
started_with_executor = executor started_with_executor = executor
# Break out of the loop now that we've started a builder successfully. # Break out of the loop now that we've started a builder successfully.
@ -542,8 +534,7 @@ class EphemeralBuilderManager(BaseManager):
job.build_uuid, build_component.builder_realm) job.build_uuid, build_component.builder_realm)
yield From(build_component.start_build(job)) yield From(build_component.start_build(job))
yield From(self._write_duration_metric(metric_queue.builder_time_to_build, yield From(self._write_duration_metric(build_ack_duration, build_component.builder_realm))
build_component.builder_realm))
# Clean up the bookkeeping for allowing any manager to take the job. # Clean up the bookkeeping for allowing any manager to take the job.
try: try:
@ -560,7 +551,8 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Calling job_completed for job %s with status: %s', logger.debug('Calling job_completed for job %s with status: %s',
build_job.build_uuid, job_status) build_job.build_uuid, job_status)
yield From(self._write_duration_metric(metric_queue.build_time, build_component.builder_realm)) yield From(self._write_duration_metric(build_duration, build_component.builder_realm,
job_status=job_status))
# Mark the job as completed. Since this is being invoked from the component, we don't need # Mark the job as completed. Since this is being invoked from the component, we don't need
# to ask for the phase to be updated as well. # to ask for the phase to be updated as well.
@ -660,18 +652,16 @@ class EphemeralBuilderManager(BaseManager):
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)) yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
@coroutine @coroutine
def _write_duration_metric(self, metric, realm): def _write_duration_metric(self, metric, realm, job_status=None):
"""
:returns: True if the metric was written, otherwise False
:rtype: bool
"""
try: try:
metric_data = yield From(self._orchestrator.get_key(self._metric_key(realm))) metric_data = yield From(self._orchestrator.get_key(self._metric_key(realm)))
parsed_metric_data = json.loads(metric_data) parsed_metric_data = json.loads(metric_data)
start_time = parsed_metric_data['start_time'] start_time = parsed_metric_data['start_time']
metric.Observe(time.time() - start_time, executor = parsed_metric_data.get('executor_name', 'unknown')
labelvalues=[parsed_metric_data.get('executor_name', if job_status:
'unknown')]) metric.labels(executor, job_status).observe(time.time() - start_time)
else:
metric.labels(executor).observe(time.time() - start_time)
except Exception: except Exception:
logger.exception("Could not write metric for realm %s", realm) logger.exception("Could not write metric for realm %s", realm)

View file

@ -5,25 +5,25 @@ import os
import socket import socket
import subprocess import subprocess
import threading import threading
import time
import uuid import uuid
from functools import partial from functools import partial, wraps
import boto.ec2 import boto.ec2
import cachetools.func import cachetools.func
import requests import requests
import trollius
from container_cloud_config import CloudConfigContext from container_cloud_config import CloudConfigContext
from jinja2 import FileSystemLoader, Environment from jinja2 import FileSystemLoader, Environment
from trollius import coroutine, From, Return, get_event_loop from trollius import coroutine, sleep, From, Return, get_event_loop
from prometheus_client import Histogram
import release import release
from buildman.asyncutil import AsyncWrapper
from app import metric_queue, app
from util.metrics.metricqueue import duration_collector_async
from _init import ROOT_DIR from _init import ROOT_DIR
from app import app
from buildman.asyncutil import AsyncWrapper
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -38,6 +38,28 @@ ENV = Environment(loader=FileSystemLoader(os.path.join(ROOT_DIR, "buildman/templ
TEMPLATE = ENV.get_template('cloudconfig.yaml') TEMPLATE = ENV.get_template('cloudconfig.yaml')
CloudConfigContext().populate_jinja_environment(ENV) CloudConfigContext().populate_jinja_environment(ENV)
build_start_duration = Histogram('quay_build_start_duration_seconds',
'seconds taken for a executor to start executing a queued build',
labelnames=['executor'],
buckets=[.5, 1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 180.0, 240.0, 300.0, 600.0])
def async_observe(metric, *labels):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
trigger_time = time.time()
try:
rv = func(*args, **kwargs)
except Return as e:
metric.labels(*labels).observe(time.time() - trigger_time)
raise e
return rv
return wrapper
return decorator
class ExecutorException(Exception): class ExecutorException(Exception):
""" Exception raised when there is a problem starting or stopping a builder. """ Exception raised when there is a problem starting or stopping a builder.
""" """
@ -160,7 +182,7 @@ class EC2Executor(BuilderExecutor):
return stack_amis[ec2_region] return stack_amis[ec2_region]
@coroutine @coroutine
@duration_collector_async(metric_queue.builder_time_to_start, ['ec2']) @async_observe(build_start_duration, 'ec2')
def start_builder(self, realm, token, build_uuid): def start_builder(self, realm, token, build_uuid):
region = self.executor_config['EC2_REGION'] region = self.executor_config['EC2_REGION']
channel = self.executor_config.get('COREOS_CHANNEL', 'stable') channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
@ -204,7 +226,6 @@ class EC2Executor(BuilderExecutor):
)) ))
except boto.exception.EC2ResponseError as ec2e: except boto.exception.EC2ResponseError as ec2e:
logger.exception('Unable to spawn builder instance') logger.exception('Unable to spawn builder instance')
metric_queue.ephemeral_build_worker_failure.Inc()
raise ec2e raise ec2e
if not reservation.instances: if not reservation.instances:
@ -215,7 +236,7 @@ class EC2Executor(BuilderExecutor):
launched = AsyncWrapper(reservation.instances[0]) launched = AsyncWrapper(reservation.instances[0])
# Sleep a few seconds to wait for AWS to spawn the instance. # Sleep a few seconds to wait for AWS to spawn the instance.
yield From(trollius.sleep(_TAG_RETRY_SLEEP)) yield From(sleep(_TAG_RETRY_SLEEP))
# Tag the instance with its metadata. # Tag the instance with its metadata.
for i in range(0, _TAG_RETRY_COUNT): for i in range(0, _TAG_RETRY_COUNT):
@ -231,7 +252,7 @@ class EC2Executor(BuilderExecutor):
if i < _TAG_RETRY_COUNT - 1: if i < _TAG_RETRY_COUNT - 1:
logger.warning('Failed to write EC2 tags for instance %s for build %s (attempt #%s)', logger.warning('Failed to write EC2 tags for instance %s for build %s (attempt #%s)',
launched.id, build_uuid, i) launched.id, build_uuid, i)
yield From(trollius.sleep(_TAG_RETRY_SLEEP)) yield From(sleep(_TAG_RETRY_SLEEP))
continue continue
raise ExecutorException('Unable to find builder instance.') raise ExecutorException('Unable to find builder instance.')
@ -269,7 +290,7 @@ class PopenExecutor(BuilderExecutor):
""" Executor which uses Popen to fork a quay-builder process. """ Executor which uses Popen to fork a quay-builder process.
""" """
@coroutine @coroutine
@duration_collector_async(metric_queue.builder_time_to_start, ['fork']) @async_observe(build_start_duration, 'fork')
def start_builder(self, realm, token, build_uuid): def start_builder(self, realm, token, build_uuid):
# Now start a machine for this job, adding the machine id to the etcd information # Now start a machine for this job, adding the machine id to the etcd information
logger.debug('Forking process for build') logger.debug('Forking process for build')
@ -491,7 +512,7 @@ class KubernetesExecutor(BuilderExecutor):
return job_resource return job_resource
@coroutine @coroutine
@duration_collector_async(metric_queue.builder_time_to_start, ['k8s']) @async_observe(build_start_duration, 'k8s')
def start_builder(self, realm, token, build_uuid): def start_builder(self, realm, token, build_uuid):
# generate resource # generate resource
channel = self.executor_config.get('COREOS_CHANNEL', 'stable') channel = self.executor_config.get('COREOS_CHANNEL', 'stable')

View file

@ -1,21 +1,23 @@
import logging import logging
import json import json
from datetime import timedelta
from threading import Event
import trollius import trollius
from threading import Event from aiowsgi import create_server as create_wsgi_server
from datetime import timedelta
from trollius.coroutines import From
from autobahn.asyncio.wamp import RouterFactory, RouterSessionFactory from autobahn.asyncio.wamp import RouterFactory, RouterSessionFactory
from autobahn.asyncio.websocket import WampWebSocketServerFactory from autobahn.asyncio.websocket import WampWebSocketServerFactory
from autobahn.wamp import types from autobahn.wamp import types
from aiowsgi import create_server as create_wsgi_server
from flask import Flask from flask import Flask
from trollius.coroutines import From
from app import app
from buildman.enums import BuildJobResult, BuildServerStatus, RESULT_PHASES from buildman.enums import BuildJobResult, BuildServerStatus, RESULT_PHASES
from buildman.jobutil.buildstatus import StatusHandler
from buildman.jobutil.buildjob import BuildJob, BuildJobLoadException from buildman.jobutil.buildjob import BuildJob, BuildJobLoadException
from buildman.jobutil.buildstatus import StatusHandler
from data import database, model from data import database, model
from app import app, metric_queue
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -67,11 +69,10 @@ class BuilderServer(object):
@controller_app.route('/status') @controller_app.route('/status')
def status(): def status():
metrics = server._queue.get_metrics() (running_count, _available_not_running_count, available_count) = server._queue.get_metrics()
(running_count, available_not_running_count, available_count) = metrics
workers = [component for component in server._current_components workers = [component for component in server._current_components
if component.kind() == 'builder'] if component.kind() == 'builder']
data = { data = {
'status': server._current_status, 'status': server._current_status,
@ -167,8 +168,6 @@ class BuilderServer(object):
if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count: if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count:
self._shutdown_event.set() self._shutdown_event.set()
_report_completion_status(build_job, job_status, executor_name)
@trollius.coroutine @trollius.coroutine
def _work_checker(self): def _work_checker(self):
logger.debug('Initializing work checker') logger.debug('Initializing work checker')
@ -249,18 +248,3 @@ class BuilderServer(object):
# Initialize the work queue checker. # Initialize the work queue checker.
yield From(self._work_checker()) yield From(self._work_checker())
def _report_completion_status(build_job, status, executor_name):
metric_queue.build_counter.Inc(labelvalues=[status])
metric_queue.repository_build_completed.Inc(labelvalues=[build_job.namespace, build_job.repo_name,
status, executor_name or 'executor'])
if status == BuildJobResult.COMPLETE:
status_name = 'CompleteBuilds'
elif status == BuildJobResult.ERROR:
status_name = 'FailedBuilds'
elif status == BuildJobResult.INCOMPLETE:
status_name = 'IncompletedBuilds'
else:
return
metric_queue.put_deprecated(status_name, 1, unit='Count')

View file

@ -6,7 +6,6 @@ from mock import Mock, ANY
from six import iteritems from six import iteritems
from trollius import coroutine, get_event_loop, From, Future, Return from trollius import coroutine, get_event_loop, From, Future, Return
from app import metric_queue
from buildman.asyncutil import AsyncWrapper from buildman.asyncutil import AsyncWrapper
from buildman.component.buildcomponent import BuildComponent from buildman.component.buildcomponent import BuildComponent
from buildman.manager.ephemeral import (EphemeralBuilderManager, REALM_PREFIX, from buildman.manager.ephemeral import (EphemeralBuilderManager, REALM_PREFIX,
@ -15,7 +14,6 @@ from buildman.manager.executor import BuilderExecutor, ExecutorException
from buildman.orchestrator import KeyEvent, KeyChange from buildman.orchestrator import KeyEvent, KeyChange
from buildman.server import BuildJobResult from buildman.server import BuildJobResult
from util import slash_join from util import slash_join
from util.metrics.metricqueue import duration_collector_async
BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead' BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead'
@ -36,7 +34,6 @@ class TestExecutor(BuilderExecutor):
job_stopped = None job_stopped = None
@coroutine @coroutine
@duration_collector_async(metric_queue.builder_time_to_start, labelvalues=["testlabel"])
def start_builder(self, realm, token, build_uuid): def start_builder(self, realm, token, build_uuid):
self.job_started = str(uuid.uuid4()) self.job_started = str(uuid.uuid4())
raise Return(self.job_started) raise Return(self.job_started)
@ -48,7 +45,6 @@ class TestExecutor(BuilderExecutor):
class BadExecutor(BuilderExecutor): class BadExecutor(BuilderExecutor):
@coroutine @coroutine
@duration_collector_async(metric_queue.builder_time_to_start, labelvalues=["testlabel"])
def start_builder(self, realm, token, build_uuid): def start_builder(self, realm, token, build_uuid):
raise ExecutorException('raised on purpose!') raise ExecutorException('raised on purpose!')

View file

@ -607,3 +607,6 @@ class DefaultConfig(ImmutableConfig):
# Feature Flag: Whether garbage collection is enabled. # Feature Flag: Whether garbage collection is enabled.
FEATURE_GARBAGE_COLLECTION = True FEATURE_GARBAGE_COLLECTION = True
# When enabled, sets a tracing callback to report greenlet metrics.
GREENLET_TRACING = True

View file

@ -3,36 +3,41 @@ import uuid
from datetime import datetime, timedelta from datetime import datetime, timedelta
from contextlib import contextmanager from contextlib import contextmanager
from prometheus_client import Counter, Gauge
from data.database import QueueItem, db, db_for_update, db_random_func from data.database import QueueItem, db, db_for_update, db_random_func
from util.morecollections import AttrDict from util.morecollections import AttrDict
queue_item_puts = Counter('quay_queue_item_puts_total',
'number of items that have been added to the queue',
labelnames=['queue_name'])
queue_item_gets = Counter('quay_queue_item_gets_total',
'number of times get() has been called on queue',
labelnames=['queue_name', 'availability'])
queue_item_deletes = Counter('quay_queue_item_deletes_total',
'number of expired queue items that have been deleted')
queue_items_locked = Gauge('quay_queue_items_locked',
'number of queue items that have been acquired',
labelnames=['queue_name'])
queue_items_available = Gauge('quay_queue_items_available',
'number of queue items that have not expired',
labelnames=['queue_name'])
queue_items_available_unlocked = Gauge('quay_queue_items_available_unlocked',
'number of queue items that have not expired and are not locked',
labelnames=['queue_name'])
MINIMUM_EXTENSION = timedelta(seconds=20) MINIMUM_EXTENSION = timedelta(seconds=20)
DEFAULT_BATCH_SIZE = 1000 DEFAULT_BATCH_SIZE = 1000
class BuildMetricQueueReporter(object):
""" Metric queue reporter for the build system. """
def __init__(self, metric_queue):
self._metric_queue = metric_queue
def __call__(self, currently_processing, running_count, total_count):
need_capacity_count = total_count - running_count
self._metric_queue.put_deprecated('BuildCapacityShortage', need_capacity_count, unit='Count')
self._metric_queue.build_capacity_shortage.Set(need_capacity_count)
building_percent = 100 if currently_processing else 0
self._metric_queue.percent_building.Set(building_percent)
class WorkQueue(object): class WorkQueue(object):
""" Work queue defines methods for interacting with a queue backed by the database. """ """ Work queue defines methods for interacting with a queue backed by the database. """
def __init__(self, queue_name, transaction_factory, def __init__(self, queue_name, transaction_factory,
canonical_name_match_list=None, reporter=None, metric_queue=None, canonical_name_match_list=None, has_namespace=False):
has_namespace=False):
self._queue_name = queue_name self._queue_name = queue_name
self._reporter = reporter
self._metric_queue = metric_queue
self._transaction_factory = transaction_factory self._transaction_factory = transaction_factory
self._currently_processing = False self._currently_processing = False
self._has_namespaced_items = has_namespace self._has_namespaced_items = has_namespace
@ -129,21 +134,10 @@ class WorkQueue(object):
return (running_count, available_not_running_count, available_count) return (running_count, available_not_running_count, available_count)
def update_metrics(self): def update_metrics(self):
if self._reporter is None and self._metric_queue is None:
return
(running_count, available_not_running_count, available_count) = self.get_metrics() (running_count, available_not_running_count, available_count) = self.get_metrics()
queue_items_locked.labels(self._queue_name).set(running_count)
if self._metric_queue: queue_items_available.labels(self._queue_name).set(available_count)
self._metric_queue.work_queue_running.Set(running_count, labelvalues=[self._queue_name]) queue_items_available_unlocked.labels(self._queue_name).set(available_not_running_count)
self._metric_queue.work_queue_available.Set(available_count, labelvalues=[self._queue_name])
self._metric_queue.work_queue_available_not_running.Set(available_not_running_count,
labelvalues=[self._queue_name])
if self._reporter:
self._reporter(self._currently_processing, running_count,
running_count + available_not_running_count)
def has_retries_remaining(self, item_id): def has_retries_remaining(self, item_id):
""" Returns whether the queue item with the given id has any retries remaining. If the """ Returns whether the queue item with the given id has any retries remaining. If the
@ -204,7 +198,9 @@ class WorkQueue(object):
# Chunk the inserted items into batch_size chunks and insert_many # Chunk the inserted items into batch_size chunks and insert_many
remaining = list(items_to_insert) remaining = list(items_to_insert)
while remaining: while remaining:
QueueItem.insert_many(remaining[0:batch_size]).execute() current_batch = remaining[0:batch_size]
QueueItem.insert_many(current_batch).execute()
queue_item_puts.labels(self._queue_name).inc(current_batch)
remaining = remaining[batch_size:] remaining = remaining[batch_size:]
def put(self, canonical_name_list, message, available_after=0, retries_remaining=5): def put(self, canonical_name_list, message, available_after=0, retries_remaining=5):
@ -214,6 +210,7 @@ class WorkQueue(object):
""" """
item = QueueItem.create(**self._queue_dict(canonical_name_list, message, available_after, item = QueueItem.create(**self._queue_dict(canonical_name_list, message, available_after,
retries_remaining)) retries_remaining))
queue_item_puts.labels(self._queue_name).inc()
return str(item.id) return str(item.id)
def _select_available_item(self, ordering_required, now): def _select_available_item(self, ordering_required, now):
@ -289,15 +286,18 @@ class WorkQueue(object):
db_item = self._select_available_item(ordering_required, now) db_item = self._select_available_item(ordering_required, now)
if db_item is None: if db_item is None:
self._currently_processing = False self._currently_processing = False
queue_item_gets.labels(self._queue_name, 'nonexistant').inc()
return None return None
# Attempt to claim the item for this instance. # Attempt to claim the item for this instance.
was_claimed = self._attempt_to_claim_item(db_item, now, processing_time) was_claimed = self._attempt_to_claim_item(db_item, now, processing_time)
if not was_claimed: if not was_claimed:
self._currently_processing = False self._currently_processing = False
queue_item_gets.labels(self._queue_name, 'claimed').inc()
return None return None
self._currently_processing = True self._currently_processing = True
queue_item_gets.labels(self._queue_name, 'acquired').inc()
# Return a view of the queue item rather than an active db object # Return a view of the queue item rather than an active db object
return AttrDict({ return AttrDict({
@ -307,8 +307,8 @@ class WorkQueue(object):
}) })
def cancel(self, item_id): def cancel(self, item_id):
""" Attempts to cancel the queue item with the given ID from the queue. Returns true on success """ Attempts to cancel the queue item with the given ID from the queue.
and false if the queue item could not be canceled. Returns true on success and false if the queue item could not be canceled.
""" """
count_removed = QueueItem.delete().where(QueueItem.id == item_id).execute() count_removed = QueueItem.delete().where(QueueItem.id == item_id).execute()
return count_removed > 0 return count_removed > 0
@ -375,4 +375,5 @@ def delete_expired(expiration_threshold, deletion_threshold, batch_size):
return 0 return 0
QueueItem.delete().where(QueueItem.id << to_delete).execute() QueueItem.delete().where(QueueItem.id << to_delete).execute()
queue_item_deletes.inc(to_delete)
return len(to_delete) return len(to_delete)

View file

@ -7,6 +7,8 @@ from collections import namedtuple
import bitmath import bitmath
import resumablehashlib import resumablehashlib
from prometheus_client import Counter, Histogram
from data.registry_model import registry_model from data.registry_model import registry_model
from data.database import CloseForLongOperation, db_transaction from data.database import CloseForLongOperation, db_transaction
from digest import digest_tools from digest import digest_tools
@ -18,6 +20,13 @@ from util.registry.torrent import PieceHasher
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
chunk_upload_duration = Histogram('quay_chunk_upload_duration_seconds',
'number of seconds for a chunk to be uploaded to the registry',
labelnames=['region'])
pushed_bytes_total = Counter('quay_registry_pushed_bytes_total',
'number of bytes pushed to the registry')
BLOB_CONTENT_TYPE = 'application/octet-stream' BLOB_CONTENT_TYPE = 'application/octet-stream'
@ -125,13 +134,10 @@ class _BlobUploadManager(object):
""" Returns the unique ID for the blob upload. """ """ Returns the unique ID for the blob upload. """
return self.blob_upload.upload_id return self.blob_upload.upload_id
def upload_chunk(self, app_config, input_fp, start_offset=0, length=-1, metric_queue=None): def upload_chunk(self, app_config, input_fp, start_offset=0, length=-1):
""" Uploads a chunk of data found in the given input file-like interface. start_offset and """ Uploads a chunk of data found in the given input file-like interface. start_offset and
length are optional and should match a range header if any was given. length are optional and should match a range header if any was given.
If metric_queue is given, the upload time and chunk size are written into the metrics in
the queue.
Returns the total number of bytes uploaded after this upload has completed. Raises Returns the total number of bytes uploaded after this upload has completed. Raises
a BlobUploadException if the upload failed. a BlobUploadException if the upload failed.
""" """
@ -207,11 +213,8 @@ class _BlobUploadManager(object):
raise BlobUploadException(upload_error) raise BlobUploadException(upload_error)
# Update the chunk upload time and push bytes metrics. # Update the chunk upload time and push bytes metrics.
if metric_queue is not None: chunk_upload_duration.labels(list(location_set)[0]).observe(time.time() - start_time)
metric_queue.chunk_upload_time.Observe(time.time() - start_time, labelvalues=[ pushed_bytes_total.inc(length_written)
length_written, list(location_set)[0]])
metric_queue.push_byte_count.Inc(length_written)
# Ensure we have not gone beyond the max layer size. # Ensure we have not gone beyond the max layer size.
new_blob_bytes = self.blob_upload.byte_count + length_written new_blob_bytes = self.blob_upload.byte_count + length_written

View file

@ -10,7 +10,7 @@ from flask_restful import Resource, abort, Api, reqparse
from flask_restful.utils.cors import crossdomain from flask_restful.utils.cors import crossdomain
from jsonschema import validate, ValidationError from jsonschema import validate, ValidationError
from app import app, metric_queue, authentication from app import app, authentication
from auth.permissions import (ReadRepositoryPermission, ModifyRepositoryPermission, from auth.permissions import (ReadRepositoryPermission, ModifyRepositoryPermission,
AdministerRepositoryPermission, UserReadPermission, AdministerRepositoryPermission, UserReadPermission,
UserAdminPermission) UserAdminPermission)
@ -25,7 +25,7 @@ from endpoints.csrf import csrf_protect
from endpoints.exception import (Unauthorized, InvalidRequest, InvalidResponse, from endpoints.exception import (Unauthorized, InvalidRequest, InvalidResponse,
FreshLoginRequired, NotFound) FreshLoginRequired, NotFound)
from endpoints.decorators import check_anon_protection, require_xhr_from_browser, check_readonly from endpoints.decorators import check_anon_protection, require_xhr_from_browser, check_readonly
from util.metrics.metricqueue import time_decorator from util.metrics.prometheus import timed_blueprint
from util.names import parse_namespace_repository from util.names import parse_namespace_repository
from util.pagination import encrypt_page_token, decrypt_page_token from util.pagination import encrypt_page_token, decrypt_page_token
from util.request import get_request_ip from util.request import get_request_ip
@ -33,7 +33,7 @@ from __init__models_pre_oci import pre_oci_model as model
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
api_bp = Blueprint('api', __name__) api_bp = timed_blueprint(Blueprint('api', __name__))
CROSS_DOMAIN_HEADERS = ['Authorization', 'Content-Type', 'X-Requested-With'] CROSS_DOMAIN_HEADERS = ['Authorization', 'Content-Type', 'X-Requested-With']
@ -46,10 +46,8 @@ class ApiExceptionHandlingApi(Api):
api = ApiExceptionHandlingApi() api = ApiExceptionHandlingApi()
api.init_app(api_bp) api.init_app(api_bp)
api.decorators = [csrf_protect(), api.decorators = [csrf_protect(), crossdomain(origin='*', headers=CROSS_DOMAIN_HEADERS),
crossdomain(origin='*', headers=CROSS_DOMAIN_HEADERS), process_oauth, require_xhr_from_browser]
process_oauth, time_decorator(api_bp.name, metric_queue),
require_xhr_from_browser]
def resource(*urls, **kwargs): def resource(*urls, **kwargs):
@ -342,7 +340,7 @@ def max_json_size(max_size):
def wrapped(self, *args, **kwargs): def wrapped(self, *args, **kwargs):
if request.is_json and len(request.get_data()) > max_size: if request.is_json and len(request.get_data()) > max_size:
raise InvalidRequest() raise InvalidRequest()
return func(self, *args, **kwargs) return func(self, *args, **kwargs)
return wrapped return wrapped
return wrapper return wrapper

View file

@ -5,15 +5,13 @@ from functools import wraps
from cnr.exception import Forbidden from cnr.exception import Forbidden
from flask import Blueprint from flask import Blueprint
from app import metric_queue
from auth.permissions import (AdministerRepositoryPermission, ReadRepositoryPermission, from auth.permissions import (AdministerRepositoryPermission, ReadRepositoryPermission,
ModifyRepositoryPermission) ModifyRepositoryPermission)
from endpoints.appr.decorators import require_repo_permission from endpoints.appr.decorators import require_repo_permission
from util.metrics.metricqueue import time_blueprint from util.metrics.prometheus import timed_blueprint
appr_bp = Blueprint('appr', __name__) appr_bp = timed_blueprint(Blueprint('appr', __name__))
time_blueprint(appr_bp, metric_queue)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View file

@ -5,7 +5,7 @@ from datetime import datetime, timedelta
from flask import request from flask import request
from app import app, dockerfile_build_queue, metric_queue from app import app, dockerfile_build_queue
from data import model from data import model
from data.logs_model import logs_model from data.logs_model import logs_model
from data.database import db, RepositoryState from data.database import db, RepositoryState
@ -55,7 +55,7 @@ def start_build(repository, prepared_build, pull_robot_name=None):
logger.debug('Prevented queueing of build under namespace %s due to reaching max: %s', logger.debug('Prevented queueing of build under namespace %s due to reaching max: %s',
repository.namespace_user.username, repository.namespace_user.username,
repository.namespace_user.maximum_queued_builds_count) repository.namespace_user.maximum_queued_builds_count)
raise MaximumBuildsQueuedException() raise MaximumBuildsQueuedException()
host = app.config['SERVER_HOSTNAME'] host = app.config['SERVER_HOSTNAME']
repo_path = '%s/%s/%s' % (host, repository.namespace_user.username, repository.name) repo_path = '%s/%s/%s' % (host, repository.namespace_user.username, repository.name)
@ -97,10 +97,6 @@ def start_build(repository, prepared_build, pull_robot_name=None):
build_request.queue_id = queue_id build_request.queue_id = queue_id
build_request.save() build_request.save()
# Add the queueing of the build to the metrics queue.
metric_queue.repository_build_queued.Inc(labelvalues=[repository.namespace_user.username,
repository.name])
# Add the build to the repo's log and spawn the build_queued notification. # Add the build to the repo's log and spawn the build_queued notification.
event_log_metadata = { event_log_metadata = {
'build_id': build_request.uuid, 'build_id': build_request.uuid,

13
endpoints/metrics.py Normal file
View file

@ -0,0 +1,13 @@
from prometheus_client import Counter
image_pulls = Counter('quay_registry_image_pulls_total',
'number of images that have been downloaded via the registry',
labelnames=['protocol', 'ref', 'status'])
image_pushes = Counter('quay_registry_image_pushes_total',
'number of images that have been uploaded via the registry',
labelnames=['protocol', 'status'])
image_pulled_bytes = Counter('quay_registry_image_pulled_bytes_total',
'number of bytes that have been downloaded via the registry',
labelnames=['protocol'])

View file

@ -6,14 +6,13 @@ from flask import Blueprint, make_response, jsonify
import features import features
from app import metric_queue, app from app import app
from data.readreplica import ReadOnlyModeException from data.readreplica import ReadOnlyModeException
from endpoints.decorators import anon_protect, anon_allowed from endpoints.decorators import anon_protect, anon_allowed
from util.metrics.metricqueue import time_blueprint
from util.http import abort from util.http import abort
from util.metrics.prometheus import timed_blueprint
v1_bp = Blueprint('v1', __name__) v1_bp = timed_blueprint(Blueprint('v1', __name__))
time_blueprint(v1_bp, metric_queue)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View file

@ -6,7 +6,7 @@ from functools import wraps
from flask import request, make_response, jsonify, session from flask import request, make_response, jsonify, session
from app import userevents, metric_queue, storage, docker_v2_signing_key from app import userevents, storage, docker_v2_signing_key
from auth.auth_context import get_authenticated_context, get_authenticated_user from auth.auth_context import get_authenticated_context, get_authenticated_user
from auth.credentials import validate_credentials, CredentialKind from auth.credentials import validate_credentials, CredentialKind
from auth.decorators import process_auth from auth.decorators import process_auth
@ -19,6 +19,7 @@ from data.registry_model import registry_model
from data.registry_model.manifestbuilder import create_manifest_builder, lookup_manifest_builder from data.registry_model.manifestbuilder import create_manifest_builder, lookup_manifest_builder
from endpoints.decorators import (anon_protect, anon_allowed, parse_repository_name, from endpoints.decorators import (anon_protect, anon_allowed, parse_repository_name,
check_repository_state, check_readonly) check_repository_state, check_readonly)
from endpoints.metrics import image_pulls, image_pushes
from endpoints.v1 import v1_bp, check_v1_push_enabled from endpoints.v1 import v1_bp, check_v1_push_enabled
from notifications import spawn_notification from notifications import spawn_notification
from util.audit import track_and_log from util.audit import track_and_log
@ -250,11 +251,13 @@ def update_images(namespace_name, repo_name):
kind_filter='image') kind_filter='image')
if repository_ref is None: if repository_ref is None:
# Make sure the repo actually exists. # Make sure the repo actually exists.
image_pushes.labels('v1', 404).inc()
abort(404, message='Unknown repository', issue='unknown-repo') abort(404, message='Unknown repository', issue='unknown-repo')
builder = lookup_manifest_builder(repository_ref, session.get('manifest_builder'), storage, builder = lookup_manifest_builder(repository_ref, session.get('manifest_builder'), storage,
docker_v2_signing_key) docker_v2_signing_key)
if builder is None: if builder is None:
image_pushes.labels('v1', 400).inc()
abort(400) abort(400)
# Generate a job for each notification that has been added to this repo # Generate a job for each notification that has been added to this repo
@ -267,9 +270,10 @@ def update_images(namespace_name, repo_name):
track_and_log('push_repo', repository_ref) track_and_log('push_repo', repository_ref)
spawn_notification(repository_ref, 'repo_push', event_data) spawn_notification(repository_ref, 'repo_push', event_data)
metric_queue.repository_push.Inc(labelvalues=[namespace_name, repo_name, 'v1', True]) image_pushes.labels('v1', 204).inc()
return make_response('Updated', 204) return make_response('Updated', 204)
image_pushes.labels('v1', 403).inc()
abort(403) abort(403)
@ -287,6 +291,7 @@ def get_repository_images(namespace_name, repo_name):
if permission.can() or (repository_ref and repository_ref.is_public): if permission.can() or (repository_ref and repository_ref.is_public):
# We can't rely on permissions to tell us if a repo exists anymore # We can't rely on permissions to tell us if a repo exists anymore
if repository_ref is None: if repository_ref is None:
image_pulls.labels('v1', 'tag', 404).inc()
abort(404, message='Unknown repository', issue='unknown-repo') abort(404, message='Unknown repository', issue='unknown-repo')
logger.debug('Building repository image response') logger.debug('Building repository image response')
@ -296,9 +301,10 @@ def get_repository_images(namespace_name, repo_name):
track_and_log('pull_repo', repository_ref, track_and_log('pull_repo', repository_ref,
analytics_name='pull_repo_100x', analytics_name='pull_repo_100x',
analytics_sample=0.01) analytics_sample=0.01)
metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'v1', True]) image_pulls.labels('v1', 'tag', 200).inc()
return resp return resp
image_pulls.labels('v1', 'tag', 403).inc()
abort(403) abort(403)

View file

@ -7,7 +7,7 @@ from time import time
from flask import make_response, request, session, Response, redirect, abort as flask_abort from flask import make_response, request, session, Response, redirect, abort as flask_abort
from app import storage as store, app, docker_v2_signing_key, metric_queue from app import storage as store, app, docker_v2_signing_key
from auth.auth_context import get_authenticated_user from auth.auth_context import get_authenticated_user
from auth.decorators import extract_namespace_repo_from_session, process_auth from auth.decorators import extract_namespace_repo_from_session, process_auth
from auth.permissions import (ReadRepositoryPermission, ModifyRepositoryPermission) from auth.permissions import (ReadRepositoryPermission, ModifyRepositoryPermission)
@ -16,14 +16,16 @@ from data.registry_model import registry_model
from data.registry_model.blobuploader import upload_blob, BlobUploadSettings, BlobUploadException from data.registry_model.blobuploader import upload_blob, BlobUploadSettings, BlobUploadException
from data.registry_model.manifestbuilder import lookup_manifest_builder from data.registry_model.manifestbuilder import lookup_manifest_builder
from digest import checksums from digest import checksums
from endpoints.metrics import image_pulled_bytes
from endpoints.v1 import v1_bp, check_v1_push_enabled from endpoints.v1 import v1_bp, check_v1_push_enabled
from endpoints.v1.index import ensure_namespace_enabled from endpoints.v1.index import ensure_namespace_enabled
from endpoints.decorators import (anon_protect, check_region_blacklisted, check_repository_state, from endpoints.decorators import (anon_protect, check_region_blacklisted, check_repository_state,
check_readonly) check_readonly)
from util.http import abort, exact_abort from util.http import abort
from util.registry.replication import queue_storage_replication from util.registry.replication import queue_storage_replication
from util.request import get_request_ip from util.request import get_request_ip
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -127,7 +129,7 @@ def get_image_layer(namespace, repository, image_id, headers):
abort(404, 'Image %(image_id)s not found', issue='unknown-image', image_id=image_id) abort(404, 'Image %(image_id)s not found', issue='unknown-image', image_id=image_id)
path = legacy_image.blob.storage_path path = legacy_image.blob.storage_path
metric_queue.pull_byte_count.Inc(legacy_image.blob.compressed_size, labelvalues=['v1']) image_pulled_bytes.labels('v1').inc(legacy_image.blob.compressed_size)
try: try:
logger.debug('Looking up the direct download URL for path: %s', path) logger.debug('Looking up the direct download URL for path: %s', path)

View file

@ -10,10 +10,10 @@ from semantic_version import Spec
import features import features
from app import app, metric_queue, get_app_url from app import app, get_app_url
from auth.auth_context import get_authenticated_context from auth.auth_context import get_authenticated_context
from auth.permissions import ( from auth.permissions import (ReadRepositoryPermission, ModifyRepositoryPermission,
ReadRepositoryPermission, ModifyRepositoryPermission, AdministerRepositoryPermission) AdministerRepositoryPermission)
from auth.registry_jwt_auth import process_registry_jwt_auth, get_auth_headers from auth.registry_jwt_auth import process_registry_jwt_auth, get_auth_headers
from data.registry_model import registry_model from data.registry_model import registry_model
from data.readreplica import ReadOnlyModeException from data.readreplica import ReadOnlyModeException
@ -21,14 +21,15 @@ from endpoints.decorators import anon_protect, anon_allowed, route_show_if
from endpoints.v2.errors import (V2RegistryException, Unauthorized, Unsupported, NameUnknown, from endpoints.v2.errors import (V2RegistryException, Unauthorized, Unsupported, NameUnknown,
ReadOnlyMode) ReadOnlyMode)
from util.http import abort from util.http import abort
from util.metrics.metricqueue import time_blueprint from util.metrics.prometheus import timed_blueprint
from util.registry.dockerver import docker_version from util.registry.dockerver import docker_version
from util.pagination import encrypt_page_token, decrypt_page_token from util.pagination import encrypt_page_token, decrypt_page_token
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
v2_bp = Blueprint('v2', __name__)
time_blueprint(v2_bp, metric_queue) v2_bp = timed_blueprint(Blueprint('v2', __name__))
@v2_bp.app_errorhandler(V2RegistryException) @v2_bp.app_errorhandler(V2RegistryException)

View file

@ -3,7 +3,7 @@ import re
from flask import url_for, request, redirect, Response, abort as flask_abort from flask import url_for, request, redirect, Response, abort as flask_abort
from app import storage, app, get_app_url, metric_queue, model_cache from app import storage, app, get_app_url, model_cache
from auth.registry_jwt_auth import process_registry_jwt_auth from auth.registry_jwt_auth import process_registry_jwt_auth
from auth.permissions import ReadRepositoryPermission from auth.permissions import ReadRepositoryPermission
from data import database from data import database
@ -15,6 +15,7 @@ from data.registry_model.blobuploader import (create_blob_upload, retrieve_blob_
from digest import digest_tools from digest import digest_tools
from endpoints.decorators import (anon_protect, anon_allowed, parse_repository_name, from endpoints.decorators import (anon_protect, anon_allowed, parse_repository_name,
check_region_blacklisted, check_readonly) check_region_blacklisted, check_readonly)
from endpoints.metrics import image_pulled_bytes
from endpoints.v2 import v2_bp, require_repo_read, require_repo_write, get_input_stream from endpoints.v2 import v2_bp, require_repo_read, require_repo_write, get_input_stream
from endpoints.v2.errors import ( from endpoints.v2.errors import (
BlobUnknown, BlobUploadInvalid, BlobUploadUnknown, Unsupported, NameUnknown, LayerTooLarge, BlobUnknown, BlobUploadInvalid, BlobUploadUnknown, Unsupported, NameUnknown, LayerTooLarge,
@ -83,7 +84,7 @@ def download_blob(namespace_name, repo_name, digest):
if storage.get_supports_resumable_downloads(blob.placements): if storage.get_supports_resumable_downloads(blob.placements):
headers['Accept-Ranges'] = 'bytes' headers['Accept-Ranges'] = 'bytes'
metric_queue.pull_byte_count.Inc(blob.compressed_size, labelvalues=['v2']) image_pulled_bytes.labels('v2').inc(blob.compressed_size)
# Short-circuit by redirecting if the storage supports it. # Short-circuit by redirecting if the storage supports it.
path = blob.storage_path path = blob.storage_path
@ -434,7 +435,7 @@ def _upload_chunk(blob_uploader, commit_digest=None):
try: try:
# Upload the data received. # Upload the data received.
blob_uploader.upload_chunk(app.config, input_fp, start_offset, length, metric_queue) blob_uploader.upload_chunk(app.config, input_fp, start_offset, length)
if commit_digest is not None: if commit_digest is not None:
# Commit the upload to a blob. # Commit the upload to a blob.

View file

@ -6,12 +6,13 @@ from flask import request, url_for, Response
import features import features
from app import app, metric_queue, storage from app import app, storage
from auth.registry_jwt_auth import process_registry_jwt_auth from auth.registry_jwt_auth import process_registry_jwt_auth
from digest import digest_tools from digest import digest_tools
from data.registry_model import registry_model from data.registry_model import registry_model
from data.model.oci.manifest import CreateManifestException from data.model.oci.manifest import CreateManifestException
from endpoints.decorators import anon_protect, parse_repository_name, check_readonly from endpoints.decorators import anon_protect, parse_repository_name, check_readonly
from endpoints.metrics import image_pulls, image_pushes
from endpoints.v2 import v2_bp, require_repo_read, require_repo_write from endpoints.v2 import v2_bp, require_repo_read, require_repo_write
from endpoints.v2.errors import (ManifestInvalid, ManifestUnknown, NameInvalid, TagExpired, from endpoints.v2.errors import (ManifestInvalid, ManifestUnknown, NameInvalid, TagExpired,
NameUnknown) NameUnknown)
@ -41,6 +42,7 @@ MANIFEST_TAGNAME_ROUTE = BASE_MANIFEST_ROUTE.format(VALID_TAG_PATTERN)
def fetch_manifest_by_tagname(namespace_name, repo_name, manifest_ref): def fetch_manifest_by_tagname(namespace_name, repo_name, manifest_ref):
repository_ref = registry_model.lookup_repository(namespace_name, repo_name) repository_ref = registry_model.lookup_repository(namespace_name, repo_name)
if repository_ref is None: if repository_ref is None:
image_pulls.labels('v2_1', 'tag', 404).inc()
raise NameUnknown() raise NameUnknown()
tag = registry_model.get_repo_tag(repository_ref, manifest_ref) tag = registry_model.get_repo_tag(repository_ref, manifest_ref)
@ -49,23 +51,27 @@ def fetch_manifest_by_tagname(namespace_name, repo_name, manifest_ref):
logger.debug('Found expired tag %s for repository %s/%s', manifest_ref, namespace_name, logger.debug('Found expired tag %s for repository %s/%s', manifest_ref, namespace_name,
repo_name) repo_name)
msg = 'Tag %s was deleted or has expired. To pull, revive via time machine' % manifest_ref msg = 'Tag %s was deleted or has expired. To pull, revive via time machine' % manifest_ref
image_pulls.labels('v2_1', 'tag', 404).inc()
raise TagExpired(msg) raise TagExpired(msg)
image_pulls.labels('v2_1', 'tag', 404).inc()
raise ManifestUnknown() raise ManifestUnknown()
manifest = registry_model.get_manifest_for_tag(tag, backfill_if_necessary=True) manifest = registry_model.get_manifest_for_tag(tag, backfill_if_necessary=True)
if manifest is None: if manifest is None:
# Something went wrong. # Something went wrong.
image_pulls.labels('v2_1', 'tag', 400).inc()
raise ManifestInvalid() raise ManifestInvalid()
manifest_bytes, manifest_digest, manifest_media_type = _rewrite_schema_if_necessary( manifest_bytes, manifest_digest, manifest_media_type = _rewrite_schema_if_necessary(
namespace_name, repo_name, manifest_ref, manifest) namespace_name, repo_name, manifest_ref, manifest)
if manifest_bytes is None: if manifest_bytes is None:
image_pulls.labels('v2_1', 'tag', 404).inc()
raise ManifestUnknown() raise ManifestUnknown()
track_and_log('pull_repo', repository_ref, analytics_name='pull_repo_100x', analytics_sample=0.01, track_and_log('pull_repo', repository_ref, analytics_name='pull_repo_100x', analytics_sample=0.01,
tag=manifest_ref) tag=manifest_ref)
metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'v2', True]) image_pulls.labels('v2_1', 'tag', 200).inc()
return Response( return Response(
manifest_bytes.as_unicode(), manifest_bytes.as_unicode(),
@ -85,19 +91,22 @@ def fetch_manifest_by_tagname(namespace_name, repo_name, manifest_ref):
def fetch_manifest_by_digest(namespace_name, repo_name, manifest_ref): def fetch_manifest_by_digest(namespace_name, repo_name, manifest_ref):
repository_ref = registry_model.lookup_repository(namespace_name, repo_name) repository_ref = registry_model.lookup_repository(namespace_name, repo_name)
if repository_ref is None: if repository_ref is None:
image_pulls.labels('v2_1', 'manifest', 404).inc()
raise NameUnknown() raise NameUnknown()
manifest = registry_model.lookup_manifest_by_digest(repository_ref, manifest_ref) manifest = registry_model.lookup_manifest_by_digest(repository_ref, manifest_ref)
if manifest is None: if manifest is None:
image_pulls.labels('v2_1', 'manifest', 404).inc()
raise ManifestUnknown() raise ManifestUnknown()
manifest_bytes, manifest_digest, manifest_media_type = _rewrite_schema_if_necessary( manifest_bytes, manifest_digest, manifest_media_type = _rewrite_schema_if_necessary(
namespace_name, repo_name, '$digest', manifest) namespace_name, repo_name, '$digest', manifest)
if manifest_digest is None: if manifest_digest is None:
image_pulls.labels('v2_1', 'manifest', 404).inc()
raise ManifestUnknown() raise ManifestUnknown()
track_and_log('pull_repo', repository_ref, manifest_digest=manifest_ref) track_and_log('pull_repo', repository_ref, manifest_digest=manifest_ref)
metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'v2', True]) image_pulls.labels('v2_1', 'manifest', 200).inc()
return Response(manifest_bytes.as_unicode(), status=200, headers={ return Response(manifest_bytes.as_unicode(), status=200, headers={
'Content-Type': manifest_media_type, 'Content-Type': manifest_media_type,
@ -180,6 +189,7 @@ def write_manifest_by_tagname(namespace_name, repo_name, manifest_ref):
def write_manifest_by_digest(namespace_name, repo_name, manifest_ref): def write_manifest_by_digest(namespace_name, repo_name, manifest_ref):
parsed = _parse_manifest() parsed = _parse_manifest()
if parsed.digest != manifest_ref: if parsed.digest != manifest_ref:
image_pushes.labels('v2_invalid', 400).inc()
raise ManifestInvalid(detail={'message': 'manifest digest mismatch'}) raise ManifestInvalid(detail={'message': 'manifest digest mismatch'})
if parsed.schema_version != 2: if parsed.schema_version != 2:
@ -190,14 +200,17 @@ def write_manifest_by_digest(namespace_name, repo_name, manifest_ref):
# manifest with a temporary tag, as it is being pushed as part of a call for a manifest list. # manifest with a temporary tag, as it is being pushed as part of a call for a manifest list.
repository_ref = registry_model.lookup_repository(namespace_name, repo_name) repository_ref = registry_model.lookup_repository(namespace_name, repo_name)
if repository_ref is None: if repository_ref is None:
image_pushes.labels('v2_2', 404).inc()
raise NameUnknown() raise NameUnknown()
expiration_sec = app.config['PUSH_TEMP_TAG_EXPIRATION_SEC'] expiration_sec = app.config['PUSH_TEMP_TAG_EXPIRATION_SEC']
manifest = registry_model.create_manifest_with_temp_tag(repository_ref, parsed, expiration_sec, manifest = registry_model.create_manifest_with_temp_tag(repository_ref, parsed, expiration_sec,
storage) storage)
if manifest is None: if manifest is None:
image_pushes.labels('v2_2', 400).inc()
raise ManifestInvalid() raise ManifestInvalid()
image_pushes.labels('v2_2', 202).inc()
return Response( return Response(
'OK', 'OK',
status=202, status=202,
@ -271,7 +284,7 @@ def _write_manifest_and_log(namespace_name, repo_name, tag_name, manifest_impl):
track_and_log('push_repo', repository_ref, tag=tag_name) track_and_log('push_repo', repository_ref, tag=tag_name)
spawn_notification(repository_ref, 'repo_push', {'updated_tags': [tag_name]}) spawn_notification(repository_ref, 'repo_push', {'updated_tags': [tag_name]})
metric_queue.repository_push.Inc(labelvalues=[namespace_name, repo_name, 'v2', True]) image_pushes.labels('v2_1', 202).inc()
return Response( return Response(
'OK', 'OK',

View file

@ -3,11 +3,14 @@ import json
import logging import logging
import uuid import uuid
from functools import wraps
from flask import redirect, Blueprint, abort, send_file, make_response, request from flask import redirect, Blueprint, abort, send_file, make_response, request
from prometheus_client import Counter
import features import features
from app import app, signer, storage, metric_queue, config_provider, ip_resolver, instance_keys from app import app, signer, storage, config_provider, ip_resolver, instance_keys
from auth.auth_context import get_authenticated_user from auth.auth_context import get_authenticated_user
from auth.decorators import process_auth from auth.decorators import process_auth
from auth.permissions import ReadRepositoryPermission from auth.permissions import ReadRepositoryPermission
@ -16,6 +19,7 @@ from data import model
from data.registry_model import registry_model from data.registry_model import registry_model
from endpoints.decorators import (anon_protect, anon_allowed, route_show_if, parse_repository_name, from endpoints.decorators import (anon_protect, anon_allowed, route_show_if, parse_repository_name,
check_region_blacklisted) check_region_blacklisted)
from endpoints.metrics import image_pulls, image_pulled_bytes
from endpoints.v2.blob import BLOB_DIGEST_ROUTE from endpoints.v2.blob import BLOB_DIGEST_ROUTE
from image.appc import AppCImageFormatter from image.appc import AppCImageFormatter
from image.docker import ManifestException from image.docker import ManifestException
@ -32,6 +36,10 @@ from util.registry.torrent import (make_torrent, per_user_torrent_filename, publ
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
verb_stream_passes = Counter('quay_verb_stream_passes_total',
'number of passes over a tar stream used by verb requests',
labelnames=['kind'])
verbs = Blueprint('verbs', __name__) verbs = Blueprint('verbs', __name__)
LAYER_MIMETYPE = 'binary/octet-stream' LAYER_MIMETYPE = 'binary/octet-stream'
@ -42,7 +50,7 @@ class VerbReporter(TarLayerFormatterReporter):
self.kind = kind self.kind = kind
def report_pass(self, pass_count): def report_pass(self, pass_count):
metric_queue.verb_action_passes.Inc(labelvalues=[self.kind, pass_count]) verb_stream_passes.labels(self.kind).inc(pass_count)
def _open_stream(formatter, tag, schema1_manifest, derived_image_id, handlers, reporter): def _open_stream(formatter, tag, schema1_manifest, derived_image_id, handlers, reporter):
@ -65,7 +73,7 @@ def _open_stream(formatter, tag, schema1_manifest, derived_image_id, handlers, r
def tar_stream_getter_iterator(): def tar_stream_getter_iterator():
# Re-Initialize the storage engine because some may not respond well to forking (e.g. S3) # Re-Initialize the storage engine because some may not respond well to forking (e.g. S3)
store = Storage(app, metric_queue, config_provider=config_provider, ip_resolver=ip_resolver) store = Storage(app, config_provider=config_provider, ip_resolver=ip_resolver)
# Note: We reverse because we have to start at the leaf layer and move upward, # Note: We reverse because we have to start at the leaf layer and move upward,
# as per the spec for the formatters. # as per the spec for the formatters.
@ -112,7 +120,7 @@ def _write_derived_image_to_storage(verb, derived_image, queue_file):
queue_file.add_exception_handler(handle_exception) queue_file.add_exception_handler(handle_exception)
# Re-Initialize the storage engine because some may not respond well to forking (e.g. S3) # Re-Initialize the storage engine because some may not respond well to forking (e.g. S3)
store = Storage(app, metric_queue, config_provider=config_provider, ip_resolver=ip_resolver) store = Storage(app, config_provider=config_provider, ip_resolver=ip_resolver)
try: try:
store.stream_write(derived_image.blob.placements, derived_image.blob.storage_path, queue_file) store.stream_write(derived_image.blob.placements, derived_image.blob.storage_path, queue_file)
@ -293,12 +301,10 @@ def _repo_verb(namespace, repository, tag_name, verb, formatter, sign=False, che
# Check for torrent. If found, we return a torrent for the repo verb image (if the derived # Check for torrent. If found, we return a torrent for the repo verb image (if the derived
# image already exists). # image already exists).
if request.accept_mimetypes.best == 'application/x-bittorrent': if request.accept_mimetypes.best == 'application/x-bittorrent':
metric_queue.repository_pull.Inc(labelvalues=[namespace, repository, verb + '+torrent', True])
return _torrent_repo_verb(repo, tag, manifest, verb, **kwargs) return _torrent_repo_verb(repo, tag, manifest, verb, **kwargs)
# Log the action. # Log the action.
track_and_log('repo_verb', wrap_repository(repo), tag=tag.name, verb=verb, **kwargs) track_and_log('repo_verb', wrap_repository(repo), tag=tag.name, verb=verb, **kwargs)
metric_queue.repository_pull.Inc(labelvalues=[namespace, repository, verb, True])
is_readonly = app.config.get('REGISTRY_STATE', 'normal') == 'readonly' is_readonly = app.config.get('REGISTRY_STATE', 'normal') == 'readonly'
@ -321,7 +327,7 @@ def _repo_verb(namespace, repository, tag_name, verb, formatter, sign=False, che
logger.debug('Derived %s image %s exists in storage', verb, derived_image) logger.debug('Derived %s image %s exists in storage', verb, derived_image)
is_head_request = request.method == 'HEAD' is_head_request = request.method == 'HEAD'
metric_queue.pull_byte_count.Inc(derived_image.blob.compressed_size, labelvalues=[verb]) image_pulled_bytes.labels('bittorrent').inc(derived_image.blob.compressed_size)
download_url = storage.get_direct_download_url(derived_image.blob.placements, download_url = storage.get_direct_download_url(derived_image.blob.placements,
derived_image.blob.storage_path, derived_image.blob.storage_path,
@ -435,10 +441,25 @@ def os_arch_checker(os, arch):
return checker return checker
def observe_route(protocol):
"""
Decorates get_tag_torrent to record the image_pulls metric into Prometheus.
"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
rv = func(*args, **kwargs)
image_pulls.labels(protocol, 'tag', rv.status_code)
return rv
return wrapper
return decorator
@route_show_if(features.ACI_CONVERSION) @route_show_if(features.ACI_CONVERSION)
@anon_protect @anon_protect
@verbs.route('/aci/<server>/<namespace>/<repository>/<tag>/sig/<os>/<arch>/', methods=['GET']) @verbs.route('/aci/<server>/<namespace>/<repository>/<tag>/sig/<os>/<arch>/', methods=['GET'])
@verbs.route('/aci/<server>/<namespace>/<repository>/<tag>/aci.asc/<os>/<arch>/', methods=['GET']) @verbs.route('/aci/<server>/<namespace>/<repository>/<tag>/aci.asc/<os>/<arch>/', methods=['GET'])
@observe_route('aci')
@process_auth @process_auth
def get_aci_signature(server, namespace, repository, tag, os, arch): def get_aci_signature(server, namespace, repository, tag, os, arch):
return _repo_verb_signature(namespace, repository, tag, 'aci', checker=os_arch_checker(os, arch), return _repo_verb_signature(namespace, repository, tag, 'aci', checker=os_arch_checker(os, arch),
@ -449,6 +470,7 @@ def get_aci_signature(server, namespace, repository, tag, os, arch):
@anon_protect @anon_protect
@verbs.route('/aci/<server>/<namespace>/<repository>/<tag>/aci/<os>/<arch>/', methods=[ @verbs.route('/aci/<server>/<namespace>/<repository>/<tag>/aci/<os>/<arch>/', methods=[
'GET', 'HEAD']) 'GET', 'HEAD'])
@observe_route('aci')
@process_auth @process_auth
def get_aci_image(server, namespace, repository, tag, os, arch): def get_aci_image(server, namespace, repository, tag, os, arch):
return _repo_verb(namespace, repository, tag, 'aci', return _repo_verb(namespace, repository, tag, 'aci',
@ -458,6 +480,7 @@ def get_aci_image(server, namespace, repository, tag, os, arch):
@anon_protect @anon_protect
@verbs.route('/squash/<namespace>/<repository>/<tag>', methods=['GET']) @verbs.route('/squash/<namespace>/<repository>/<tag>', methods=['GET'])
@observe_route('squash')
@process_auth @process_auth
def get_squashed_tag(namespace, repository, tag): def get_squashed_tag(namespace, repository, tag):
return _repo_verb(namespace, repository, tag, 'squash', SquashedDockerImageFormatter()) return _repo_verb(namespace, repository, tag, 'squash', SquashedDockerImageFormatter())
@ -466,6 +489,7 @@ def get_squashed_tag(namespace, repository, tag):
@route_show_if(features.BITTORRENT) @route_show_if(features.BITTORRENT)
@anon_protect @anon_protect
@verbs.route('/torrent{0}'.format(BLOB_DIGEST_ROUTE), methods=['GET']) @verbs.route('/torrent{0}'.format(BLOB_DIGEST_ROUTE), methods=['GET'])
@observe_route('bittorrent')
@process_auth @process_auth
@parse_repository_name() @parse_repository_name()
@check_region_blacklisted(namespace_name_kwarg='namespace_name') @check_region_blacklisted(namespace_name_kwarg='namespace_name')
@ -493,7 +517,6 @@ def get_tag_torrent(namespace_name, repo_name, digest):
if blob is None: if blob is None:
abort(404) abort(404)
metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'torrent', True])
return _torrent_for_blob(blob, repo_is_public) return _torrent_for_blob(blob, repo_is_public)

View file

@ -16,7 +16,7 @@ disable=missing-docstring,invalid-name,too-many-locals,too-few-public-methods,to
# List of module names for which member attributes should not be checked # List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime # (useful for modules/projects where namespaces are manipulated during runtime
# and thus extisting member attributes cannot be deduced by static analysis # and thus extisting member attributes cannot be deduced by static analysis
ignored-modules=features ignored-modules=features,greenlet
# List of members which are set dynamically and missed by pylint inference # List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E0201 when accessed. Python regular # system, and so shouldn't trigger E0201 when accessed. Python regular

View file

@ -22,42 +22,41 @@ STORAGE_DRIVER_CLASSES = {
} }
def get_storage_driver(location, metric_queue, chunk_cleanup_queue, config_provider, ip_resolver, def get_storage_driver(location, chunk_cleanup_queue, config_provider, ip_resolver,
storage_params): storage_params):
""" Returns a storage driver class for the given storage configuration """ Returns a storage driver class for the given storage configuration
(a pair of string name and a dict of parameters). """ (a pair of string name and a dict of parameters). """
driver = storage_params[0] driver = storage_params[0]
parameters = storage_params[1] parameters = storage_params[1]
driver_class = STORAGE_DRIVER_CLASSES.get(driver, FakeStorage) driver_class = STORAGE_DRIVER_CLASSES.get(driver, FakeStorage)
context = StorageContext(location, metric_queue, chunk_cleanup_queue, config_provider, context = StorageContext(location, chunk_cleanup_queue, config_provider,
ip_resolver) ip_resolver)
return driver_class(context, **parameters) return driver_class(context, **parameters)
class StorageContext(object): class StorageContext(object):
def __init__(self, location, metric_queue, chunk_cleanup_queue, config_provider, ip_resolver): def __init__(self, location, chunk_cleanup_queue, config_provider, ip_resolver):
self.location = location self.location = location
self.metric_queue = metric_queue
self.chunk_cleanup_queue = chunk_cleanup_queue self.chunk_cleanup_queue = chunk_cleanup_queue
self.config_provider = config_provider self.config_provider = config_provider
self.ip_resolver = ip_resolver or NoopIPResolver() self.ip_resolver = ip_resolver or NoopIPResolver()
class Storage(object): class Storage(object):
def __init__(self, app=None, metric_queue=None, chunk_cleanup_queue=None, instance_keys=None, def __init__(self, app=None, chunk_cleanup_queue=None, instance_keys=None,
config_provider=None, ip_resolver=None): config_provider=None, ip_resolver=None):
self.app = app self.app = app
if app is not None: if app is not None:
self.state = self.init_app(app, metric_queue, chunk_cleanup_queue, instance_keys, self.state = self.init_app(app, chunk_cleanup_queue, instance_keys,
config_provider, ip_resolver) config_provider, ip_resolver)
else: else:
self.state = None self.state = None
def init_app(self, app, metric_queue, chunk_cleanup_queue, instance_keys, config_provider, def init_app(self, app, chunk_cleanup_queue, instance_keys, config_provider,
ip_resolver): ip_resolver):
storages = {} storages = {}
for location, storage_params in app.config.get('DISTRIBUTED_STORAGE_CONFIG').items(): for location, storage_params in app.config.get('DISTRIBUTED_STORAGE_CONFIG').items():
storages[location] = get_storage_driver(location, metric_queue, chunk_cleanup_queue, storages[location] = get_storage_driver(location, chunk_cleanup_queue,
config_provider, ip_resolver, storage_params) config_provider, ip_resolver, storage_params)
preference = app.config.get('DISTRIBUTED_STORAGE_PREFERENCE', None) preference = app.config.get('DISTRIBUTED_STORAGE_PREFERENCE', None)

View file

@ -3,27 +3,26 @@ import os
import logging import logging
import copy import copy
from cryptography.hazmat.backends import default_backend from collections import namedtuple
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.primitives.asymmetric import padding
from cachetools.func import lru_cache
from itertools import chain
from datetime import datetime, timedelta from datetime import datetime, timedelta
from io import BufferedIOBase
from itertools import chain
from uuid import uuid4
from botocore.signers import CloudFrontSigner
from boto.exception import S3ResponseError
import boto.s3.connection import boto.s3.connection
import boto.s3.multipart import boto.s3.multipart
import boto.gs.connection import boto.gs.connection
import boto.s3.key import boto.s3.key
import boto.gs.key import boto.gs.key
from io import BufferedIOBase from boto.exception import S3ResponseError
from uuid import uuid4 from botocore.signers import CloudFrontSigner
from collections import namedtuple from cachetools.func import lru_cache
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.primitives.asymmetric import padding
from prometheus_client import Counter
from util.registry import filelike from util.registry import filelike
from storage.basestorage import BaseStorageV2 from storage.basestorage import BaseStorageV2
@ -31,6 +30,13 @@ from storage.basestorage import BaseStorageV2
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
multipart_uploads_started = Counter('quay_multipart_uploads_started_total',
'number of multipart uploads to Quay storage that started')
multipart_uploads_completed = Counter('quay_multipart_uploads_completed_total',
'number of multipart uploads to Quay storage that completed')
_PartUploadMetadata = namedtuple('_PartUploadMetadata', ['path', 'offset', 'length']) _PartUploadMetadata = namedtuple('_PartUploadMetadata', ['path', 'offset', 'length'])
_CHUNKS_KEY = 'chunks' _CHUNKS_KEY = 'chunks'
@ -181,8 +187,7 @@ class _CloudStorage(BaseStorageV2):
if content_encoding is not None: if content_encoding is not None:
metadata['Content-Encoding'] = content_encoding metadata['Content-Encoding'] = content_encoding
if self._context.metric_queue is not None: multipart_uploads_started.inc()
self._context.metric_queue.multipart_upload_start.Inc()
return self._cloud_bucket.initiate_multipart_upload(path, metadata=metadata, return self._cloud_bucket.initiate_multipart_upload(path, metadata=metadata,
**self._upload_params) **self._upload_params)
@ -237,8 +242,7 @@ class _CloudStorage(BaseStorageV2):
logger.warn('Error when writing to stream in stream_write_internal at path %s: %s', path, e) logger.warn('Error when writing to stream in stream_write_internal at path %s: %s', path, e)
write_error = e write_error = e
if self._context.metric_queue is not None: multipart_uploads_completed.inc()
self._context.metric_queue.multipart_upload_end.Inc(labelvalues=['failure'])
if cancel_on_error: if cancel_on_error:
try: try:
@ -251,8 +255,7 @@ class _CloudStorage(BaseStorageV2):
break break
if total_bytes_written > 0: if total_bytes_written > 0:
if self._context.metric_queue is not None: multipart_uploads_completed.inc()
self._context.metric_queue.multipart_upload_end.Inc(labelvalues=['success'])
self._perform_action_with_retry(mp.complete_upload) self._perform_action_with_retry(mp.complete_upload)

View file

@ -18,7 +18,7 @@ _TEST_BUCKET = 'some_bucket'
_TEST_USER = 'someuser' _TEST_USER = 'someuser'
_TEST_PASSWORD = 'somepassword' _TEST_PASSWORD = 'somepassword'
_TEST_PATH = 'some/cool/path' _TEST_PATH = 'some/cool/path'
_TEST_CONTEXT = StorageContext('nyc', None, None, None, None) _TEST_CONTEXT = StorageContext('nyc', None, None, None)
@pytest.fixture(scope='function') @pytest.fixture(scope='function')

View file

@ -44,7 +44,7 @@ def test_direct_download(test_aws_ip, test_empty_ip_range_cache, test_ip_range_c
if ipranges_populated: if ipranges_populated:
ipresolver.sync_token = test_ip_range_cache['sync_token'] if ipranges_populated else test_empty_ip_range_cache['sync_token'] ipresolver.sync_token = test_ip_range_cache['sync_token'] if ipranges_populated else test_empty_ip_range_cache['sync_token']
ipresolver.amazon_ranges = test_ip_range_cache['all_amazon'] if ipranges_populated else test_empty_ip_range_cache['all_amazon'] ipresolver.amazon_ranges = test_ip_range_cache['all_amazon'] if ipranges_populated else test_empty_ip_range_cache['all_amazon']
context = StorageContext('nyc', None, None, config_provider, ipresolver) context = StorageContext('nyc', None, config_provider, ipresolver)
# Create a test bucket and put some test content. # Create a test bucket and put some test content.
boto.connect_s3().create_bucket(_TEST_BUCKET) boto.connect_s3().create_bucket(_TEST_BUCKET)
@ -68,7 +68,7 @@ def test_direct_download(test_aws_ip, test_empty_ip_range_cache, test_ip_range_c
@mock_s3 @mock_s3
def test_direct_download_no_ip(test_aws_ip, aws_ip_range_data, ipranges_populated, app): def test_direct_download_no_ip(test_aws_ip, aws_ip_range_data, ipranges_populated, app):
ipresolver = IPResolver(app) ipresolver = IPResolver(app)
context = StorageContext('nyc', None, None, config_provider, ipresolver) context = StorageContext('nyc', None, config_provider, ipresolver)
# Create a test bucket and put some test content. # Create a test bucket and put some test content.
boto.connect_s3().create_bucket(_TEST_BUCKET) boto.connect_s3().create_bucket(_TEST_BUCKET)

View file

@ -11,7 +11,7 @@ from storage.swift import SwiftStorage, _EMPTY_SEGMENTS_KEY
from swiftclient.client import ClientException from swiftclient.client import ClientException
base_args = { base_args = {
'context': StorageContext('nyc', None, None, None, None), 'context': StorageContext('nyc', None, None, None),
'swift_container': 'container-name', 'swift_container': 'container-name',
'storage_path': '/basepath', 'storage_path': '/basepath',
'auth_url': 'https://auth.com', 'auth_url': 'https://auth.com',
@ -265,7 +265,7 @@ def test_cancel_chunked_upload():
chunk_cleanup_queue = FakeQueue() chunk_cleanup_queue = FakeQueue()
args = dict(base_args) args = dict(base_args)
args['context'] = StorageContext('nyc', None, chunk_cleanup_queue, None, None) args['context'] = StorageContext('nyc', chunk_cleanup_queue, None, None)
swift = FakeSwiftStorage(**args) swift = FakeSwiftStorage(**args)
uuid, metadata = swift.initiate_chunked_upload() uuid, metadata = swift.initiate_chunked_upload()
@ -288,7 +288,7 @@ def test_cancel_chunked_upload():
def test_empty_chunks_queued_for_deletion(): def test_empty_chunks_queued_for_deletion():
chunk_cleanup_queue = FakeQueue() chunk_cleanup_queue = FakeQueue()
args = dict(base_args) args = dict(base_args)
args['context'] = StorageContext('nyc', None, chunk_cleanup_queue, None, None) args['context'] = StorageContext('nyc', chunk_cleanup_queue, None, None)
swift = FakeSwiftStorage(**args) swift = FakeSwiftStorage(**args)
uuid, metadata = swift.initiate_chunked_upload() uuid, metadata = swift.initiate_chunked_upload()

View file

@ -31,7 +31,7 @@ from jwkest.jwk import RSAKey
import endpoints.decorated # required for side effect import endpoints.decorated # required for side effect
from app import app, storage, instance_keys, get_app_url, metric_queue from app import app, storage, instance_keys, get_app_url
from data.database import close_db_filter, configure, DerivedStorageForImage, QueueItem, Image from data.database import close_db_filter, configure, DerivedStorageForImage, QueueItem, Image
from data import model from data import model
from digest.checksums import compute_simple from digest.checksums import compute_simple
@ -2228,7 +2228,7 @@ class V2LoginTests(V2RegistryLoginMixin, LoginTests, RegistryTestCaseMixin, Base
encoded = response.json()['token'] encoded = response.json()['token']
header = 'Bearer ' + encoded header = 'Bearer ' + encoded
payload = decode_bearer_header(header, instance_keys, app.config, metric_queue=metric_queue) payload = decode_bearer_header(header, instance_keys, app.config)
self.assertIsNotNone(payload) self.assertIsNotNone(payload)
if scope is None: if scope is None:

64
util/greenlet_tracing.py Normal file
View file

@ -0,0 +1,64 @@
from time import time
from gevent.hub import get_hub
from greenlet import settrace
from prometheus_client import Counter, Histogram
greenlet_switch = Counter('greenlet_switch_total', 'number of greenlet context switches')
greenlet_throw = Counter('greenlet_throw_total', 'number of greenlet throws')
greenlet_duration = Histogram('greenlet_duration_seconds',
'seconds in which a particular greenlet is executing',
buckets=[.01, .025, .05, .1, .25, .5, 1.0, 2.5, 5.0])
_latest_switch = None
def enable_tracing():
settrace(greenlet_callback)
def greenlet_callback(event, args):
"""
This is a callback that is executed greenlet on all events.
"""
if event in ('switch', 'throw'):
# It's only safe to unpack args under these two events.
(origin, _target) = args
if origin is get_hub():
# This greenlet is the one that manages the loop itself, thus noop.
return
if event == 'switch':
switch_callback(args)
return
if event == 'throw':
throw_callback(args)
return
def switch_callback(_args):
"""
This is a callback that is executed specifically on greenlet switches.
"""
global _latest_switch
greenlet_switch.inc()
if _latest_switch is None:
# This is the first switch.
_latest_switch = time()
return
now = time()
greenlet_duration.observe(now - _latest_switch)
_latest_switch = now
def throw_callback(_args):
"""
This is a callback that is executed on execeptions from origin to target.
This callback is running in the context of the target greenlet and any exceptions will replace
the original, as if target.throw() was used replacing the exception.
"""
greenlet_throw.inc()

View file

@ -1,210 +0,0 @@
import datetime
import logging
import time
from functools import wraps
from Queue import Queue, Full
from flask import g, request
from trollius import Return
logger = logging.getLogger(__name__)
# Buckets for the API response times.
API_RESPONSE_TIME_BUCKETS = [.01, .025, .05, .1, .25, .5, 1.0, 2.5, 5.0]
# Buckets for the builder start times.
BUILDER_START_TIME_BUCKETS = [.5, 1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 180.0, 240.0, 300.0, 600.0]
class MetricQueue(object):
""" Object to which various metrics are written, for distribution to metrics collection
system(s) such as Prometheus.
"""
def __init__(self, prom):
# Define the various exported metrics.
self.resp_time = prom.create_histogram('response_time', 'HTTP response time in seconds',
labelnames=['endpoint'],
buckets=API_RESPONSE_TIME_BUCKETS)
self.resp_code = prom.create_counter('response_code', 'HTTP response code',
labelnames=['endpoint', 'code'])
self.non_200 = prom.create_counter('response_non200', 'Non-200 HTTP response codes',
labelnames=['endpoint'])
self.error_500 = prom.create_counter('response_500', '5XX HTTP response codes',
labelnames=['endpoint'])
self.multipart_upload_start = prom.create_counter('multipart_upload_start',
'Multipart upload started')
self.multipart_upload_end = prom.create_counter('multipart_upload_end',
'Multipart upload ends.', labelnames=['type'])
self.build_capacity_shortage = prom.create_gauge('build_capacity_shortage',
'Build capacity shortage.')
self.builder_time_to_start = prom.create_histogram('builder_tts',
'Time from triggering to starting a builder.',
labelnames=['builder_type'],
buckets=BUILDER_START_TIME_BUCKETS)
self.builder_time_to_build = prom.create_histogram('builder_ttb',
'Time from triggering to actually starting a build',
labelnames=['builder_type'],
buckets=BUILDER_START_TIME_BUCKETS)
self.build_time = prom.create_histogram('build_time', 'Time spent building', labelnames=['builder_type'])
self.builder_fallback = prom.create_counter('builder_fallback', 'Builder fell back to secondary executor')
self.build_start_success = prom.create_counter('build_start_success', 'Executor succeeded in starting a build', labelnames=['builder_type'])
self.build_start_failure = prom.create_counter('build_start_failure', 'Executor failed to start a build', labelnames=['builder_type'])
self.percent_building = prom.create_gauge('build_percent_building', 'Percent building.')
self.build_counter = prom.create_counter('builds', 'Number of builds', labelnames=['name'])
self.ephemeral_build_workers = prom.create_counter('ephemeral_build_workers',
'Number of started ephemeral build workers')
self.ephemeral_build_worker_failure = prom.create_counter('ephemeral_build_worker_failure',
'Number of failed-to-start ephemeral build workers')
self.work_queue_running = prom.create_gauge('work_queue_running', 'Running items in a queue',
labelnames=['queue_name'])
self.work_queue_available = prom.create_gauge('work_queue_available',
'Available items in a queue',
labelnames=['queue_name'])
self.work_queue_available_not_running = prom.create_gauge('work_queue_available_not_running',
'Available items that are not yet running',
labelnames=['queue_name'])
self.repository_pull = prom.create_counter('repository_pull', 'Repository Pull Count',
labelnames=['namespace', 'repo_name', 'protocol',
'status'])
self.repository_push = prom.create_counter('repository_push', 'Repository Push Count',
labelnames=['namespace', 'repo_name', 'protocol',
'status'])
self.repository_build_queued = prom.create_counter('repository_build_queued',
'Repository Build Queued Count',
labelnames=['namespace', 'repo_name'])
self.repository_build_completed = prom.create_counter('repository_build_completed',
'Repository Build Complete Count',
labelnames=['namespace', 'repo_name',
'status', 'executor'])
self.chunk_size = prom.create_histogram('chunk_size',
'Registry blob chunk size',
labelnames=['storage_region'])
self.chunk_upload_time = prom.create_histogram('chunk_upload_time',
'Registry blob chunk upload time',
labelnames=['storage_region'])
self.authentication_count = prom.create_counter('authentication_count',
'Authentication count',
labelnames=['kind', 'status'])
self.repository_count = prom.create_gauge('repository_count', 'Number of repositories')
self.user_count = prom.create_gauge('user_count', 'Number of users')
self.org_count = prom.create_gauge('org_count', 'Number of Organizations')
self.robot_count = prom.create_gauge('robot_count', 'Number of robot accounts')
self.instance_key_renewal_success = prom.create_counter('instance_key_renewal_success',
'Instance Key Renewal Success Count',
labelnames=['key_id'])
self.instance_key_renewal_failure = prom.create_counter('instance_key_renewal_failure',
'Instance Key Renewal Failure Count',
labelnames=['key_id'])
self.invalid_instance_key_count = prom.create_counter('invalid_registry_instance_key_count',
'Invalid registry instance key count',
labelnames=['key_id'])
self.verb_action_passes = prom.create_counter('verb_action_passes', 'Verb Pass Count',
labelnames=['kind', 'pass_count'])
self.push_byte_count = prom.create_counter('registry_push_byte_count',
'Number of bytes pushed to the registry')
self.pull_byte_count = prom.create_counter('estimated_registry_pull_byte_count',
'Number of (estimated) bytes pulled from the registry',
labelnames=['protocol_version'])
# Deprecated: Define an in-memory queue for reporting metrics to CloudWatch or another
# provider.
self._queue = None
def enable_deprecated(self, maxsize=10000):
self._queue = Queue(maxsize)
def put_deprecated(self, name, value, **kwargs):
if self._queue is None:
logger.debug('No metric queue %s %s %s', name, value, kwargs)
return
try:
kwargs.setdefault('timestamp', datetime.datetime.now())
kwargs.setdefault('dimensions', {})
self._queue.put_nowait((name, value, kwargs))
except Full:
logger.error('Metric queue full')
def get_deprecated(self):
return self._queue.get()
def get_nowait_deprecated(self):
return self._queue.get_nowait()
def duration_collector_async(metric, labelvalues):
""" Decorates a method to have its duration time logged to the metric. """
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
trigger_time = time.time()
try:
rv = func(*args, **kwargs)
except Return as e:
metric.Observe(time.time() - trigger_time, labelvalues=labelvalues)
raise e
return rv
return wrapper
return decorator
def time_decorator(name, metric_queue):
""" Decorates an endpoint method to have its request time logged to the metrics queue. """
after = _time_after_request(name, metric_queue)
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
_time_before_request()
rv = func(*args, **kwargs)
after(rv)
return rv
return wrapper
return decorator
def time_blueprint(bp, metric_queue):
""" Decorates a blueprint to have its request time logged to the metrics queue. """
bp.before_request(_time_before_request)
bp.after_request(_time_after_request(bp.name, metric_queue))
def _time_before_request():
g._request_start_time = time.time()
def _time_after_request(name, metric_queue):
def f(r):
start = getattr(g, '_request_start_time', None)
if start is None:
return r
dur = time.time() - start
metric_queue.resp_time.Observe(dur, labelvalues=[request.endpoint])
metric_queue.resp_code.Inc(labelvalues=[request.endpoint, r.status_code])
if r.status_code >= 500:
metric_queue.error_500.Inc(labelvalues=[request.endpoint])
elif r.status_code < 200 or r.status_code >= 300:
metric_queue.non_200.Inc(labelvalues=[request.endpoint])
return r
return f

View file

@ -1,18 +1,23 @@
import datetime
import json
import logging import logging
import time
import threading
from Queue import Queue, Full, Empty from flask import g, request
from threading import Thread from prometheus_client import push_to_gateway, REGISTRY, Histogram
import requests
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
QUEUE_MAX = 1000
MAX_BATCH_SIZE = 100 request_duration = Histogram('quay_request_duration_seconds',
REGISTER_WAIT = datetime.timedelta(hours=1) 'seconds taken to process a request',
labelnames=['method', 'endpoint', 'status'],
buckets=[.01, .025, .05, .1, .25, .5, 1.0, 2.5, 5.0])
PROMETHEUS_PUSH_INTERVAL_SECONDS = 30
ONE_DAY_IN_SECONDS = 60 * 60 * 24
class PrometheusPlugin(object): class PrometheusPlugin(object):
""" Application plugin for reporting metrics to Prometheus. """ """ Application plugin for reporting metrics to Prometheus. """
@ -24,145 +29,54 @@ class PrometheusPlugin(object):
self.state = None self.state = None
def init_app(self, app): def init_app(self, app):
prom_url = app.config.get('PROMETHEUS_AGGREGATOR_URL') pusher = ThreadPusher(app)
prom_namespace = app.config.get('PROMETHEUS_NAMESPACE') pusher.start()
logger.debug('Initializing prometheus with aggregator url: %s', prom_url)
prometheus = Prometheus(prom_url, prom_namespace)
# register extension with app # register extension with app
app.extensions = getattr(app, 'extensions', {}) app.extensions = getattr(app, 'extensions', {})
app.extensions['prometheus'] = prometheus app.extensions['prometheus'] = pusher
return prometheus return pusher
def __getattr__(self, name): def __getattr__(self, name):
return getattr(self.state, name, None) return getattr(self.state, name, None)
class Prometheus(object): class ThreadPusher(threading.Thread):
""" Aggregator for collecting stats that are reported to Prometheus. """ def __init__(self, app):
def __init__(self, url=None, namespace=None): super(ThreadPusher, self).__init__()
self._metric_collectors = []
self._url = url
self._namespace = namespace or ''
if url is not None:
self._queue = Queue(QUEUE_MAX)
self._sender = _QueueSender(self._queue, url, self._metric_collectors)
self._sender.start()
logger.debug('Prometheus aggregator sending to %s', url)
else:
self._queue = None
logger.debug('Prometheus aggregator disabled')
def enqueue(self, call, data):
if not self._queue:
return
v = json.dumps({
'Call': call,
'Data': data,
})
if call == 'register':
self._metric_collectors.append(v)
return
try:
self._queue.put_nowait(v)
except Full:
# If the queue is full, it is because 1) no aggregator was enabled or 2)
# the aggregator is taking a long time to respond to requests. In the case
# of 1, it's probably enterprise mode and we don't care. In the case of 2,
# the response timeout error is printed inside the queue handler. In either case,
# we don't need to print an error here.
pass
def create_gauge(self, *args, **kwargs):
return self._create_collector('Gauge', args, kwargs)
def create_counter(self, *args, **kwargs):
return self._create_collector('Counter', args, kwargs)
def create_summary(self, *args, **kwargs):
return self._create_collector('Summary', args, kwargs)
def create_histogram(self, *args, **kwargs):
return self._create_collector('Histogram', args, kwargs)
def create_untyped(self, *args, **kwargs):
return self._create_collector('Untyped', args, kwargs)
def _create_collector(self, collector_type, args, kwargs):
kwargs['namespace'] = kwargs.get('namespace', self._namespace)
return _Collector(self.enqueue, collector_type, *args, **kwargs)
class _QueueSender(Thread):
""" Helper class which uses a thread to asynchronously send metrics to the local Prometheus
aggregator. """
def __init__(self, queue, url, metric_collectors):
Thread.__init__(self)
self.daemon = True self.daemon = True
self.next_register = datetime.datetime.now() self._app = app
self._queue = queue
self._url = url
self._metric_collectors = metric_collectors
def run(self): def run(self):
agg_url = self._app.config.get('PROMETHEUS_AGGREGATOR_URL')
while True: while True:
reqs = [] if agg_url is None:
reqs.append(self._queue.get()) # Practically disable this worker, if there is no aggregator.
time.sleep(ONE_DAY_IN_SECONDS)
continue
while len(reqs) < MAX_BATCH_SIZE: time.sleep(PROMETHEUS_PUSH_INTERVAL_SECONDS)
try: push_to_gateway(agg_url, job=self._app.config.get('PROMETHEUS_NAMESPACE', 'quay'),
req = self._queue.get_nowait() registry=REGISTRY)
reqs.append(req)
except Empty:
break
try:
resp = requests.post(self._url + '/call', '\n'.join(reqs))
if resp.status_code == 500 and self.next_register <= datetime.datetime.now():
resp = requests.post(self._url + '/call', '\n'.join(self._metric_collectors))
self.next_register = datetime.datetime.now() + REGISTER_WAIT
logger.debug('Register returned %s for %s metrics; setting next to %s', resp.status_code,
len(self._metric_collectors), self.next_register)
elif resp.status_code != 200:
logger.debug('Failed sending to prometheus: %s: %s: %s', resp.status_code, resp.text,
', '.join(reqs))
else:
logger.debug('Sent %d prometheus metrics', len(reqs))
except:
logger.exception('Failed to write to prometheus aggregator: %s', reqs)
class _Collector(object): def timed_blueprint(bp):
""" Collector for a Prometheus metric. """ """
def __init__(self, enqueue_method, collector_type, collector_name, collector_help, Decorates a blueprint to have its request duration tracked by Prometheus.
namespace='', subsystem='', **kwargs): """
self._enqueue_method = enqueue_method def _time_before_request():
self._base_args = { g._request_start_time = time.time()
'Name': collector_name, bp.before_request(_time_before_request)
'Namespace': namespace,
'Subsystem': subsystem,
'Type': collector_type,
}
registration_params = dict(kwargs)
registration_params.update(self._base_args)
registration_params['Help'] = collector_help
self._enqueue_method('register', registration_params)
def __getattr__(self, method):
def f(value=0, labelvalues=()):
data = dict(self._base_args)
data.update({
'Value': value,
'LabelValues': [str(i) for i in labelvalues],
'Method': method,
})
self._enqueue_method('put', data)
def _time_after_request():
def f(r):
start = getattr(g, '_request_start_time', None)
if start is None:
return r
dur = time.time() - start
request_duration.labels(request.method, request.endpoint, r.status_code).observe(dur)
return r
return f return f
bp.after_request(_time_after_request())
return bp

View file

@ -1,58 +0,0 @@
import time
import pytest
from mock import Mock
from trollius import coroutine, Return, get_event_loop, From
from util.metrics.metricqueue import duration_collector_async
mock_histogram = Mock()
class NonReturn(Exception):
pass
@coroutine
@duration_collector_async(mock_histogram, labelvalues=["testlabel"])
def duration_decorated():
time.sleep(1)
raise Return("fin")
@coroutine
@duration_collector_async(mock_histogram, labelvalues=["testlabel"])
def duration_decorated_error():
raise NonReturn("not a Return error")
@coroutine
def calls_decorated():
yield From(duration_decorated())
def test_duration_decorator():
loop = get_event_loop()
loop.run_until_complete(duration_decorated())
assert mock_histogram.Observe.called
assert 1 - mock_histogram.Observe.call_args[0][0] < 1 # duration should be close to 1s
assert mock_histogram.Observe.call_args[1]["labelvalues"] == ["testlabel"]
def test_duration_decorator_error():
loop = get_event_loop()
mock_histogram.reset_mock()
with pytest.raises(NonReturn):
loop.run_until_complete(duration_decorated_error())
assert not mock_histogram.Observe.called
def test_duration_decorator_caller():
mock_histogram.reset_mock()
loop = get_event_loop()
loop.run_until_complete(calls_decorated())
assert mock_histogram.Observe.called
assert 1 - mock_histogram.Observe.call_args[0][0] < 1 # duration should be close to 1s
assert mock_histogram.Observe.call_args[1]["labelvalues"] == ["testlabel"]

View file

@ -7,9 +7,11 @@ import time
import sys import sys
import traceback import traceback
logger = multiprocessing.log_to_stderr() logger = multiprocessing.log_to_stderr()
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
class QueueProcess(object): class QueueProcess(object):
""" Helper class which invokes a worker in a process to produce """ Helper class which invokes a worker in a process to produce
data for one (or more) queues. data for one (or more) queues.

View file

@ -1,20 +1,23 @@
import logging import logging
import boto
import time import time
import random import random
from Queue import Empty from Queue import Empty
from threading import Thread from threading import Thread
import boto
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
MAX_BATCH_METRICS = 20 MAX_BATCH_METRICS = 20
# Sleep for this much time between failed send requests. # Sleep for this much time between failed send requests.
# This prevents hammering cloudwatch when it's not available. # This prevents hammering cloudwatch when it's not available.
FAILED_SEND_SLEEP_SECS = 15 FAILED_SEND_SLEEP_SECS = 15
def start_cloudwatch_sender(metrics, app): def start_cloudwatch_sender(metrics, app):
""" """
Starts sending from metrics to a new CloudWatchSender. Starts sending from metrics to a new CloudWatchSender.
@ -76,10 +79,9 @@ class CloudWatchSender(Thread):
except: except:
for i in range(len(metrics['name'])): for i in range(len(metrics['name'])):
self._metrics.put_deprecated(metrics['name'][i], metrics['value'][i], self._metrics.put_deprecated(metrics['name'][i], metrics['value'][i],
unit=metrics['unit'][i], unit=metrics['unit'][i],
dimensions=metrics['dimensions'][i], dimensions=metrics['dimensions'][i],
timestamp=metrics['timestamp'][i], timestamp=metrics['timestamp'][i])
)
logger.exception('Failed to write to CloudWatch: %s', metrics) logger.exception('Failed to write to CloudWatch: %s', metrics)
logger.debug('Attempted to requeue %d metrics.', len(metrics['name'])) logger.debug('Attempted to requeue %d metrics.', len(metrics['name']))

View file

@ -1,11 +1,23 @@
import time
import jwt
import logging import logging
import time
from functools import wraps
import jwt
from prometheus_client import Counter
from util.security import jwtutil from util.security import jwtutil
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
bearer_token_decoded = Counter('bearer_token_decoded_total',
'number of times a bearer token has been validated',
labelnames=['success'])
ANONYMOUS_SUB = '(anonymous)' ANONYMOUS_SUB = '(anonymous)'
ALGORITHM = 'RS256' ALGORITHM = 'RS256'
CLAIM_TUF_ROOTS = 'com.apostille.roots' CLAIM_TUF_ROOTS = 'com.apostille.roots'
@ -23,7 +35,7 @@ class InvalidBearerTokenException(Exception):
pass pass
def decode_bearer_header(bearer_header, instance_keys, config, metric_queue=None): def decode_bearer_header(bearer_header, instance_keys, config):
""" decode_bearer_header decodes the given bearer header that contains an encoded JWT with both """ decode_bearer_header decodes the given bearer header that contains an encoded JWT with both
a Key ID as well as the signed JWT and returns the decoded and validated JWT. On any error, a Key ID as well as the signed JWT and returns the decoded and validated JWT. On any error,
raises an InvalidBearerTokenException with the reason for failure. raises an InvalidBearerTokenException with the reason for failure.
@ -35,10 +47,30 @@ def decode_bearer_header(bearer_header, instance_keys, config, metric_queue=None
encoded_jwt = match.group(1) encoded_jwt = match.group(1)
logger.debug('encoded JWT: %s', encoded_jwt) logger.debug('encoded JWT: %s', encoded_jwt)
return decode_bearer_token(encoded_jwt, instance_keys, config, metric_queue=metric_queue) return decode_bearer_token(encoded_jwt, instance_keys, config)
def decode_bearer_token(bearer_token, instance_keys, config, metric_queue=None): def observe_decode():
"""
Decorates `decode_bearer_tokens` to record a metric into Prometheus such that any exceptions
raised get recorded as a failure and the return of a payload is considered a success.
"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
try:
rv = func(*args, **kwargs)
except Exception as e:
bearer_token_decoded.labels(False).inc()
raise e
bearer_token_decoded.labels(True).inc()
return rv
return wrapper
return decorator
@observe_decode()
def decode_bearer_token(bearer_token, instance_keys, config):
""" decode_bearer_token decodes the given bearer token that contains both a Key ID as well as the """ decode_bearer_token decodes the given bearer token that contains both a Key ID as well as the
encoded JWT and returns the decoded and validated JWT. On any error, raises an encoded JWT and returns the decoded and validated JWT. On any error, raises an
InvalidBearerTokenException with the reason for failure. InvalidBearerTokenException with the reason for failure.
@ -58,9 +90,6 @@ def decode_bearer_token(bearer_token, instance_keys, config, metric_queue=None):
# Find the matching public key. # Find the matching public key.
public_key = instance_keys.get_service_key_public_key(kid) public_key = instance_keys.get_service_key_public_key(kid)
if public_key is None: if public_key is None:
if metric_queue is not None:
metric_queue.invalid_instance_key_count.Inc(labelvalues=[kid])
logger.error('Could not find requested service key %s with encoded JWT: %s', kid, bearer_token) logger.error('Could not find requested service key %s with encoded JWT: %s', kid, bearer_token)
raise InvalidBearerTokenException('Unknown service key') raise InvalidBearerTokenException('Unknown service key')

View file

@ -2,8 +2,8 @@ from datetime import datetime, timedelta
from data import model from data import model
from data.database import BlobUpload as BlobUploadTable from data.database import BlobUpload as BlobUploadTable
from workers.blobuploadcleanupworker.models_interface import ( from workers.blobuploadcleanupworker.models_interface import (BlobUpload,
BlobUpload, BlobUploadCleanupWorkerDataInterface) BlobUploadCleanupWorkerDataInterface)
class PreOCIModel(BlobUploadCleanupWorkerDataInterface): class PreOCIModel(BlobUploadCleanupWorkerDataInterface):

View file

@ -10,12 +10,14 @@ from util.streamingjsonencoder import StreamingJSONEncoder
from workers.buildlogsarchiver.models_pre_oci import pre_oci_model as model from workers.buildlogsarchiver.models_pre_oci import pre_oci_model as model
from workers.worker import Worker from workers.worker import Worker
POLL_PERIOD_SECONDS = 30
MEMORY_TEMPFILE_SIZE = 64 * 1024 # Large enough to handle approximately 99% of builds in memory
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
POLL_PERIOD_SECONDS = 30
MEMORY_TEMPFILE_SIZE = 64 * 1024 # Large enough to handle approximately 99% of builds in memory
class ArchiveBuildLogsWorker(Worker): class ArchiveBuildLogsWorker(Worker):
def __init__(self): def __init__(self):
super(ArchiveBuildLogsWorker, self).__init__() super(ArchiveBuildLogsWorker, self).__init__()

View file

@ -9,10 +9,13 @@ from workers.worker import Worker
from util.log import logfile_path from util.log import logfile_path
from util.timedeltastring import convert_to_timedelta from util.timedeltastring import convert_to_timedelta
POLL_PERIOD_SECONDS = 60 * 60 # 1 hour
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
POLL_PERIOD_SECONDS = 60 * 60 # 1 hour
class ExpiredAppSpecificTokenWorker(Worker): class ExpiredAppSpecificTokenWorker(Worker):
def __init__(self): def __init__(self):
super(ExpiredAppSpecificTokenWorker, self).__init__() super(ExpiredAppSpecificTokenWorker, self).__init__()
@ -38,7 +41,7 @@ if __name__ == "__main__":
logger.debug('App specific tokens disabled; skipping') logger.debug('App specific tokens disabled; skipping')
while True: while True:
time.sleep(100000) time.sleep(100000)
if app.config.get('EXPIRED_APP_SPECIFIC_TOKEN_GC') is None: if app.config.get('EXPIRED_APP_SPECIFIC_TOKEN_GC') is None:
logger.debug('GC of App specific tokens is disabled; skipping') logger.debug('GC of App specific tokens is disabled; skipping')
while True: while True:

View file

@ -15,10 +15,11 @@ from app import app, export_action_logs_queue, storage as app_storage, get_app_u
from endpoints.api import format_date from endpoints.api import format_date
from data.logs_model import logs_model from data.logs_model import logs_model
from data.logs_model.interface import LogsIterationTimeout from data.logs_model.interface import LogsIterationTimeout
from workers.queueworker import QueueWorker, JobException from workers.queueworker import QueueWorker
from util.log import logfile_path from util.log import logfile_path
from util.useremails import send_logs_exported_email from util.useremails import send_logs_exported_email
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View file

@ -9,8 +9,10 @@ from data.model.repository import find_repository_with_garbage, get_random_gc_po
from data.model.gc import garbage_collect_repo from data.model.gc import garbage_collect_repo
from workers.worker import Worker from workers.worker import Worker
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class GarbageCollectionWorker(Worker): class GarbageCollectionWorker(Worker):
def __init__(self): def __init__(self):
super(GarbageCollectionWorker, self).__init__() super(GarbageCollectionWorker, self).__init__()

View file

@ -1,15 +1,25 @@
import logging import logging
import time import time
from app import app, metric_queue from prometheus_client import Gauge
from app import app
from data.database import UseThenDisconnect from data.database import UseThenDisconnect
from workers.globalpromstats.models_pre_oci import pre_oci_model as model
from util.locking import GlobalLock, LockNotAcquiredException from util.locking import GlobalLock, LockNotAcquiredException
from util.log import logfile_path from util.log import logfile_path
from workers.globalpromstats.models_pre_oci import pre_oci_model as model
from workers.worker import Worker from workers.worker import Worker
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
repository_rows = Gauge('quay_repository_rows', 'number of repositories in the database')
user_rows = Gauge('quay_user_rows', 'number of users in the database')
org_rows = Gauge('quay_org_rows', 'number of organizations in the database')
robot_rows = Gauge('quay_robot_rows', 'number of robot accounts in the database')
WORKER_FREQUENCY = app.config.get('GLOBAL_PROMETHEUS_STATS_FREQUENCY', 60 * 60) WORKER_FREQUENCY = app.config.get('GLOBAL_PROMETHEUS_STATS_FREQUENCY', 60 * 60)
@ -33,13 +43,10 @@ class GlobalPrometheusStatsWorker(Worker):
def _report_stats(self): def _report_stats(self):
logger.debug('Reporting global stats') logger.debug('Reporting global stats')
with UseThenDisconnect(app.config): with UseThenDisconnect(app.config):
# Repository count. repository_rows.set(model.get_repository_count())
metric_queue.repository_count.Set(model.get_repository_count()) user_rows.set(model.get_active_user_count())
org_rows.set(model.get_active_org_count())
# User counts. robot_rows.set(model.get_robot_count())
metric_queue.user_count.Set(model.get_active_user_count())
metric_queue.org_count.Set(model.get_active_org_count())
metric_queue.robot_count.Set(model.get_robot_count())
def main(): def main():

View file

@ -1,15 +0,0 @@
from mock import patch, Mock
from workers.globalpromstats.globalpromstats import GlobalPrometheusStatsWorker
from test.fixtures import *
def test_reportstats(initialized_db):
mock = Mock()
with patch('workers.globalpromstats.globalpromstats.metric_queue', mock):
worker = GlobalPrometheusStatsWorker()
worker._report_stats()
mock.repository_count.Set.assert_called_once()
mock.org_count.Set.assert_called_once()
mock.robot_count.Set.assert_called_once()

View file

@ -12,10 +12,13 @@ from workers.worker import Worker
from util.log import logfile_path from util.log import logfile_path
from util.migrate.allocator import yield_random_entries from util.migrate.allocator import yield_random_entries
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
WORKER_TIMEOUT = 600 WORKER_TIMEOUT = 600
class LabelBackfillWorker(Worker): class LabelBackfillWorker(Worker):
def __init__(self): def __init__(self):
super(LabelBackfillWorker, self).__init__() super(LabelBackfillWorker, self).__init__()

View file

@ -16,8 +16,10 @@ from util.streamingjsonencoder import StreamingJSONEncoder
from util.timedeltastring import convert_to_timedelta from util.timedeltastring import convert_to_timedelta
from workers.worker import Worker from workers.worker import Worker
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
JSON_MIMETYPE = 'application/json' JSON_MIMETYPE = 'application/json'
MIN_LOGS_PER_ROTATION = 5000 MIN_LOGS_PER_ROTATION = 5000
MEMORY_TEMPFILE_SIZE = 12 * 1024 * 1024 MEMORY_TEMPFILE_SIZE = 12 * 1024 * 1024

View file

@ -14,6 +14,7 @@ from test.fixtures import *
from workers.notificationworker.models_pre_oci import pre_oci_model as model from workers.notificationworker.models_pre_oci import pre_oci_model as model
def test_basic_notification_endtoend(initialized_db): def test_basic_notification_endtoend(initialized_db):
# Ensure the public user doesn't have any notifications. # Ensure the public user doesn't have any notifications.
assert not model.user_has_local_notifications('public') assert not model.user_has_local_notifications('public')

View file

@ -7,8 +7,10 @@ from app import app
from data.database import CloseForLongOperation from data.database import CloseForLongOperation
from workers.worker import Worker from workers.worker import Worker
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class JobException(Exception): class JobException(Exception):
""" A job exception is an exception that is caused by something being malformed in the job. When """ A job exception is an exception that is caused by something being malformed in the job. When
a worker raises this exception the job will be terminated and the retry will not be returned a worker raises this exception the job will be terminated and the retry will not be returned

View file

@ -4,9 +4,11 @@ import traceback
import fnmatch import fnmatch
import logging.config import logging.config
from prometheus_client import Gauge
import features import features
from app import app, prometheus from app import app
from data import database from data import database
from data.model.repo_mirror import claim_mirror, release_mirror from data.model.repo_mirror import claim_mirror, release_mirror
from data.logs_model import logs_model from data.logs_model import logs_model
@ -16,12 +18,15 @@ from data.model.oci.tag import delete_tag, retarget_tag, lookup_alive_tags_shall
from notifications import spawn_notification from notifications import spawn_notification
from util.audit import wrap_repository from util.audit import wrap_repository
from workers.repomirrorworker.repo_mirror_model import repo_mirror_model as model from workers.repomirrorworker.repo_mirror_model import repo_mirror_model as model
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
unmirrored_repositories_gauge = prometheus.create_gauge('unmirrored_repositories',
'Number of repositories that need to be scanned.')
unmirrored_repositories = Gauge('quay_repository_rows_unmirrored',
'number of repositories in the database that have not yet been mirrored')
class PreemptedException(Exception): class PreemptedException(Exception):
""" Exception raised if another worker analyzed the image before this worker was able to do so. """ Exception raised if another worker analyzed the image before this worker was able to do so.
@ -61,7 +66,7 @@ def process_mirrors(skopeo, token=None):
logger.exception('Repository Mirror service unavailable') logger.exception('Repository Mirror service unavailable')
return None return None
unmirrored_repositories_gauge.Set(num_remaining) unmirrored_repositories.set(num_remaining)
return next_token return next_token

View file

@ -7,10 +7,13 @@ from data import model
from data.logs_model import logs_model from data.logs_model import logs_model
from workers.worker import Worker, with_exponential_backoff from workers.worker import Worker, with_exponential_backoff
POLL_PERIOD_SECONDS = 10
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
POLL_PERIOD_SECONDS = 10
class RepositoryActionCountWorker(Worker): class RepositoryActionCountWorker(Worker):
def __init__(self): def __init__(self):
super(RepositoryActionCountWorker, self).__init__() super(RepositoryActionCountWorker, self).__init__()

View file

@ -1,14 +1,19 @@
import logging.config import logging.config
from prometheus_client import Gauge
from app import app, prometheus from app import app, prometheus
from data.database import UseThenDisconnect from data.database import UseThenDisconnect
from workers.securityworker.models_pre_oci import pre_oci_model as model from workers.securityworker.models_pre_oci import pre_oci_model as model
from util.secscan.api import APIRequestFailure from util.secscan.api import APIRequestFailure
from util.secscan.analyzer import PreemptedException from util.secscan.analyzer import PreemptedException
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
unscanned_images_gauge = prometheus.create_gauge('unscanned_images',
'Number of images that clair needs to scan.')
unscanned_images = Gauge('quay_security_scanning_unscanned_images_remaining',
'number of images that are not scanned by the latest security scanner')
def index_images(target_version, analyzer, token=None): def index_images(target_version, analyzer, token=None):
@ -31,6 +36,6 @@ def index_images(target_version, analyzer, token=None):
logger.exception('Security scanner service unavailable') logger.exception('Security scanner service unavailable')
return return
unscanned_images_gauge.Set(num_remaining) unscanned_images.set(num_remaining)
return next_token return next_token

View file

@ -11,8 +11,10 @@ from util.secscan.analyzer import LayerAnalyzer
from util.log import logfile_path from util.log import logfile_path
from endpoints.v2 import v2_bp from endpoints.v2 import v2_bp
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DEFAULT_INDEXING_INTERVAL = 30 DEFAULT_INDEXING_INTERVAL = 30

View file

@ -1,13 +1,21 @@
import logging import logging
from datetime import datetime, timedelta from datetime import datetime, timedelta
from app import app, instance_keys, metric_queue from prometheus_client import Counter
from app import app, instance_keys
from workers.servicekeyworker.models_pre_oci import pre_oci_model as model from workers.servicekeyworker.models_pre_oci import pre_oci_model as model
from workers.worker import Worker from workers.worker import Worker
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
instance_key_renewal_self = Counter('quay_instance_key_renewal_self_total',
'number of times a Quay instance renews its own key',
labelnames=['success'])
class ServiceKeyWorker(Worker): class ServiceKeyWorker(Worker):
def __init__(self): def __init__(self):
super(ServiceKeyWorker, self).__init__() super(ServiceKeyWorker, self).__init__()
@ -28,12 +36,12 @@ class ServiceKeyWorker(Worker):
except Exception as ex: except Exception as ex:
logger.exception('Failure for automatic refresh of service key %s with new expiration %s', logger.exception('Failure for automatic refresh of service key %s with new expiration %s',
instance_keys.local_key_id, new_expiration) instance_keys.local_key_id, new_expiration)
metric_queue.instance_key_renewal_failure.Inc(labelvalues=[instance_keys.local_key_id]) instance_key_renewal_self.labels(False).inc()
raise ex raise ex
logger.debug('Finished automatic refresh of service key %s with new expiration %s', logger.debug('Finished automatic refresh of service key %s with new expiration %s',
instance_keys.local_key_id, new_expiration) instance_keys.local_key_id, new_expiration)
metric_queue.instance_key_renewal_success.Inc(labelvalues=[instance_keys.local_key_id]) instance_key_renewal_self.labels(True).inc()
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -9,11 +9,14 @@ from data import model
from workers.queueworker import QueueWorker, WorkerUnhealthyException, JobException from workers.queueworker import QueueWorker, WorkerUnhealthyException, JobException
from util.log import logfile_path from util.log import logfile_path
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
POLL_PERIOD_SECONDS = 10 POLL_PERIOD_SECONDS = 10
RESERVATION_SECONDS = app.config.get('STORAGE_REPLICATION_PROCESSING_SECONDS', 60*20) RESERVATION_SECONDS = app.config.get('STORAGE_REPLICATION_PROCESSING_SECONDS', 60*20)
class StorageReplicationWorker(QueueWorker): class StorageReplicationWorker(QueueWorker):
def process_queue_item(self, job_details): def process_queue_item(self, job_details):
storage_uuid = job_details['storage_id'] storage_uuid = job_details['storage_id']

View file

@ -26,8 +26,10 @@ from util.bytes import Bytes
from util.log import logfile_path from util.log import logfile_path
from util.migrate.allocator import yield_random_entries from util.migrate.allocator import yield_random_entries
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
WORKER_TIMEOUT = app.config.get('BACKFILL_TAGS_TIMEOUT', 6000) WORKER_TIMEOUT = app.config.get('BACKFILL_TAGS_TIMEOUT', 6000)

View file

@ -9,11 +9,14 @@ from workers.worker import Worker
from util.timedeltastring import convert_to_timedelta from util.timedeltastring import convert_to_timedelta
from util.log import logfile_path from util.log import logfile_path
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
WORKER_FREQUENCY = app.config.get('TEAM_SYNC_WORKER_FREQUENCY', 60) WORKER_FREQUENCY = app.config.get('TEAM_SYNC_WORKER_FREQUENCY', 60)
STALE_CUTOFF = convert_to_timedelta(app.config.get('TEAM_RESYNC_STALE_TIME', '30m')) STALE_CUTOFF = convert_to_timedelta(app.config.get('TEAM_RESYNC_STALE_TIME', '30m'))
class TeamSynchronizationWorker(Worker): class TeamSynchronizationWorker(Worker):
""" Worker which synchronizes teams with their backing groups in LDAP/Keystone/etc. """ Worker which synchronizes teams with their backing groups in LDAP/Keystone/etc.
""" """

View file

@ -16,12 +16,13 @@ from workers.exportactionlogsworker import ExportActionLogsWorker, POLL_PERIOD_S
from test.fixtures import * from test.fixtures import *
_TEST_CONTENT = os.urandom(1024) _TEST_CONTENT = os.urandom(1024)
_TEST_BUCKET = 'some_bucket' _TEST_BUCKET = 'some_bucket'
_TEST_USER = 'someuser' _TEST_USER = 'someuser'
_TEST_PASSWORD = 'somepassword' _TEST_PASSWORD = 'somepassword'
_TEST_PATH = 'some/cool/path' _TEST_PATH = 'some/cool/path'
_TEST_CONTEXT = StorageContext('nyc', None, None, None, None) _TEST_CONTEXT = StorageContext('nyc', None, None, None)
@pytest.fixture(params=['test', 'mock_s3']) @pytest.fixture(params=['test', 'mock_s3'])

View file

@ -1,4 +1,5 @@
import os.path import os.path
from datetime import datetime, timedelta from datetime import datetime, timedelta
from app import storage from app import storage