Add duration metric collector decorator (#1885)
Track time-to-start for builders Track time-to-build for builders Track ec2 builder fallbacks Track build time
This commit is contained in:
parent
85d611e2fb
commit
832ee89923
5 changed files with 110 additions and 5 deletions
|
@ -4,6 +4,7 @@ import uuid
|
|||
import calendar
|
||||
import os.path
|
||||
import json
|
||||
import time
|
||||
|
||||
from collections import namedtuple
|
||||
from datetime import datetime, timedelta
|
||||
|
@ -385,7 +386,7 @@ class EphemeralBuilderManager(BaseManager):
|
|||
build_uuid = build_job.job_details['build_uuid']
|
||||
logger.debug('Calling schedule with job: %s', build_uuid)
|
||||
|
||||
# Check if there are worker slots avialable by checking the number of jobs in etcd
|
||||
# Check if there are worker slots available by checking the number of jobs in etcd
|
||||
allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
|
||||
try:
|
||||
active_jobs = yield From(self._etcd_client.read(self._etcd_job_prefix, recursive=True))
|
||||
|
@ -450,6 +451,7 @@ class EphemeralBuilderManager(BaseManager):
|
|||
|
||||
# Check if we can use this executor based on the retries remaining.
|
||||
if executor.minimum_retry_threshold > build_job.retries_remaining:
|
||||
metric_queue.builder_fallback.Inc()
|
||||
logger.debug('Job %s cannot use executor %s as it is below retry threshold %s (retry #%s)',
|
||||
build_uuid, executor.name, executor.minimum_retry_threshold,
|
||||
build_job.retries_remaining)
|
||||
|
@ -499,6 +501,7 @@ class EphemeralBuilderManager(BaseManager):
|
|||
'execution_id': execution_id,
|
||||
'executor_name': started_with_executor.name,
|
||||
'job_queue_item': build_job.job_item,
|
||||
'start_time': time.time(),
|
||||
})
|
||||
|
||||
try:
|
||||
|
@ -534,6 +537,14 @@ class EphemeralBuilderManager(BaseManager):
|
|||
logger.debug('Sending build %s to newly ready component on realm %s',
|
||||
job.build_uuid, build_component.builder_realm)
|
||||
yield From(build_component.start_build(job))
|
||||
|
||||
try:
|
||||
# log start time to prometheus
|
||||
realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm)))
|
||||
start_time = json.loads(realm_data.value)['start_time']
|
||||
metric_queue.builder_time_to_build(time.time() - start_time, labelvalues=[realm_data.executor_name])
|
||||
except (KeyError, etcd.EtcdKeyError):
|
||||
logger.warning('Could not read realm key %s', build_component.builder_realm)
|
||||
|
||||
try:
|
||||
# Clean up the bookkeeping for allowing any manager to take the job.
|
||||
|
@ -556,8 +567,16 @@ class EphemeralBuilderManager(BaseManager):
|
|||
|
||||
self.job_complete_callback(build_job, job_status, executor_name)
|
||||
|
||||
# Kill the ephmeral builder.
|
||||
# Kill the ephemeral builder.
|
||||
yield From(self.kill_builder_executor(build_job.build_uuid))
|
||||
|
||||
try:
|
||||
# log build time to prometheus
|
||||
realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm)))
|
||||
start_time = json.loads(realm_data.value)['start_time']
|
||||
metric_queue.build_time(time.time() - start_time, labelvalues=[realm_data.executor_name])
|
||||
except (KeyError, etcd.EtcdKeyError):
|
||||
logger.warning('Could not read realm key %s', build_component.builder_realm)
|
||||
|
||||
# Delete the build job from etcd.
|
||||
job_key = self._etcd_job_key(build_job)
|
||||
|
|
|
@ -17,6 +17,7 @@ from functools import partial
|
|||
from buildman.asyncutil import AsyncWrapper
|
||||
from container_cloud_config import CloudConfigContext
|
||||
from app import metric_queue, app
|
||||
from util.metrics.metricqueue import duration_collector_async
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -131,6 +132,7 @@ class EC2Executor(BuilderExecutor):
|
|||
return stack_amis[ec2_region]
|
||||
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, ['ec2'])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
region = self.executor_config['EC2_REGION']
|
||||
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
|
||||
|
@ -240,6 +242,7 @@ class PopenExecutor(BuilderExecutor):
|
|||
""" Executor which uses Popen to fork a quay-builder process.
|
||||
"""
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, ['fork'])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
# Now start a machine for this job, adding the machine id to the etcd information
|
||||
logger.debug('Forking process for build')
|
||||
|
@ -391,6 +394,7 @@ class KubernetesExecutor(BuilderExecutor):
|
|||
}
|
||||
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, ['k8s'])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
# generate resource
|
||||
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
|
||||
|
|
|
@ -13,7 +13,8 @@ from buildman.manager.ephemeral import (EphemeralBuilderManager, EtcdAction,
|
|||
ETCD_MAX_WATCH_TIMEOUT)
|
||||
from buildman.component.buildcomponent import BuildComponent
|
||||
from buildman.server import BuildJobResult
|
||||
|
||||
from util.metrics.metricqueue import duration_collector_async
|
||||
from app import metric_queue
|
||||
|
||||
BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead'
|
||||
REALM_ID = '1234-realm'
|
||||
|
@ -33,6 +34,7 @@ class TestExecutor(BuilderExecutor):
|
|||
job_stopped = None
|
||||
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, labelvalues=["testlabel"])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
self.job_started = str(uuid.uuid4())
|
||||
raise Return(self.job_started)
|
||||
|
@ -45,6 +47,7 @@ class TestExecutor(BuilderExecutor):
|
|||
|
||||
class BadExecutor(BuilderExecutor):
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, labelvalues=["testlabel"])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
raise ExecutorException('raised on purpose!')
|
||||
|
||||
|
@ -210,6 +213,7 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
|
|||
# Take the job ourselves
|
||||
yield From(self.manager.build_component_ready(test_component))
|
||||
|
||||
self.etcd_client_mock.read.assert_called_with(os.path.join('realm/', REALM_ID))
|
||||
self.etcd_client_mock.delete.assert_called_once_with(os.path.join('realm/', REALM_ID))
|
||||
self.etcd_client_mock.delete.reset_mock()
|
||||
|
||||
|
@ -743,4 +747,3 @@ class TestEphemeral(EphemeralBuilderTestCase):
|
|||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
|
|
50
test/test_metricqueue.py
Normal file
50
test/test_metricqueue.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
import time
|
||||
import unittest
|
||||
|
||||
from mock import Mock
|
||||
from trollius import coroutine, Return, get_event_loop
|
||||
|
||||
from util.metrics.metricqueue import duration_collector_async
|
||||
|
||||
|
||||
mock_histogram = Mock()
|
||||
|
||||
class NonReturn(Exception):
|
||||
pass
|
||||
|
||||
|
||||
@coroutine
|
||||
@duration_collector_async(mock_histogram, labelvalues=["testlabel"])
|
||||
def duration_decorated():
|
||||
time.sleep(1)
|
||||
raise Return("fin")
|
||||
|
||||
|
||||
@coroutine
|
||||
@duration_collector_async(mock_histogram, labelvalues=["testlabel"])
|
||||
def duration_decorated_error():
|
||||
raise NonReturn("not a Return error")
|
||||
|
||||
|
||||
class DurationDecoratorTestCase(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.etcd_client_mock = None
|
||||
self.loop = get_event_loop()
|
||||
super(DurationDecoratorTestCase, self).__init__(*args, **kwargs)
|
||||
|
||||
def test_duration_decorator(self):
|
||||
self.loop.run_until_complete(duration_decorated())
|
||||
assert mock_histogram.Observe.called
|
||||
assert 1 - mock_histogram.Observe.call_args[0][0] < 1 # duration should be close to 1s
|
||||
assert mock_histogram.Observe.call_args[1]["labelvalues"] == ["testlabel"]
|
||||
|
||||
def test_duration_decorator_error(self):
|
||||
mock_histogram.reset_mock()
|
||||
|
||||
with self.assertRaises(NonReturn):
|
||||
self.loop.run_until_complete(duration_decorated_error())
|
||||
assert not mock_histogram.Observe.called
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -6,6 +6,7 @@ from functools import wraps
|
|||
from Queue import Queue, Full
|
||||
|
||||
from flask import g, request
|
||||
from trollius import Return
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -13,6 +14,9 @@ logger = logging.getLogger(__name__)
|
|||
# Buckets for the API response times.
|
||||
API_RESPONSE_TIME_BUCKETS = [.01, .025, .05, .1, .25, .5, 1.0, 2.5, 5.0]
|
||||
|
||||
# Buckets for the builder start times.
|
||||
BUILDER_START_TIME_BUCKETS = [.5, 1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 180.0, 240.0, 300.0, 600.0]
|
||||
|
||||
|
||||
class MetricQueue(object):
|
||||
""" Object to which various metrics are written, for distribution to metrics collection
|
||||
|
@ -33,6 +37,16 @@ class MetricQueue(object):
|
|||
'Multipart upload ends.', labelnames=['type'])
|
||||
self.build_capacity_shortage = prom.create_gauge('build_capacity_shortage',
|
||||
'Build capacity shortage.')
|
||||
self.builder_time_to_start = prom.create_histogram('builder_tts',
|
||||
'Time from triggering to starting a builder.',
|
||||
labelnames=['builder_type'],
|
||||
buckets=BUILDER_START_TIME_BUCKETS)
|
||||
self.builder_time_to_build = prom.create_histogram('builder_ttb',
|
||||
'Time from triggering to actually starting a build',
|
||||
labelnames=['builder_type'],
|
||||
buckets=BUILDER_START_TIME_BUCKETS)
|
||||
self.build_time = prom.create_histogram('build_time', 'Time spent buildig', labelnames=['builder_type'])
|
||||
self.builder_fallback = prom.create_counter('builder_fallback', 'Builder fell back to ec2 executor')
|
||||
self.percent_building = prom.create_gauge('build_percent_building', 'Percent building.')
|
||||
self.build_counter = prom.create_counter('builds', 'Number of builds', labelnames=['name'])
|
||||
self.ephemeral_build_workers = prom.create_counter('ephemeral_build_workers',
|
||||
|
@ -88,6 +102,22 @@ class MetricQueue(object):
|
|||
return self._queue.get_nowait()
|
||||
|
||||
|
||||
def duration_collector_async(metric, labelvalues):
|
||||
""" Decorates a method to have its duration time logged to the metric. """
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
trigger_time = time.time()
|
||||
try:
|
||||
rv = func(*args, **kwargs)
|
||||
except Return as e:
|
||||
metric.Observe(time.time() - trigger_time, labelvalues=labelvalues)
|
||||
raise e
|
||||
return rv
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
def time_decorator(name, metric_queue):
|
||||
""" Decorates an endpoint method to have its request time logged to the metrics queue. """
|
||||
after = _time_after_request(name, metric_queue)
|
||||
|
@ -135,4 +165,3 @@ def _time_after_request(name, metric_queue):
|
|||
|
||||
return r
|
||||
return f
|
||||
|
||||
|
|
Reference in a new issue