Merge pull request #2915 from coreos-inc/joseph.schorr/QS-41/build-man-alarms

Add additional metrics on executor start and failure
This commit is contained in:
josephschorr 2017-11-27 18:14:19 +02:00 committed by GitHub
commit 773ea9fc65
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 17 additions and 2 deletions

View file

@ -496,9 +496,22 @@ class EphemeralBuilderManager(BaseManager):
try:
execution_id = yield From(executor.start_builder(realm, token, build_uuid))
except:
try:
metric_queue.build_start_failure.Inc(labelvalues=[executor.name])
metric_queue.put_deprecated(('ExecutorFailure-%s' % executor.name), 1, unit='Count')
except:
logger.exception('Exception when writing failure metric for execution %s for job %s',
execution_id, build_uuid)
logger.exception('Exception when starting builder for job: %s', build_uuid)
continue
try:
metric_queue.build_start_success.Inc(labelvalues=[executor.name])
except:
logger.exception('Exception when writing success metric for execution %s for job %s',
execution_id, build_uuid)
try:
metric_queue.ephemeral_build_workers.Inc()
except:

View file

@ -47,8 +47,10 @@ class MetricQueue(object):
'Time from triggering to actually starting a build',
labelnames=['builder_type'],
buckets=BUILDER_START_TIME_BUCKETS)
self.build_time = prom.create_histogram('build_time', 'Time spent buildig', labelnames=['builder_type'])
self.builder_fallback = prom.create_counter('builder_fallback', 'Builder fell back to ec2 executor')
self.build_time = prom.create_histogram('build_time', 'Time spent building', labelnames=['builder_type'])
self.builder_fallback = prom.create_counter('builder_fallback', 'Builder fell back to secondary executor')
self.build_start_success = prom.create_counter('build_start_success', 'Executor succeeded in starting a build', labelnames=['builder_type'])
self.build_start_failure = prom.create_counter('build_start_failure', 'Executor failed to start a build', labelnames=['builder_type'])
self.percent_building = prom.create_gauge('build_percent_building', 'Percent building.')
self.build_counter = prom.create_counter('builds', 'Number of builds', labelnames=['name'])
self.ephemeral_build_workers = prom.create_counter('ephemeral_build_workers',