From 400a5db719d4e7e3343aebebf9b43dabdc34b4c3 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Mon, 27 Nov 2017 11:52:37 +0200 Subject: [PATCH] Add additional metrics on executor start and failure This will allow us to register a pager if one of the executors starts failing consistently --- buildman/manager/ephemeral.py | 13 +++++++++++++ util/metrics/metricqueue.py | 6 ++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 295ff06dd..f5e74706d 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -496,9 +496,22 @@ class EphemeralBuilderManager(BaseManager): try: execution_id = yield From(executor.start_builder(realm, token, build_uuid)) except: + try: + metric_queue.build_start_failure.Inc(labelvalues=[executor.name]) + metric_queue.put_deprecated(('ExecutorFailure-%s' % executor.name), 1, unit='Count') + except: + logger.exception('Exception when writing failure metric for execution %s for job %s', + execution_id, build_uuid) + logger.exception('Exception when starting builder for job: %s', build_uuid) continue + try: + metric_queue.build_start_success.Inc(labelvalues=[executor.name]) + except: + logger.exception('Exception when writing success metric for execution %s for job %s', + execution_id, build_uuid) + try: metric_queue.ephemeral_build_workers.Inc() except: diff --git a/util/metrics/metricqueue.py b/util/metrics/metricqueue.py index ca11c256b..ec90867a4 100644 --- a/util/metrics/metricqueue.py +++ b/util/metrics/metricqueue.py @@ -47,8 +47,10 @@ class MetricQueue(object): 'Time from triggering to actually starting a build', labelnames=['builder_type'], buckets=BUILDER_START_TIME_BUCKETS) - self.build_time = prom.create_histogram('build_time', 'Time spent buildig', labelnames=['builder_type']) - self.builder_fallback = prom.create_counter('builder_fallback', 'Builder fell back to ec2 executor') + self.build_time = prom.create_histogram('build_time', 'Time spent building', labelnames=['builder_type']) + self.builder_fallback = prom.create_counter('builder_fallback', 'Builder fell back to secondary executor') + self.build_start_success = prom.create_counter('build_start_success', 'Executor succeeded in starting a build', labelnames=['builder_type']) + self.build_start_failure = prom.create_counter('build_start_failure', 'Executor failed to start a build', labelnames=['builder_type']) self.percent_building = prom.create_gauge('build_percent_building', 'Percent building.') self.build_counter = prom.create_counter('builds', 'Number of builds', labelnames=['name']) self.ephemeral_build_workers = prom.create_counter('ephemeral_build_workers',