From b407f88a265bc153c7c2de9a125d05f5f0a56393 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 1 Feb 2017 13:08:21 -0500 Subject: [PATCH 1/2] Remove unnecessary CloudWatch metrics They are spamming the API and costing us a lot of money --- buildman/manager/ephemeral.py | 1 - buildman/manager/executor.py | 1 - data/queue.py | 14 -------------- storage/cloud.py | 3 --- util/metrics/metricqueue.py | 8 +++----- 5 files changed, 3 insertions(+), 24 deletions(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 3892e4bba..b5b8cf608 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -474,7 +474,6 @@ class EphemeralBuilderManager(BaseManager): continue try: - metric_queue.put_deprecated('EphemeralBuilderStarted', 1, unit='Count') metric_queue.ephemeral_build_workers.Inc() except: logger.exception('Exception when writing start metrics for execution %s for job %s', diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index f09ce8d69..202cf0921 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -196,7 +196,6 @@ class EC2Executor(BuilderExecutor): )) except boto.exception.EC2ResponseError as ec2e: logger.exception('Unable to spawn builder instance') - metric_queue.put_deprecated('EC2BuildStartFailure', 1, unit='Count') metric_queue.ephemeral_build_worker_failure.Inc() raise ec2e diff --git a/data/queue.py b/data/queue.py index af83fb806..4e84c6738 100644 --- a/data/queue.py +++ b/data/queue.py @@ -23,7 +23,6 @@ class BuildMetricQueueReporter(object): self._metric_queue.build_capacity_shortage.Set(need_capacity_count) building_percent = 100 if currently_processing else 0 - self._metric_queue.put_deprecated('PercentBuilding', building_percent, unit='Percent') self._metric_queue.percent_building.Set(building_percent) @@ -123,11 +122,6 @@ class WorkQueue(object): if self._metric_queue: dim = {'queue': self._queue_name} - self._metric_queue.put_deprecated('Running', running_count, dimensions=dim) - self._metric_queue.put_deprecated('AvailableNotRunning', available_not_running_count, - dimensions=dim) - self._metric_queue.put_deprecated('Available', available_count, dimensions=dim) - self._metric_queue.work_queue_running.Set(running_count, labelvalues=[self._queue_name]) self._metric_queue.work_queue_available.Set(available_count, labelvalues=[self._queue_name]) @@ -197,10 +191,6 @@ class WorkQueue(object): QueueItem.insert_many(remaining[0:batch_size]).execute() remaining = remaining[batch_size:] - if self._metric_queue: - self._metric_queue.put_deprecated('Added', len(items_to_insert), - dimensions={'queue': self._queue_name}) - def put(self, canonical_name_list, message, available_after=0, retries_remaining=5): """ Put an item, if it shouldn't be processed for some number of seconds, @@ -208,10 +198,6 @@ class WorkQueue(object): """ item = QueueItem.create(**self._queue_dict(canonical_name_list, message, available_after, retries_remaining)) - - if self._metric_queue: - self._metric_queue.put_deprecated('Added', 1, dimensions={'queue': self._queue_name}) - return str(item.id) def _select_available_item(self, ordering_required, now): diff --git a/storage/cloud.py b/storage/cloud.py index 075d76f9a..9d0ae01d5 100644 --- a/storage/cloud.py +++ b/storage/cloud.py @@ -167,7 +167,6 @@ class _CloudStorage(BaseStorageV2): metadata['Content-Encoding'] = content_encoding if self._context.metric_queue is not None: - self._context.metric_queue.put_deprecated('MultipartUploadStart', 1) self._context.metric_queue.multipart_upload_start.Inc() return self._cloud_bucket.initiate_multipart_upload(path, metadata=metadata, @@ -208,7 +207,6 @@ class _CloudStorage(BaseStorageV2): write_error = e if self._context.metric_queue is not None: - self._context.metric_queue.put_deprecated('MultipartUploadFailure', 1) self._context.metric_queue.multipart_upload_end.Inc(labelvalues=['failure']) if cancel_on_error: @@ -219,7 +217,6 @@ class _CloudStorage(BaseStorageV2): if total_bytes_written > 0: if self._context.metric_queue is not None: - self._context.metric_queue.put_deprecated('MultipartUploadSuccess', 1) self._context.metric_queue.multipart_upload_end.Inc(labelvalues=['success']) self._perform_action_with_retry(mp.complete_upload) diff --git a/util/metrics/metricqueue.py b/util/metrics/metricqueue.py index 5dba0ea97..8ed541a0d 100644 --- a/util/metrics/metricqueue.py +++ b/util/metrics/metricqueue.py @@ -31,6 +31,8 @@ class MetricQueue(object): labelnames=['endpoint', 'code']) self.non_200 = prom.create_counter('response_non200', 'Non-200 HTTP response codes', labelnames=['endpoint']) + self.error_500 = prom.create_counter('response_500', '5XX HTTP response codes', + labelnames=['endpoint']) self.multipart_upload_start = prom.create_counter('multipart_upload_start', 'Multipart upload started') self.multipart_upload_end = prom.create_counter('multipart_upload_end', @@ -165,16 +167,12 @@ def _time_after_request(name, metric_queue): dur = time.time() - start dims = {'endpoint': request.endpoint} - metric_queue.put_deprecated('ResponseTime', dur, dimensions=dims, unit='Seconds') - metric_queue.put_deprecated('ResponseCode', r.status_code, dimensions=dims) - metric_queue.resp_time.Observe(dur, labelvalues=[request.endpoint]) metric_queue.resp_code.Inc(labelvalues=[request.endpoint, r.status_code]) if r.status_code >= 500: - metric_queue.put_deprecated('5XXResponse', 1, dimensions={'name': name}) + metric_queue.error_500.Inc(labelvalues=[request.endpoint]) elif r.status_code < 200 or r.status_code >= 300: - metric_queue.put_deprecated('Non200Response', 1, dimensions={'name': name}) metric_queue.non_200.Inc(labelvalues=[request.endpoint]) return r From c9bb1323390501ddf17208b694368fa5336daac0 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 1 Feb 2017 13:09:00 -0500 Subject: [PATCH 2/2] Increase cloudwatch send timeout to reduce how often we hit the API --- util/saas/cloudwatch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/saas/cloudwatch.py b/util/saas/cloudwatch.py index 78e4abc11..c38df655a 100644 --- a/util/saas/cloudwatch.py +++ b/util/saas/cloudwatch.py @@ -13,7 +13,7 @@ MAX_BATCH_METRICS = 20 # Sleep for this much time between failed send requests. # This prevents hammering cloudwatch when it's not available. -FAILED_SEND_SLEEP_SECS = 5 +FAILED_SEND_SLEEP_SECS = 15 def start_cloudwatch_sender(metrics, app): """