Merge pull request #2325 from coreos-inc/reduce-cloudwatch
Reduce cloudwatch costs and error messaging
This commit is contained in:
commit
3bb4946e63
6 changed files with 4 additions and 25 deletions
|
@ -474,7 +474,6 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
metric_queue.put_deprecated('EphemeralBuilderStarted', 1, unit='Count')
|
|
||||||
metric_queue.ephemeral_build_workers.Inc()
|
metric_queue.ephemeral_build_workers.Inc()
|
||||||
except:
|
except:
|
||||||
logger.exception('Exception when writing start metrics for execution %s for job %s',
|
logger.exception('Exception when writing start metrics for execution %s for job %s',
|
||||||
|
|
|
@ -196,7 +196,6 @@ class EC2Executor(BuilderExecutor):
|
||||||
))
|
))
|
||||||
except boto.exception.EC2ResponseError as ec2e:
|
except boto.exception.EC2ResponseError as ec2e:
|
||||||
logger.exception('Unable to spawn builder instance')
|
logger.exception('Unable to spawn builder instance')
|
||||||
metric_queue.put_deprecated('EC2BuildStartFailure', 1, unit='Count')
|
|
||||||
metric_queue.ephemeral_build_worker_failure.Inc()
|
metric_queue.ephemeral_build_worker_failure.Inc()
|
||||||
raise ec2e
|
raise ec2e
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,6 @@ class BuildMetricQueueReporter(object):
|
||||||
self._metric_queue.build_capacity_shortage.Set(need_capacity_count)
|
self._metric_queue.build_capacity_shortage.Set(need_capacity_count)
|
||||||
|
|
||||||
building_percent = 100 if currently_processing else 0
|
building_percent = 100 if currently_processing else 0
|
||||||
self._metric_queue.put_deprecated('PercentBuilding', building_percent, unit='Percent')
|
|
||||||
self._metric_queue.percent_building.Set(building_percent)
|
self._metric_queue.percent_building.Set(building_percent)
|
||||||
|
|
||||||
|
|
||||||
|
@ -123,11 +122,6 @@ class WorkQueue(object):
|
||||||
|
|
||||||
if self._metric_queue:
|
if self._metric_queue:
|
||||||
dim = {'queue': self._queue_name}
|
dim = {'queue': self._queue_name}
|
||||||
self._metric_queue.put_deprecated('Running', running_count, dimensions=dim)
|
|
||||||
self._metric_queue.put_deprecated('AvailableNotRunning', available_not_running_count,
|
|
||||||
dimensions=dim)
|
|
||||||
self._metric_queue.put_deprecated('Available', available_count, dimensions=dim)
|
|
||||||
|
|
||||||
self._metric_queue.work_queue_running.Set(running_count, labelvalues=[self._queue_name])
|
self._metric_queue.work_queue_running.Set(running_count, labelvalues=[self._queue_name])
|
||||||
self._metric_queue.work_queue_available.Set(available_count, labelvalues=[self._queue_name])
|
self._metric_queue.work_queue_available.Set(available_count, labelvalues=[self._queue_name])
|
||||||
|
|
||||||
|
@ -197,10 +191,6 @@ class WorkQueue(object):
|
||||||
QueueItem.insert_many(remaining[0:batch_size]).execute()
|
QueueItem.insert_many(remaining[0:batch_size]).execute()
|
||||||
remaining = remaining[batch_size:]
|
remaining = remaining[batch_size:]
|
||||||
|
|
||||||
if self._metric_queue:
|
|
||||||
self._metric_queue.put_deprecated('Added', len(items_to_insert),
|
|
||||||
dimensions={'queue': self._queue_name})
|
|
||||||
|
|
||||||
def put(self, canonical_name_list, message, available_after=0, retries_remaining=5):
|
def put(self, canonical_name_list, message, available_after=0, retries_remaining=5):
|
||||||
"""
|
"""
|
||||||
Put an item, if it shouldn't be processed for some number of seconds,
|
Put an item, if it shouldn't be processed for some number of seconds,
|
||||||
|
@ -208,10 +198,6 @@ class WorkQueue(object):
|
||||||
"""
|
"""
|
||||||
item = QueueItem.create(**self._queue_dict(canonical_name_list, message, available_after,
|
item = QueueItem.create(**self._queue_dict(canonical_name_list, message, available_after,
|
||||||
retries_remaining))
|
retries_remaining))
|
||||||
|
|
||||||
if self._metric_queue:
|
|
||||||
self._metric_queue.put_deprecated('Added', 1, dimensions={'queue': self._queue_name})
|
|
||||||
|
|
||||||
return str(item.id)
|
return str(item.id)
|
||||||
|
|
||||||
def _select_available_item(self, ordering_required, now):
|
def _select_available_item(self, ordering_required, now):
|
||||||
|
|
|
@ -167,7 +167,6 @@ class _CloudStorage(BaseStorageV2):
|
||||||
metadata['Content-Encoding'] = content_encoding
|
metadata['Content-Encoding'] = content_encoding
|
||||||
|
|
||||||
if self._context.metric_queue is not None:
|
if self._context.metric_queue is not None:
|
||||||
self._context.metric_queue.put_deprecated('MultipartUploadStart', 1)
|
|
||||||
self._context.metric_queue.multipart_upload_start.Inc()
|
self._context.metric_queue.multipart_upload_start.Inc()
|
||||||
|
|
||||||
return self._cloud_bucket.initiate_multipart_upload(path, metadata=metadata,
|
return self._cloud_bucket.initiate_multipart_upload(path, metadata=metadata,
|
||||||
|
@ -208,7 +207,6 @@ class _CloudStorage(BaseStorageV2):
|
||||||
write_error = e
|
write_error = e
|
||||||
|
|
||||||
if self._context.metric_queue is not None:
|
if self._context.metric_queue is not None:
|
||||||
self._context.metric_queue.put_deprecated('MultipartUploadFailure', 1)
|
|
||||||
self._context.metric_queue.multipart_upload_end.Inc(labelvalues=['failure'])
|
self._context.metric_queue.multipart_upload_end.Inc(labelvalues=['failure'])
|
||||||
|
|
||||||
if cancel_on_error:
|
if cancel_on_error:
|
||||||
|
@ -219,7 +217,6 @@ class _CloudStorage(BaseStorageV2):
|
||||||
|
|
||||||
if total_bytes_written > 0:
|
if total_bytes_written > 0:
|
||||||
if self._context.metric_queue is not None:
|
if self._context.metric_queue is not None:
|
||||||
self._context.metric_queue.put_deprecated('MultipartUploadSuccess', 1)
|
|
||||||
self._context.metric_queue.multipart_upload_end.Inc(labelvalues=['success'])
|
self._context.metric_queue.multipart_upload_end.Inc(labelvalues=['success'])
|
||||||
|
|
||||||
self._perform_action_with_retry(mp.complete_upload)
|
self._perform_action_with_retry(mp.complete_upload)
|
||||||
|
|
|
@ -31,6 +31,8 @@ class MetricQueue(object):
|
||||||
labelnames=['endpoint', 'code'])
|
labelnames=['endpoint', 'code'])
|
||||||
self.non_200 = prom.create_counter('response_non200', 'Non-200 HTTP response codes',
|
self.non_200 = prom.create_counter('response_non200', 'Non-200 HTTP response codes',
|
||||||
labelnames=['endpoint'])
|
labelnames=['endpoint'])
|
||||||
|
self.error_500 = prom.create_counter('response_500', '5XX HTTP response codes',
|
||||||
|
labelnames=['endpoint'])
|
||||||
self.multipart_upload_start = prom.create_counter('multipart_upload_start',
|
self.multipart_upload_start = prom.create_counter('multipart_upload_start',
|
||||||
'Multipart upload started')
|
'Multipart upload started')
|
||||||
self.multipart_upload_end = prom.create_counter('multipart_upload_end',
|
self.multipart_upload_end = prom.create_counter('multipart_upload_end',
|
||||||
|
@ -165,16 +167,12 @@ def _time_after_request(name, metric_queue):
|
||||||
dur = time.time() - start
|
dur = time.time() - start
|
||||||
dims = {'endpoint': request.endpoint}
|
dims = {'endpoint': request.endpoint}
|
||||||
|
|
||||||
metric_queue.put_deprecated('ResponseTime', dur, dimensions=dims, unit='Seconds')
|
|
||||||
metric_queue.put_deprecated('ResponseCode', r.status_code, dimensions=dims)
|
|
||||||
|
|
||||||
metric_queue.resp_time.Observe(dur, labelvalues=[request.endpoint])
|
metric_queue.resp_time.Observe(dur, labelvalues=[request.endpoint])
|
||||||
metric_queue.resp_code.Inc(labelvalues=[request.endpoint, r.status_code])
|
metric_queue.resp_code.Inc(labelvalues=[request.endpoint, r.status_code])
|
||||||
|
|
||||||
if r.status_code >= 500:
|
if r.status_code >= 500:
|
||||||
metric_queue.put_deprecated('5XXResponse', 1, dimensions={'name': name})
|
metric_queue.error_500.Inc(labelvalues=[request.endpoint])
|
||||||
elif r.status_code < 200 or r.status_code >= 300:
|
elif r.status_code < 200 or r.status_code >= 300:
|
||||||
metric_queue.put_deprecated('Non200Response', 1, dimensions={'name': name})
|
|
||||||
metric_queue.non_200.Inc(labelvalues=[request.endpoint])
|
metric_queue.non_200.Inc(labelvalues=[request.endpoint])
|
||||||
|
|
||||||
return r
|
return r
|
||||||
|
|
|
@ -13,7 +13,7 @@ MAX_BATCH_METRICS = 20
|
||||||
|
|
||||||
# Sleep for this much time between failed send requests.
|
# Sleep for this much time between failed send requests.
|
||||||
# This prevents hammering cloudwatch when it's not available.
|
# This prevents hammering cloudwatch when it's not available.
|
||||||
FAILED_SEND_SLEEP_SECS = 5
|
FAILED_SEND_SLEEP_SECS = 15
|
||||||
|
|
||||||
def start_cloudwatch_sender(metrics, app):
|
def start_cloudwatch_sender(metrics, app):
|
||||||
"""
|
"""
|
||||||
|
|
Reference in a new issue