import datetime import logging import time from functools import wraps from Queue import Queue, Full from flask import g, request from trollius import Return logger = logging.getLogger(__name__) # Buckets for the API response times. API_RESPONSE_TIME_BUCKETS = [.01, .025, .05, .1, .25, .5, 1.0, 2.5, 5.0] # Buckets for the builder start times. BUILDER_START_TIME_BUCKETS = [.5, 1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 180.0, 240.0, 300.0, 600.0] class MetricQueue(object): """ Object to which various metrics are written, for distribution to metrics collection system(s) such as Prometheus. """ def __init__(self, prom): # Define the various exported metrics. self.resp_time = prom.create_histogram('response_time', 'HTTP response time in seconds', labelnames=['endpoint'], buckets=API_RESPONSE_TIME_BUCKETS) self.resp_code = prom.create_counter('response_code', 'HTTP response code', labelnames=['endpoint', 'code']) self.non_200 = prom.create_counter('response_non200', 'Non-200 HTTP response codes', labelnames=['endpoint']) self.error_500 = prom.create_counter('response_500', '5XX HTTP response codes', labelnames=['endpoint']) self.multipart_upload_start = prom.create_counter('multipart_upload_start', 'Multipart upload started') self.multipart_upload_end = prom.create_counter('multipart_upload_end', 'Multipart upload ends.', labelnames=['type']) self.build_capacity_shortage = prom.create_gauge('build_capacity_shortage', 'Build capacity shortage.') self.builder_time_to_start = prom.create_histogram('builder_tts', 'Time from triggering to starting a builder.', labelnames=['builder_type'], buckets=BUILDER_START_TIME_BUCKETS) self.builder_time_to_build = prom.create_histogram('builder_ttb', 'Time from triggering to actually starting a build', labelnames=['builder_type'], buckets=BUILDER_START_TIME_BUCKETS) self.build_time = prom.create_histogram('build_time', 'Time spent building', labelnames=['builder_type']) self.builder_fallback = prom.create_counter('builder_fallback', 'Builder fell back to secondary executor') self.build_start_success = prom.create_counter('build_start_success', 'Executor succeeded in starting a build', labelnames=['builder_type']) self.build_start_failure = prom.create_counter('build_start_failure', 'Executor failed to start a build', labelnames=['builder_type']) self.percent_building = prom.create_gauge('build_percent_building', 'Percent building.') self.build_counter = prom.create_counter('builds', 'Number of builds', labelnames=['name']) self.ephemeral_build_workers = prom.create_counter('ephemeral_build_workers', 'Number of started ephemeral build workers') self.ephemeral_build_worker_failure = prom.create_counter('ephemeral_build_worker_failure', 'Number of failed-to-start ephemeral build workers') self.work_queue_running = prom.create_gauge('work_queue_running', 'Running items in a queue', labelnames=['queue_name']) self.work_queue_available = prom.create_gauge('work_queue_available', 'Available items in a queue', labelnames=['queue_name']) self.work_queue_available_not_running = prom.create_gauge('work_queue_available_not_running', 'Available items that are not yet running', labelnames=['queue_name']) self.repository_pull = prom.create_counter('repository_pull', 'Repository Pull Count', labelnames=['namespace', 'repo_name', 'protocol', 'status']) self.repository_push = prom.create_counter('repository_push', 'Repository Push Count', labelnames=['namespace', 'repo_name', 'protocol', 'status']) self.repository_build_queued = prom.create_counter('repository_build_queued', 'Repository Build Queued Count', labelnames=['namespace', 'repo_name']) self.repository_build_completed = prom.create_counter('repository_build_completed', 'Repository Build Complete Count', labelnames=['namespace', 'repo_name', 'status', 'executor']) self.chunk_size = prom.create_histogram('chunk_size', 'Registry blob chunk size', labelnames=['storage_region']) self.chunk_upload_time = prom.create_histogram('chunk_upload_time', 'Registry blob chunk upload time', labelnames=['storage_region']) self.authentication_count = prom.create_counter('authentication_count', 'Authentication count', labelnames=['kind', 'status']) self.repository_count = prom.create_gauge('repository_count', 'Number of repositories') self.user_count = prom.create_gauge('user_count', 'Number of users') self.org_count = prom.create_gauge('org_count', 'Number of Organizations') self.robot_count = prom.create_gauge('robot_count', 'Number of robot accounts') # Deprecated: Define an in-memory queue for reporting metrics to CloudWatch or another # provider. self._queue = None def enable_deprecated(self, maxsize=10000): self._queue = Queue(maxsize) def put_deprecated(self, name, value, **kwargs): if self._queue is None: logger.debug('No metric queue %s %s %s', name, value, kwargs) return try: kwargs.setdefault('timestamp', datetime.datetime.now()) kwargs.setdefault('dimensions', {}) self._queue.put_nowait((name, value, kwargs)) except Full: logger.error('Metric queue full') def get_deprecated(self): return self._queue.get() def get_nowait_deprecated(self): return self._queue.get_nowait() def duration_collector_async(metric, labelvalues): """ Decorates a method to have its duration time logged to the metric. """ def decorator(func): @wraps(func) def wrapper(*args, **kwargs): trigger_time = time.time() try: rv = func(*args, **kwargs) except Return as e: metric.Observe(time.time() - trigger_time, labelvalues=labelvalues) raise e return rv return wrapper return decorator def time_decorator(name, metric_queue): """ Decorates an endpoint method to have its request time logged to the metrics queue. """ after = _time_after_request(name, metric_queue) def decorator(func): @wraps(func) def wrapper(*args, **kwargs): _time_before_request() rv = func(*args, **kwargs) after(rv) return rv return wrapper return decorator def time_blueprint(bp, metric_queue): """ Decorates a blueprint to have its request time logged to the metrics queue. """ bp.before_request(_time_before_request) bp.after_request(_time_after_request(bp.name, metric_queue)) def _time_before_request(): g._request_start_time = time.time() def _time_after_request(name, metric_queue): def f(r): start = getattr(g, '_request_start_time', None) if start is None: return r dur = time.time() - start metric_queue.resp_time.Observe(dur, labelvalues=[request.endpoint]) metric_queue.resp_code.Inc(labelvalues=[request.endpoint, r.status_code]) if r.status_code >= 500: metric_queue.error_500.Inc(labelvalues=[request.endpoint]) elif r.status_code < 200 or r.status_code >= 300: metric_queue.non_200.Inc(labelvalues=[request.endpoint]) return r return f