Merge pull request #351 from mjibson/more-metrics

More metrics
This commit is contained in:
Jake Moshenko 2015-08-17 13:09:08 -04:00
commit 2fd1d5969e
10 changed files with 166 additions and 157 deletions

10
app.py
View file

@ -26,11 +26,11 @@ from util.saas.exceptionlog import Sentry
from util.names import urn_generator
from util.config.oauth import GoogleOAuthConfig, GithubOAuthConfig, GitLabOAuthConfig
from util.security.signing import Signer
from util.saas.queuemetrics import QueueMetrics
from util.saas.cloudwatch import start_cloudwatch_sender
from util.saas.metricqueue import MetricQueue
from util.config.provider import FileConfigProvider, TestConfigProvider
from util.config.configutil import generate_secret_key
from util.config.superusermanager import SuperUserManager
from buildman.jobutil.buildreporter import BuildMetrics
OVERRIDE_CONFIG_DIRECTORY = 'conf/stack/'
OVERRIDE_CONFIG_YAML_FILENAME = 'conf/stack/config.yaml'
@ -124,8 +124,8 @@ authentication = UserAuthentication(app, OVERRIDE_CONFIG_DIRECTORY)
userevents = UserEventsBuilderModule(app)
superusers = SuperUserManager(app)
signer = Signer(app, OVERRIDE_CONFIG_DIRECTORY)
queue_metrics = QueueMetrics(app)
build_metrics = BuildMetrics(app)
metric_queue = MetricQueue()
start_cloudwatch_sender(metric_queue, app)
tf = app.config['DB_TRANSACTION_FACTORY']
@ -137,7 +137,7 @@ oauth_apps = [github_login, github_trigger, gitlab_trigger, google_login]
image_diff_queue = WorkQueue(app.config['DIFFS_QUEUE_NAME'], tf)
dockerfile_build_queue = WorkQueue(app.config['DOCKERFILE_BUILD_QUEUE_NAME'], tf,
reporter=queue_metrics.report)
metric_queue=metric_queue)
notification_queue = WorkQueue(app.config['NOTIFICATION_QUEUE_NAME'], tf)
database.configure(app.config)

View file

@ -1,70 +0,0 @@
from buildman.enums import BuildJobResult
from util.saas.cloudwatch import get_queue
class BuildReporter(object):
"""
Base class for reporting build statuses to a metrics service.
"""
def report_completion_status(self, status):
"""
Method to invoke the recording of build's completion status to a metric service.
"""
raise NotImplementedError
class NullReporter(BuildReporter):
"""
The /dev/null of BuildReporters.
"""
def report_completion_status(self, *args):
pass
class CloudWatchBuildReporter(BuildReporter):
"""
Implements a BuildReporter for Amazon's CloudWatch.
"""
def __init__(self, queue, namespace_name, completed_name, failed_name, incompleted_name):
self._queue = queue
self._namespace_name = namespace_name
self._completed_name = completed_name
self._failed_name = failed_name
self._incompleted_name = incompleted_name
def _send_to_queue(self, *args, **kwargs):
self._queue.put((args, kwargs))
def report_completion_status(self, status):
if status == BuildJobResult.COMPLETE:
status_name = self._completed_name
elif status == BuildJobResult.ERROR:
status_name = self._failed_name
elif status == BuildJobResult.INCOMPLETE:
status_name = self._incompleted_name
else:
return
self._send_to_queue(self._namespace_name, status_name, 1, unit='Count')
class BuildMetrics(object):
"""
BuildMetrics initializes a reporter for recording the status of build completions.
"""
def __init__(self, app=None):
self._app = app
self._reporter = NullReporter()
if app is not None:
reporter_type = app.config.get('BUILD_METRICS_TYPE', 'Null')
if reporter_type == 'CloudWatch':
namespace = app.config['BUILD_METRICS_NAMESPACE']
completed_name = app.config['BUILD_METRICS_COMPLETED_NAME']
failed_name = app.config['BUILD_METRICS_FAILED_NAME']
incompleted_name = app.config['BUILD_METRICS_INCOMPLETED_NAME']
request_queue = get_queue(app)
self._reporter = CloudWatchBuildReporter(request_queue, namespace, completed_name,
failed_name, incompleted_name)
def __getattr__(self, name):
return getattr(self._reporter, name, None)

View file

@ -16,7 +16,7 @@ from buildman.enums import BuildJobResult, BuildServerStatus
from buildman.jobutil.buildstatus import StatusHandler
from buildman.jobutil.buildjob import BuildJob, BuildJobLoadException
from data import database
from app import app, build_metrics
from app import app, metric_queue
logger = logging.getLogger(__name__)
@ -151,7 +151,7 @@ class BuilderServer(object):
if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count:
self._shutdown_event.set()
build_metrics.report_completion_status(job_status)
report_completion_status(job_status)
@trollius.coroutine
def _work_checker(self):
@ -225,3 +225,15 @@ class BuilderServer(object):
# Initialize the work queue checker.
yield From(self._work_checker())
def report_completion_status(status):
if status == BuildJobResult.COMPLETE:
status_name = 'CompleteBuilds'
elif status == BuildJobResult.ERROR:
status_name = 'FailedBuilds'
elif status == BuildJobResult.INCOMPLETE:
status_name = 'IncompletedBuilds'
else:
return
metric_queue.put(status_name, 1, unit='Count')

View file

@ -15,9 +15,9 @@ class NoopWith:
class WorkQueue(object):
def __init__(self, queue_name, transaction_factory,
canonical_name_match_list=None, reporter=None):
canonical_name_match_list=None, metric_queue=None):
self._queue_name = queue_name
self._reporter = reporter
self._metric_queue = metric_queue
self._transaction_factory = transaction_factory
self._currently_processing = False
@ -75,12 +75,14 @@ class WorkQueue(object):
return (running_count, available_not_running_count, available_count)
def update_metrics(self):
if self._reporter is None:
if self._metric_queue is None:
return
(running_count, available_not_running_count, available_count) = self.get_metrics()
self._reporter(self._currently_processing, running_count,
running_count + available_not_running_count)
self._metric_queue.put('BuildCapacityShortage', available_not_running_count, unit='Count')
building_percent = 100 if self._currently_processing else 0
self._metric_queue.put('PercentBuilding', building_percent, unit='Percent')
def has_retries_remaining(self, item_id):
""" Returns whether the queue item with the given id has any retries remaining. If the

View file

@ -1,7 +1,7 @@
import logging
import datetime
from app import app
from app import app, metric_queue
from flask import Blueprint, request, make_response, jsonify, session
from flask.ext.restful import Resource, abort, Api, reqparse
from flask.ext.restful.utils.cors import crossdomain
@ -20,6 +20,7 @@ from auth.auth_context import get_authenticated_user, get_validated_oauth_token
from auth.auth import process_oauth
from endpoints.csrf import csrf_protect
from endpoints.decorators import check_anon_protection
from util.saas.metricqueue import time_decorator
logger = logging.getLogger(__name__)
@ -28,7 +29,7 @@ api = Api()
api.init_app(api_bp)
api.decorators = [csrf_protect,
crossdomain(origin='*', headers=['Authorization', 'Content-Type']),
process_oauth]
process_oauth, time_decorator(api_bp.name, metric_queue)]
class ApiException(Exception):

View file

@ -1,10 +1,13 @@
from flask import Blueprint, make_response
from app import metric_queue
from endpoints.decorators import anon_protect, anon_allowed
from util.saas.metricqueue import time_blueprint
v1_bp = Blueprint('v1', __name__)
time_blueprint(v1_bp, metric_queue)
# Note: This is *not* part of the Docker index spec. This is here for our own health check,
# since we have nginx handle the _ping below.

View file

@ -7,6 +7,7 @@ from flask import Blueprint, make_response, url_for, request
from functools import wraps
from urlparse import urlparse
from app import metric_queue
from endpoints.decorators import anon_protect, anon_allowed
from auth.jwt_auth import process_jwt_auth
from auth.auth_context import get_grant_user_context
@ -14,12 +15,13 @@ from auth.permissions import (ReadRepositoryPermission, ModifyRepositoryPermissi
AdministerRepositoryPermission)
from data import model
from util.http import abort
from util.saas.metricqueue import time_blueprint
logger = logging.getLogger(__name__)
v2_bp = Blueprint('v2', __name__)
time_blueprint(v2_bp, metric_queue)
def _require_repo_permission(permission_class, allow_public=False):
def wrapper(func):

View file

@ -1,35 +1,46 @@
import logging
import boto
import time
from Queue import Queue
from Queue import Empty
from threading import Thread
logger = logging.getLogger(__name__)
def get_queue(app):
"""
Returns a queue to a new CloudWatchSender.
"""
access_key = app.config['CLOUDWATCH_AWS_ACCESS_KEY']
secret_key = app.config['CLOUDWATCH_AWS_SECRET_KEY']
MAX_BATCH_METRICS = 100
queue = Queue()
sender = CloudWatchSender(queue, access_key, secret_key)
# Sleep for this much time between failed send requests.
# This prevents hammering cloudwatch when it's not available.
FAILED_SEND_SLEEP_SECS = 5
def start_cloudwatch_sender(metrics, app):
"""
Starts sending from metrics to a new CloudWatchSender.
"""
try:
access_key = app.config['CLOUDWATCH_AWS_ACCESS_KEY']
secret_key = app.config['CLOUDWATCH_AWS_SECRET_KEY']
namespace = app.config['CLOUDWATCH_NAMESPACE']
except KeyError:
logger.debug('CloudWatch not configured')
return
sender = CloudWatchSender(metrics, access_key, secret_key, namespace)
sender.start()
return queue
class CloudWatchSender(Thread):
"""
CloudWatchSender loops indefinitely and pulls metrics off of a queue then sends it to CloudWatch.
"""
def __init__(self, request_queue, aws_access_key, aws_secret_key):
def __init__(self, metrics, aws_access_key, aws_secret_key, namespace):
Thread.__init__(self)
self.daemon = True
self._aws_access_key = aws_access_key
self._aws_secret_key = aws_secret_key
self._put_metrics_queue = request_queue
self._metrics = metrics
self._namespace = namespace
def run(self):
try:
@ -37,11 +48,46 @@ class CloudWatchSender(Thread):
connection = boto.connect_cloudwatch(self._aws_access_key, self._aws_secret_key)
except:
logger.exception('Failed to connect to CloudWatch.')
self._metrics.enable()
while True:
put_metric_args, kwargs = self._put_metrics_queue.get()
logger.debug('Got queued put metrics request.')
metrics = {
'name': [],
'value': [],
'unit': [],
'timestamp': [],
'dimensions': [],
}
metric = self._metrics.get()
append_metric(metrics, metric)
while len(metrics['name']) < MAX_BATCH_METRICS:
try:
metric = self._metrics.get_nowait()
append_metric(metrics, metric)
except Empty:
break
try:
connection.put_metric_data(*put_metric_args, **kwargs)
connection.put_metric_data(self._namespace, **metrics)
logger.debug('Sent %d CloudWatch metrics', len(metrics['name']))
except:
logger.exception('Failed to write to CloudWatch')
for i in range(len(metrics['name'])):
self._metrics.put(metrics['name'][i], metrics['value'][i],
unit=metrics['unit'][i],
dimensions=metrics['dimensions'][i],
timestamp=metrics['timestamp'][i],
)
logger.exception('Failed to write to CloudWatch: %s', metrics)
logger.debug('Attempted to requeue %d metrics.', len(metrics['name']))
time.sleep(FAILED_SEND_SLEEP_SECS)
def append_metric(metrics, m):
name, value, kwargs = m
metrics['name'].append(name)
metrics['value'].append(value)
metrics['unit'].append(kwargs.get('unit'))
metrics['dimensions'].append(kwargs.get('dimensions'))
metrics['timestamp'].append(kwargs.get('timestamp'))

69
util/saas/metricqueue.py Normal file
View file

@ -0,0 +1,69 @@
import datetime
import logging
import time
from functools import wraps
from Queue import Queue, Full
from flask import g, request
logger = logging.getLogger(__name__)
class MetricQueue(object):
def __init__(self):
self._queue = None
def enable(self, maxsize=10000):
self._queue = Queue(maxsize)
def put(self, name, value, **kwargs):
if self._queue is None:
logging.debug('No metric queue: %s %s %s', name, value, kwargs)
return
try:
kwargs.setdefault('timestamp', datetime.datetime.now())
kwargs.setdefault('dimensions', {})
self._queue.put_nowait((name, value, kwargs))
except Full:
logger.error('Metric queue full')
def get(self):
return self._queue.get()
def get_nowait(self):
return self._queue.get_nowait()
def time_blueprint(bp, metric_queue):
bp.before_request(time_before_request)
bp.after_request(time_after_request(bp.name, metric_queue))
def time_before_request():
g._request_start_time = time.time()
def time_after_request(name, metric_queue):
def f(r):
start = getattr(g, '_request_start_time', None)
if start is None:
return r
dur = time.time() - start
dims = {'endpoint': request.endpoint}
metric_queue.put('ResponseTime', dur, dimensions=dims, unit='Seconds')
metric_queue.put('ResponseCode', r.status_code, dimensions=dims)
if r.status_code < 200 or r.status_code >= 300:
metric_queue.put('Non200Response', 1, dimensions={'name': name})
return r
return f
def time_decorator(name, metric_queue):
after = time_after_request(name, metric_queue)
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
time_before_request()
rv = func(*args, **kwargs)
after(rv)
return rv
return wrapper
return decorator

View file

@ -1,56 +0,0 @@
import logging
from util.saas.cloudwatch import get_queue
logger = logging.getLogger(__name__)
class NullReporter(object):
def report(self, *args):
pass
class CloudWatchReporter(object):
""" CloudWatchReporter reports work queue metrics to CloudWatch """
def __init__(self, request_queue, namespace, need_capacity_name, build_percent_name):
self._namespace = namespace
self._need_capacity_name = need_capacity_name
self._build_percent_name = build_percent_name
self._put_metrics_queue = request_queue
def _send_to_queue(self, *args, **kwargs):
self._put_metrics_queue.put((args, kwargs))
def report(self, currently_processing, running_count, total_count):
logger.debug('Worker indicated %s running count and %s total count', running_count,
total_count)
need_capacity_count = total_count - running_count
self._send_to_queue(self._namespace, self._need_capacity_name, need_capacity_count,
unit='Count')
building_percent = 100 if currently_processing else 0
self._send_to_queue(self._namespace, self._build_percent_name, building_percent,
unit='Percent')
class QueueMetrics(object):
"""
QueueMetrics initializes a reporter for recording metrics of work queues.
"""
def __init__(self, app=None):
self._app = app
self._reporter = NullReporter()
if app is not None:
reporter_type = app.config.get('QUEUE_METRICS_TYPE', 'Null')
if reporter_type == 'CloudWatch':
namespace = app.config['QUEUE_METRICS_NAMESPACE']
req_capacity_name = app.config['QUEUE_METRICS_CAPACITY_SHORTAGE_NAME']
build_percent_name = app.config['QUEUE_METRICS_BUILD_PERCENT_NAME']
request_queue = get_queue(app)
self._reporter = CloudWatchReporter(request_queue, namespace, req_capacity_name,
build_percent_name)
def __getattr__(self, name):
return getattr(self._reporter, name, None)