Merge pull request #351 from mjibson/more-metrics

More metrics
This commit is contained in:
Jake Moshenko 2015-08-17 13:09:08 -04:00
commit 2fd1d5969e
10 changed files with 166 additions and 157 deletions

10
app.py
View file

@ -26,11 +26,11 @@ from util.saas.exceptionlog import Sentry
from util.names import urn_generator from util.names import urn_generator
from util.config.oauth import GoogleOAuthConfig, GithubOAuthConfig, GitLabOAuthConfig from util.config.oauth import GoogleOAuthConfig, GithubOAuthConfig, GitLabOAuthConfig
from util.security.signing import Signer from util.security.signing import Signer
from util.saas.queuemetrics import QueueMetrics from util.saas.cloudwatch import start_cloudwatch_sender
from util.saas.metricqueue import MetricQueue
from util.config.provider import FileConfigProvider, TestConfigProvider from util.config.provider import FileConfigProvider, TestConfigProvider
from util.config.configutil import generate_secret_key from util.config.configutil import generate_secret_key
from util.config.superusermanager import SuperUserManager from util.config.superusermanager import SuperUserManager
from buildman.jobutil.buildreporter import BuildMetrics
OVERRIDE_CONFIG_DIRECTORY = 'conf/stack/' OVERRIDE_CONFIG_DIRECTORY = 'conf/stack/'
OVERRIDE_CONFIG_YAML_FILENAME = 'conf/stack/config.yaml' OVERRIDE_CONFIG_YAML_FILENAME = 'conf/stack/config.yaml'
@ -124,8 +124,8 @@ authentication = UserAuthentication(app, OVERRIDE_CONFIG_DIRECTORY)
userevents = UserEventsBuilderModule(app) userevents = UserEventsBuilderModule(app)
superusers = SuperUserManager(app) superusers = SuperUserManager(app)
signer = Signer(app, OVERRIDE_CONFIG_DIRECTORY) signer = Signer(app, OVERRIDE_CONFIG_DIRECTORY)
queue_metrics = QueueMetrics(app) metric_queue = MetricQueue()
build_metrics = BuildMetrics(app) start_cloudwatch_sender(metric_queue, app)
tf = app.config['DB_TRANSACTION_FACTORY'] tf = app.config['DB_TRANSACTION_FACTORY']
@ -137,7 +137,7 @@ oauth_apps = [github_login, github_trigger, gitlab_trigger, google_login]
image_diff_queue = WorkQueue(app.config['DIFFS_QUEUE_NAME'], tf) image_diff_queue = WorkQueue(app.config['DIFFS_QUEUE_NAME'], tf)
dockerfile_build_queue = WorkQueue(app.config['DOCKERFILE_BUILD_QUEUE_NAME'], tf, dockerfile_build_queue = WorkQueue(app.config['DOCKERFILE_BUILD_QUEUE_NAME'], tf,
reporter=queue_metrics.report) metric_queue=metric_queue)
notification_queue = WorkQueue(app.config['NOTIFICATION_QUEUE_NAME'], tf) notification_queue = WorkQueue(app.config['NOTIFICATION_QUEUE_NAME'], tf)
database.configure(app.config) database.configure(app.config)

View file

@ -1,70 +0,0 @@
from buildman.enums import BuildJobResult
from util.saas.cloudwatch import get_queue
class BuildReporter(object):
"""
Base class for reporting build statuses to a metrics service.
"""
def report_completion_status(self, status):
"""
Method to invoke the recording of build's completion status to a metric service.
"""
raise NotImplementedError
class NullReporter(BuildReporter):
"""
The /dev/null of BuildReporters.
"""
def report_completion_status(self, *args):
pass
class CloudWatchBuildReporter(BuildReporter):
"""
Implements a BuildReporter for Amazon's CloudWatch.
"""
def __init__(self, queue, namespace_name, completed_name, failed_name, incompleted_name):
self._queue = queue
self._namespace_name = namespace_name
self._completed_name = completed_name
self._failed_name = failed_name
self._incompleted_name = incompleted_name
def _send_to_queue(self, *args, **kwargs):
self._queue.put((args, kwargs))
def report_completion_status(self, status):
if status == BuildJobResult.COMPLETE:
status_name = self._completed_name
elif status == BuildJobResult.ERROR:
status_name = self._failed_name
elif status == BuildJobResult.INCOMPLETE:
status_name = self._incompleted_name
else:
return
self._send_to_queue(self._namespace_name, status_name, 1, unit='Count')
class BuildMetrics(object):
"""
BuildMetrics initializes a reporter for recording the status of build completions.
"""
def __init__(self, app=None):
self._app = app
self._reporter = NullReporter()
if app is not None:
reporter_type = app.config.get('BUILD_METRICS_TYPE', 'Null')
if reporter_type == 'CloudWatch':
namespace = app.config['BUILD_METRICS_NAMESPACE']
completed_name = app.config['BUILD_METRICS_COMPLETED_NAME']
failed_name = app.config['BUILD_METRICS_FAILED_NAME']
incompleted_name = app.config['BUILD_METRICS_INCOMPLETED_NAME']
request_queue = get_queue(app)
self._reporter = CloudWatchBuildReporter(request_queue, namespace, completed_name,
failed_name, incompleted_name)
def __getattr__(self, name):
return getattr(self._reporter, name, None)

View file

@ -16,7 +16,7 @@ from buildman.enums import BuildJobResult, BuildServerStatus
from buildman.jobutil.buildstatus import StatusHandler from buildman.jobutil.buildstatus import StatusHandler
from buildman.jobutil.buildjob import BuildJob, BuildJobLoadException from buildman.jobutil.buildjob import BuildJob, BuildJobLoadException
from data import database from data import database
from app import app, build_metrics from app import app, metric_queue
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -151,7 +151,7 @@ class BuilderServer(object):
if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count: if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count:
self._shutdown_event.set() self._shutdown_event.set()
build_metrics.report_completion_status(job_status) report_completion_status(job_status)
@trollius.coroutine @trollius.coroutine
def _work_checker(self): def _work_checker(self):
@ -225,3 +225,15 @@ class BuilderServer(object):
# Initialize the work queue checker. # Initialize the work queue checker.
yield From(self._work_checker()) yield From(self._work_checker())
def report_completion_status(status):
if status == BuildJobResult.COMPLETE:
status_name = 'CompleteBuilds'
elif status == BuildJobResult.ERROR:
status_name = 'FailedBuilds'
elif status == BuildJobResult.INCOMPLETE:
status_name = 'IncompletedBuilds'
else:
return
metric_queue.put(status_name, 1, unit='Count')

View file

@ -15,9 +15,9 @@ class NoopWith:
class WorkQueue(object): class WorkQueue(object):
def __init__(self, queue_name, transaction_factory, def __init__(self, queue_name, transaction_factory,
canonical_name_match_list=None, reporter=None): canonical_name_match_list=None, metric_queue=None):
self._queue_name = queue_name self._queue_name = queue_name
self._reporter = reporter self._metric_queue = metric_queue
self._transaction_factory = transaction_factory self._transaction_factory = transaction_factory
self._currently_processing = False self._currently_processing = False
@ -75,12 +75,14 @@ class WorkQueue(object):
return (running_count, available_not_running_count, available_count) return (running_count, available_not_running_count, available_count)
def update_metrics(self): def update_metrics(self):
if self._reporter is None: if self._metric_queue is None:
return return
(running_count, available_not_running_count, available_count) = self.get_metrics() (running_count, available_not_running_count, available_count) = self.get_metrics()
self._reporter(self._currently_processing, running_count, self._metric_queue.put('BuildCapacityShortage', available_not_running_count, unit='Count')
running_count + available_not_running_count)
building_percent = 100 if self._currently_processing else 0
self._metric_queue.put('PercentBuilding', building_percent, unit='Percent')
def has_retries_remaining(self, item_id): def has_retries_remaining(self, item_id):
""" Returns whether the queue item with the given id has any retries remaining. If the """ Returns whether the queue item with the given id has any retries remaining. If the

View file

@ -1,7 +1,7 @@
import logging import logging
import datetime import datetime
from app import app from app import app, metric_queue
from flask import Blueprint, request, make_response, jsonify, session from flask import Blueprint, request, make_response, jsonify, session
from flask.ext.restful import Resource, abort, Api, reqparse from flask.ext.restful import Resource, abort, Api, reqparse
from flask.ext.restful.utils.cors import crossdomain from flask.ext.restful.utils.cors import crossdomain
@ -20,6 +20,7 @@ from auth.auth_context import get_authenticated_user, get_validated_oauth_token
from auth.auth import process_oauth from auth.auth import process_oauth
from endpoints.csrf import csrf_protect from endpoints.csrf import csrf_protect
from endpoints.decorators import check_anon_protection from endpoints.decorators import check_anon_protection
from util.saas.metricqueue import time_decorator
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -28,7 +29,7 @@ api = Api()
api.init_app(api_bp) api.init_app(api_bp)
api.decorators = [csrf_protect, api.decorators = [csrf_protect,
crossdomain(origin='*', headers=['Authorization', 'Content-Type']), crossdomain(origin='*', headers=['Authorization', 'Content-Type']),
process_oauth] process_oauth, time_decorator(api_bp.name, metric_queue)]
class ApiException(Exception): class ApiException(Exception):

View file

@ -1,10 +1,13 @@
from flask import Blueprint, make_response from flask import Blueprint, make_response
from app import metric_queue
from endpoints.decorators import anon_protect, anon_allowed from endpoints.decorators import anon_protect, anon_allowed
from util.saas.metricqueue import time_blueprint
v1_bp = Blueprint('v1', __name__) v1_bp = Blueprint('v1', __name__)
time_blueprint(v1_bp, metric_queue)
# Note: This is *not* part of the Docker index spec. This is here for our own health check, # Note: This is *not* part of the Docker index spec. This is here for our own health check,
# since we have nginx handle the _ping below. # since we have nginx handle the _ping below.

View file

@ -7,6 +7,7 @@ from flask import Blueprint, make_response, url_for, request
from functools import wraps from functools import wraps
from urlparse import urlparse from urlparse import urlparse
from app import metric_queue
from endpoints.decorators import anon_protect, anon_allowed from endpoints.decorators import anon_protect, anon_allowed
from auth.jwt_auth import process_jwt_auth from auth.jwt_auth import process_jwt_auth
from auth.auth_context import get_grant_user_context from auth.auth_context import get_grant_user_context
@ -14,12 +15,13 @@ from auth.permissions import (ReadRepositoryPermission, ModifyRepositoryPermissi
AdministerRepositoryPermission) AdministerRepositoryPermission)
from data import model from data import model
from util.http import abort from util.http import abort
from util.saas.metricqueue import time_blueprint
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
v2_bp = Blueprint('v2', __name__) v2_bp = Blueprint('v2', __name__)
time_blueprint(v2_bp, metric_queue)
def _require_repo_permission(permission_class, allow_public=False): def _require_repo_permission(permission_class, allow_public=False):
def wrapper(func): def wrapper(func):

View file

@ -1,35 +1,46 @@
import logging import logging
import boto import boto
import time
from Queue import Queue from Queue import Empty
from threading import Thread from threading import Thread
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_queue(app): MAX_BATCH_METRICS = 100
"""
Returns a queue to a new CloudWatchSender.
"""
access_key = app.config['CLOUDWATCH_AWS_ACCESS_KEY']
secret_key = app.config['CLOUDWATCH_AWS_SECRET_KEY']
queue = Queue() # Sleep for this much time between failed send requests.
sender = CloudWatchSender(queue, access_key, secret_key) # This prevents hammering cloudwatch when it's not available.
FAILED_SEND_SLEEP_SECS = 5
def start_cloudwatch_sender(metrics, app):
"""
Starts sending from metrics to a new CloudWatchSender.
"""
try:
access_key = app.config['CLOUDWATCH_AWS_ACCESS_KEY']
secret_key = app.config['CLOUDWATCH_AWS_SECRET_KEY']
namespace = app.config['CLOUDWATCH_NAMESPACE']
except KeyError:
logger.debug('CloudWatch not configured')
return
sender = CloudWatchSender(metrics, access_key, secret_key, namespace)
sender.start() sender.start()
return queue
class CloudWatchSender(Thread): class CloudWatchSender(Thread):
""" """
CloudWatchSender loops indefinitely and pulls metrics off of a queue then sends it to CloudWatch. CloudWatchSender loops indefinitely and pulls metrics off of a queue then sends it to CloudWatch.
""" """
def __init__(self, request_queue, aws_access_key, aws_secret_key): def __init__(self, metrics, aws_access_key, aws_secret_key, namespace):
Thread.__init__(self) Thread.__init__(self)
self.daemon = True self.daemon = True
self._aws_access_key = aws_access_key self._aws_access_key = aws_access_key
self._aws_secret_key = aws_secret_key self._aws_secret_key = aws_secret_key
self._put_metrics_queue = request_queue self._metrics = metrics
self._namespace = namespace
def run(self): def run(self):
try: try:
@ -37,11 +48,46 @@ class CloudWatchSender(Thread):
connection = boto.connect_cloudwatch(self._aws_access_key, self._aws_secret_key) connection = boto.connect_cloudwatch(self._aws_access_key, self._aws_secret_key)
except: except:
logger.exception('Failed to connect to CloudWatch.') logger.exception('Failed to connect to CloudWatch.')
self._metrics.enable()
while True: while True:
put_metric_args, kwargs = self._put_metrics_queue.get() metrics = {
logger.debug('Got queued put metrics request.') 'name': [],
'value': [],
'unit': [],
'timestamp': [],
'dimensions': [],
}
metric = self._metrics.get()
append_metric(metrics, metric)
while len(metrics['name']) < MAX_BATCH_METRICS:
try:
metric = self._metrics.get_nowait()
append_metric(metrics, metric)
except Empty:
break
try: try:
connection.put_metric_data(*put_metric_args, **kwargs) connection.put_metric_data(self._namespace, **metrics)
logger.debug('Sent %d CloudWatch metrics', len(metrics['name']))
except: except:
logger.exception('Failed to write to CloudWatch') for i in range(len(metrics['name'])):
self._metrics.put(metrics['name'][i], metrics['value'][i],
unit=metrics['unit'][i],
dimensions=metrics['dimensions'][i],
timestamp=metrics['timestamp'][i],
)
logger.exception('Failed to write to CloudWatch: %s', metrics)
logger.debug('Attempted to requeue %d metrics.', len(metrics['name']))
time.sleep(FAILED_SEND_SLEEP_SECS)
def append_metric(metrics, m):
name, value, kwargs = m
metrics['name'].append(name)
metrics['value'].append(value)
metrics['unit'].append(kwargs.get('unit'))
metrics['dimensions'].append(kwargs.get('dimensions'))
metrics['timestamp'].append(kwargs.get('timestamp'))

69
util/saas/metricqueue.py Normal file
View file

@ -0,0 +1,69 @@
import datetime
import logging
import time
from functools import wraps
from Queue import Queue, Full
from flask import g, request
logger = logging.getLogger(__name__)
class MetricQueue(object):
def __init__(self):
self._queue = None
def enable(self, maxsize=10000):
self._queue = Queue(maxsize)
def put(self, name, value, **kwargs):
if self._queue is None:
logging.debug('No metric queue: %s %s %s', name, value, kwargs)
return
try:
kwargs.setdefault('timestamp', datetime.datetime.now())
kwargs.setdefault('dimensions', {})
self._queue.put_nowait((name, value, kwargs))
except Full:
logger.error('Metric queue full')
def get(self):
return self._queue.get()
def get_nowait(self):
return self._queue.get_nowait()
def time_blueprint(bp, metric_queue):
bp.before_request(time_before_request)
bp.after_request(time_after_request(bp.name, metric_queue))
def time_before_request():
g._request_start_time = time.time()
def time_after_request(name, metric_queue):
def f(r):
start = getattr(g, '_request_start_time', None)
if start is None:
return r
dur = time.time() - start
dims = {'endpoint': request.endpoint}
metric_queue.put('ResponseTime', dur, dimensions=dims, unit='Seconds')
metric_queue.put('ResponseCode', r.status_code, dimensions=dims)
if r.status_code < 200 or r.status_code >= 300:
metric_queue.put('Non200Response', 1, dimensions={'name': name})
return r
return f
def time_decorator(name, metric_queue):
after = time_after_request(name, metric_queue)
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
time_before_request()
rv = func(*args, **kwargs)
after(rv)
return rv
return wrapper
return decorator

View file

@ -1,56 +0,0 @@
import logging
from util.saas.cloudwatch import get_queue
logger = logging.getLogger(__name__)
class NullReporter(object):
def report(self, *args):
pass
class CloudWatchReporter(object):
""" CloudWatchReporter reports work queue metrics to CloudWatch """
def __init__(self, request_queue, namespace, need_capacity_name, build_percent_name):
self._namespace = namespace
self._need_capacity_name = need_capacity_name
self._build_percent_name = build_percent_name
self._put_metrics_queue = request_queue
def _send_to_queue(self, *args, **kwargs):
self._put_metrics_queue.put((args, kwargs))
def report(self, currently_processing, running_count, total_count):
logger.debug('Worker indicated %s running count and %s total count', running_count,
total_count)
need_capacity_count = total_count - running_count
self._send_to_queue(self._namespace, self._need_capacity_name, need_capacity_count,
unit='Count')
building_percent = 100 if currently_processing else 0
self._send_to_queue(self._namespace, self._build_percent_name, building_percent,
unit='Percent')
class QueueMetrics(object):
"""
QueueMetrics initializes a reporter for recording metrics of work queues.
"""
def __init__(self, app=None):
self._app = app
self._reporter = NullReporter()
if app is not None:
reporter_type = app.config.get('QUEUE_METRICS_TYPE', 'Null')
if reporter_type == 'CloudWatch':
namespace = app.config['QUEUE_METRICS_NAMESPACE']
req_capacity_name = app.config['QUEUE_METRICS_CAPACITY_SHORTAGE_NAME']
build_percent_name = app.config['QUEUE_METRICS_BUILD_PERCENT_NAME']
request_queue = get_queue(app)
self._reporter = CloudWatchReporter(request_queue, namespace, req_capacity_name,
build_percent_name)
def __getattr__(self, name):
return getattr(self._reporter, name, None)