commit
2fd1d5969e
10 changed files with 166 additions and 157 deletions
10
app.py
10
app.py
|
@ -26,11 +26,11 @@ from util.saas.exceptionlog import Sentry
|
|||
from util.names import urn_generator
|
||||
from util.config.oauth import GoogleOAuthConfig, GithubOAuthConfig, GitLabOAuthConfig
|
||||
from util.security.signing import Signer
|
||||
from util.saas.queuemetrics import QueueMetrics
|
||||
from util.saas.cloudwatch import start_cloudwatch_sender
|
||||
from util.saas.metricqueue import MetricQueue
|
||||
from util.config.provider import FileConfigProvider, TestConfigProvider
|
||||
from util.config.configutil import generate_secret_key
|
||||
from util.config.superusermanager import SuperUserManager
|
||||
from buildman.jobutil.buildreporter import BuildMetrics
|
||||
|
||||
OVERRIDE_CONFIG_DIRECTORY = 'conf/stack/'
|
||||
OVERRIDE_CONFIG_YAML_FILENAME = 'conf/stack/config.yaml'
|
||||
|
@ -124,8 +124,8 @@ authentication = UserAuthentication(app, OVERRIDE_CONFIG_DIRECTORY)
|
|||
userevents = UserEventsBuilderModule(app)
|
||||
superusers = SuperUserManager(app)
|
||||
signer = Signer(app, OVERRIDE_CONFIG_DIRECTORY)
|
||||
queue_metrics = QueueMetrics(app)
|
||||
build_metrics = BuildMetrics(app)
|
||||
metric_queue = MetricQueue()
|
||||
start_cloudwatch_sender(metric_queue, app)
|
||||
|
||||
tf = app.config['DB_TRANSACTION_FACTORY']
|
||||
|
||||
|
@ -137,7 +137,7 @@ oauth_apps = [github_login, github_trigger, gitlab_trigger, google_login]
|
|||
|
||||
image_diff_queue = WorkQueue(app.config['DIFFS_QUEUE_NAME'], tf)
|
||||
dockerfile_build_queue = WorkQueue(app.config['DOCKERFILE_BUILD_QUEUE_NAME'], tf,
|
||||
reporter=queue_metrics.report)
|
||||
metric_queue=metric_queue)
|
||||
notification_queue = WorkQueue(app.config['NOTIFICATION_QUEUE_NAME'], tf)
|
||||
|
||||
database.configure(app.config)
|
||||
|
|
|
@ -1,70 +0,0 @@
|
|||
from buildman.enums import BuildJobResult
|
||||
from util.saas.cloudwatch import get_queue
|
||||
|
||||
|
||||
class BuildReporter(object):
|
||||
"""
|
||||
Base class for reporting build statuses to a metrics service.
|
||||
"""
|
||||
def report_completion_status(self, status):
|
||||
"""
|
||||
Method to invoke the recording of build's completion status to a metric service.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class NullReporter(BuildReporter):
|
||||
"""
|
||||
The /dev/null of BuildReporters.
|
||||
"""
|
||||
def report_completion_status(self, *args):
|
||||
pass
|
||||
|
||||
|
||||
class CloudWatchBuildReporter(BuildReporter):
|
||||
"""
|
||||
Implements a BuildReporter for Amazon's CloudWatch.
|
||||
"""
|
||||
def __init__(self, queue, namespace_name, completed_name, failed_name, incompleted_name):
|
||||
self._queue = queue
|
||||
self._namespace_name = namespace_name
|
||||
self._completed_name = completed_name
|
||||
self._failed_name = failed_name
|
||||
self._incompleted_name = incompleted_name
|
||||
|
||||
def _send_to_queue(self, *args, **kwargs):
|
||||
self._queue.put((args, kwargs))
|
||||
|
||||
def report_completion_status(self, status):
|
||||
if status == BuildJobResult.COMPLETE:
|
||||
status_name = self._completed_name
|
||||
elif status == BuildJobResult.ERROR:
|
||||
status_name = self._failed_name
|
||||
elif status == BuildJobResult.INCOMPLETE:
|
||||
status_name = self._incompleted_name
|
||||
else:
|
||||
return
|
||||
|
||||
self._send_to_queue(self._namespace_name, status_name, 1, unit='Count')
|
||||
|
||||
|
||||
class BuildMetrics(object):
|
||||
"""
|
||||
BuildMetrics initializes a reporter for recording the status of build completions.
|
||||
"""
|
||||
def __init__(self, app=None):
|
||||
self._app = app
|
||||
self._reporter = NullReporter()
|
||||
if app is not None:
|
||||
reporter_type = app.config.get('BUILD_METRICS_TYPE', 'Null')
|
||||
if reporter_type == 'CloudWatch':
|
||||
namespace = app.config['BUILD_METRICS_NAMESPACE']
|
||||
completed_name = app.config['BUILD_METRICS_COMPLETED_NAME']
|
||||
failed_name = app.config['BUILD_METRICS_FAILED_NAME']
|
||||
incompleted_name = app.config['BUILD_METRICS_INCOMPLETED_NAME']
|
||||
request_queue = get_queue(app)
|
||||
self._reporter = CloudWatchBuildReporter(request_queue, namespace, completed_name,
|
||||
failed_name, incompleted_name)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._reporter, name, None)
|
|
@ -16,7 +16,7 @@ from buildman.enums import BuildJobResult, BuildServerStatus
|
|||
from buildman.jobutil.buildstatus import StatusHandler
|
||||
from buildman.jobutil.buildjob import BuildJob, BuildJobLoadException
|
||||
from data import database
|
||||
from app import app, build_metrics
|
||||
from app import app, metric_queue
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -151,7 +151,7 @@ class BuilderServer(object):
|
|||
if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count:
|
||||
self._shutdown_event.set()
|
||||
|
||||
build_metrics.report_completion_status(job_status)
|
||||
report_completion_status(job_status)
|
||||
|
||||
@trollius.coroutine
|
||||
def _work_checker(self):
|
||||
|
@ -225,3 +225,15 @@ class BuilderServer(object):
|
|||
|
||||
# Initialize the work queue checker.
|
||||
yield From(self._work_checker())
|
||||
|
||||
def report_completion_status(status):
|
||||
if status == BuildJobResult.COMPLETE:
|
||||
status_name = 'CompleteBuilds'
|
||||
elif status == BuildJobResult.ERROR:
|
||||
status_name = 'FailedBuilds'
|
||||
elif status == BuildJobResult.INCOMPLETE:
|
||||
status_name = 'IncompletedBuilds'
|
||||
else:
|
||||
return
|
||||
|
||||
metric_queue.put(status_name, 1, unit='Count')
|
||||
|
|
|
@ -15,9 +15,9 @@ class NoopWith:
|
|||
|
||||
class WorkQueue(object):
|
||||
def __init__(self, queue_name, transaction_factory,
|
||||
canonical_name_match_list=None, reporter=None):
|
||||
canonical_name_match_list=None, metric_queue=None):
|
||||
self._queue_name = queue_name
|
||||
self._reporter = reporter
|
||||
self._metric_queue = metric_queue
|
||||
self._transaction_factory = transaction_factory
|
||||
self._currently_processing = False
|
||||
|
||||
|
@ -75,12 +75,14 @@ class WorkQueue(object):
|
|||
return (running_count, available_not_running_count, available_count)
|
||||
|
||||
def update_metrics(self):
|
||||
if self._reporter is None:
|
||||
if self._metric_queue is None:
|
||||
return
|
||||
|
||||
(running_count, available_not_running_count, available_count) = self.get_metrics()
|
||||
self._reporter(self._currently_processing, running_count,
|
||||
running_count + available_not_running_count)
|
||||
self._metric_queue.put('BuildCapacityShortage', available_not_running_count, unit='Count')
|
||||
|
||||
building_percent = 100 if self._currently_processing else 0
|
||||
self._metric_queue.put('PercentBuilding', building_percent, unit='Percent')
|
||||
|
||||
def has_retries_remaining(self, item_id):
|
||||
""" Returns whether the queue item with the given id has any retries remaining. If the
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import logging
|
||||
import datetime
|
||||
|
||||
from app import app
|
||||
from app import app, metric_queue
|
||||
from flask import Blueprint, request, make_response, jsonify, session
|
||||
from flask.ext.restful import Resource, abort, Api, reqparse
|
||||
from flask.ext.restful.utils.cors import crossdomain
|
||||
|
@ -20,6 +20,7 @@ from auth.auth_context import get_authenticated_user, get_validated_oauth_token
|
|||
from auth.auth import process_oauth
|
||||
from endpoints.csrf import csrf_protect
|
||||
from endpoints.decorators import check_anon_protection
|
||||
from util.saas.metricqueue import time_decorator
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -28,7 +29,7 @@ api = Api()
|
|||
api.init_app(api_bp)
|
||||
api.decorators = [csrf_protect,
|
||||
crossdomain(origin='*', headers=['Authorization', 'Content-Type']),
|
||||
process_oauth]
|
||||
process_oauth, time_decorator(api_bp.name, metric_queue)]
|
||||
|
||||
|
||||
class ApiException(Exception):
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
from flask import Blueprint, make_response
|
||||
|
||||
from app import metric_queue
|
||||
from endpoints.decorators import anon_protect, anon_allowed
|
||||
from util.saas.metricqueue import time_blueprint
|
||||
|
||||
|
||||
v1_bp = Blueprint('v1', __name__)
|
||||
|
||||
time_blueprint(v1_bp, metric_queue)
|
||||
|
||||
# Note: This is *not* part of the Docker index spec. This is here for our own health check,
|
||||
# since we have nginx handle the _ping below.
|
||||
|
|
|
@ -7,6 +7,7 @@ from flask import Blueprint, make_response, url_for, request
|
|||
from functools import wraps
|
||||
from urlparse import urlparse
|
||||
|
||||
from app import metric_queue
|
||||
from endpoints.decorators import anon_protect, anon_allowed
|
||||
from auth.jwt_auth import process_jwt_auth
|
||||
from auth.auth_context import get_grant_user_context
|
||||
|
@ -14,12 +15,13 @@ from auth.permissions import (ReadRepositoryPermission, ModifyRepositoryPermissi
|
|||
AdministerRepositoryPermission)
|
||||
from data import model
|
||||
from util.http import abort
|
||||
|
||||
from util.saas.metricqueue import time_blueprint
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
v2_bp = Blueprint('v2', __name__)
|
||||
|
||||
time_blueprint(v2_bp, metric_queue)
|
||||
|
||||
def _require_repo_permission(permission_class, allow_public=False):
|
||||
def wrapper(func):
|
||||
|
|
|
@ -1,35 +1,46 @@
|
|||
import logging
|
||||
import boto
|
||||
import time
|
||||
|
||||
from Queue import Queue
|
||||
from Queue import Empty
|
||||
from threading import Thread
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_queue(app):
|
||||
"""
|
||||
Returns a queue to a new CloudWatchSender.
|
||||
"""
|
||||
access_key = app.config['CLOUDWATCH_AWS_ACCESS_KEY']
|
||||
secret_key = app.config['CLOUDWATCH_AWS_SECRET_KEY']
|
||||
MAX_BATCH_METRICS = 100
|
||||
|
||||
queue = Queue()
|
||||
sender = CloudWatchSender(queue, access_key, secret_key)
|
||||
# Sleep for this much time between failed send requests.
|
||||
# This prevents hammering cloudwatch when it's not available.
|
||||
FAILED_SEND_SLEEP_SECS = 5
|
||||
|
||||
def start_cloudwatch_sender(metrics, app):
|
||||
"""
|
||||
Starts sending from metrics to a new CloudWatchSender.
|
||||
"""
|
||||
try:
|
||||
access_key = app.config['CLOUDWATCH_AWS_ACCESS_KEY']
|
||||
secret_key = app.config['CLOUDWATCH_AWS_SECRET_KEY']
|
||||
namespace = app.config['CLOUDWATCH_NAMESPACE']
|
||||
except KeyError:
|
||||
logger.debug('CloudWatch not configured')
|
||||
return
|
||||
|
||||
sender = CloudWatchSender(metrics, access_key, secret_key, namespace)
|
||||
sender.start()
|
||||
return queue
|
||||
|
||||
class CloudWatchSender(Thread):
|
||||
"""
|
||||
CloudWatchSender loops indefinitely and pulls metrics off of a queue then sends it to CloudWatch.
|
||||
"""
|
||||
def __init__(self, request_queue, aws_access_key, aws_secret_key):
|
||||
def __init__(self, metrics, aws_access_key, aws_secret_key, namespace):
|
||||
Thread.__init__(self)
|
||||
self.daemon = True
|
||||
|
||||
self._aws_access_key = aws_access_key
|
||||
self._aws_secret_key = aws_secret_key
|
||||
self._put_metrics_queue = request_queue
|
||||
self._metrics = metrics
|
||||
self._namespace = namespace
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
|
@ -37,11 +48,46 @@ class CloudWatchSender(Thread):
|
|||
connection = boto.connect_cloudwatch(self._aws_access_key, self._aws_secret_key)
|
||||
except:
|
||||
logger.exception('Failed to connect to CloudWatch.')
|
||||
self._metrics.enable()
|
||||
|
||||
while True:
|
||||
put_metric_args, kwargs = self._put_metrics_queue.get()
|
||||
logger.debug('Got queued put metrics request.')
|
||||
metrics = {
|
||||
'name': [],
|
||||
'value': [],
|
||||
'unit': [],
|
||||
'timestamp': [],
|
||||
'dimensions': [],
|
||||
}
|
||||
|
||||
metric = self._metrics.get()
|
||||
append_metric(metrics, metric)
|
||||
|
||||
while len(metrics['name']) < MAX_BATCH_METRICS:
|
||||
try:
|
||||
metric = self._metrics.get_nowait()
|
||||
append_metric(metrics, metric)
|
||||
except Empty:
|
||||
break
|
||||
|
||||
try:
|
||||
connection.put_metric_data(*put_metric_args, **kwargs)
|
||||
connection.put_metric_data(self._namespace, **metrics)
|
||||
logger.debug('Sent %d CloudWatch metrics', len(metrics['name']))
|
||||
except:
|
||||
logger.exception('Failed to write to CloudWatch')
|
||||
for i in range(len(metrics['name'])):
|
||||
self._metrics.put(metrics['name'][i], metrics['value'][i],
|
||||
unit=metrics['unit'][i],
|
||||
dimensions=metrics['dimensions'][i],
|
||||
timestamp=metrics['timestamp'][i],
|
||||
)
|
||||
|
||||
logger.exception('Failed to write to CloudWatch: %s', metrics)
|
||||
logger.debug('Attempted to requeue %d metrics.', len(metrics['name']))
|
||||
time.sleep(FAILED_SEND_SLEEP_SECS)
|
||||
|
||||
def append_metric(metrics, m):
|
||||
name, value, kwargs = m
|
||||
metrics['name'].append(name)
|
||||
metrics['value'].append(value)
|
||||
metrics['unit'].append(kwargs.get('unit'))
|
||||
metrics['dimensions'].append(kwargs.get('dimensions'))
|
||||
metrics['timestamp'].append(kwargs.get('timestamp'))
|
||||
|
|
69
util/saas/metricqueue.py
Normal file
69
util/saas/metricqueue.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
import datetime
|
||||
import logging
|
||||
import time
|
||||
|
||||
from functools import wraps
|
||||
from Queue import Queue, Full
|
||||
|
||||
from flask import g, request
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MetricQueue(object):
|
||||
def __init__(self):
|
||||
self._queue = None
|
||||
|
||||
def enable(self, maxsize=10000):
|
||||
self._queue = Queue(maxsize)
|
||||
|
||||
def put(self, name, value, **kwargs):
|
||||
if self._queue is None:
|
||||
logging.debug('No metric queue: %s %s %s', name, value, kwargs)
|
||||
return
|
||||
|
||||
try:
|
||||
kwargs.setdefault('timestamp', datetime.datetime.now())
|
||||
kwargs.setdefault('dimensions', {})
|
||||
self._queue.put_nowait((name, value, kwargs))
|
||||
except Full:
|
||||
logger.error('Metric queue full')
|
||||
|
||||
def get(self):
|
||||
return self._queue.get()
|
||||
|
||||
def get_nowait(self):
|
||||
return self._queue.get_nowait()
|
||||
|
||||
def time_blueprint(bp, metric_queue):
|
||||
bp.before_request(time_before_request)
|
||||
bp.after_request(time_after_request(bp.name, metric_queue))
|
||||
|
||||
def time_before_request():
|
||||
g._request_start_time = time.time()
|
||||
|
||||
def time_after_request(name, metric_queue):
|
||||
def f(r):
|
||||
start = getattr(g, '_request_start_time', None)
|
||||
if start is None:
|
||||
return r
|
||||
dur = time.time() - start
|
||||
dims = {'endpoint': request.endpoint}
|
||||
metric_queue.put('ResponseTime', dur, dimensions=dims, unit='Seconds')
|
||||
metric_queue.put('ResponseCode', r.status_code, dimensions=dims)
|
||||
if r.status_code < 200 or r.status_code >= 300:
|
||||
metric_queue.put('Non200Response', 1, dimensions={'name': name})
|
||||
return r
|
||||
return f
|
||||
|
||||
def time_decorator(name, metric_queue):
|
||||
after = time_after_request(name, metric_queue)
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
time_before_request()
|
||||
rv = func(*args, **kwargs)
|
||||
after(rv)
|
||||
return rv
|
||||
return wrapper
|
||||
return decorator
|
|
@ -1,56 +0,0 @@
|
|||
import logging
|
||||
|
||||
from util.saas.cloudwatch import get_queue
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NullReporter(object):
|
||||
def report(self, *args):
|
||||
pass
|
||||
|
||||
|
||||
class CloudWatchReporter(object):
|
||||
""" CloudWatchReporter reports work queue metrics to CloudWatch """
|
||||
def __init__(self, request_queue, namespace, need_capacity_name, build_percent_name):
|
||||
self._namespace = namespace
|
||||
self._need_capacity_name = need_capacity_name
|
||||
self._build_percent_name = build_percent_name
|
||||
self._put_metrics_queue = request_queue
|
||||
|
||||
def _send_to_queue(self, *args, **kwargs):
|
||||
self._put_metrics_queue.put((args, kwargs))
|
||||
|
||||
def report(self, currently_processing, running_count, total_count):
|
||||
logger.debug('Worker indicated %s running count and %s total count', running_count,
|
||||
total_count)
|
||||
|
||||
need_capacity_count = total_count - running_count
|
||||
self._send_to_queue(self._namespace, self._need_capacity_name, need_capacity_count,
|
||||
unit='Count')
|
||||
|
||||
building_percent = 100 if currently_processing else 0
|
||||
self._send_to_queue(self._namespace, self._build_percent_name, building_percent,
|
||||
unit='Percent')
|
||||
|
||||
class QueueMetrics(object):
|
||||
"""
|
||||
QueueMetrics initializes a reporter for recording metrics of work queues.
|
||||
"""
|
||||
def __init__(self, app=None):
|
||||
self._app = app
|
||||
self._reporter = NullReporter()
|
||||
if app is not None:
|
||||
reporter_type = app.config.get('QUEUE_METRICS_TYPE', 'Null')
|
||||
if reporter_type == 'CloudWatch':
|
||||
namespace = app.config['QUEUE_METRICS_NAMESPACE']
|
||||
req_capacity_name = app.config['QUEUE_METRICS_CAPACITY_SHORTAGE_NAME']
|
||||
build_percent_name = app.config['QUEUE_METRICS_BUILD_PERCENT_NAME']
|
||||
|
||||
request_queue = get_queue(app)
|
||||
self._reporter = CloudWatchReporter(request_queue, namespace, req_capacity_name,
|
||||
build_percent_name)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._reporter, name, None)
|
Reference in a new issue