Merge pull request #11 from coreos-inc/nimbus

CloudWatch for build job status
This commit is contained in:
Jimmy Zelinskie 2015-02-18 17:17:28 -05:00
commit 47f8cb77c4
8 changed files with 174 additions and 86 deletions

12
buildman/enums.py Normal file
View file

@ -0,0 +1,12 @@
class BuildJobResult(object):
""" Build job result enum """
INCOMPLETE = 'incomplete'
COMPLETE = 'complete'
ERROR = 'error'
class BuildServerStatus(object):
""" Build server status enum """
STARTING = 'starting'
RUNNING = 'running'
SHUTDOWN = 'shutting_down'

View file

@ -0,0 +1,72 @@
from trollius import From
from buildman.enums import BuildJobResult
from util.cloudwatch import get_queue
class BuildReporter(object):
"""
Base class for reporting build statuses to a metrics service.
"""
def report_completion_status(self, status):
"""
Method to invoke the recording of build's completion status to a metric service.
"""
raise NotImplementedError
class NullReporter(BuildReporter):
"""
The /dev/null of BuildReporters.
"""
def report_completion_status(self, *args):
pass
class CloudWatchBuildReporter(BuildReporter):
"""
Implements a BuildReporter for Amazon's CloudWatch.
"""
def __init__(self, queue, namespace_name, completed_name, failed_name, incompleted_name):
self._queue = queue
self._namespace_name = namespace_name
self._completed_name = completed_name
self._failed_name = failed_name
self._incompleted_name = incompleted_name
def _send_to_queue(self, *args, **kwargs):
self._queue.put((args, kwargs))
def report_completion_status(self, status):
if status == BuildJobResult.COMPLETE:
status_name = self._completed_name
elif status == BuildJobResult.ERROR:
status_name = self._failed_name
elif status == BuildJobResult.INCOMPLETE:
status_name = self._incompleted_name
else:
return
self._send_to_queue(self._namespace_name, status_name, 1, unit='Count')
class BuildMetrics(object):
"""
BuildMetrics initializes a reporter for recording the status of build completions.
"""
def __init__(self, app=None):
self._app = app
self._reporter = NullReporter()
if app is not None:
reporter_type = app.config.get('BUILD_METRICS_TYPE', 'Null')
if reporter_type == 'CloudWatch':
namespace = app.config['BUILD_METRICS_NAMESPACE']
completed_name = app.config['BUILD_METRICS_COMPLETED_NAME']
failed_name = app.config['BUILD_METRICS_FAILED_NAME']
incompleted_name = app.config['BUILD_METRICS_INCOMPLETED_NAME']
request_queue = get_queue(app)
self._reporter = CloudWatchBuildReporter(request_queue, namespace, completed_name,
failed_name, incompleted_name)
def __getattr__(self, name):
return getattr(self._reporter, name, None)

View file

@ -9,14 +9,15 @@ from autobahn.wamp import types
from aiowsgi import create_server as create_wsgi_server
from flask import Flask
from threading import Event
from trollius.tasks import Task
from trollius.coroutines import From
from datetime import timedelta
from buildman.enums import BuildJobResult, BuildServerStatus
from buildman.jobutil.buildstatus import StatusHandler
from buildman.jobutil.buildjob import BuildJob, BuildJobLoadException
from data import database
from data.queue import WorkQueue
from app import app
from app import app, build_metrics
logger = logging.getLogger(__name__)
@ -27,12 +28,6 @@ MINIMUM_JOB_EXTENSION = timedelta(minutes=2)
HEARTBEAT_PERIOD_SEC = 30
class BuildJobResult(object):
""" Build job result enum """
INCOMPLETE = 'incomplete'
COMPLETE = 'complete'
ERROR = 'error'
class BuilderServer(object):
""" Server which handles both HTTP and WAMP requests, managing the full state of the build
controller.
@ -40,7 +35,7 @@ class BuilderServer(object):
def __init__(self, registry_hostname, queue, build_logs, user_files, lifecycle_manager_klass,
lifecycle_manager_config, manager_hostname):
self._loop = None
self._current_status = 'starting'
self._current_status = BuildServerStatus.STARTING
self._current_components = []
self._job_count = 0
@ -60,7 +55,7 @@ class BuilderServer(object):
self._lifecycle_manager_config = lifecycle_manager_config
self._shutdown_event = Event()
self._current_status = 'running'
self._current_status = BuildServerStatus.RUNNING
self._register_controller()
@ -97,8 +92,14 @@ class BuilderServer(object):
logger.debug('Starting server on port %s, with controller on port %s', websocket_port,
controller_port)
TASKS = [
Task(self._initialize(loop, host, websocket_port, controller_port, ssl)),
Task(self._queue_metrics_updater()),
]
try:
loop.run_until_complete(self._initialize(loop, host, websocket_port, controller_port, ssl))
loop.run_until_complete(trollius.wait(TASKS))
except KeyboardInterrupt:
pass
finally:
@ -106,7 +107,7 @@ class BuilderServer(object):
def close(self):
logger.debug('Requested server shutdown')
self._current_status = 'shutting_down'
self._current_status = BuildServerStatus.SHUTDOWN
self._lifecycle_manager.shutdown()
self._shutdown_event.wait()
logger.debug('Shutting down server')
@ -147,12 +148,14 @@ class BuilderServer(object):
self._job_count = self._job_count - 1
if self._current_status == 'shutting_down' and not self._job_count:
if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count:
self._shutdown_event.set()
build_metrics.report_completion_status(job_status)
@trollius.coroutine
def _work_checker(self):
while self._current_status == 'running':
while self._current_status == BuildServerStatus.RUNNING:
with database.CloseForLongOperation(app.config):
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
@ -183,7 +186,11 @@ class BuilderServer(object):
logger.debug('All workers are busy. Requeuing.')
self._queue.incomplete(job_item, restore_retry=True, retry_after=0)
@trollius.coroutine
def _queue_metrics_updater(self):
while self._current_status == BuildServerStatus.RUNNING:
yield From(trollius.sleep(30))
self._queue.update_metrics()
@trollius.coroutine
def _initialize(self, loop, host, websocket_port, controller_port, ssl=None):