Add support to the build system for tracking if/when the build manager crashes and make sure builds are restarted within a few minutes

This commit is contained in:
Joseph Schorr 2014-11-21 14:27:06 -05:00
parent 25b5062bb6
commit b8e873b00b
13 changed files with 73 additions and 15 deletions

View file

@ -9,14 +9,17 @@ from aiowsgi import create_server as create_wsgi_server
from flask import Flask
from threading import Event
from trollius.coroutines import From
from datetime import datetime, timedelta
from buildman.buildjob import BuildJob, BuildJobLoadException
from data.queue import WorkQueue
LOGGER = logging.getLogger(__name__)
WORK_CHECK_TIMEOUT = 10
TIMEOUT_PERIOD_MINUTES = 20
RESERVATION_SECONDS = (TIMEOUT_PERIOD_MINUTES + 5) * 60
JOB_TIMEOUT_SECONDS = 300
MINIMUM_JOB_EXTENSION = timedelta(minutes=2)
class BuildJobResult(object):
""" Build job result enum """
@ -42,6 +45,7 @@ class BuilderServer(object):
self._lifecycle_manager = lifecycle_manager_klass(
self._register_component,
self._unregister_component,
self._job_heartbeat,
self._job_complete
)
@ -107,6 +111,10 @@ class BuilderServer(object):
self._current_components.remove(component)
self._session_factory.remove(component)
def _job_heartbeat(self, build_job):
WorkQueue.extend_processing(build_job.job_item(), seconds_from_now=JOB_TIMEOUT_SECONDS,
retry_count=1, minimum_extension=MINIMUM_JOB_EXTENSION)
def _job_complete(self, build_job, job_status):
if job_status == BuildJobResult.INCOMPLETE:
self._queue.incomplete(build_job.job_item(), restore_retry=True, retry_after=30)
@ -126,7 +134,7 @@ class BuilderServer(object):
def _work_checker(self):
while self._current_status == 'running':
LOGGER.debug('Checking for more work')
job_item = self._queue.get(processing_time=RESERVATION_SECONDS)
job_item = self._queue.get(processing_time=self._lifecycle_manager.setup_time())
if job_item is None:
LOGGER.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))