Add support to the build system for tracking if/when the build manager crashes and make sure builds are restarted within a few minutes
This commit is contained in:
parent
25b5062bb6
commit
b8e873b00b
13 changed files with 73 additions and 15 deletions
|
@ -9,14 +9,17 @@ from aiowsgi import create_server as create_wsgi_server
|
|||
from flask import Flask
|
||||
from threading import Event
|
||||
from trollius.coroutines import From
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from buildman.buildjob import BuildJob, BuildJobLoadException
|
||||
from data.queue import WorkQueue
|
||||
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
WORK_CHECK_TIMEOUT = 10
|
||||
TIMEOUT_PERIOD_MINUTES = 20
|
||||
RESERVATION_SECONDS = (TIMEOUT_PERIOD_MINUTES + 5) * 60
|
||||
JOB_TIMEOUT_SECONDS = 300
|
||||
MINIMUM_JOB_EXTENSION = timedelta(minutes=2)
|
||||
|
||||
class BuildJobResult(object):
|
||||
""" Build job result enum """
|
||||
|
@ -42,6 +45,7 @@ class BuilderServer(object):
|
|||
self._lifecycle_manager = lifecycle_manager_klass(
|
||||
self._register_component,
|
||||
self._unregister_component,
|
||||
self._job_heartbeat,
|
||||
self._job_complete
|
||||
)
|
||||
|
||||
|
@ -107,6 +111,10 @@ class BuilderServer(object):
|
|||
self._current_components.remove(component)
|
||||
self._session_factory.remove(component)
|
||||
|
||||
def _job_heartbeat(self, build_job):
|
||||
WorkQueue.extend_processing(build_job.job_item(), seconds_from_now=JOB_TIMEOUT_SECONDS,
|
||||
retry_count=1, minimum_extension=MINIMUM_JOB_EXTENSION)
|
||||
|
||||
def _job_complete(self, build_job, job_status):
|
||||
if job_status == BuildJobResult.INCOMPLETE:
|
||||
self._queue.incomplete(build_job.job_item(), restore_retry=True, retry_after=30)
|
||||
|
@ -126,7 +134,7 @@ class BuilderServer(object):
|
|||
def _work_checker(self):
|
||||
while self._current_status == 'running':
|
||||
LOGGER.debug('Checking for more work')
|
||||
job_item = self._queue.get(processing_time=RESERVATION_SECONDS)
|
||||
job_item = self._queue.get(processing_time=self._lifecycle_manager.setup_time())
|
||||
if job_item is None:
|
||||
LOGGER.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
|
||||
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
|
||||
|
|
Reference in a new issue