Add support to the build system for tracking if/when the build manager crashes and make sure builds are restarted within a few minutes

This commit is contained in:
Joseph Schorr 2014-11-21 14:27:06 -05:00
parent 25b5062bb6
commit b8e873b00b
13 changed files with 73 additions and 15 deletions

View file

@ -258,7 +258,8 @@ class BuildComponent(BaseComponent):
worker_error = WorkerError(aex.error, aex.kwargs.get('base_error')) worker_error = WorkerError(aex.error, aex.kwargs.get('base_error'))
# Write the error to the log. # Write the error to the log.
self._build_status.set_error(worker_error.public_message(), worker_error.extra_data()) self._build_status.set_error(worker_error.public_message(), worker_error.extra_data(),
internal_error=worker_error.is_internal_error())
# Mark the build as completed. # Mark the build as completed.
if worker_error.is_internal_error(): if worker_error.is_internal_error():
@ -326,6 +327,12 @@ class BuildComponent(BaseComponent):
with build_status as status_dict: with build_status as status_dict:
status_dict['heartbeat'] = int(time.time()) status_dict['heartbeat'] = int(time.time())
# Mark the build item.
current_job = self._current_job
if current_job is not None:
self.parent_manager.job_heartbeat(current_job)
# Check the heartbeat from the worker. # Check the heartbeat from the worker.
LOGGER.debug('Checking heartbeat on realm %s', self.builder_realm) LOGGER.debug('Checking heartbeat on realm %s', self.builder_realm)
if not self._last_heartbeat: if not self._last_heartbeat:
@ -348,7 +355,7 @@ class BuildComponent(BaseComponent):
# manager. # manager.
if self._current_job is not None: if self._current_job is not None:
if timed_out: if timed_out:
self._build_status.set_error('Build worker timed out. Build has been requeued.') self._build_status.set_error('Build worker timed out', internal_error=True)
self.parent_manager.job_completed(self._current_job, BuildJobResult.INCOMPLETE, self) self.parent_manager.job_completed(self._current_job, BuildJobResult.INCOMPLETE, self)
self._build_status = None self._build_status = None

View file

@ -28,8 +28,11 @@ class StatusHandler(object):
def set_command(self, command, extra_data=None): def set_command(self, command, extra_data=None):
self._append_log_message(command, self._build_logs.COMMAND, extra_data) self._append_log_message(command, self._build_logs.COMMAND, extra_data)
def set_error(self, error_message, extra_data=None): def set_error(self, error_message, extra_data=None, internal_error=False):
self.set_phase(BUILD_PHASE.ERROR) self.set_phase(BUILD_PHASE.INTERNAL_ERROR if internal_error else BUILD_PHASE.ERROR)
extra_data = extra_data or {}
extra_data['internal_error'] = internal_error
self._append_log_message(error_message, self._build_logs.ERROR, extra_data) self._append_log_message(error_message, self._build_logs.ERROR, extra_data)
def set_phase(self, phase, extra_data=None): def set_phase(self, phase, extra_data=None):

View file

@ -1,10 +1,23 @@
class BaseManager(object): class BaseManager(object):
""" Base for all worker managers. """ """ Base for all worker managers. """
def __init__(self, register_component, unregister_component, job_complete_callback): def __init__(self, register_component, unregister_component, job_heartbeat_callback,
job_complete_callback):
self.register_component = register_component self.register_component = register_component
self.unregister_component = unregister_component self.unregister_component = unregister_component
self.job_heartbeat_callback = job_heartbeat_callback
self.job_complete_callback = job_complete_callback self.job_complete_callback = job_complete_callback
def job_heartbeat(self, build_job):
""" Method invoked to tell the manager that a job is still running. This method will be called
every few minutes. """
self.job_heartbeat_callback(build_job)
def setup_time(self):
""" Returns the number of seconds that the build system should wait before allowing the job
to be picked up again after called 'schedule'.
"""
raise NotImplementedError
def shutdown(self): def shutdown(self):
""" Indicates that the build controller server is in a shutdown state and that no new jobs """ Indicates that the build controller server is in a shutdown state and that no new jobs
or workers should be performed. Existing workers should be cleaned up once their jobs or workers should be performed. Existing workers should be cleaned up once their jobs

View file

@ -36,6 +36,11 @@ class EnterpriseManager(BaseManager):
# production, build workers in enterprise are long-lived and register dynamically. # production, build workers in enterprise are long-lived and register dynamically.
self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent) self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
def setup_time(self):
# Builders are already registered, so the setup time should be essentially instant. We therefore
# only return a minute here.
return 60
def add_build_component(self): def add_build_component(self):
""" Adds a new build component for an Enterprise Registry. """ """ Adds a new build component for an Enterprise Registry. """
# Generate a new unique realm ID for the build worker. # Generate a new unique realm ID for the build worker.

View file

@ -9,14 +9,17 @@ from aiowsgi import create_server as create_wsgi_server
from flask import Flask from flask import Flask
from threading import Event from threading import Event
from trollius.coroutines import From from trollius.coroutines import From
from datetime import datetime, timedelta
from buildman.buildjob import BuildJob, BuildJobLoadException from buildman.buildjob import BuildJob, BuildJobLoadException
from data.queue import WorkQueue
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
WORK_CHECK_TIMEOUT = 10 WORK_CHECK_TIMEOUT = 10
TIMEOUT_PERIOD_MINUTES = 20 TIMEOUT_PERIOD_MINUTES = 20
RESERVATION_SECONDS = (TIMEOUT_PERIOD_MINUTES + 5) * 60 JOB_TIMEOUT_SECONDS = 300
MINIMUM_JOB_EXTENSION = timedelta(minutes=2)
class BuildJobResult(object): class BuildJobResult(object):
""" Build job result enum """ """ Build job result enum """
@ -42,6 +45,7 @@ class BuilderServer(object):
self._lifecycle_manager = lifecycle_manager_klass( self._lifecycle_manager = lifecycle_manager_klass(
self._register_component, self._register_component,
self._unregister_component, self._unregister_component,
self._job_heartbeat,
self._job_complete self._job_complete
) )
@ -107,6 +111,10 @@ class BuilderServer(object):
self._current_components.remove(component) self._current_components.remove(component)
self._session_factory.remove(component) self._session_factory.remove(component)
def _job_heartbeat(self, build_job):
WorkQueue.extend_processing(build_job.job_item(), seconds_from_now=JOB_TIMEOUT_SECONDS,
retry_count=1, minimum_extension=MINIMUM_JOB_EXTENSION)
def _job_complete(self, build_job, job_status): def _job_complete(self, build_job, job_status):
if job_status == BuildJobResult.INCOMPLETE: if job_status == BuildJobResult.INCOMPLETE:
self._queue.incomplete(build_job.job_item(), restore_retry=True, retry_after=30) self._queue.incomplete(build_job.job_item(), restore_retry=True, retry_after=30)
@ -126,7 +134,7 @@ class BuilderServer(object):
def _work_checker(self): def _work_checker(self):
while self._current_status == 'running': while self._current_status == 'running':
LOGGER.debug('Checking for more work') LOGGER.debug('Checking for more work')
job_item = self._queue.get(processing_time=RESERVATION_SECONDS) job_item = self._queue.get(processing_time=self._lifecycle_manager.setup_time())
if job_item is None: if job_item is None:
LOGGER.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT) LOGGER.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
yield From(trollius.sleep(WORK_CHECK_TIMEOUT)) yield From(trollius.sleep(WORK_CHECK_TIMEOUT))

View file

@ -73,9 +73,6 @@ class WorkerError(object):
if handler.get('show_base_error', False) and self._base_message: if handler.get('show_base_error', False) and self._base_message:
message = message + ': ' + self._base_message message = message + ': ' + self._base_message
if handler.get('is_internal', False):
message = message + '\nThe build will be retried shortly'
return message return message
def extra_data(self): def extra_data(self):

View file

@ -370,6 +370,7 @@ class RepositoryTag(BaseModel):
class BUILD_PHASE(object): class BUILD_PHASE(object):
""" Build phases enum """ """ Build phases enum """
ERROR = 'error' ERROR = 'error'
INTERNAL_ERROR = 'internalerror'
UNPACKING = 'unpacking' UNPACKING = 'unpacking'
PULLING = 'pulling' PULLING = 'pulling'
BUILDING = 'building' BUILDING = 'building'

View file

@ -120,10 +120,14 @@ class WorkQueue(object):
self._currently_processing = False self._currently_processing = False
@staticmethod @staticmethod
def extend_processing(queue_item, seconds_from_now): def extend_processing(queue_item, seconds_from_now, retry_count=None,
minimum_extension=MINIMUM_EXTENSION):
new_expiration = datetime.utcnow() + timedelta(seconds=seconds_from_now) new_expiration = datetime.utcnow() + timedelta(seconds=seconds_from_now)
# Only actually write the new expiration to the db if it moves the expiration some minimum # Only actually write the new expiration to the db if it moves the expiration some minimum
if new_expiration - queue_item.processing_expires > MINIMUM_EXTENSION: if new_expiration - queue_item.processing_expires > minimum_extension:
if retry_count is not None:
queue_item.retries_remaining = retry_count
queue_item.processing_expires = new_expiration queue_item.processing_expires = new_expiration
queue_item.save() queue_item.save()

View file

@ -1,5 +1,7 @@
import logging import logging
import json import json
import time
import datetime
from flask import request, redirect from flask import request, redirect
@ -9,7 +11,7 @@ from endpoints.api import (RepositoryParamResource, parse_args, query_param, nic
ApiResource, internal_only, format_date, api, Unauthorized, NotFound) ApiResource, internal_only, format_date, api, Unauthorized, NotFound)
from endpoints.common import start_build from endpoints.common import start_build
from endpoints.trigger import BuildTrigger from endpoints.trigger import BuildTrigger
from data import model from data import model, database
from auth.auth_context import get_authenticated_user from auth.auth_context import get_authenticated_user
from auth.permissions import ModifyRepositoryPermission, AdministerOrganizationPermission from auth.permissions import ModifyRepositoryPermission, AdministerOrganizationPermission
from data.buildlogs import BuildStatusRetrievalError from data.buildlogs import BuildStatusRetrievalError
@ -65,6 +67,13 @@ def build_status_view(build_obj, can_write=False):
status = {} status = {}
phase = 'cannot_load' phase = 'cannot_load'
# If the status contains a heartbeat, then check to see if has been written in the last few
# minutes. If not, then the build timed out.
if status is not None and 'heartbeat' in status and status['heartbeat']:
heartbeat = datetime.datetime.fromtimestamp(status['heartbeat'])
if datetime.datetime.now() - heartbeat > datetime.timedelta(minutes=1):
phase = database.BUILD_PHASE.INTERNAL_ERROR
logger.debug('Can write: %s job_config: %s', can_write, build_obj.job_config) logger.debug('Can write: %s job_config: %s', can_write, build_obj.job_config)
resp = { resp = {
'id': build_obj.uuid, 'id': build_obj.uuid,

View file

@ -864,6 +864,10 @@ i.toggle-icon:hover {
background-color: red; background-color: red;
} }
.phase-icon.internalerror {
background-color: #DFFF00;
}
.phase-icon.waiting, .phase-icon.unpacking, .phase-icon.starting, .phase-icon.initializing { .phase-icon.waiting, .phase-icon.unpacking, .phase-icon.starting, .phase-icon.initializing {
background-color: #ddd; background-color: #ddd;
} }

View file

@ -9,7 +9,6 @@
</span> </span>
</span> </span>
<div class="alert alert-danger" ng-if="error.message == 'HTTP code: 403' && getLocalPullInfo().isLocal"> <div class="alert alert-danger" ng-if="error.message == 'HTTP code: 403' && getLocalPullInfo().isLocal">
<div ng-if="getLocalPullInfo().login"> <div ng-if="getLocalPullInfo().login">
Note: The credentials <b>{{ getLocalPullInfo().login.username }}</b> for registry <b>{{ getLocalPullInfo().login.registry }}</b> cannot Note: The credentials <b>{{ getLocalPullInfo().login.username }}</b> for registry <b>{{ getLocalPullInfo().login.registry }}</b> cannot

View file

@ -4746,6 +4746,11 @@ quayApp.directive('buildLogError', function () {
'entries': '=entries' 'entries': '=entries'
}, },
controller: function($scope, $element, Config) { controller: function($scope, $element, Config) {
$scope.isInternalError = function() {
var entry = $scope.entries[$scope.entries.length - 1];
return entry && entry.data && entry.data['internal_error'];
};
$scope.getLocalPullInfo = function() { $scope.getLocalPullInfo = function() {
if ($scope.entries.__localpull !== undefined) { if ($scope.entries.__localpull !== undefined) {
return $scope.entries.__localpull; return $scope.entries.__localpull;
@ -5690,6 +5695,9 @@ quayApp.directive('buildMessage', function () {
case 'error': case 'error':
return 'Dockerfile build failed'; return 'Dockerfile build failed';
case 'internalerror':
return 'A system error occurred while building - Will be retried shortly';
} }
}; };
} }