Add support to the build system for tracking if/when the build manager crashes and make sure builds are restarted within a few minutes

This commit is contained in:
Joseph Schorr 2014-11-21 14:27:06 -05:00
parent 25b5062bb6
commit b8e873b00b
13 changed files with 73 additions and 15 deletions

View file

@ -258,7 +258,8 @@ class BuildComponent(BaseComponent):
worker_error = WorkerError(aex.error, aex.kwargs.get('base_error'))
# Write the error to the log.
self._build_status.set_error(worker_error.public_message(), worker_error.extra_data())
self._build_status.set_error(worker_error.public_message(), worker_error.extra_data(),
internal_error=worker_error.is_internal_error())
# Mark the build as completed.
if worker_error.is_internal_error():
@ -326,6 +327,12 @@ class BuildComponent(BaseComponent):
with build_status as status_dict:
status_dict['heartbeat'] = int(time.time())
# Mark the build item.
current_job = self._current_job
if current_job is not None:
self.parent_manager.job_heartbeat(current_job)
# Check the heartbeat from the worker.
LOGGER.debug('Checking heartbeat on realm %s', self.builder_realm)
if not self._last_heartbeat:
@ -348,7 +355,7 @@ class BuildComponent(BaseComponent):
# manager.
if self._current_job is not None:
if timed_out:
self._build_status.set_error('Build worker timed out. Build has been requeued.')
self._build_status.set_error('Build worker timed out', internal_error=True)
self.parent_manager.job_completed(self._current_job, BuildJobResult.INCOMPLETE, self)
self._build_status = None

View file

@ -28,8 +28,11 @@ class StatusHandler(object):
def set_command(self, command, extra_data=None):
self._append_log_message(command, self._build_logs.COMMAND, extra_data)
def set_error(self, error_message, extra_data=None):
self.set_phase(BUILD_PHASE.ERROR)
def set_error(self, error_message, extra_data=None, internal_error=False):
self.set_phase(BUILD_PHASE.INTERNAL_ERROR if internal_error else BUILD_PHASE.ERROR)
extra_data = extra_data or {}
extra_data['internal_error'] = internal_error
self._append_log_message(error_message, self._build_logs.ERROR, extra_data)
def set_phase(self, phase, extra_data=None):

View file

@ -1,10 +1,23 @@
class BaseManager(object):
""" Base for all worker managers. """
def __init__(self, register_component, unregister_component, job_complete_callback):
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
job_complete_callback):
self.register_component = register_component
self.unregister_component = unregister_component
self.job_heartbeat_callback = job_heartbeat_callback
self.job_complete_callback = job_complete_callback
def job_heartbeat(self, build_job):
""" Method invoked to tell the manager that a job is still running. This method will be called
every few minutes. """
self.job_heartbeat_callback(build_job)
def setup_time(self):
""" Returns the number of seconds that the build system should wait before allowing the job
to be picked up again after called 'schedule'.
"""
raise NotImplementedError
def shutdown(self):
""" Indicates that the build controller server is in a shutdown state and that no new jobs
or workers should be performed. Existing workers should be cleaned up once their jobs

View file

@ -36,6 +36,11 @@ class EnterpriseManager(BaseManager):
# production, build workers in enterprise are long-lived and register dynamically.
self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
def setup_time(self):
# Builders are already registered, so the setup time should be essentially instant. We therefore
# only return a minute here.
return 60
def add_build_component(self):
""" Adds a new build component for an Enterprise Registry. """
# Generate a new unique realm ID for the build worker.

View file

@ -9,14 +9,17 @@ from aiowsgi import create_server as create_wsgi_server
from flask import Flask
from threading import Event
from trollius.coroutines import From
from datetime import datetime, timedelta
from buildman.buildjob import BuildJob, BuildJobLoadException
from data.queue import WorkQueue
LOGGER = logging.getLogger(__name__)
WORK_CHECK_TIMEOUT = 10
TIMEOUT_PERIOD_MINUTES = 20
RESERVATION_SECONDS = (TIMEOUT_PERIOD_MINUTES + 5) * 60
JOB_TIMEOUT_SECONDS = 300
MINIMUM_JOB_EXTENSION = timedelta(minutes=2)
class BuildJobResult(object):
""" Build job result enum """
@ -42,6 +45,7 @@ class BuilderServer(object):
self._lifecycle_manager = lifecycle_manager_klass(
self._register_component,
self._unregister_component,
self._job_heartbeat,
self._job_complete
)
@ -107,6 +111,10 @@ class BuilderServer(object):
self._current_components.remove(component)
self._session_factory.remove(component)
def _job_heartbeat(self, build_job):
WorkQueue.extend_processing(build_job.job_item(), seconds_from_now=JOB_TIMEOUT_SECONDS,
retry_count=1, minimum_extension=MINIMUM_JOB_EXTENSION)
def _job_complete(self, build_job, job_status):
if job_status == BuildJobResult.INCOMPLETE:
self._queue.incomplete(build_job.job_item(), restore_retry=True, retry_after=30)
@ -126,7 +134,7 @@ class BuilderServer(object):
def _work_checker(self):
while self._current_status == 'running':
LOGGER.debug('Checking for more work')
job_item = self._queue.get(processing_time=RESERVATION_SECONDS)
job_item = self._queue.get(processing_time=self._lifecycle_manager.setup_time())
if job_item is None:
LOGGER.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))

View file

@ -73,9 +73,6 @@ class WorkerError(object):
if handler.get('show_base_error', False) and self._base_message:
message = message + ': ' + self._base_message
if handler.get('is_internal', False):
message = message + '\nThe build will be retried shortly'
return message
def extra_data(self):

View file

@ -370,6 +370,7 @@ class RepositoryTag(BaseModel):
class BUILD_PHASE(object):
""" Build phases enum """
ERROR = 'error'
INTERNAL_ERROR = 'internalerror'
UNPACKING = 'unpacking'
PULLING = 'pulling'
BUILDING = 'building'

View file

@ -120,10 +120,14 @@ class WorkQueue(object):
self._currently_processing = False
@staticmethod
def extend_processing(queue_item, seconds_from_now):
def extend_processing(queue_item, seconds_from_now, retry_count=None,
minimum_extension=MINIMUM_EXTENSION):
new_expiration = datetime.utcnow() + timedelta(seconds=seconds_from_now)
# Only actually write the new expiration to the db if it moves the expiration some minimum
if new_expiration - queue_item.processing_expires > MINIMUM_EXTENSION:
if new_expiration - queue_item.processing_expires > minimum_extension:
if retry_count is not None:
queue_item.retries_remaining = retry_count
queue_item.processing_expires = new_expiration
queue_item.save()

View file

@ -1,5 +1,7 @@
import logging
import json
import time
import datetime
from flask import request, redirect
@ -9,7 +11,7 @@ from endpoints.api import (RepositoryParamResource, parse_args, query_param, nic
ApiResource, internal_only, format_date, api, Unauthorized, NotFound)
from endpoints.common import start_build
from endpoints.trigger import BuildTrigger
from data import model
from data import model, database
from auth.auth_context import get_authenticated_user
from auth.permissions import ModifyRepositoryPermission, AdministerOrganizationPermission
from data.buildlogs import BuildStatusRetrievalError
@ -65,6 +67,13 @@ def build_status_view(build_obj, can_write=False):
status = {}
phase = 'cannot_load'
# If the status contains a heartbeat, then check to see if has been written in the last few
# minutes. If not, then the build timed out.
if status is not None and 'heartbeat' in status and status['heartbeat']:
heartbeat = datetime.datetime.fromtimestamp(status['heartbeat'])
if datetime.datetime.now() - heartbeat > datetime.timedelta(minutes=1):
phase = database.BUILD_PHASE.INTERNAL_ERROR
logger.debug('Can write: %s job_config: %s', can_write, build_obj.job_config)
resp = {
'id': build_obj.uuid,

View file

@ -864,6 +864,10 @@ i.toggle-icon:hover {
background-color: red;
}
.phase-icon.internalerror {
background-color: #DFFF00;
}
.phase-icon.waiting, .phase-icon.unpacking, .phase-icon.starting, .phase-icon.initializing {
background-color: #ddd;
}

View file

@ -9,7 +9,6 @@
</span>
</span>
<div class="alert alert-danger" ng-if="error.message == 'HTTP code: 403' && getLocalPullInfo().isLocal">
<div ng-if="getLocalPullInfo().login">
Note: The credentials <b>{{ getLocalPullInfo().login.username }}</b> for registry <b>{{ getLocalPullInfo().login.registry }}</b> cannot

View file

@ -4746,6 +4746,11 @@ quayApp.directive('buildLogError', function () {
'entries': '=entries'
},
controller: function($scope, $element, Config) {
$scope.isInternalError = function() {
var entry = $scope.entries[$scope.entries.length - 1];
return entry && entry.data && entry.data['internal_error'];
};
$scope.getLocalPullInfo = function() {
if ($scope.entries.__localpull !== undefined) {
return $scope.entries.__localpull;
@ -5690,6 +5695,9 @@ quayApp.directive('buildMessage', function () {
case 'error':
return 'Dockerfile build failed';
case 'internalerror':
return 'A system error occurred while building - Will be retried shortly';
}
};
}

View file

@ -1238,7 +1238,7 @@ function RepoBuildCtrl($scope, Restangular, ApiService, $routeParams, $rootScope
ApiService.getRepoBuildStatus(null, params, true).then(function(resp) {
if (build != $scope.currentBuild) { callback(false); return; }
// Note: We use extend here rather than replacing as Angular is depending on the
// root build object to remain the same object.
var matchingBuilds = $.grep($scope.builds, function(elem) {