diff --git a/buildman/buildcomponent.py b/buildman/buildcomponent.py index f47de4022..9b29ad5b6 100644 --- a/buildman/buildcomponent.py +++ b/buildman/buildcomponent.py @@ -258,7 +258,8 @@ class BuildComponent(BaseComponent): worker_error = WorkerError(aex.error, aex.kwargs.get('base_error')) # Write the error to the log. - self._build_status.set_error(worker_error.public_message(), worker_error.extra_data()) + self._build_status.set_error(worker_error.public_message(), worker_error.extra_data(), + internal_error=worker_error.is_internal_error()) # Mark the build as completed. if worker_error.is_internal_error(): @@ -326,6 +327,12 @@ class BuildComponent(BaseComponent): with build_status as status_dict: status_dict['heartbeat'] = int(time.time()) + + # Mark the build item. + current_job = self._current_job + if current_job is not None: + self.parent_manager.job_heartbeat(current_job) + # Check the heartbeat from the worker. LOGGER.debug('Checking heartbeat on realm %s', self.builder_realm) if not self._last_heartbeat: @@ -348,7 +355,7 @@ class BuildComponent(BaseComponent): # manager. if self._current_job is not None: if timed_out: - self._build_status.set_error('Build worker timed out. Build has been requeued.') + self._build_status.set_error('Build worker timed out', internal_error=True) self.parent_manager.job_completed(self._current_job, BuildJobResult.INCOMPLETE, self) self._build_status = None diff --git a/buildman/buildstatus.py b/buildman/buildstatus.py index 0ce509343..68b8cd5e3 100644 --- a/buildman/buildstatus.py +++ b/buildman/buildstatus.py @@ -28,8 +28,11 @@ class StatusHandler(object): def set_command(self, command, extra_data=None): self._append_log_message(command, self._build_logs.COMMAND, extra_data) - def set_error(self, error_message, extra_data=None): - self.set_phase(BUILD_PHASE.ERROR) + def set_error(self, error_message, extra_data=None, internal_error=False): + self.set_phase(BUILD_PHASE.INTERNAL_ERROR if internal_error else BUILD_PHASE.ERROR) + + extra_data = extra_data or {} + extra_data['internal_error'] = internal_error self._append_log_message(error_message, self._build_logs.ERROR, extra_data) def set_phase(self, phase, extra_data=None): diff --git a/buildman/manager/basemanager.py b/buildman/manager/basemanager.py index 1de6e24df..f66054c45 100644 --- a/buildman/manager/basemanager.py +++ b/buildman/manager/basemanager.py @@ -1,10 +1,23 @@ class BaseManager(object): """ Base for all worker managers. """ - def __init__(self, register_component, unregister_component, job_complete_callback): + def __init__(self, register_component, unregister_component, job_heartbeat_callback, + job_complete_callback): self.register_component = register_component self.unregister_component = unregister_component + self.job_heartbeat_callback = job_heartbeat_callback self.job_complete_callback = job_complete_callback + def job_heartbeat(self, build_job): + """ Method invoked to tell the manager that a job is still running. This method will be called + every few minutes. """ + self.job_heartbeat_callback(build_job) + + def setup_time(self): + """ Returns the number of seconds that the build system should wait before allowing the job + to be picked up again after called 'schedule'. + """ + raise NotImplementedError + def shutdown(self): """ Indicates that the build controller server is in a shutdown state and that no new jobs or workers should be performed. Existing workers should be cleaned up once their jobs diff --git a/buildman/manager/enterprise.py b/buildman/manager/enterprise.py index 584a6daf4..9abe5f3c0 100644 --- a/buildman/manager/enterprise.py +++ b/buildman/manager/enterprise.py @@ -36,6 +36,11 @@ class EnterpriseManager(BaseManager): # production, build workers in enterprise are long-lived and register dynamically. self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent) + def setup_time(self): + # Builders are already registered, so the setup time should be essentially instant. We therefore + # only return a minute here. + return 60 + def add_build_component(self): """ Adds a new build component for an Enterprise Registry. """ # Generate a new unique realm ID for the build worker. diff --git a/buildman/server.py b/buildman/server.py index 0e2d7e050..fd5d4f409 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -9,14 +9,17 @@ from aiowsgi import create_server as create_wsgi_server from flask import Flask from threading import Event from trollius.coroutines import From +from datetime import datetime, timedelta from buildman.buildjob import BuildJob, BuildJobLoadException +from data.queue import WorkQueue LOGGER = logging.getLogger(__name__) WORK_CHECK_TIMEOUT = 10 TIMEOUT_PERIOD_MINUTES = 20 -RESERVATION_SECONDS = (TIMEOUT_PERIOD_MINUTES + 5) * 60 +JOB_TIMEOUT_SECONDS = 300 +MINIMUM_JOB_EXTENSION = timedelta(minutes=2) class BuildJobResult(object): """ Build job result enum """ @@ -42,6 +45,7 @@ class BuilderServer(object): self._lifecycle_manager = lifecycle_manager_klass( self._register_component, self._unregister_component, + self._job_heartbeat, self._job_complete ) @@ -107,6 +111,10 @@ class BuilderServer(object): self._current_components.remove(component) self._session_factory.remove(component) + def _job_heartbeat(self, build_job): + WorkQueue.extend_processing(build_job.job_item(), seconds_from_now=JOB_TIMEOUT_SECONDS, + retry_count=1, minimum_extension=MINIMUM_JOB_EXTENSION) + def _job_complete(self, build_job, job_status): if job_status == BuildJobResult.INCOMPLETE: self._queue.incomplete(build_job.job_item(), restore_retry=True, retry_after=30) @@ -126,7 +134,7 @@ class BuilderServer(object): def _work_checker(self): while self._current_status == 'running': LOGGER.debug('Checking for more work') - job_item = self._queue.get(processing_time=RESERVATION_SECONDS) + job_item = self._queue.get(processing_time=self._lifecycle_manager.setup_time()) if job_item is None: LOGGER.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT) yield From(trollius.sleep(WORK_CHECK_TIMEOUT)) diff --git a/buildman/workererror.py b/buildman/workererror.py index c5a5adbd0..8271976e4 100644 --- a/buildman/workererror.py +++ b/buildman/workererror.py @@ -73,9 +73,6 @@ class WorkerError(object): if handler.get('show_base_error', False) and self._base_message: message = message + ': ' + self._base_message - if handler.get('is_internal', False): - message = message + '\nThe build will be retried shortly' - return message def extra_data(self): diff --git a/data/database.py b/data/database.py index 6562e1f63..e8874f692 100644 --- a/data/database.py +++ b/data/database.py @@ -370,6 +370,7 @@ class RepositoryTag(BaseModel): class BUILD_PHASE(object): """ Build phases enum """ ERROR = 'error' + INTERNAL_ERROR = 'internalerror' UNPACKING = 'unpacking' PULLING = 'pulling' BUILDING = 'building' diff --git a/data/queue.py b/data/queue.py index 548ee380c..203d31eeb 100644 --- a/data/queue.py +++ b/data/queue.py @@ -120,10 +120,14 @@ class WorkQueue(object): self._currently_processing = False @staticmethod - def extend_processing(queue_item, seconds_from_now): + def extend_processing(queue_item, seconds_from_now, retry_count=None, + minimum_extension=MINIMUM_EXTENSION): new_expiration = datetime.utcnow() + timedelta(seconds=seconds_from_now) # Only actually write the new expiration to the db if it moves the expiration some minimum - if new_expiration - queue_item.processing_expires > MINIMUM_EXTENSION: + if new_expiration - queue_item.processing_expires > minimum_extension: + if retry_count is not None: + queue_item.retries_remaining = retry_count + queue_item.processing_expires = new_expiration queue_item.save() diff --git a/endpoints/api/build.py b/endpoints/api/build.py index adf6f43ec..2af1e0c8b 100644 --- a/endpoints/api/build.py +++ b/endpoints/api/build.py @@ -1,5 +1,7 @@ import logging import json +import time +import datetime from flask import request, redirect @@ -9,7 +11,7 @@ from endpoints.api import (RepositoryParamResource, parse_args, query_param, nic ApiResource, internal_only, format_date, api, Unauthorized, NotFound) from endpoints.common import start_build from endpoints.trigger import BuildTrigger -from data import model +from data import model, database from auth.auth_context import get_authenticated_user from auth.permissions import ModifyRepositoryPermission, AdministerOrganizationPermission from data.buildlogs import BuildStatusRetrievalError @@ -65,6 +67,13 @@ def build_status_view(build_obj, can_write=False): status = {} phase = 'cannot_load' + # If the status contains a heartbeat, then check to see if has been written in the last few + # minutes. If not, then the build timed out. + if status is not None and 'heartbeat' in status and status['heartbeat']: + heartbeat = datetime.datetime.fromtimestamp(status['heartbeat']) + if datetime.datetime.now() - heartbeat > datetime.timedelta(minutes=1): + phase = database.BUILD_PHASE.INTERNAL_ERROR + logger.debug('Can write: %s job_config: %s', can_write, build_obj.job_config) resp = { 'id': build_obj.uuid, diff --git a/static/css/quay.css b/static/css/quay.css index 1065fa586..2a2f80136 100644 --- a/static/css/quay.css +++ b/static/css/quay.css @@ -864,6 +864,10 @@ i.toggle-icon:hover { background-color: red; } +.phase-icon.internalerror { + background-color: #DFFF00; +} + .phase-icon.waiting, .phase-icon.unpacking, .phase-icon.starting, .phase-icon.initializing { background-color: #ddd; } diff --git a/static/directives/build-log-error.html b/static/directives/build-log-error.html index 13b399bb9..c012a623f 100644 --- a/static/directives/build-log-error.html +++ b/static/directives/build-log-error.html @@ -9,7 +9,6 @@ -
Note: The credentials {{ getLocalPullInfo().login.username }} for registry {{ getLocalPullInfo().login.registry }} cannot diff --git a/static/js/app.js b/static/js/app.js index 1f5d53574..3440468d4 100644 --- a/static/js/app.js +++ b/static/js/app.js @@ -4746,6 +4746,11 @@ quayApp.directive('buildLogError', function () { 'entries': '=entries' }, controller: function($scope, $element, Config) { + $scope.isInternalError = function() { + var entry = $scope.entries[$scope.entries.length - 1]; + return entry && entry.data && entry.data['internal_error']; + }; + $scope.getLocalPullInfo = function() { if ($scope.entries.__localpull !== undefined) { return $scope.entries.__localpull; @@ -5690,6 +5695,9 @@ quayApp.directive('buildMessage', function () { case 'error': return 'Dockerfile build failed'; + + case 'internalerror': + return 'A system error occurred while building - Will be retried shortly'; } }; } diff --git a/static/js/controllers.js b/static/js/controllers.js index 07041ae7a..6a4051e50 100644 --- a/static/js/controllers.js +++ b/static/js/controllers.js @@ -1238,7 +1238,7 @@ function RepoBuildCtrl($scope, Restangular, ApiService, $routeParams, $rootScope ApiService.getRepoBuildStatus(null, params, true).then(function(resp) { if (build != $scope.currentBuild) { callback(false); return; } - + // Note: We use extend here rather than replacing as Angular is depending on the // root build object to remain the same object. var matchingBuilds = $.grep($scope.builds, function(elem) {