Merge pull request #2089 from coreos-inc/build-start-timeouts
Customized Build start timeouts and better debug logs
This commit is contained in:
commit
585a010c89
5 changed files with 22 additions and 5 deletions
|
@ -17,7 +17,7 @@ class BaseManager(object):
|
||||||
every few minutes. """
|
every few minutes. """
|
||||||
self.job_heartbeat_callback(build_job)
|
self.job_heartbeat_callback(build_job)
|
||||||
|
|
||||||
def setup_time(self):
|
def overall_setup_time(self):
|
||||||
""" Returns the number of seconds that the build system should wait before allowing the job
|
""" Returns the number of seconds that the build system should wait before allowing the job
|
||||||
to be picked up again after called 'schedule'.
|
to be picked up again after called 'schedule'.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -45,7 +45,7 @@ class EnterpriseManager(BaseManager):
|
||||||
# production, build workers in enterprise are long-lived and register dynamically.
|
# production, build workers in enterprise are long-lived and register dynamically.
|
||||||
self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
|
self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
|
||||||
|
|
||||||
def setup_time(self):
|
def overall_setup_time(self):
|
||||||
# Builders are already registered, so the setup time should be essentially instant. We therefore
|
# Builders are already registered, so the setup time should be essentially instant. We therefore
|
||||||
# only return a minute here.
|
# only return a minute here.
|
||||||
return 60
|
return 60
|
||||||
|
|
|
@ -373,7 +373,7 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
# Load components for all realms currently known to the cluster
|
# Load components for all realms currently known to the cluster
|
||||||
async(self._register_existing_realms())
|
async(self._register_existing_realms())
|
||||||
|
|
||||||
def setup_time(self):
|
def overall_setup_time(self):
|
||||||
return self._ephemeral_setup_timeout
|
return self._ephemeral_setup_timeout
|
||||||
|
|
||||||
def shutdown(self):
|
def shutdown(self):
|
||||||
|
@ -418,7 +418,6 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
realm = str(uuid.uuid4())
|
realm = str(uuid.uuid4())
|
||||||
token = str(uuid.uuid4())
|
token = str(uuid.uuid4())
|
||||||
nonce = str(uuid.uuid4())
|
nonce = str(uuid.uuid4())
|
||||||
setup_time = self.setup_time()
|
|
||||||
|
|
||||||
machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200)
|
machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200)
|
||||||
max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration)
|
max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration)
|
||||||
|
@ -526,6 +525,9 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
})
|
})
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
setup_time = started_with_executor.setup_time or self.overall_setup_time()
|
||||||
|
logger.debug('Writing job key for job %s using executor %s with ID %s and ttl %s', build_uuid,
|
||||||
|
started_with_executor.name, execution_id, setup_time)
|
||||||
yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False,
|
yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False,
|
||||||
ttl=setup_time))
|
ttl=setup_time))
|
||||||
except (KeyError, etcd.EtcdKeyError):
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
|
|
|
@ -55,6 +55,13 @@ class BuilderExecutor(object):
|
||||||
""" Name returns the unique name for this executor. """
|
""" Name returns the unique name for this executor. """
|
||||||
return self.executor_config.get('NAME') or self.__class__.__name__
|
return self.executor_config.get('NAME') or self.__class__.__name__
|
||||||
|
|
||||||
|
@property
|
||||||
|
def setup_time(self):
|
||||||
|
""" Returns the amount of time (in seconds) to wait for the execution to start for the build.
|
||||||
|
If None, the manager's default will be used.
|
||||||
|
"""
|
||||||
|
return self.executor_config.get('SETUP_TIME')
|
||||||
|
|
||||||
@coroutine
|
@coroutine
|
||||||
def start_builder(self, realm, token, build_uuid):
|
def start_builder(self, realm, token, build_uuid):
|
||||||
""" Create a builder with the specified config. Returns a unique id which can be used to manage
|
""" Create a builder with the specified config. Returns a unique id which can be used to manage
|
||||||
|
|
|
@ -143,6 +143,8 @@ class BuilderServer(object):
|
||||||
|
|
||||||
def _job_complete(self, build_job, job_status, executor_name=None, update_phase=False):
|
def _job_complete(self, build_job, job_status, executor_name=None, update_phase=False):
|
||||||
if job_status == BuildJobResult.INCOMPLETE:
|
if job_status == BuildJobResult.INCOMPLETE:
|
||||||
|
logger.debug('[BUILD INCOMPLETE: job complete] Build ID: %s. No retry restore.',
|
||||||
|
build_job.repo_build.uuid)
|
||||||
self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30)
|
self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30)
|
||||||
else:
|
else:
|
||||||
self._queue.complete(build_job.job_item)
|
self._queue.complete(build_job.job_item)
|
||||||
|
@ -168,7 +170,7 @@ class BuilderServer(object):
|
||||||
logger.debug('Checking for more work for %d active workers',
|
logger.debug('Checking for more work for %d active workers',
|
||||||
self._lifecycle_manager.num_workers())
|
self._lifecycle_manager.num_workers())
|
||||||
|
|
||||||
processing_time = self._lifecycle_manager.setup_time() + SETUP_LEEWAY_SECONDS
|
processing_time = self._lifecycle_manager.overall_setup_time() + SETUP_LEEWAY_SECONDS
|
||||||
job_item = self._queue.get(processing_time=processing_time, ordering_required=True)
|
job_item = self._queue.get(processing_time=processing_time, ordering_required=True)
|
||||||
if job_item is None:
|
if job_item is None:
|
||||||
logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
|
logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
|
||||||
|
@ -177,6 +179,8 @@ class BuilderServer(object):
|
||||||
try:
|
try:
|
||||||
build_job = BuildJob(job_item)
|
build_job = BuildJob(job_item)
|
||||||
except BuildJobLoadException as irbe:
|
except BuildJobLoadException as irbe:
|
||||||
|
logger.debug('[BUILD INCOMPLETE: job load exception] Jon data: %s. No retry restore.',
|
||||||
|
job_item.body)
|
||||||
logger.exception(irbe)
|
logger.exception(irbe)
|
||||||
self._queue.incomplete(job_item, restore_retry=False)
|
self._queue.incomplete(job_item, restore_retry=False)
|
||||||
continue
|
continue
|
||||||
|
@ -187,6 +191,8 @@ class BuilderServer(object):
|
||||||
try:
|
try:
|
||||||
schedule_success, retry_timeout = yield From(self._lifecycle_manager.schedule(build_job))
|
schedule_success, retry_timeout = yield From(self._lifecycle_manager.schedule(build_job))
|
||||||
except:
|
except:
|
||||||
|
logger.debug('[BUILD INCOMPLETE: scheduling] Build ID: %s. Retry restored.',
|
||||||
|
build_job.repo_build.uuid)
|
||||||
logger.exception('Exception when scheduling job: %s', build_job.repo_build.uuid)
|
logger.exception('Exception when scheduling job: %s', build_job.repo_build.uuid)
|
||||||
self._current_status = BuildServerStatus.EXCEPTION
|
self._current_status = BuildServerStatus.EXCEPTION
|
||||||
self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT)
|
self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT)
|
||||||
|
@ -201,6 +207,8 @@ class BuilderServer(object):
|
||||||
logger.debug('Build job %s scheduled. Running: %s', build_job.repo_build.uuid,
|
logger.debug('Build job %s scheduled. Running: %s', build_job.repo_build.uuid,
|
||||||
self._job_count)
|
self._job_count)
|
||||||
else:
|
else:
|
||||||
|
logger.debug('[BUILD INCOMPLETE: no schedule] Build ID: %s. Retry restored.',
|
||||||
|
build_job.repo_build.uuid)
|
||||||
logger.debug('All workers are busy for job %s Requeuing after %s seconds.',
|
logger.debug('All workers are busy for job %s Requeuing after %s seconds.',
|
||||||
build_job.repo_build.uuid, retry_timeout)
|
build_job.repo_build.uuid, retry_timeout)
|
||||||
self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout)
|
self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout)
|
||||||
|
|
Reference in a new issue