Merge pull request #2089 from coreos-inc/build-start-timeouts

Customized Build start timeouts and better debug logs
This commit is contained in:
josephschorr 2016-11-07 16:31:51 -05:00 committed by GitHub
commit 585a010c89
5 changed files with 22 additions and 5 deletions

View file

@ -17,7 +17,7 @@ class BaseManager(object):
every few minutes. """ every few minutes. """
self.job_heartbeat_callback(build_job) self.job_heartbeat_callback(build_job)
def setup_time(self): def overall_setup_time(self):
""" Returns the number of seconds that the build system should wait before allowing the job """ Returns the number of seconds that the build system should wait before allowing the job
to be picked up again after called 'schedule'. to be picked up again after called 'schedule'.
""" """

View file

@ -45,7 +45,7 @@ class EnterpriseManager(BaseManager):
# production, build workers in enterprise are long-lived and register dynamically. # production, build workers in enterprise are long-lived and register dynamically.
self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent) self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
def setup_time(self): def overall_setup_time(self):
# Builders are already registered, so the setup time should be essentially instant. We therefore # Builders are already registered, so the setup time should be essentially instant. We therefore
# only return a minute here. # only return a minute here.
return 60 return 60

View file

@ -373,7 +373,7 @@ class EphemeralBuilderManager(BaseManager):
# Load components for all realms currently known to the cluster # Load components for all realms currently known to the cluster
async(self._register_existing_realms()) async(self._register_existing_realms())
def setup_time(self): def overall_setup_time(self):
return self._ephemeral_setup_timeout return self._ephemeral_setup_timeout
def shutdown(self): def shutdown(self):
@ -418,7 +418,6 @@ class EphemeralBuilderManager(BaseManager):
realm = str(uuid.uuid4()) realm = str(uuid.uuid4())
token = str(uuid.uuid4()) token = str(uuid.uuid4())
nonce = str(uuid.uuid4()) nonce = str(uuid.uuid4())
setup_time = self.setup_time()
machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200) machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200)
max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration) max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration)
@ -526,6 +525,9 @@ class EphemeralBuilderManager(BaseManager):
}) })
try: try:
setup_time = started_with_executor.setup_time or self.overall_setup_time()
logger.debug('Writing job key for job %s using executor %s with ID %s and ttl %s', build_uuid,
started_with_executor.name, execution_id, setup_time)
yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False, yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False,
ttl=setup_time)) ttl=setup_time))
except (KeyError, etcd.EtcdKeyError): except (KeyError, etcd.EtcdKeyError):

View file

@ -55,6 +55,13 @@ class BuilderExecutor(object):
""" Name returns the unique name for this executor. """ """ Name returns the unique name for this executor. """
return self.executor_config.get('NAME') or self.__class__.__name__ return self.executor_config.get('NAME') or self.__class__.__name__
@property
def setup_time(self):
""" Returns the amount of time (in seconds) to wait for the execution to start for the build.
If None, the manager's default will be used.
"""
return self.executor_config.get('SETUP_TIME')
@coroutine @coroutine
def start_builder(self, realm, token, build_uuid): def start_builder(self, realm, token, build_uuid):
""" Create a builder with the specified config. Returns a unique id which can be used to manage """ Create a builder with the specified config. Returns a unique id which can be used to manage

View file

@ -143,6 +143,8 @@ class BuilderServer(object):
def _job_complete(self, build_job, job_status, executor_name=None, update_phase=False): def _job_complete(self, build_job, job_status, executor_name=None, update_phase=False):
if job_status == BuildJobResult.INCOMPLETE: if job_status == BuildJobResult.INCOMPLETE:
logger.debug('[BUILD INCOMPLETE: job complete] Build ID: %s. No retry restore.',
build_job.repo_build.uuid)
self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30) self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30)
else: else:
self._queue.complete(build_job.job_item) self._queue.complete(build_job.job_item)
@ -168,7 +170,7 @@ class BuilderServer(object):
logger.debug('Checking for more work for %d active workers', logger.debug('Checking for more work for %d active workers',
self._lifecycle_manager.num_workers()) self._lifecycle_manager.num_workers())
processing_time = self._lifecycle_manager.setup_time() + SETUP_LEEWAY_SECONDS processing_time = self._lifecycle_manager.overall_setup_time() + SETUP_LEEWAY_SECONDS
job_item = self._queue.get(processing_time=processing_time, ordering_required=True) job_item = self._queue.get(processing_time=processing_time, ordering_required=True)
if job_item is None: if job_item is None:
logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT) logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
@ -177,6 +179,8 @@ class BuilderServer(object):
try: try:
build_job = BuildJob(job_item) build_job = BuildJob(job_item)
except BuildJobLoadException as irbe: except BuildJobLoadException as irbe:
logger.debug('[BUILD INCOMPLETE: job load exception] Jon data: %s. No retry restore.',
job_item.body)
logger.exception(irbe) logger.exception(irbe)
self._queue.incomplete(job_item, restore_retry=False) self._queue.incomplete(job_item, restore_retry=False)
continue continue
@ -187,6 +191,8 @@ class BuilderServer(object):
try: try:
schedule_success, retry_timeout = yield From(self._lifecycle_manager.schedule(build_job)) schedule_success, retry_timeout = yield From(self._lifecycle_manager.schedule(build_job))
except: except:
logger.debug('[BUILD INCOMPLETE: scheduling] Build ID: %s. Retry restored.',
build_job.repo_build.uuid)
logger.exception('Exception when scheduling job: %s', build_job.repo_build.uuid) logger.exception('Exception when scheduling job: %s', build_job.repo_build.uuid)
self._current_status = BuildServerStatus.EXCEPTION self._current_status = BuildServerStatus.EXCEPTION
self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT) self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT)
@ -201,6 +207,8 @@ class BuilderServer(object):
logger.debug('Build job %s scheduled. Running: %s', build_job.repo_build.uuid, logger.debug('Build job %s scheduled. Running: %s', build_job.repo_build.uuid,
self._job_count) self._job_count)
else: else:
logger.debug('[BUILD INCOMPLETE: no schedule] Build ID: %s. Retry restored.',
build_job.repo_build.uuid)
logger.debug('All workers are busy for job %s Requeuing after %s seconds.', logger.debug('All workers are busy for job %s Requeuing after %s seconds.',
build_job.repo_build.uuid, retry_timeout) build_job.repo_build.uuid, retry_timeout)
self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout) self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout)