diff --git a/buildman/manager/basemanager.py b/buildman/manager/basemanager.py index 83c192d36..62514f072 100644 --- a/buildman/manager/basemanager.py +++ b/buildman/manager/basemanager.py @@ -17,7 +17,7 @@ class BaseManager(object): every few minutes. """ self.job_heartbeat_callback(build_job) - def setup_time(self): + def overall_setup_time(self): """ Returns the number of seconds that the build system should wait before allowing the job to be picked up again after called 'schedule'. """ diff --git a/buildman/manager/enterprise.py b/buildman/manager/enterprise.py index d74c50ea9..4a17bfc57 100644 --- a/buildman/manager/enterprise.py +++ b/buildman/manager/enterprise.py @@ -45,7 +45,7 @@ class EnterpriseManager(BaseManager): # production, build workers in enterprise are long-lived and register dynamically. self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent) - def setup_time(self): + def overall_setup_time(self): # Builders are already registered, so the setup time should be essentially instant. We therefore # only return a minute here. return 60 diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index f9e05404e..5071b9b07 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -373,7 +373,7 @@ class EphemeralBuilderManager(BaseManager): # Load components for all realms currently known to the cluster async(self._register_existing_realms()) - def setup_time(self): + def overall_setup_time(self): return self._ephemeral_setup_timeout def shutdown(self): @@ -418,7 +418,6 @@ class EphemeralBuilderManager(BaseManager): realm = str(uuid.uuid4()) token = str(uuid.uuid4()) nonce = str(uuid.uuid4()) - setup_time = self.setup_time() machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200) max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration) @@ -526,6 +525,9 @@ class EphemeralBuilderManager(BaseManager): }) try: + setup_time = started_with_executor.setup_time or self.overall_setup_time() + logger.debug('Writing job key for job %s using executor %s with ID %s and ttl %s', build_uuid, + started_with_executor.name, execution_id, setup_time) yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False, ttl=setup_time)) except (KeyError, etcd.EtcdKeyError): diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index 1f25dddb2..0dee78439 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -55,6 +55,13 @@ class BuilderExecutor(object): """ Name returns the unique name for this executor. """ return self.executor_config.get('NAME') or self.__class__.__name__ + @property + def setup_time(self): + """ Returns the amount of time (in seconds) to wait for the execution to start for the build. + If None, the manager's default will be used. + """ + return self.executor_config.get('SETUP_TIME') + @coroutine def start_builder(self, realm, token, build_uuid): """ Create a builder with the specified config. Returns a unique id which can be used to manage diff --git a/buildman/server.py b/buildman/server.py index f04060351..fea89ea05 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -143,6 +143,8 @@ class BuilderServer(object): def _job_complete(self, build_job, job_status, executor_name=None, update_phase=False): if job_status == BuildJobResult.INCOMPLETE: + logger.debug('[BUILD INCOMPLETE: job complete] Build ID: %s. No retry restore.', + build_job.repo_build.uuid) self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30) else: self._queue.complete(build_job.job_item) @@ -168,7 +170,7 @@ class BuilderServer(object): logger.debug('Checking for more work for %d active workers', self._lifecycle_manager.num_workers()) - processing_time = self._lifecycle_manager.setup_time() + SETUP_LEEWAY_SECONDS + processing_time = self._lifecycle_manager.overall_setup_time() + SETUP_LEEWAY_SECONDS job_item = self._queue.get(processing_time=processing_time, ordering_required=True) if job_item is None: logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT) @@ -177,6 +179,8 @@ class BuilderServer(object): try: build_job = BuildJob(job_item) except BuildJobLoadException as irbe: + logger.debug('[BUILD INCOMPLETE: job load exception] Jon data: %s. No retry restore.', + job_item.body) logger.exception(irbe) self._queue.incomplete(job_item, restore_retry=False) continue @@ -187,6 +191,8 @@ class BuilderServer(object): try: schedule_success, retry_timeout = yield From(self._lifecycle_manager.schedule(build_job)) except: + logger.debug('[BUILD INCOMPLETE: scheduling] Build ID: %s. Retry restored.', + build_job.repo_build.uuid) logger.exception('Exception when scheduling job: %s', build_job.repo_build.uuid) self._current_status = BuildServerStatus.EXCEPTION self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT) @@ -201,6 +207,8 @@ class BuilderServer(object): logger.debug('Build job %s scheduled. Running: %s', build_job.repo_build.uuid, self._job_count) else: + logger.debug('[BUILD INCOMPLETE: no schedule] Build ID: %s. Retry restored.', + build_job.repo_build.uuid) logger.debug('All workers are busy for job %s Requeuing after %s seconds.', build_job.repo_build.uuid, retry_timeout) self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout)