diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index f5910c6b0..902da86f0 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -215,14 +215,14 @@ class EphemeralBuilderManager(BaseManager): except (KeyError, etcd.EtcdKeyError): workers_alive = 0 except etcd.EtcdException: - logger.exception('Exception when reading job count from etcd') + logger.exception('Exception when reading job count from etcd for job: %s', build_uuid) raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) logger.debug('Total jobs: %s', workers_alive) if workers_alive >= allowed_worker_count: - logger.info('Too many workers alive, unable to start new worker. %s >= %s', workers_alive, - allowed_worker_count) + logger.info('Too many workers alive, unable to start new worker for build job: %s. %s >= %s', + build_uuid, workers_alive, allowed_worker_count) raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) job_key = self._etcd_job_key(build_job) @@ -249,13 +249,14 @@ class EphemeralBuilderManager(BaseManager): ttl=EC2_API_TIMEOUT)) except (KeyError, etcd.EtcdKeyError): # The job was already taken by someone else, we are probably a retry - logger.exception('Job already exists in etcd, are timeouts misconfigured or is the queue broken?') - raise Return(False) + logger.error('Job: %s already exists in etcd, timeout may be misconfigured', build_uuid) + raise Return(False, EC2_API_TIMEOUT) except etcd.EtcdException: logger.exception('Exception when writing job %s to etcd', build_uuid) raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) - logger.debug('Starting builder with executor: %s', self._executor) + executor_type = self._executor.__class__.__name__ + logger.debug('Starting builder for job: %s with executor: %s', build_uuid, executor_type) try: builder_id = yield From(self._executor.start_builder(realm, token, build_uuid)) @@ -284,11 +285,12 @@ class EphemeralBuilderManager(BaseManager): yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False, ttl=setup_time)) except (KeyError, etcd.EtcdKeyError): - logger.error('Realm already exists in etcd. UUID collision or something is very very wrong.') + logger.error('Realm %s already exists in etcd for job %s ' + + 'UUID collision or something is very very wrong.', realm, build_uuid) raise Return(False, setup_time) except etcd.EtcdException: - logger.exception('Exception when writing realm %s to etcd', realm) - raise Return(False) + logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid) + raise Return(False, setup_time) raise Return(True, None) diff --git a/buildman/server.py b/buildman/server.py index 179be4e5d..fa8f4ea90 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -169,12 +169,13 @@ class BuilderServer(object): self._queue.incomplete(job_item, restore_retry=False) continue - logger.debug('Build job found. Checking for an avaliable worker.') + logger.debug('Checking for an avaliable worker for build job %s', + build_job.repo_build.uuid) try: schedule_success, retry_timeout = yield From(self._lifecycle_manager.schedule(build_job)) except: - logger.exception('Exception when scheduling job') + logger.exception('Exception when scheduling job: %s', build_job.repo_build.uuid) self._current_status = BuildServerStatus.EXCEPTION return @@ -187,7 +188,8 @@ class BuilderServer(object): logger.debug('Build job %s scheduled. Running: %s', build_job.repo_build.uuid, self._job_count) else: - logger.debug('All workers are busy. Requeuing.') + logger.debug('All workers are busy for job %s Requeuing after %s seconds.', + build_job.repo_build.uuid, retry_timeout) self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout) @trollius.coroutine