diff --git a/buildman/enums.py b/buildman/enums.py index 3d38217fe..2a5cb1978 100644 --- a/buildman/enums.py +++ b/buildman/enums.py @@ -10,3 +10,4 @@ class BuildServerStatus(object): STARTING = 'starting' RUNNING = 'running' SHUTDOWN = 'shutting_down' + EXCEPTION = 'exception' diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 473e75fb3..0978c1b8d 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -77,7 +77,7 @@ class EphemeralBuilderManager(BaseManager): try: etcd_result = changed_key_future.result() - except (ReadTimeoutError, ProtocolError): + except (ReadTimeoutError, ProtocolError, etcd.EtcdException): return change_callback(etcd_result) @@ -233,11 +233,20 @@ class EphemeralBuilderManager(BaseManager): raise Return(False) logger.debug('Starting builder with executor: %s', self._executor) - builder_id = yield From(self._executor.start_builder(realm, token, build_uuid)) + + try: + builder_id = yield From(self._executor.start_builder(realm, token, build_uuid)) + except: + logger.exception('Exception when starting builder for job: %s', build_uuid) + raise Return(False) # Store the builder in etcd associated with the job id - payload['builder_id'] = builder_id - yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl)) + try: + payload['builder_id'] = builder_id + yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl)) + except etcd.EtcdException: + logger.exception('Exception when writing job %s to etcd', build_uuid) + raise Return(False) # Store the realm spec which will allow any manager to accept this builder when it connects realm_spec = json.dumps({ diff --git a/buildman/server.py b/buildman/server.py index 855afc212..6b55fc2b1 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -175,7 +175,14 @@ class BuilderServer(object): continue logger.debug('Build job found. Checking for an avaliable worker.') - scheduled = yield From(self._lifecycle_manager.schedule(build_job)) + + try: + scheduled = yield From(self._lifecycle_manager.schedule(build_job)) + except: + logger.exception('Exception when scheduling job') + self._current_status = BuildServerStatus.EXCEPTION + return + if scheduled: status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid) status_handler.set_phase('build-scheduled') @@ -190,6 +197,7 @@ class BuilderServer(object): def _queue_metrics_updater(self): while self._current_status == BuildServerStatus.RUNNING: yield From(trollius.sleep(30)) + logger.debug('Writing metrics') self._queue.update_metrics() @trollius.coroutine