From afe7e14254595040f74c095e41325a939db69453 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 25 Feb 2015 12:09:14 -0500 Subject: [PATCH 1/2] Add better exception handling and logging to the ephemeral build manager --- buildman/manager/ephemeral.py | 17 +++++++++++++---- buildman/server.py | 8 +++++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 473e75fb3..0978c1b8d 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -77,7 +77,7 @@ class EphemeralBuilderManager(BaseManager): try: etcd_result = changed_key_future.result() - except (ReadTimeoutError, ProtocolError): + except (ReadTimeoutError, ProtocolError, etcd.EtcdException): return change_callback(etcd_result) @@ -233,11 +233,20 @@ class EphemeralBuilderManager(BaseManager): raise Return(False) logger.debug('Starting builder with executor: %s', self._executor) - builder_id = yield From(self._executor.start_builder(realm, token, build_uuid)) + + try: + builder_id = yield From(self._executor.start_builder(realm, token, build_uuid)) + except: + logger.exception('Exception when starting builder for job: %s', build_uuid) + raise Return(False) # Store the builder in etcd associated with the job id - payload['builder_id'] = builder_id - yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl)) + try: + payload['builder_id'] = builder_id + yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl)) + except etcd.EtcdException: + logger.exception('Exception when writing job %s to etcd', build_uuid) + raise Return(False) # Store the realm spec which will allow any manager to accept this builder when it connects realm_spec = json.dumps({ diff --git a/buildman/server.py b/buildman/server.py index 855afc212..bad1d84f8 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -175,7 +175,13 @@ class BuilderServer(object): continue logger.debug('Build job found. Checking for an avaliable worker.') - scheduled = yield From(self._lifecycle_manager.schedule(build_job)) + + try: + scheduled = yield From(self._lifecycle_manager.schedule(build_job)) + except: + logger.exception('Exception when scheduling job') + scheduled = None + if scheduled: status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid) status_handler.set_phase('build-scheduled') From 390f8df4ad03e69ab34e5fb3c6f92e3244ab77e9 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 25 Feb 2015 12:19:21 -0500 Subject: [PATCH 2/2] Make sure the build manager dies on an unhandled schedule exception --- buildman/enums.py | 1 + buildman/server.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/buildman/enums.py b/buildman/enums.py index 3d38217fe..2a5cb1978 100644 --- a/buildman/enums.py +++ b/buildman/enums.py @@ -10,3 +10,4 @@ class BuildServerStatus(object): STARTING = 'starting' RUNNING = 'running' SHUTDOWN = 'shutting_down' + EXCEPTION = 'exception' diff --git a/buildman/server.py b/buildman/server.py index bad1d84f8..6b55fc2b1 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -180,7 +180,8 @@ class BuilderServer(object): scheduled = yield From(self._lifecycle_manager.schedule(build_job)) except: logger.exception('Exception when scheduling job') - scheduled = None + self._current_status = BuildServerStatus.EXCEPTION + return if scheduled: status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid) @@ -196,6 +197,7 @@ class BuilderServer(object): def _queue_metrics_updater(self): while self._current_status == BuildServerStatus.RUNNING: yield From(trollius.sleep(30)) + logger.debug('Writing metrics') self._queue.update_metrics() @trollius.coroutine