From f50bb8a1ced910bf82e77847bc7e72f4ba165060 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Fri, 30 Sep 2016 14:54:49 +0200 Subject: [PATCH] Add missing call to set_phase when a build doesn't start This change fixes the build manager ephemeral executor to tell the overall build server to call set_phase when a build never starts. Before this change, we'd properly adjust the queue item, but not the repo build row or the logs, which is why users just saw "Preparing Build Node", with no indicating the node failed to start. Fixes #1904 --- buildman/component/buildcomponent.py | 4 ++-- buildman/jobutil/buildstatus.py | 1 - buildman/manager/ephemeral.py | 13 +++++++------ buildman/server.py | 6 +++++- test/test_buildman.py | 4 +++- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py index 0ddd2835d..4881bfa80 100644 --- a/buildman/component/buildcomponent.py +++ b/buildman/component/buildcomponent.py @@ -452,8 +452,8 @@ class BuildComponent(BaseComponent): requeued=self._current_job.has_retries_remaining()) yield From(self.parent_manager.job_completed(self._current_job, - BuildJobResult.INCOMPLETE, - self)) + BuildJobResult.INCOMPLETE, + self)) self._current_job = None # Unregister the current component so that it cannot be invoked again. diff --git a/buildman/jobutil/buildstatus.py b/buildman/jobutil/buildstatus.py index 079615812..0b65f7934 100644 --- a/buildman/jobutil/buildstatus.py +++ b/buildman/jobutil/buildstatus.py @@ -66,7 +66,6 @@ class StatusHandler(object): repo_build = model.build.get_repository_build(self._uuid) repo_build.phase = phase repo_build.save() - return True def __enter__(self): diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 5fb3baa83..0343d0126 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -198,7 +198,8 @@ class EphemeralBuilderManager(BaseManager): execution_id)) if got_lock: logger.warning('Marking job %s as incomplete', build_job.build_uuid) - self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name) + self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name, + update_phase=True) # Finally, we terminate the build execution for the job. We don't do this under a lock as # terminating a node is an atomic operation; better to make sure it is terminated than not. @@ -537,7 +538,7 @@ class EphemeralBuilderManager(BaseManager): logger.debug('Sending build %s to newly ready component on realm %s', job.build_uuid, build_component.builder_realm) yield From(build_component.start_build(job)) - + try: # log start time to prometheus realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm))) @@ -561,15 +562,15 @@ class EphemeralBuilderManager(BaseManager): logger.debug('Calling job_completed for job %s with status: %s', build_job.build_uuid, job_status) - # Mark the job as completed. + # Mark the job as completed. Since this is being invoked from the component, we don't need + # to ask for the phase to be updated as well. build_info = self._build_uuid_to_info.get(build_job.build_uuid, None) executor_name = build_info.executor_name if build_info else None - - self.job_complete_callback(build_job, job_status, executor_name) + self.job_complete_callback(build_job, job_status, executor_name, update_phase=False) # Kill the ephemeral builder. yield From(self.kill_builder_executor(build_job.build_uuid)) - + try: # log build time to prometheus realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm))) diff --git a/buildman/server.py b/buildman/server.py index 072dc2c98..f8f6f8f1c 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -141,12 +141,16 @@ class BuilderServer(object): self._queue.extend_processing(build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS, minimum_extension=MINIMUM_JOB_EXTENSION) - def _job_complete(self, build_job, job_status, executor_name=None): + def _job_complete(self, build_job, job_status, executor_name=None, update_phase=False): if job_status == BuildJobResult.INCOMPLETE: self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30) else: self._queue.complete(build_job.job_item) + if update_phase: + status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid) + status_handler.set_phase(job_status) + self._job_count = self._job_count - 1 if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count: diff --git a/test/test_buildman.py b/test/test_buildman.py index 0a4a9caec..a96056bbc 100644 --- a/test/test_buildman.py +++ b/test/test_buildman.py @@ -384,8 +384,10 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase): self.test_executor.stop_builder.assert_called_once_with('123') self.assertEqual(self.test_executor.stop_builder.call_count, 1) + # Ensure the job was marked as incomplete, with an update_phase to True (so the DB record and + # logs are updated as well) self.job_complete_callback.assert_called_once_with(ANY, BuildJobResult.INCOMPLETE, - 'MockExecutor') + 'MockExecutor', update_phase=True) @async_test def test_change_worker(self):