From c799367ac4893cb7932bdcfbd90c219c977090a8 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 11 Oct 2017 14:56:19 -0400 Subject: [PATCH] Make sure expired startup marks build jobs incomplete immediately Currently, we wait for the job to expire, which can take a very long time. We also add yet even more logs, in the attempt to track down the root cause --- buildman/manager/ephemeral.py | 43 +++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index a57589a95..1f24f9303 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -156,6 +156,27 @@ class EphemeralBuilderManager(BaseManager): self._watch_tasks[watch_task_key] = async(watch_future) + @coroutine + def _mark_job_incomplete(self, build_job, build_info): + """ Marks a job as incomplete, in response to a failure to start or a timeout. """ + executor_name = build_info.executor_name + execution_id = build_info.execution_id + + logger.warning('Build executor failed to successfully boot with execution id %s', + execution_id) + + # Take a lock to ensure that only one manager reports the build as incomplete for this + # execution. + got_lock = yield From(self._take_etcd_atomic_lock('job-expired', build_job.build_uuid, + execution_id)) + if got_lock: + logger.error('[BUILD INTERNAL ERROR] Build ID: %s. Exec name: %s. Exec ID: %s', + build_job.build_uuid, executor_name, execution_id) + yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name, + update_phase=True)) + else: + logger.debug('Did not get lock for job-expiration for job %s', build_job.build_uuid) + @coroutine def _handle_job_change(self, etcd_result): """ Handler invoked whenever a job expires or is deleted in etcd. """ @@ -185,24 +206,16 @@ class EphemeralBuilderManager(BaseManager): self._build_uuid_to_info.pop(build_job.build_uuid, None) raise Return() + logger.debug('Got expiration for job %s with metadata: %s', build_job.build_uuid, + job_metadata) + executor_name = build_info.executor_name execution_id = build_info.execution_id # If we have not yet received a heartbeat, then the node failed to boot in some way. We mark # the job as incomplete here. if not job_metadata.get('had_heartbeat', False): - logger.warning('Build executor failed to successfully boot with execution id %s', - execution_id) - - # Take a lock to ensure that only one manager reports the build as incomplete for this - # execution. - got_lock = yield From(self._take_etcd_atomic_lock('job-expired', build_job.build_uuid, - execution_id)) - if got_lock: - logger.error('[BUILD INTERNAL ERROR: etcd %s] Build ID: %s. Exec name: %s. Exec ID: %s', - etcd_result.action, build_job.build_uuid, executor_name, execution_id) - yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name, - update_phase=True)) + yield From(self._mark_job_incomplete(build_job, build_info)) # Finally, we terminate the build execution for the job. We don't do this under a lock as # terminating a node is an atomic operation; better to make sure it is terminated than not. @@ -249,6 +262,12 @@ class EphemeralBuilderManager(BaseManager): execution_id = realm_spec.get('execution_id', None) executor_name = realm_spec.get('executor_name', 'EC2Executor') + # Cleanup the job, since it never started. + logger.debug('Job %s for incomplete marking: %s', build_uuid, build_info) + if build_info is not None: + yield From(self._mark_job_incomplete(build_job, build_info)) + + # Cleanup the executor. logger.info('Realm %s expired for job %s, terminating executor %s with execution id %s', realm_id, build_uuid, executor_name, execution_id) yield From(self.terminate_executor(executor_name, execution_id))