Make sure expired startup marks build jobs incomplete immediately

Currently, we wait for the job to expire, which can take a very long time. We also add yet even more logs, in the attempt to track down the root cause
2017-10-11 14:56:19 -04:00 · 2017-10-11 14:56:19 -04:00 · c799367ac4
commit c799367ac4
parent ab304cb834
1 changed files with 31 additions and 12 deletions
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@ -156,6 +156,27 @@ class EphemeralBuilderManager(BaseManager):

      self._watch_tasks[watch_task_key] = async(watch_future)

+  @coroutine
+  def _mark_job_incomplete(self, build_job, build_info):
+    """ Marks a job as incomplete, in response to a failure to start or a timeout. """
+    executor_name = build_info.executor_name
+    execution_id = build_info.execution_id
+
+    logger.warning('Build executor failed to successfully boot with execution id %s',
+                    execution_id)
+
+    # Take a lock to ensure that only one manager reports the build as incomplete for this
+    # execution.
+    got_lock = yield From(self._take_etcd_atomic_lock('job-expired', build_job.build_uuid,
+                                                      execution_id))
+    if got_lock:
+      logger.error('[BUILD INTERNAL ERROR] Build ID: %s. Exec name: %s. Exec ID: %s',
+                    build_job.build_uuid, executor_name, execution_id)
+      yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
+                                            update_phase=True))
+    else:
+      logger.debug('Did not get lock for job-expiration for job %s', build_job.build_uuid)
+
  @coroutine
  def _handle_job_change(self, etcd_result):
    """ Handler invoked whenever a job expires or is deleted in etcd. """
@ -185,24 +206,16 @@ class EphemeralBuilderManager(BaseManager):
        self._build_uuid_to_info.pop(build_job.build_uuid, None)
        raise Return()

+      logger.debug('Got expiration for job %s with metadata: %s', build_job.build_uuid,
+                   job_metadata)
+
      executor_name = build_info.executor_name
      execution_id = build_info.execution_id

      # If we have not yet received a heartbeat, then the node failed to boot in some way. We mark
      # the job as incomplete here.
      if not job_metadata.get('had_heartbeat', False):
-        logger.warning('Build executor failed to successfully boot with execution id %s',
-                       execution_id)
-
-        # Take a lock to ensure that only one manager reports the build as incomplete for this
-        # execution.
-        got_lock = yield From(self._take_etcd_atomic_lock('job-expired', build_job.build_uuid,
-                                                          execution_id))
-        if got_lock:
-          logger.error('[BUILD INTERNAL ERROR: etcd %s] Build ID: %s. Exec name: %s. Exec ID: %s',
-                       etcd_result.action, build_job.build_uuid, executor_name, execution_id)
-          yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
-                                                update_phase=True))
+        yield From(self._mark_job_incomplete(build_job, build_info))

      # Finally, we terminate the build execution for the job. We don't do this under a lock as
      # terminating a node is an atomic operation; better to make sure it is terminated than not.
@ -249,6 +262,12 @@ class EphemeralBuilderManager(BaseManager):
        execution_id = realm_spec.get('execution_id', None)
        executor_name = realm_spec.get('executor_name', 'EC2Executor')

+        # Cleanup the job, since it never started.
+        logger.debug('Job %s for incomplete marking: %s', build_uuid, build_info)
+        if build_info is not None:
+          yield From(self._mark_job_incomplete(build_job, build_info))
+
+        # Cleanup the executor.
        logger.info('Realm %s expired for job %s, terminating executor %s with execution id %s',
                    realm_id, build_uuid, executor_name, execution_id)
        yield From(self.terminate_executor(executor_name, execution_id))