Make sure expired startup marks build jobs incomplete immediately

Currently, we wait for the job to expire, which can take a very long time. We also add yet even more logs, in the attempt to track down the root cause
This commit is contained in:
Joseph Schorr 2017-10-11 14:56:19 -04:00
parent ab304cb834
commit c799367ac4

View file

@ -156,6 +156,27 @@ class EphemeralBuilderManager(BaseManager):
self._watch_tasks[watch_task_key] = async(watch_future)
@coroutine
def _mark_job_incomplete(self, build_job, build_info):
""" Marks a job as incomplete, in response to a failure to start or a timeout. """
executor_name = build_info.executor_name
execution_id = build_info.execution_id
logger.warning('Build executor failed to successfully boot with execution id %s',
execution_id)
# Take a lock to ensure that only one manager reports the build as incomplete for this
# execution.
got_lock = yield From(self._take_etcd_atomic_lock('job-expired', build_job.build_uuid,
execution_id))
if got_lock:
logger.error('[BUILD INTERNAL ERROR] Build ID: %s. Exec name: %s. Exec ID: %s',
build_job.build_uuid, executor_name, execution_id)
yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
update_phase=True))
else:
logger.debug('Did not get lock for job-expiration for job %s', build_job.build_uuid)
@coroutine
def _handle_job_change(self, etcd_result):
""" Handler invoked whenever a job expires or is deleted in etcd. """
@ -185,24 +206,16 @@ class EphemeralBuilderManager(BaseManager):
self._build_uuid_to_info.pop(build_job.build_uuid, None)
raise Return()
logger.debug('Got expiration for job %s with metadata: %s', build_job.build_uuid,
job_metadata)
executor_name = build_info.executor_name
execution_id = build_info.execution_id
# If we have not yet received a heartbeat, then the node failed to boot in some way. We mark
# the job as incomplete here.
if not job_metadata.get('had_heartbeat', False):
logger.warning('Build executor failed to successfully boot with execution id %s',
execution_id)
# Take a lock to ensure that only one manager reports the build as incomplete for this
# execution.
got_lock = yield From(self._take_etcd_atomic_lock('job-expired', build_job.build_uuid,
execution_id))
if got_lock:
logger.error('[BUILD INTERNAL ERROR: etcd %s] Build ID: %s. Exec name: %s. Exec ID: %s',
etcd_result.action, build_job.build_uuid, executor_name, execution_id)
yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
update_phase=True))
yield From(self._mark_job_incomplete(build_job, build_info))
# Finally, we terminate the build execution for the job. We don't do this under a lock as
# terminating a node is an atomic operation; better to make sure it is terminated than not.
@ -249,6 +262,12 @@ class EphemeralBuilderManager(BaseManager):
execution_id = realm_spec.get('execution_id', None)
executor_name = realm_spec.get('executor_name', 'EC2Executor')
# Cleanup the job, since it never started.
logger.debug('Job %s for incomplete marking: %s', build_uuid, build_info)
if build_info is not None:
yield From(self._mark_job_incomplete(build_job, build_info))
# Cleanup the executor.
logger.info('Realm %s expired for job %s, terminating executor %s with execution id %s',
realm_id, build_uuid, executor_name, execution_id)
yield From(self.terminate_executor(executor_name, execution_id))