Make sure expired startup marks build jobs incomplete immediately
Currently, we wait for the job to expire, which can take a very long time. We also add yet even more logs, in the attempt to track down the root cause
This commit is contained in:
parent
ab304cb834
commit
c799367ac4
1 changed files with 31 additions and 12 deletions
|
@ -156,6 +156,27 @@ class EphemeralBuilderManager(BaseManager):
|
|||
|
||||
self._watch_tasks[watch_task_key] = async(watch_future)
|
||||
|
||||
@coroutine
|
||||
def _mark_job_incomplete(self, build_job, build_info):
|
||||
""" Marks a job as incomplete, in response to a failure to start or a timeout. """
|
||||
executor_name = build_info.executor_name
|
||||
execution_id = build_info.execution_id
|
||||
|
||||
logger.warning('Build executor failed to successfully boot with execution id %s',
|
||||
execution_id)
|
||||
|
||||
# Take a lock to ensure that only one manager reports the build as incomplete for this
|
||||
# execution.
|
||||
got_lock = yield From(self._take_etcd_atomic_lock('job-expired', build_job.build_uuid,
|
||||
execution_id))
|
||||
if got_lock:
|
||||
logger.error('[BUILD INTERNAL ERROR] Build ID: %s. Exec name: %s. Exec ID: %s',
|
||||
build_job.build_uuid, executor_name, execution_id)
|
||||
yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
|
||||
update_phase=True))
|
||||
else:
|
||||
logger.debug('Did not get lock for job-expiration for job %s', build_job.build_uuid)
|
||||
|
||||
@coroutine
|
||||
def _handle_job_change(self, etcd_result):
|
||||
""" Handler invoked whenever a job expires or is deleted in etcd. """
|
||||
|
@ -185,24 +206,16 @@ class EphemeralBuilderManager(BaseManager):
|
|||
self._build_uuid_to_info.pop(build_job.build_uuid, None)
|
||||
raise Return()
|
||||
|
||||
logger.debug('Got expiration for job %s with metadata: %s', build_job.build_uuid,
|
||||
job_metadata)
|
||||
|
||||
executor_name = build_info.executor_name
|
||||
execution_id = build_info.execution_id
|
||||
|
||||
# If we have not yet received a heartbeat, then the node failed to boot in some way. We mark
|
||||
# the job as incomplete here.
|
||||
if not job_metadata.get('had_heartbeat', False):
|
||||
logger.warning('Build executor failed to successfully boot with execution id %s',
|
||||
execution_id)
|
||||
|
||||
# Take a lock to ensure that only one manager reports the build as incomplete for this
|
||||
# execution.
|
||||
got_lock = yield From(self._take_etcd_atomic_lock('job-expired', build_job.build_uuid,
|
||||
execution_id))
|
||||
if got_lock:
|
||||
logger.error('[BUILD INTERNAL ERROR: etcd %s] Build ID: %s. Exec name: %s. Exec ID: %s',
|
||||
etcd_result.action, build_job.build_uuid, executor_name, execution_id)
|
||||
yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
|
||||
update_phase=True))
|
||||
yield From(self._mark_job_incomplete(build_job, build_info))
|
||||
|
||||
# Finally, we terminate the build execution for the job. We don't do this under a lock as
|
||||
# terminating a node is an atomic operation; better to make sure it is terminated than not.
|
||||
|
@ -249,6 +262,12 @@ class EphemeralBuilderManager(BaseManager):
|
|||
execution_id = realm_spec.get('execution_id', None)
|
||||
executor_name = realm_spec.get('executor_name', 'EC2Executor')
|
||||
|
||||
# Cleanup the job, since it never started.
|
||||
logger.debug('Job %s for incomplete marking: %s', build_uuid, build_info)
|
||||
if build_info is not None:
|
||||
yield From(self._mark_job_incomplete(build_job, build_info))
|
||||
|
||||
# Cleanup the executor.
|
||||
logger.info('Realm %s expired for job %s, terminating executor %s with execution id %s',
|
||||
realm_id, build_uuid, executor_name, execution_id)
|
||||
yield From(self.terminate_executor(executor_name, execution_id))
|
||||
|
|
Reference in a new issue