Merge pull request #2886 from coreos-inc/builder-expires
Make sure expired startup marks build jobs incomplete immediately
This commit is contained in:
commit
fdcf8dbdbc
1 changed files with 31 additions and 12 deletions
|
@ -156,6 +156,27 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
|
|
||||||
self._watch_tasks[watch_task_key] = async(watch_future)
|
self._watch_tasks[watch_task_key] = async(watch_future)
|
||||||
|
|
||||||
|
@coroutine
|
||||||
|
def _mark_job_incomplete(self, build_job, build_info):
|
||||||
|
""" Marks a job as incomplete, in response to a failure to start or a timeout. """
|
||||||
|
executor_name = build_info.executor_name
|
||||||
|
execution_id = build_info.execution_id
|
||||||
|
|
||||||
|
logger.warning('Build executor failed to successfully boot with execution id %s',
|
||||||
|
execution_id)
|
||||||
|
|
||||||
|
# Take a lock to ensure that only one manager reports the build as incomplete for this
|
||||||
|
# execution.
|
||||||
|
got_lock = yield From(self._take_etcd_atomic_lock('job-expired', build_job.build_uuid,
|
||||||
|
execution_id))
|
||||||
|
if got_lock:
|
||||||
|
logger.error('[BUILD INTERNAL ERROR] Build ID: %s. Exec name: %s. Exec ID: %s',
|
||||||
|
build_job.build_uuid, executor_name, execution_id)
|
||||||
|
yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
|
||||||
|
update_phase=True))
|
||||||
|
else:
|
||||||
|
logger.debug('Did not get lock for job-expiration for job %s', build_job.build_uuid)
|
||||||
|
|
||||||
@coroutine
|
@coroutine
|
||||||
def _handle_job_change(self, etcd_result):
|
def _handle_job_change(self, etcd_result):
|
||||||
""" Handler invoked whenever a job expires or is deleted in etcd. """
|
""" Handler invoked whenever a job expires or is deleted in etcd. """
|
||||||
|
@ -185,24 +206,16 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
self._build_uuid_to_info.pop(build_job.build_uuid, None)
|
self._build_uuid_to_info.pop(build_job.build_uuid, None)
|
||||||
raise Return()
|
raise Return()
|
||||||
|
|
||||||
|
logger.debug('Got expiration for job %s with metadata: %s', build_job.build_uuid,
|
||||||
|
job_metadata)
|
||||||
|
|
||||||
executor_name = build_info.executor_name
|
executor_name = build_info.executor_name
|
||||||
execution_id = build_info.execution_id
|
execution_id = build_info.execution_id
|
||||||
|
|
||||||
# If we have not yet received a heartbeat, then the node failed to boot in some way. We mark
|
# If we have not yet received a heartbeat, then the node failed to boot in some way. We mark
|
||||||
# the job as incomplete here.
|
# the job as incomplete here.
|
||||||
if not job_metadata.get('had_heartbeat', False):
|
if not job_metadata.get('had_heartbeat', False):
|
||||||
logger.warning('Build executor failed to successfully boot with execution id %s',
|
yield From(self._mark_job_incomplete(build_job, build_info))
|
||||||
execution_id)
|
|
||||||
|
|
||||||
# Take a lock to ensure that only one manager reports the build as incomplete for this
|
|
||||||
# execution.
|
|
||||||
got_lock = yield From(self._take_etcd_atomic_lock('job-expired', build_job.build_uuid,
|
|
||||||
execution_id))
|
|
||||||
if got_lock:
|
|
||||||
logger.error('[BUILD INTERNAL ERROR: etcd %s] Build ID: %s. Exec name: %s. Exec ID: %s',
|
|
||||||
etcd_result.action, build_job.build_uuid, executor_name, execution_id)
|
|
||||||
yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
|
|
||||||
update_phase=True))
|
|
||||||
|
|
||||||
# Finally, we terminate the build execution for the job. We don't do this under a lock as
|
# Finally, we terminate the build execution for the job. We don't do this under a lock as
|
||||||
# terminating a node is an atomic operation; better to make sure it is terminated than not.
|
# terminating a node is an atomic operation; better to make sure it is terminated than not.
|
||||||
|
@ -249,6 +262,12 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
execution_id = realm_spec.get('execution_id', None)
|
execution_id = realm_spec.get('execution_id', None)
|
||||||
executor_name = realm_spec.get('executor_name', 'EC2Executor')
|
executor_name = realm_spec.get('executor_name', 'EC2Executor')
|
||||||
|
|
||||||
|
# Cleanup the job, since it never started.
|
||||||
|
logger.debug('Job %s for incomplete marking: %s', build_uuid, build_info)
|
||||||
|
if build_info is not None:
|
||||||
|
yield From(self._mark_job_incomplete(build_job, build_info))
|
||||||
|
|
||||||
|
# Cleanup the executor.
|
||||||
logger.info('Realm %s expired for job %s, terminating executor %s with execution id %s',
|
logger.info('Realm %s expired for job %s, terminating executor %s with execution id %s',
|
||||||
realm_id, build_uuid, executor_name, execution_id)
|
realm_id, build_uuid, executor_name, execution_id)
|
||||||
yield From(self.terminate_executor(executor_name, execution_id))
|
yield From(self.terminate_executor(executor_name, execution_id))
|
||||||
|
|
Reference in a new issue