Add a timeout to various operations against etcd in the build manager when it cannot connect to etcd
This will ensure that the build managers don't simply sit there thrashing against a non-existing cluster, thus driving the CPU up on our production nodes, and thus taking them out of service Addresses https://jira.coreos.com/browse/QUAY-990
This commit is contained in:
parent
beebe6d5ed
commit
2d6a6a1f6c
1 changed files with 44 additions and 2 deletions
|
@ -6,6 +6,8 @@ import os.path
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
import trollius
|
||||||
|
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from trollius import From, coroutine, Return, async
|
from trollius import From, coroutine, Return, async
|
||||||
|
@ -31,7 +33,7 @@ RETRY_IMMEDIATELY_TIMEOUT = 0
|
||||||
NO_WORKER_AVAILABLE_TIMEOUT = 10
|
NO_WORKER_AVAILABLE_TIMEOUT = 10
|
||||||
DEFAULT_EPHEMERAL_API_TIMEOUT = 20
|
DEFAULT_EPHEMERAL_API_TIMEOUT = 20
|
||||||
DEFAULT_EPHEMERAL_SETUP_TIMEOUT = 500
|
DEFAULT_EPHEMERAL_SETUP_TIMEOUT = 500
|
||||||
|
ETCD_DOWN_SLEEP_TIMEOUT = 5
|
||||||
|
|
||||||
class EtcdAction(object):
|
class EtcdAction(object):
|
||||||
""" Enumeration of the various kinds of etcd actions we can observe via a watch. """
|
""" Enumeration of the various kinds of etcd actions we can observe via a watch. """
|
||||||
|
@ -129,6 +131,15 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
logger.debug('Etcd key already cleared: %s', etcd_key)
|
logger.debug('Etcd key already cleared: %s', etcd_key)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
except etcd.EtcdConnectionFailed:
|
||||||
|
# If the connection has failed, then etcd is most likely down, and we need to
|
||||||
|
# sleep for a bit before checking for it to come up again.
|
||||||
|
logger.exception('Connecting to etcd failed; sleeping for %s and then trying again',
|
||||||
|
ETCD_DOWN_SLEEP_TIMEOUT)
|
||||||
|
time.sleep(ETCD_DOWN_SLEEP_TIMEOUT)
|
||||||
|
logger.exception('Connecting to etcd failed; slept for %s and now trying again',
|
||||||
|
ETCD_DOWN_SLEEP_TIMEOUT)
|
||||||
|
|
||||||
except etcd.EtcdException as eex:
|
except etcd.EtcdException as eex:
|
||||||
# TODO(jschorr): This is a quick and dirty hack and should be replaced
|
# TODO(jschorr): This is a quick and dirty hack and should be replaced
|
||||||
# with a proper exception check.
|
# with a proper exception check.
|
||||||
|
@ -328,6 +339,9 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
except (KeyError, etcd.EtcdKeyError):
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
# no realms have been registered yet
|
# no realms have been registered yet
|
||||||
pass
|
pass
|
||||||
|
except etcd.EtcdConnectionFailed:
|
||||||
|
# Not much to do.
|
||||||
|
pass
|
||||||
|
|
||||||
def _load_executor(self, executor_kind_name, executor_config):
|
def _load_executor(self, executor_kind_name, executor_config):
|
||||||
executor_klass = EphemeralBuilderManager.EXECUTORS.get(executor_kind_name)
|
executor_klass = EphemeralBuilderManager.EXECUTORS.get(executor_kind_name)
|
||||||
|
@ -427,6 +441,9 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
workers_alive = sum(1 for child in active_jobs.children if not child.dir)
|
workers_alive = sum(1 for child in active_jobs.children if not child.dir)
|
||||||
except (KeyError, etcd.EtcdKeyError):
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
workers_alive = 0
|
workers_alive = 0
|
||||||
|
except etcd.EtcdConnectionFailed:
|
||||||
|
logger.exception('Could not read job count from etcd for job due to etcd being down')
|
||||||
|
raise Return(False, ETCD_DOWN_SLEEP_TIMEOUT)
|
||||||
except etcd.EtcdException:
|
except etcd.EtcdException:
|
||||||
logger.exception('Exception when reading job count from etcd for job: %s', build_uuid)
|
logger.exception('Exception when reading job count from etcd for job: %s', build_uuid)
|
||||||
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
|
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
|
||||||
|
@ -465,6 +482,9 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
# The job was already taken by someone else, we are probably a retry
|
# The job was already taken by someone else, we are probably a retry
|
||||||
logger.warning('Job: %s already exists in etcd, timeout may be misconfigured', build_uuid)
|
logger.warning('Job: %s already exists in etcd, timeout may be misconfigured', build_uuid)
|
||||||
raise Return(False, self._ephemeral_api_timeout)
|
raise Return(False, self._ephemeral_api_timeout)
|
||||||
|
except etcd.EtcdConnectionFailed:
|
||||||
|
logger.exception('Exception when writing job %s to etcd; could not connect', build_uuid)
|
||||||
|
raise Return(False, ETCD_DOWN_SLEEP_TIMEOUT)
|
||||||
except etcd.EtcdException:
|
except etcd.EtcdException:
|
||||||
logger.exception('Exception when writing job %s to etcd', build_uuid)
|
logger.exception('Exception when writing job %s to etcd', build_uuid)
|
||||||
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
|
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
|
||||||
|
@ -568,6 +588,9 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
logger.error('Realm %s already exists in etcd for job %s ' +
|
logger.error('Realm %s already exists in etcd for job %s ' +
|
||||||
'UUID collision or something is very very wrong.', realm, build_uuid)
|
'UUID collision or something is very very wrong.', realm, build_uuid)
|
||||||
raise Return(False, setup_time)
|
raise Return(False, setup_time)
|
||||||
|
except etcd.EtcdConnectionFailed:
|
||||||
|
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
|
||||||
|
raise Return(False, ETCD_DOWN_SLEEP_TIMEOUT)
|
||||||
except etcd.EtcdException:
|
except etcd.EtcdException:
|
||||||
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
|
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
|
||||||
raise Return(False, setup_time)
|
raise Return(False, setup_time)
|
||||||
|
@ -628,6 +651,10 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
job_key = self._etcd_job_key(build_job)
|
job_key = self._etcd_job_key(build_job)
|
||||||
try:
|
try:
|
||||||
yield From(self._etcd_client.delete(job_key))
|
yield From(self._etcd_client.delete(job_key))
|
||||||
|
except etcd.EtcdConnectionFailed:
|
||||||
|
logger.exception('Could not remove job key as etcd is not available')
|
||||||
|
yield From(trollius.sleep(ETCD_DOWN_SLEEP_TIMEOUT))
|
||||||
|
raise Return()
|
||||||
except (KeyError, etcd.EtcdKeyError):
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
logger.debug('Builder is asking for job to be removed, but work already completed')
|
logger.debug('Builder is asking for job to be removed, but work already completed')
|
||||||
|
|
||||||
|
@ -637,6 +664,10 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
yield From(self._etcd_client.delete(metric_key))
|
yield From(self._etcd_client.delete(metric_key))
|
||||||
except (KeyError, etcd.EtcdKeyError):
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
logger.debug('Builder is asking for metric to be removed, but key not found')
|
logger.debug('Builder is asking for metric to be removed, but key not found')
|
||||||
|
except etcd.EtcdConnectionFailed:
|
||||||
|
logger.exception('Could not remove metric key as etcd is not available')
|
||||||
|
yield From(trollius.sleep(ETCD_DOWN_SLEEP_TIMEOUT))
|
||||||
|
raise Return()
|
||||||
|
|
||||||
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
|
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
|
||||||
|
|
||||||
|
@ -695,7 +726,11 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
# Note: A TTL of < 0 in etcd results in the key *never being expired*. We use a max here
|
# Note: A TTL of < 0 in etcd results in the key *never being expired*. We use a max here
|
||||||
# to ensure that if the TTL is < 0, the key will expire immediately.
|
# to ensure that if the TTL is < 0, the key will expire immediately.
|
||||||
etcd_ttl = max(ttl, 0)
|
etcd_ttl = max(ttl, 0)
|
||||||
yield From(self._etcd_client.write(job_key, json.dumps(payload), ttl=etcd_ttl))
|
try:
|
||||||
|
yield From(self._etcd_client.write(job_key, json.dumps(payload), ttl=etcd_ttl))
|
||||||
|
except etcd.EtcdConnectionFailed:
|
||||||
|
logger.exception('Could not update heartbeat for job as etcd is not available')
|
||||||
|
yield From(trollius.sleep(ETCD_DOWN_SLEEP_TIMEOUT))
|
||||||
|
|
||||||
|
|
||||||
@coroutine
|
@coroutine
|
||||||
|
@ -710,6 +745,9 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
try:
|
try:
|
||||||
yield From(self._etcd_client.write(lock_key, {}, prevExist=False, ttl=ETCD_ATOMIC_OP_TIMEOUT))
|
yield From(self._etcd_client.write(lock_key, {}, prevExist=False, ttl=ETCD_ATOMIC_OP_TIMEOUT))
|
||||||
raise Return(True)
|
raise Return(True)
|
||||||
|
except etcd.EtcdConnectionFailed:
|
||||||
|
logger.exception('Could not get etcd atomic lock as etcd is down')
|
||||||
|
raise Return(False)
|
||||||
except (KeyError, etcd.EtcdKeyError):
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
raise Return(False)
|
raise Return(False)
|
||||||
|
|
||||||
|
@ -769,6 +807,7 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
yield From(self.delete_etcd_key(self._etcd_realm_key(build_info.component.builder_realm)))
|
yield From(self.delete_etcd_key(self._etcd_realm_key(build_info.component.builder_realm)))
|
||||||
yield From(self.delete_etcd_key(self._etcd_metric_key(build_info.component.builder_realm)))
|
yield From(self.delete_etcd_key(self._etcd_metric_key(build_info.component.builder_realm)))
|
||||||
yield From(self.delete_etcd_key(os.path.join(self._etcd_job_prefix, build_uuid)))
|
yield From(self.delete_etcd_key(os.path.join(self._etcd_job_prefix, build_uuid)))
|
||||||
|
|
||||||
# This is outside the lock so we can un-register the component wherever it is registered to.
|
# This is outside the lock so we can un-register the component wherever it is registered to.
|
||||||
yield From(build_info.component.cancel_build())
|
yield From(build_info.component.cancel_build())
|
||||||
|
|
||||||
|
@ -778,3 +817,6 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
yield From(self._etcd_client.delete(etcd_key))
|
yield From(self._etcd_client.delete(etcd_key))
|
||||||
except (KeyError, etcd.EtcdKeyError):
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
logger.warning('Could not delete etcd key %s', etcd_key)
|
logger.warning('Could not delete etcd key %s', etcd_key)
|
||||||
|
except etcd.EtcdConnectionFailed:
|
||||||
|
logger.exception('Could not delete etcd key as etcd is down')
|
||||||
|
yield From(trollius.sleep(ETCD_DOWN_SLEEP_TIMEOUT))
|
||||||
|
|
Reference in a new issue