Add a timeout to various operations against etcd in the build manager when it cannot connect to etcd

This will ensure that the build managers don't simply sit there thrashing against a non-existing cluster, thus driving the CPU up on our production nodes, and thus taking them out of service

Addresses https://jira.coreos.com/browse/QUAY-990
This commit is contained in:
Joseph Schorr 2018-07-08 12:25:33 +03:00
parent beebe6d5ed
commit 2d6a6a1f6c

View file

@ -6,6 +6,8 @@ import os.path
import json import json
import time import time
import trollius
from collections import namedtuple from collections import namedtuple
from datetime import datetime, timedelta from datetime import datetime, timedelta
from trollius import From, coroutine, Return, async from trollius import From, coroutine, Return, async
@ -31,7 +33,7 @@ RETRY_IMMEDIATELY_TIMEOUT = 0
NO_WORKER_AVAILABLE_TIMEOUT = 10 NO_WORKER_AVAILABLE_TIMEOUT = 10
DEFAULT_EPHEMERAL_API_TIMEOUT = 20 DEFAULT_EPHEMERAL_API_TIMEOUT = 20
DEFAULT_EPHEMERAL_SETUP_TIMEOUT = 500 DEFAULT_EPHEMERAL_SETUP_TIMEOUT = 500
ETCD_DOWN_SLEEP_TIMEOUT = 5
class EtcdAction(object): class EtcdAction(object):
""" Enumeration of the various kinds of etcd actions we can observe via a watch. """ """ Enumeration of the various kinds of etcd actions we can observe via a watch. """
@ -129,6 +131,15 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Etcd key already cleared: %s', etcd_key) logger.debug('Etcd key already cleared: %s', etcd_key)
return return
except etcd.EtcdConnectionFailed:
# If the connection has failed, then etcd is most likely down, and we need to
# sleep for a bit before checking for it to come up again.
logger.exception('Connecting to etcd failed; sleeping for %s and then trying again',
ETCD_DOWN_SLEEP_TIMEOUT)
time.sleep(ETCD_DOWN_SLEEP_TIMEOUT)
logger.exception('Connecting to etcd failed; slept for %s and now trying again',
ETCD_DOWN_SLEEP_TIMEOUT)
except etcd.EtcdException as eex: except etcd.EtcdException as eex:
# TODO(jschorr): This is a quick and dirty hack and should be replaced # TODO(jschorr): This is a quick and dirty hack and should be replaced
# with a proper exception check. # with a proper exception check.
@ -328,6 +339,9 @@ class EphemeralBuilderManager(BaseManager):
except (KeyError, etcd.EtcdKeyError): except (KeyError, etcd.EtcdKeyError):
# no realms have been registered yet # no realms have been registered yet
pass pass
except etcd.EtcdConnectionFailed:
# Not much to do.
pass
def _load_executor(self, executor_kind_name, executor_config): def _load_executor(self, executor_kind_name, executor_config):
executor_klass = EphemeralBuilderManager.EXECUTORS.get(executor_kind_name) executor_klass = EphemeralBuilderManager.EXECUTORS.get(executor_kind_name)
@ -427,6 +441,9 @@ class EphemeralBuilderManager(BaseManager):
workers_alive = sum(1 for child in active_jobs.children if not child.dir) workers_alive = sum(1 for child in active_jobs.children if not child.dir)
except (KeyError, etcd.EtcdKeyError): except (KeyError, etcd.EtcdKeyError):
workers_alive = 0 workers_alive = 0
except etcd.EtcdConnectionFailed:
logger.exception('Could not read job count from etcd for job due to etcd being down')
raise Return(False, ETCD_DOWN_SLEEP_TIMEOUT)
except etcd.EtcdException: except etcd.EtcdException:
logger.exception('Exception when reading job count from etcd for job: %s', build_uuid) logger.exception('Exception when reading job count from etcd for job: %s', build_uuid)
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
@ -465,6 +482,9 @@ class EphemeralBuilderManager(BaseManager):
# The job was already taken by someone else, we are probably a retry # The job was already taken by someone else, we are probably a retry
logger.warning('Job: %s already exists in etcd, timeout may be misconfigured', build_uuid) logger.warning('Job: %s already exists in etcd, timeout may be misconfigured', build_uuid)
raise Return(False, self._ephemeral_api_timeout) raise Return(False, self._ephemeral_api_timeout)
except etcd.EtcdConnectionFailed:
logger.exception('Exception when writing job %s to etcd; could not connect', build_uuid)
raise Return(False, ETCD_DOWN_SLEEP_TIMEOUT)
except etcd.EtcdException: except etcd.EtcdException:
logger.exception('Exception when writing job %s to etcd', build_uuid) logger.exception('Exception when writing job %s to etcd', build_uuid)
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
@ -568,6 +588,9 @@ class EphemeralBuilderManager(BaseManager):
logger.error('Realm %s already exists in etcd for job %s ' + logger.error('Realm %s already exists in etcd for job %s ' +
'UUID collision or something is very very wrong.', realm, build_uuid) 'UUID collision or something is very very wrong.', realm, build_uuid)
raise Return(False, setup_time) raise Return(False, setup_time)
except etcd.EtcdConnectionFailed:
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
raise Return(False, ETCD_DOWN_SLEEP_TIMEOUT)
except etcd.EtcdException: except etcd.EtcdException:
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid) logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
raise Return(False, setup_time) raise Return(False, setup_time)
@ -628,6 +651,10 @@ class EphemeralBuilderManager(BaseManager):
job_key = self._etcd_job_key(build_job) job_key = self._etcd_job_key(build_job)
try: try:
yield From(self._etcd_client.delete(job_key)) yield From(self._etcd_client.delete(job_key))
except etcd.EtcdConnectionFailed:
logger.exception('Could not remove job key as etcd is not available')
yield From(trollius.sleep(ETCD_DOWN_SLEEP_TIMEOUT))
raise Return()
except (KeyError, etcd.EtcdKeyError): except (KeyError, etcd.EtcdKeyError):
logger.debug('Builder is asking for job to be removed, but work already completed') logger.debug('Builder is asking for job to be removed, but work already completed')
@ -637,6 +664,10 @@ class EphemeralBuilderManager(BaseManager):
yield From(self._etcd_client.delete(metric_key)) yield From(self._etcd_client.delete(metric_key))
except (KeyError, etcd.EtcdKeyError): except (KeyError, etcd.EtcdKeyError):
logger.debug('Builder is asking for metric to be removed, but key not found') logger.debug('Builder is asking for metric to be removed, but key not found')
except etcd.EtcdConnectionFailed:
logger.exception('Could not remove metric key as etcd is not available')
yield From(trollius.sleep(ETCD_DOWN_SLEEP_TIMEOUT))
raise Return()
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status) logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
@ -695,7 +726,11 @@ class EphemeralBuilderManager(BaseManager):
# Note: A TTL of < 0 in etcd results in the key *never being expired*. We use a max here # Note: A TTL of < 0 in etcd results in the key *never being expired*. We use a max here
# to ensure that if the TTL is < 0, the key will expire immediately. # to ensure that if the TTL is < 0, the key will expire immediately.
etcd_ttl = max(ttl, 0) etcd_ttl = max(ttl, 0)
yield From(self._etcd_client.write(job_key, json.dumps(payload), ttl=etcd_ttl)) try:
yield From(self._etcd_client.write(job_key, json.dumps(payload), ttl=etcd_ttl))
except etcd.EtcdConnectionFailed:
logger.exception('Could not update heartbeat for job as etcd is not available')
yield From(trollius.sleep(ETCD_DOWN_SLEEP_TIMEOUT))
@coroutine @coroutine
@ -710,6 +745,9 @@ class EphemeralBuilderManager(BaseManager):
try: try:
yield From(self._etcd_client.write(lock_key, {}, prevExist=False, ttl=ETCD_ATOMIC_OP_TIMEOUT)) yield From(self._etcd_client.write(lock_key, {}, prevExist=False, ttl=ETCD_ATOMIC_OP_TIMEOUT))
raise Return(True) raise Return(True)
except etcd.EtcdConnectionFailed:
logger.exception('Could not get etcd atomic lock as etcd is down')
raise Return(False)
except (KeyError, etcd.EtcdKeyError): except (KeyError, etcd.EtcdKeyError):
raise Return(False) raise Return(False)
@ -769,6 +807,7 @@ class EphemeralBuilderManager(BaseManager):
yield From(self.delete_etcd_key(self._etcd_realm_key(build_info.component.builder_realm))) yield From(self.delete_etcd_key(self._etcd_realm_key(build_info.component.builder_realm)))
yield From(self.delete_etcd_key(self._etcd_metric_key(build_info.component.builder_realm))) yield From(self.delete_etcd_key(self._etcd_metric_key(build_info.component.builder_realm)))
yield From(self.delete_etcd_key(os.path.join(self._etcd_job_prefix, build_uuid))) yield From(self.delete_etcd_key(os.path.join(self._etcd_job_prefix, build_uuid)))
# This is outside the lock so we can un-register the component wherever it is registered to. # This is outside the lock so we can un-register the component wherever it is registered to.
yield From(build_info.component.cancel_build()) yield From(build_info.component.cancel_build())
@ -778,3 +817,6 @@ class EphemeralBuilderManager(BaseManager):
yield From(self._etcd_client.delete(etcd_key)) yield From(self._etcd_client.delete(etcd_key))
except (KeyError, etcd.EtcdKeyError): except (KeyError, etcd.EtcdKeyError):
logger.warning('Could not delete etcd key %s', etcd_key) logger.warning('Could not delete etcd key %s', etcd_key)
except etcd.EtcdConnectionFailed:
logger.exception('Could not delete etcd key as etcd is down')
yield From(trollius.sleep(ETCD_DOWN_SLEEP_TIMEOUT))