diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index f5e74706d..180f22026 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -6,6 +6,8 @@ import os.path import json import time +import trollius + from collections import namedtuple from datetime import datetime, timedelta from trollius import From, coroutine, Return, async @@ -31,7 +33,7 @@ RETRY_IMMEDIATELY_TIMEOUT = 0 NO_WORKER_AVAILABLE_TIMEOUT = 10 DEFAULT_EPHEMERAL_API_TIMEOUT = 20 DEFAULT_EPHEMERAL_SETUP_TIMEOUT = 500 - +ETCD_DOWN_SLEEP_TIMEOUT = 5 class EtcdAction(object): """ Enumeration of the various kinds of etcd actions we can observe via a watch. """ @@ -129,6 +131,15 @@ class EphemeralBuilderManager(BaseManager): logger.debug('Etcd key already cleared: %s', etcd_key) return + except etcd.EtcdConnectionFailed: + # If the connection has failed, then etcd is most likely down, and we need to + # sleep for a bit before checking for it to come up again. + logger.exception('Connecting to etcd failed; sleeping for %s and then trying again', + ETCD_DOWN_SLEEP_TIMEOUT) + time.sleep(ETCD_DOWN_SLEEP_TIMEOUT) + logger.exception('Connecting to etcd failed; slept for %s and now trying again', + ETCD_DOWN_SLEEP_TIMEOUT) + except etcd.EtcdException as eex: # TODO(jschorr): This is a quick and dirty hack and should be replaced # with a proper exception check. @@ -328,6 +339,9 @@ class EphemeralBuilderManager(BaseManager): except (KeyError, etcd.EtcdKeyError): # no realms have been registered yet pass + except etcd.EtcdConnectionFailed: + # Not much to do. + pass def _load_executor(self, executor_kind_name, executor_config): executor_klass = EphemeralBuilderManager.EXECUTORS.get(executor_kind_name) @@ -427,6 +441,9 @@ class EphemeralBuilderManager(BaseManager): workers_alive = sum(1 for child in active_jobs.children if not child.dir) except (KeyError, etcd.EtcdKeyError): workers_alive = 0 + except etcd.EtcdConnectionFailed: + logger.exception('Could not read job count from etcd for job due to etcd being down') + raise Return(False, ETCD_DOWN_SLEEP_TIMEOUT) except etcd.EtcdException: logger.exception('Exception when reading job count from etcd for job: %s', build_uuid) raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) @@ -465,6 +482,9 @@ class EphemeralBuilderManager(BaseManager): # The job was already taken by someone else, we are probably a retry logger.warning('Job: %s already exists in etcd, timeout may be misconfigured', build_uuid) raise Return(False, self._ephemeral_api_timeout) + except etcd.EtcdConnectionFailed: + logger.exception('Exception when writing job %s to etcd; could not connect', build_uuid) + raise Return(False, ETCD_DOWN_SLEEP_TIMEOUT) except etcd.EtcdException: logger.exception('Exception when writing job %s to etcd', build_uuid) raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) @@ -568,6 +588,9 @@ class EphemeralBuilderManager(BaseManager): logger.error('Realm %s already exists in etcd for job %s ' + 'UUID collision or something is very very wrong.', realm, build_uuid) raise Return(False, setup_time) + except etcd.EtcdConnectionFailed: + logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid) + raise Return(False, ETCD_DOWN_SLEEP_TIMEOUT) except etcd.EtcdException: logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid) raise Return(False, setup_time) @@ -628,6 +651,10 @@ class EphemeralBuilderManager(BaseManager): job_key = self._etcd_job_key(build_job) try: yield From(self._etcd_client.delete(job_key)) + except etcd.EtcdConnectionFailed: + logger.exception('Could not remove job key as etcd is not available') + yield From(trollius.sleep(ETCD_DOWN_SLEEP_TIMEOUT)) + raise Return() except (KeyError, etcd.EtcdKeyError): logger.debug('Builder is asking for job to be removed, but work already completed') @@ -637,6 +664,10 @@ class EphemeralBuilderManager(BaseManager): yield From(self._etcd_client.delete(metric_key)) except (KeyError, etcd.EtcdKeyError): logger.debug('Builder is asking for metric to be removed, but key not found') + except etcd.EtcdConnectionFailed: + logger.exception('Could not remove metric key as etcd is not available') + yield From(trollius.sleep(ETCD_DOWN_SLEEP_TIMEOUT)) + raise Return() logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status) @@ -695,7 +726,11 @@ class EphemeralBuilderManager(BaseManager): # Note: A TTL of < 0 in etcd results in the key *never being expired*. We use a max here # to ensure that if the TTL is < 0, the key will expire immediately. etcd_ttl = max(ttl, 0) - yield From(self._etcd_client.write(job_key, json.dumps(payload), ttl=etcd_ttl)) + try: + yield From(self._etcd_client.write(job_key, json.dumps(payload), ttl=etcd_ttl)) + except etcd.EtcdConnectionFailed: + logger.exception('Could not update heartbeat for job as etcd is not available') + yield From(trollius.sleep(ETCD_DOWN_SLEEP_TIMEOUT)) @coroutine @@ -710,6 +745,9 @@ class EphemeralBuilderManager(BaseManager): try: yield From(self._etcd_client.write(lock_key, {}, prevExist=False, ttl=ETCD_ATOMIC_OP_TIMEOUT)) raise Return(True) + except etcd.EtcdConnectionFailed: + logger.exception('Could not get etcd atomic lock as etcd is down') + raise Return(False) except (KeyError, etcd.EtcdKeyError): raise Return(False) @@ -769,6 +807,7 @@ class EphemeralBuilderManager(BaseManager): yield From(self.delete_etcd_key(self._etcd_realm_key(build_info.component.builder_realm))) yield From(self.delete_etcd_key(self._etcd_metric_key(build_info.component.builder_realm))) yield From(self.delete_etcd_key(os.path.join(self._etcd_job_prefix, build_uuid))) + # This is outside the lock so we can un-register the component wherever it is registered to. yield From(build_info.component.cancel_build()) @@ -778,3 +817,6 @@ class EphemeralBuilderManager(BaseManager): yield From(self._etcd_client.delete(etcd_key)) except (KeyError, etcd.EtcdKeyError): logger.warning('Could not delete etcd key %s', etcd_key) + except etcd.EtcdConnectionFailed: + logger.exception('Could not delete etcd key as etcd is down') + yield From(trollius.sleep(ETCD_DOWN_SLEEP_TIMEOUT))