Fix etcd watching
Etcd can miss events on watches if they are occurring fast enough, so if we can get an exception indicating that we've missed an index, we reset the state of our local tracking structures by re-reading the *full* list and starting a new watch at HEAD
This commit is contained in:
parent
05d5238cc8
commit
52fa9aad5b
2 changed files with 34 additions and 1 deletions
|
@ -89,6 +89,16 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
except (ProtocolError, etcd.EtcdException):
|
except (ProtocolError, etcd.EtcdException):
|
||||||
logger.exception('Exception on etcd watch: %s', etcd_key)
|
logger.exception('Exception on etcd watch: %s', etcd_key)
|
||||||
|
|
||||||
|
except etcd.EtcdEventIndexCleared:
|
||||||
|
# This happens if etcd2 has moved forward too fast for us to start watching
|
||||||
|
# at the index we retrieved. We therefore start a new watch at HEAD and
|
||||||
|
# call the method to read the key and load ALL realms to handle any we might
|
||||||
|
# have missed.
|
||||||
|
# TODO: Remove this hack once Etcd is fixed.
|
||||||
|
logger.exception('Etcd moved forward too quickly. Restarting watch cycle.')
|
||||||
|
new_index = None
|
||||||
|
async(self._register_existing_realms())
|
||||||
|
|
||||||
if watch_task_key not in self._watch_tasks or self._watch_tasks[watch_task_key].done():
|
if watch_task_key not in self._watch_tasks or self._watch_tasks[watch_task_key].done():
|
||||||
self._watch_etcd(etcd_key, change_callback, start_index=new_index)
|
self._watch_etcd(etcd_key, change_callback, start_index=new_index)
|
||||||
|
|
||||||
|
@ -165,18 +175,35 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
logger.debug('Registering realm with manager: %s', realm_spec['realm'])
|
logger.debug('Registering realm with manager: %s', realm_spec['realm'])
|
||||||
component = self.register_component(realm_spec['realm'], BuildComponent,
|
component = self.register_component(realm_spec['realm'], BuildComponent,
|
||||||
token=realm_spec['token'])
|
token=realm_spec['token'])
|
||||||
|
|
||||||
|
if component in self._component_to_job:
|
||||||
|
logger.debug('Realm already registered with manager: %s', realm_spec['realm'])
|
||||||
|
return component
|
||||||
|
|
||||||
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
|
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
|
||||||
self._component_to_job[component] = build_job
|
self._component_to_job[component] = build_job
|
||||||
self._component_to_builder[component] = realm_spec['builder_id']
|
self._component_to_builder[component] = realm_spec['builder_id']
|
||||||
self._job_uuid_to_component[build_job.job_details['build_uuid']] = component
|
self._job_uuid_to_component[build_job.job_details['build_uuid']] = component
|
||||||
|
return component
|
||||||
|
|
||||||
@coroutine
|
@coroutine
|
||||||
def _register_existing_realms(self):
|
def _register_existing_realms(self):
|
||||||
try:
|
try:
|
||||||
all_realms = yield From(self._etcd_client.read(self._etcd_realm_prefix, recursive=True))
|
all_realms = yield From(self._etcd_client.read(self._etcd_realm_prefix, recursive=True))
|
||||||
|
|
||||||
|
# Register all existing realms found.
|
||||||
|
encountered = set()
|
||||||
for realm in all_realms.children:
|
for realm in all_realms.children:
|
||||||
if not realm.dir:
|
if not realm.dir:
|
||||||
self._register_realm(json.loads(realm.value))
|
component = self._register_realm(json.loads(realm.value))
|
||||||
|
encountered.add(component)
|
||||||
|
|
||||||
|
# Remove any components not encountered so we can clean up.
|
||||||
|
for found in list(self._component_to_job.keys()):
|
||||||
|
if not found in encountered:
|
||||||
|
self._component_to_job.pop(component)
|
||||||
|
self._component_to_builder.pop(component)
|
||||||
|
|
||||||
except (KeyError, etcd.EtcdKeyError):
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
# no realms have been registered yet
|
# no realms have been registered yet
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -36,6 +36,7 @@ class BuilderServer(object):
|
||||||
self._loop = None
|
self._loop = None
|
||||||
self._current_status = BuildServerStatus.STARTING
|
self._current_status = BuildServerStatus.STARTING
|
||||||
self._current_components = []
|
self._current_components = []
|
||||||
|
self._realm_map = {}
|
||||||
self._job_count = 0
|
self._job_count = 0
|
||||||
|
|
||||||
self._session_factory = RouterSessionFactory(RouterFactory())
|
self._session_factory = RouterSessionFactory(RouterFactory())
|
||||||
|
@ -111,6 +112,9 @@ class BuilderServer(object):
|
||||||
BaseComponent.
|
BaseComponent.
|
||||||
"""
|
"""
|
||||||
logger.debug('Registering component with realm %s', realm)
|
logger.debug('Registering component with realm %s', realm)
|
||||||
|
if realm in self._realm_map:
|
||||||
|
logger.debug('Component with realm %s already registered', realm)
|
||||||
|
return self._realm_map[realm]
|
||||||
|
|
||||||
component = component_klass(types.ComponentConfig(realm=realm), realm=realm, **kwargs)
|
component = component_klass(types.ComponentConfig(realm=realm), realm=realm, **kwargs)
|
||||||
component.server = self
|
component.server = self
|
||||||
|
@ -119,6 +123,7 @@ class BuilderServer(object):
|
||||||
component.user_files = self._user_files
|
component.user_files = self._user_files
|
||||||
component.registry_hostname = self._registry_hostname
|
component.registry_hostname = self._registry_hostname
|
||||||
|
|
||||||
|
self._realm_map[realm] = component
|
||||||
self._current_components.append(component)
|
self._current_components.append(component)
|
||||||
self._session_factory.add(component)
|
self._session_factory.add(component)
|
||||||
return component
|
return component
|
||||||
|
@ -127,6 +132,7 @@ class BuilderServer(object):
|
||||||
logger.debug('Unregistering component with realm %s and token %s',
|
logger.debug('Unregistering component with realm %s and token %s',
|
||||||
component.builder_realm, component.expected_token)
|
component.builder_realm, component.expected_token)
|
||||||
|
|
||||||
|
self._realm_map.pop(component.builder_realm)
|
||||||
self._current_components.remove(component)
|
self._current_components.remove(component)
|
||||||
self._session_factory.remove(component)
|
self._session_factory.remove(component)
|
||||||
|
|
||||||
|
|
Reference in a new issue