diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 0978c1b8d..ad976cc64 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -4,6 +4,7 @@ import uuid import calendar import os.path import json +import boto from datetime import datetime, timedelta from trollius import From, coroutine, Return, async @@ -45,6 +46,10 @@ class EphemeralBuilderManager(BaseManager): _etcd_client_klass = etcd.Client def __init__(self, *args, **kwargs): + # Set a timeout so that boto connections don't hang indefinitely. + boto.config.add_section('Boto') + boto.config.set('Boto', 'http_socket_timeout', '10') + self._shutting_down = False self._manager_config = None @@ -90,6 +95,9 @@ class EphemeralBuilderManager(BaseManager): self._watch_tasks[watch_task_key] = async(watch_future) def _handle_builder_expiration(self, etcd_result): + if etcd_result is None: + return + if etcd_result.action == EtcdAction.EXPIRE: # Handle the expiration logger.debug('Builder expired, clean up the old build node') @@ -100,6 +108,9 @@ class EphemeralBuilderManager(BaseManager): async(self._executor.stop_builder(job_metadata['builder_id'])) def _handle_realm_change(self, etcd_result): + if etcd_result is None: + return + if etcd_result.action == EtcdAction.CREATE: # We must listen on the realm created by ourselves or another worker realm_spec = json.loads(etcd_result.value) @@ -160,7 +171,8 @@ class EphemeralBuilderManager(BaseManager): self._async_thread_executor = ThreadPoolExecutor(worker_threads) self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port, cert=etcd_auth, ca_cert=etcd_ca_cert, - protocol=etcd_protocol), + protocol=etcd_protocol, + read_timeout=5), executor=self._async_thread_executor) self._etcd_builder_prefix = self._manager_config.get('ETCD_BUILDER_PREFIX', 'building/') @@ -201,6 +213,9 @@ class EphemeralBuilderManager(BaseManager): workers_alive = sum(1 for child in building.children if not child.dir) except KeyError: workers_alive = 0 + except etcd.EtcdException: + logger.exception('Exception when reading job count from etcd') + raise Return(False) logger.debug('Total jobs: %s', workers_alive) @@ -231,6 +246,9 @@ class EphemeralBuilderManager(BaseManager): # The job was already taken by someone else, we are probably a retry logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?') raise Return(False) + except etcd.EtcdException: + logger.exception('Exception when writing job %s to etcd', build_uuid) + raise Return(False) logger.debug('Starting builder with executor: %s', self._executor) @@ -255,12 +273,16 @@ class EphemeralBuilderManager(BaseManager): 'builder_id': builder_id, 'job_queue_item': build_job.job_item, }) + try: yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False, ttl=ttl)) except KeyError: logger.error('Realm already exists in etcd. UUID collision or something is very very wrong.') raise Return(False) + except etcd.EtcdException: + logger.exception('Exception when writing realm %s to etcd', realm) + raise Return(False) raise Return(True) diff --git a/buildman/server.py b/buildman/server.py index 6346f4a2e..63ccd9309 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -150,6 +150,7 @@ class BuilderServer(object): @trollius.coroutine def _work_checker(self): + logger.debug('Initializing work checker') while self._current_status == BuildServerStatus.RUNNING: with database.CloseForLongOperation(app.config): yield From(trollius.sleep(WORK_CHECK_TIMEOUT)) @@ -192,10 +193,12 @@ class BuilderServer(object): def _queue_metrics_updater(self): logger.debug('Initializing queue metrics updater') while self._current_status == BuildServerStatus.RUNNING: - yield From(trollius.sleep(30)) logger.debug('Writing metrics') self._queue.update_metrics() + logger.debug('Metrics going to sleep for 30 seconds') + yield From(trollius.sleep(30)) + @trollius.coroutine def _initialize(self, loop, host, websocket_port, controller_port, ssl=None): self._loop = loop