Add additional logging, timeouts, and exception checks

This commit is contained in:
Joseph Schorr 2015-02-25 15:15:22 -05:00
parent 328de0201f
commit 5dd78f76c7
2 changed files with 27 additions and 2 deletions

View file

@ -4,6 +4,7 @@ import uuid
import calendar
import os.path
import json
import boto
from datetime import datetime, timedelta
from trollius import From, coroutine, Return, async
@ -45,6 +46,10 @@ class EphemeralBuilderManager(BaseManager):
_etcd_client_klass = etcd.Client
def __init__(self, *args, **kwargs):
# Set a timeout so that boto connections don't hang indefinitely.
boto.config.add_section('Boto')
boto.config.set('Boto', 'http_socket_timeout', '10')
self._shutting_down = False
self._manager_config = None
@ -90,6 +95,9 @@ class EphemeralBuilderManager(BaseManager):
self._watch_tasks[watch_task_key] = async(watch_future)
def _handle_builder_expiration(self, etcd_result):
if etcd_result is None:
return
if etcd_result.action == EtcdAction.EXPIRE:
# Handle the expiration
logger.debug('Builder expired, clean up the old build node')
@ -100,6 +108,9 @@ class EphemeralBuilderManager(BaseManager):
async(self._executor.stop_builder(job_metadata['builder_id']))
def _handle_realm_change(self, etcd_result):
if etcd_result is None:
return
if etcd_result.action == EtcdAction.CREATE:
# We must listen on the realm created by ourselves or another worker
realm_spec = json.loads(etcd_result.value)
@ -160,7 +171,8 @@ class EphemeralBuilderManager(BaseManager):
self._async_thread_executor = ThreadPoolExecutor(worker_threads)
self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port,
cert=etcd_auth, ca_cert=etcd_ca_cert,
protocol=etcd_protocol),
protocol=etcd_protocol,
read_timeout=5),
executor=self._async_thread_executor)
self._etcd_builder_prefix = self._manager_config.get('ETCD_BUILDER_PREFIX', 'building/')
@ -201,6 +213,9 @@ class EphemeralBuilderManager(BaseManager):
workers_alive = sum(1 for child in building.children if not child.dir)
except KeyError:
workers_alive = 0
except etcd.EtcdException:
logger.exception('Exception when reading job count from etcd')
raise Return(False)
logger.debug('Total jobs: %s', workers_alive)
@ -231,6 +246,9 @@ class EphemeralBuilderManager(BaseManager):
# The job was already taken by someone else, we are probably a retry
logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?')
raise Return(False)
except etcd.EtcdException:
logger.exception('Exception when writing job %s to etcd', build_uuid)
raise Return(False)
logger.debug('Starting builder with executor: %s', self._executor)
@ -255,12 +273,16 @@ class EphemeralBuilderManager(BaseManager):
'builder_id': builder_id,
'job_queue_item': build_job.job_item,
})
try:
yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False,
ttl=ttl))
except KeyError:
logger.error('Realm already exists in etcd. UUID collision or something is very very wrong.')
raise Return(False)
except etcd.EtcdException:
logger.exception('Exception when writing realm %s to etcd', realm)
raise Return(False)
raise Return(True)

View file

@ -150,6 +150,7 @@ class BuilderServer(object):
@trollius.coroutine
def _work_checker(self):
logger.debug('Initializing work checker')
while self._current_status == BuildServerStatus.RUNNING:
with database.CloseForLongOperation(app.config):
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
@ -192,10 +193,12 @@ class BuilderServer(object):
def _queue_metrics_updater(self):
logger.debug('Initializing queue metrics updater')
while self._current_status == BuildServerStatus.RUNNING:
yield From(trollius.sleep(30))
logger.debug('Writing metrics')
self._queue.update_metrics()
logger.debug('Metrics going to sleep for 30 seconds')
yield From(trollius.sleep(30))
@trollius.coroutine
def _initialize(self, loop, host, websocket_port, controller_port, ssl=None):
self._loop = loop