Make build workers report that they are unhealthy when we get an LXC error or a Docker connection issue

This commit is contained in:
Joseph Schorr 2014-07-30 17:54:58 -04:00
parent b12d63ce9a
commit 7e935f5a8c
2 changed files with 57 additions and 14 deletions

View file

@ -96,6 +96,14 @@ class Worker(object):
if self.current_queue_item is not None:
self._queue.extend_processing(self.current_queue_item, seconds_from_now)
def run_watchdog(self):
logger.debug('Running watchdog.')
try:
self.watchdog()
except WorkerUnhealthyException:
logger.error('The worker has encountered an error and will not take new jobs.')
self._stop.set()
def poll_queue(self):
logger.debug('Getting work item from queue.')
@ -112,7 +120,7 @@ class Worker(object):
logger.warning('An error occurred processing request: %s', self.current_queue_item.body)
self._queue.incomplete(self.current_queue_item)
except WorkerUnhealthyException:
logger.error('The worker has encountered an error and will not take new jobs.')
logger.error('The worker has encountered an error and will not take new jobs. Job is being requeued.')
self._stop.set()
self._queue.incomplete(self.current_queue_item, restore_retry=True)
finally:
@ -147,7 +155,7 @@ class Worker(object):
self._sched.add_interval_job(self.poll_queue, seconds=self._poll_period_seconds,
start_date=soon)
self._sched.add_interval_job(self.update_queue_metrics, seconds=60, start_date=soon)
self._sched.add_interval_job(self.watchdog, seconds=self._watchdog_period_seconds)
self._sched.add_interval_job(self.run_watchdog, seconds=self._watchdog_period_seconds)
signal.signal(signal.SIGTERM, self.terminate)
signal.signal(signal.SIGINT, self.terminate)