Make build workers report that they are unhealthy when we get an LXC error or a Docker connection issue

This commit is contained in:
Joseph Schorr 2014-07-30 17:54:58 -04:00
parent b12d63ce9a
commit 7e935f5a8c
2 changed files with 57 additions and 14 deletions

View file

@ -20,6 +20,7 @@ from datetime import datetime, timedelta
from threading import Event from threading import Event
from uuid import uuid4 from uuid import uuid4
from collections import defaultdict from collections import defaultdict
from requests.exceptions import ConnectionError
from data import model from data import model
from workers.worker import Worker, WorkerUnhealthyException, JobException from workers.worker import Worker, WorkerUnhealthyException, JobException
@ -35,6 +36,19 @@ CACHE_EXPIRATION_PERIOD_HOURS = 24
NO_TAGS = ['<none>:<none>'] NO_TAGS = ['<none>:<none>']
RESERVATION_TIME = (TIMEOUT_PERIOD_MINUTES + 5) * 60 RESERVATION_TIME = (TIMEOUT_PERIOD_MINUTES + 5) * 60
def matches_system_error(status_str):
""" Returns true if the given status string matches a known system error in the
Docker builder.
"""
KNOWN_MATCHES = ['lxc-start: invalid', 'lxc-start: failed to', 'lxc-start: Permission denied']
for match in KNOWN_MATCHES:
# 4 because we might have a Unix control code at the start.
if status_str.find(match[0:len(match) + 4]) <= 4:
return True
return False
class StatusWrapper(object): class StatusWrapper(object):
def __init__(self, build_uuid): def __init__(self, build_uuid):
@ -141,13 +155,20 @@ class DockerfileBuildContext(object):
message = 'Docker installation is no longer healthy.' message = 'Docker installation is no longer healthy.'
logger.exception(message) logger.exception(message)
raise WorkerUnhealthyException(message) raise WorkerUnhealthyException(message)
return self return self
def __exit__(self, exc_type, value, traceback): def __exit__(self, exc_type, value, traceback):
self.__cleanup_containers()
shutil.rmtree(self._build_dir) shutil.rmtree(self._build_dir)
try:
self.__cleanup_containers()
except APIError:
sentry.client.captureException()
message = 'Docker installation is no longer healthy.'
logger.exception(message)
raise WorkerUnhealthyException(message)
@staticmethod @staticmethod
def __inject_quay_repo_env(parsed_dockerfile, quay_reponame): def __inject_quay_repo_env(parsed_dockerfile, quay_reponame):
env_command = { env_command = {
@ -247,6 +268,11 @@ class DockerfileBuildContext(object):
fully_unwrapped = status fully_unwrapped = status
status_str = str(fully_unwrapped.encode('utf-8')) status_str = str(fully_unwrapped.encode('utf-8'))
# Check for system errors when building.
if matches_system_error(status_str):
raise WorkerUnhealthyException(status_str)
logger.debug('Status: %s', status_str) logger.debug('Status: %s', status_str)
step_increment = re.search(r'Step ([0-9]+) :', status_str) step_increment = re.search(r'Step ([0-9]+) :', status_str)
if step_increment: if step_increment:
@ -437,17 +463,20 @@ class DockerfileBuildWorker(Worker):
def watchdog(self): def watchdog(self):
logger.debug('Running build watchdog code.') logger.debug('Running build watchdog code.')
docker_cl = Client() try:
docker_cl = Client()
# Iterate the running containers and kill ones that have been running more than 20 minutes # Iterate the running containers and kill ones that have been running more than 20 minutes
for container in docker_cl.containers(): for container in docker_cl.containers():
start_time = datetime.fromtimestamp(container['Created']) start_time = datetime.fromtimestamp(container['Created'])
running_time = datetime.now() - start_time running_time = datetime.now() - start_time
if running_time > timedelta(minutes=TIMEOUT_PERIOD_MINUTES): if running_time > timedelta(minutes=TIMEOUT_PERIOD_MINUTES):
logger.warning('Container has been running too long: %s with command: %s', logger.warning('Container has been running too long: %s with command: %s',
container['Id'], container['Command']) container['Id'], container['Command'])
docker_cl.kill(container['Id']) docker_cl.kill(container['Id'])
self._timeout.set() self._timeout.set()
except ConnectionError as exc:
raise WorkerUnhealthyException(exc.message)
def process_queue_item(self, job_details): def process_queue_item(self, job_details):
self._timeout.clear() self._timeout.clear()
@ -548,6 +577,12 @@ class DockerfileBuildWorker(Worker):
# Need a separate handler for this so it doesn't get caught by catch all below # Need a separate handler for this so it doesn't get caught by catch all below
raise exc raise exc
except ConnectionError as exc:
# A connection exception means the worker has become unhealthy (Docker is down)
# so we re-raise as that exception.
log_appender('Docker daemon has gone away. Will retry shortly.', build_logs.ERROR)
raise WorkerUnhealthyException(exc.message)
except Exception as exc: except Exception as exc:
sentry.client.captureException() sentry.client.captureException()
log_appender('error', build_logs.PHASE) log_appender('error', build_logs.PHASE)

View file

@ -96,6 +96,14 @@ class Worker(object):
if self.current_queue_item is not None: if self.current_queue_item is not None:
self._queue.extend_processing(self.current_queue_item, seconds_from_now) self._queue.extend_processing(self.current_queue_item, seconds_from_now)
def run_watchdog(self):
logger.debug('Running watchdog.')
try:
self.watchdog()
except WorkerUnhealthyException:
logger.error('The worker has encountered an error and will not take new jobs.')
self._stop.set()
def poll_queue(self): def poll_queue(self):
logger.debug('Getting work item from queue.') logger.debug('Getting work item from queue.')
@ -112,7 +120,7 @@ class Worker(object):
logger.warning('An error occurred processing request: %s', self.current_queue_item.body) logger.warning('An error occurred processing request: %s', self.current_queue_item.body)
self._queue.incomplete(self.current_queue_item) self._queue.incomplete(self.current_queue_item)
except WorkerUnhealthyException: except WorkerUnhealthyException:
logger.error('The worker has encountered an error and will not take new jobs.') logger.error('The worker has encountered an error and will not take new jobs. Job is being requeued.')
self._stop.set() self._stop.set()
self._queue.incomplete(self.current_queue_item, restore_retry=True) self._queue.incomplete(self.current_queue_item, restore_retry=True)
finally: finally:
@ -147,7 +155,7 @@ class Worker(object):
self._sched.add_interval_job(self.poll_queue, seconds=self._poll_period_seconds, self._sched.add_interval_job(self.poll_queue, seconds=self._poll_period_seconds,
start_date=soon) start_date=soon)
self._sched.add_interval_job(self.update_queue_metrics, seconds=60, start_date=soon) self._sched.add_interval_job(self.update_queue_metrics, seconds=60, start_date=soon)
self._sched.add_interval_job(self.watchdog, seconds=self._watchdog_period_seconds) self._sched.add_interval_job(self.run_watchdog, seconds=self._watchdog_period_seconds)
signal.signal(signal.SIGTERM, self.terminate) signal.signal(signal.SIGTERM, self.terminate)
signal.signal(signal.SIGINT, self.terminate) signal.signal(signal.SIGINT, self.terminate)