Add more check conditions for unhealthy workers and make the messaging better.

This commit is contained in:
Joseph Schorr 2014-08-26 12:41:43 -04:00
parent 67905c277e
commit 510bbe7889
2 changed files with 7 additions and 5 deletions

View file

@ -41,12 +41,13 @@ def matches_system_error(status_str):
""" Returns true if the given status string matches a known system error in the """ Returns true if the given status string matches a known system error in the
Docker builder. Docker builder.
""" """
KNOWN_MATCHES = ['lxc-start: invalid', 'lxc-start: failed to', 'lxc-start: Permission denied'] KNOWN_MATCHES = ['lxc-start: invalid', 'lxc-start: failed to', 'lxc-start: Permission denied',
'lxc-start: The container failed']
for match in KNOWN_MATCHES: for match in KNOWN_MATCHES:
# 4 because we might have a Unix control code at the start. # 10 because we might have a Unix control code at the start.
found = status_str.find(match[0:len(match) + 4]) found = status_str.find(match[0:len(match) + 10])
if found >= 0 and found <= 4: if found >= 0 and found <= 10:
return True return True
return False return False
@ -613,6 +614,7 @@ class DockerfileBuildWorker(Worker):
except WorkerUnhealthyException as exc: except WorkerUnhealthyException as exc:
# Spawn a notification that the build has failed. # Spawn a notification that the build has failed.
log_appender('Worker has become unhealthy. Will retry shortly.', build_logs.ERROR)
spawn_failure(exc.message, event_data) spawn_failure(exc.message, event_data)
# Raise the exception to the queue. # Raise the exception to the queue.

View file

@ -135,8 +135,8 @@ class Worker(object):
except WorkerUnhealthyException: except WorkerUnhealthyException:
logger.error('The worker has encountered an error and will not take new jobs. Job is being requeued.') logger.error('The worker has encountered an error and will not take new jobs. Job is being requeued.')
self._stop.set()
self.mark_current_incomplete(restore_retry=True) self.mark_current_incomplete(restore_retry=True)
self._stop.set()
finally: finally:
# Close the db handle periodically # Close the db handle periodically