Add more check conditions for unhealthy workers and make the messaging better.
This commit is contained in:
parent
67905c277e
commit
510bbe7889
2 changed files with 7 additions and 5 deletions
|
@ -41,12 +41,13 @@ def matches_system_error(status_str):
|
||||||
""" Returns true if the given status string matches a known system error in the
|
""" Returns true if the given status string matches a known system error in the
|
||||||
Docker builder.
|
Docker builder.
|
||||||
"""
|
"""
|
||||||
KNOWN_MATCHES = ['lxc-start: invalid', 'lxc-start: failed to', 'lxc-start: Permission denied']
|
KNOWN_MATCHES = ['lxc-start: invalid', 'lxc-start: failed to', 'lxc-start: Permission denied',
|
||||||
|
'lxc-start: The container failed']
|
||||||
|
|
||||||
for match in KNOWN_MATCHES:
|
for match in KNOWN_MATCHES:
|
||||||
# 4 because we might have a Unix control code at the start.
|
# 10 because we might have a Unix control code at the start.
|
||||||
found = status_str.find(match[0:len(match) + 4])
|
found = status_str.find(match[0:len(match) + 10])
|
||||||
if found >= 0 and found <= 4:
|
if found >= 0 and found <= 10:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
@ -613,6 +614,7 @@ class DockerfileBuildWorker(Worker):
|
||||||
|
|
||||||
except WorkerUnhealthyException as exc:
|
except WorkerUnhealthyException as exc:
|
||||||
# Spawn a notification that the build has failed.
|
# Spawn a notification that the build has failed.
|
||||||
|
log_appender('Worker has become unhealthy. Will retry shortly.', build_logs.ERROR)
|
||||||
spawn_failure(exc.message, event_data)
|
spawn_failure(exc.message, event_data)
|
||||||
|
|
||||||
# Raise the exception to the queue.
|
# Raise the exception to the queue.
|
||||||
|
|
|
@ -135,8 +135,8 @@ class Worker(object):
|
||||||
|
|
||||||
except WorkerUnhealthyException:
|
except WorkerUnhealthyException:
|
||||||
logger.error('The worker has encountered an error and will not take new jobs. Job is being requeued.')
|
logger.error('The worker has encountered an error and will not take new jobs. Job is being requeued.')
|
||||||
self._stop.set()
|
|
||||||
self.mark_current_incomplete(restore_retry=True)
|
self.mark_current_incomplete(restore_retry=True)
|
||||||
|
self._stop.set()
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
# Close the db handle periodically
|
# Close the db handle periodically
|
||||||
|
|
Reference in a new issue