Add a heartbeat to the build status, so we know if a manager crashed

This commit is contained in:
Joseph Schorr 2014-11-14 15:31:02 -05:00
parent 01dc10b8fc
commit 043a30ee96

View file

@ -15,7 +15,8 @@ from buildman.workererror import WorkerError
from data.database import BUILD_PHASE
HEARTBEAT_DELTA = datetime.timedelta(seconds=15)
HEARTBEAT_DELTA = datetime.timedelta(seconds=30)
HEARTBEAT_TIMEOUT = 10
logger = logging.getLogger(__name__)
@ -276,9 +277,9 @@ class BuildComponent(BaseComponent):
self._set_status(COMPONENT_STATUS.RUNNING)
# Start the heartbeat check.
# Start the heartbeat check and updating loop.
loop = trollius.get_event_loop()
loop.create_task(self._check_heartbeat(loop))
loop.create_task(self._heartbeat(loop))
logger.debug('Build worker %s is connected and ready' % self.builder_realm)
return True
@ -288,17 +289,29 @@ class BuildComponent(BaseComponent):
def _on_heartbeat(self):
self._last_heartbeat = datetime.datetime.now()
def _start_heartbeat_check(self, loop):
def _start_heartbeat(self, loop):
trollius.set_event_loop(loop)
loop.run_until_complete(self._check_heartbeat())
loop.run_until_complete(self._heartbeat())
@trollius.coroutine
def _check_heartbeat(self, loop):
def _heartbeat(self, loop):
""" Coroutine that runs every HEARTBEAT_TIMEOUT seconds, both checking the worker's heartbeat
and updating the heartbeat in the build status dictionary (if applicable). This allows
the build system to catch crashes from either end.
"""
while True:
# If the component is no longer running or actively building, nothing more to do.
if (self._component_status != COMPONENT_STATUS.RUNNING and
self._component_status != COMPONENT_STATUS.BUILDING):
return
# If there is an active build, write the heartbeat to its status.
build_status = self._build_status
if build_status is not None:
with build_status as status_dict:
status_dict['heartbeat'] = int(time.time())
# Check the heartbeat from the worker.
logger.debug('Checking heartbeat on realm %s', self.builder_realm)
if not self._last_heartbeat:
self._timeout()
@ -308,7 +321,7 @@ class BuildComponent(BaseComponent):
self._timeout()
return
yield From(trollius.sleep(5))
yield From(trollius.sleep(HEARTBEAT_TIMEOUT))
def _timeout(self):
self._set_status(COMPONENT_STATUS.TIMED_OUT)