Add a heartbeat to the build status, so we know if a manager crashed
This commit is contained in:
parent
01dc10b8fc
commit
043a30ee96
1 changed files with 20 additions and 7 deletions
|
@ -15,7 +15,8 @@ from buildman.workererror import WorkerError
|
|||
|
||||
from data.database import BUILD_PHASE
|
||||
|
||||
HEARTBEAT_DELTA = datetime.timedelta(seconds=15)
|
||||
HEARTBEAT_DELTA = datetime.timedelta(seconds=30)
|
||||
HEARTBEAT_TIMEOUT = 10
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -276,9 +277,9 @@ class BuildComponent(BaseComponent):
|
|||
|
||||
self._set_status(COMPONENT_STATUS.RUNNING)
|
||||
|
||||
# Start the heartbeat check.
|
||||
# Start the heartbeat check and updating loop.
|
||||
loop = trollius.get_event_loop()
|
||||
loop.create_task(self._check_heartbeat(loop))
|
||||
loop.create_task(self._heartbeat(loop))
|
||||
logger.debug('Build worker %s is connected and ready' % self.builder_realm)
|
||||
return True
|
||||
|
||||
|
@ -288,17 +289,29 @@ class BuildComponent(BaseComponent):
|
|||
def _on_heartbeat(self):
|
||||
self._last_heartbeat = datetime.datetime.now()
|
||||
|
||||
def _start_heartbeat_check(self, loop):
|
||||
def _start_heartbeat(self, loop):
|
||||
trollius.set_event_loop(loop)
|
||||
loop.run_until_complete(self._check_heartbeat())
|
||||
loop.run_until_complete(self._heartbeat())
|
||||
|
||||
@trollius.coroutine
|
||||
def _check_heartbeat(self, loop):
|
||||
def _heartbeat(self, loop):
|
||||
""" Coroutine that runs every HEARTBEAT_TIMEOUT seconds, both checking the worker's heartbeat
|
||||
and updating the heartbeat in the build status dictionary (if applicable). This allows
|
||||
the build system to catch crashes from either end.
|
||||
"""
|
||||
while True:
|
||||
# If the component is no longer running or actively building, nothing more to do.
|
||||
if (self._component_status != COMPONENT_STATUS.RUNNING and
|
||||
self._component_status != COMPONENT_STATUS.BUILDING):
|
||||
return
|
||||
|
||||
# If there is an active build, write the heartbeat to its status.
|
||||
build_status = self._build_status
|
||||
if build_status is not None:
|
||||
with build_status as status_dict:
|
||||
status_dict['heartbeat'] = int(time.time())
|
||||
|
||||
# Check the heartbeat from the worker.
|
||||
logger.debug('Checking heartbeat on realm %s', self.builder_realm)
|
||||
if not self._last_heartbeat:
|
||||
self._timeout()
|
||||
|
@ -308,7 +321,7 @@ class BuildComponent(BaseComponent):
|
|||
self._timeout()
|
||||
return
|
||||
|
||||
yield From(trollius.sleep(5))
|
||||
yield From(trollius.sleep(HEARTBEAT_TIMEOUT))
|
||||
|
||||
def _timeout(self):
|
||||
self._set_status(COMPONENT_STATUS.TIMED_OUT)
|
||||
|
|
Reference in a new issue