Add a heartbeat to the build status, so we know if a manager crashed
This commit is contained in:
parent
01dc10b8fc
commit
043a30ee96
1 changed files with 20 additions and 7 deletions
|
@ -15,7 +15,8 @@ from buildman.workererror import WorkerError
|
||||||
|
|
||||||
from data.database import BUILD_PHASE
|
from data.database import BUILD_PHASE
|
||||||
|
|
||||||
HEARTBEAT_DELTA = datetime.timedelta(seconds=15)
|
HEARTBEAT_DELTA = datetime.timedelta(seconds=30)
|
||||||
|
HEARTBEAT_TIMEOUT = 10
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -276,9 +277,9 @@ class BuildComponent(BaseComponent):
|
||||||
|
|
||||||
self._set_status(COMPONENT_STATUS.RUNNING)
|
self._set_status(COMPONENT_STATUS.RUNNING)
|
||||||
|
|
||||||
# Start the heartbeat check.
|
# Start the heartbeat check and updating loop.
|
||||||
loop = trollius.get_event_loop()
|
loop = trollius.get_event_loop()
|
||||||
loop.create_task(self._check_heartbeat(loop))
|
loop.create_task(self._heartbeat(loop))
|
||||||
logger.debug('Build worker %s is connected and ready' % self.builder_realm)
|
logger.debug('Build worker %s is connected and ready' % self.builder_realm)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -288,17 +289,29 @@ class BuildComponent(BaseComponent):
|
||||||
def _on_heartbeat(self):
|
def _on_heartbeat(self):
|
||||||
self._last_heartbeat = datetime.datetime.now()
|
self._last_heartbeat = datetime.datetime.now()
|
||||||
|
|
||||||
def _start_heartbeat_check(self, loop):
|
def _start_heartbeat(self, loop):
|
||||||
trollius.set_event_loop(loop)
|
trollius.set_event_loop(loop)
|
||||||
loop.run_until_complete(self._check_heartbeat())
|
loop.run_until_complete(self._heartbeat())
|
||||||
|
|
||||||
@trollius.coroutine
|
@trollius.coroutine
|
||||||
def _check_heartbeat(self, loop):
|
def _heartbeat(self, loop):
|
||||||
|
""" Coroutine that runs every HEARTBEAT_TIMEOUT seconds, both checking the worker's heartbeat
|
||||||
|
and updating the heartbeat in the build status dictionary (if applicable). This allows
|
||||||
|
the build system to catch crashes from either end.
|
||||||
|
"""
|
||||||
while True:
|
while True:
|
||||||
|
# If the component is no longer running or actively building, nothing more to do.
|
||||||
if (self._component_status != COMPONENT_STATUS.RUNNING and
|
if (self._component_status != COMPONENT_STATUS.RUNNING and
|
||||||
self._component_status != COMPONENT_STATUS.BUILDING):
|
self._component_status != COMPONENT_STATUS.BUILDING):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# If there is an active build, write the heartbeat to its status.
|
||||||
|
build_status = self._build_status
|
||||||
|
if build_status is not None:
|
||||||
|
with build_status as status_dict:
|
||||||
|
status_dict['heartbeat'] = int(time.time())
|
||||||
|
|
||||||
|
# Check the heartbeat from the worker.
|
||||||
logger.debug('Checking heartbeat on realm %s', self.builder_realm)
|
logger.debug('Checking heartbeat on realm %s', self.builder_realm)
|
||||||
if not self._last_heartbeat:
|
if not self._last_heartbeat:
|
||||||
self._timeout()
|
self._timeout()
|
||||||
|
@ -308,7 +321,7 @@ class BuildComponent(BaseComponent):
|
||||||
self._timeout()
|
self._timeout()
|
||||||
return
|
return
|
||||||
|
|
||||||
yield From(trollius.sleep(5))
|
yield From(trollius.sleep(HEARTBEAT_TIMEOUT))
|
||||||
|
|
||||||
def _timeout(self):
|
def _timeout(self):
|
||||||
self._set_status(COMPONENT_STATUS.TIMED_OUT)
|
self._set_status(COMPONENT_STATUS.TIMED_OUT)
|
||||||
|
|
Reference in a new issue