Better handling and logging of exceptions in build manager
Also increases the setup timeout for EC2
This commit is contained in:
parent
480d890442
commit
292abb5395
2 changed files with 26 additions and 24 deletions
|
@ -376,7 +376,7 @@ class BuildComponent(BaseComponent):
|
||||||
if self._component_status == ComponentStatus.TIMED_OUT:
|
if self._component_status == ComponentStatus.TIMED_OUT:
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.debug('got heartbeat on realm %s', self.builder_realm)
|
logger.debug('Got heartbeat on realm %s', self.builder_realm)
|
||||||
self._last_heartbeat = datetime.datetime.utcnow()
|
self._last_heartbeat = datetime.datetime.utcnow()
|
||||||
|
|
||||||
@trollius.coroutine
|
@trollius.coroutine
|
||||||
|
|
|
@ -29,7 +29,7 @@ ETCD_ATOMIC_OP_TIMEOUT = 120
|
||||||
RETRY_IMMEDIATELY_TIMEOUT = 0
|
RETRY_IMMEDIATELY_TIMEOUT = 0
|
||||||
NO_WORKER_AVAILABLE_TIMEOUT = 10
|
NO_WORKER_AVAILABLE_TIMEOUT = 10
|
||||||
DEFAULT_EPHEMERAL_API_TIMEOUT = 20
|
DEFAULT_EPHEMERAL_API_TIMEOUT = 20
|
||||||
DEFAULT_EPHEMERAL_SETUP_TIMEOUT = 300
|
DEFAULT_EPHEMERAL_SETUP_TIMEOUT = 500
|
||||||
|
|
||||||
|
|
||||||
class EtcdAction(object):
|
class EtcdAction(object):
|
||||||
|
@ -249,24 +249,24 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key)
|
logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key)
|
||||||
|
|
||||||
def _register_realm(self, realm_spec):
|
def _register_realm(self, realm_spec):
|
||||||
logger.debug('Registering realm with manager: %s', realm_spec['realm'])
|
logger.debug('Got call to register realm %s with manager', realm_spec['realm'])
|
||||||
component = self.register_component(realm_spec['realm'], BuildComponent,
|
|
||||||
token=realm_spec['token'])
|
|
||||||
|
|
||||||
if component in self._component_to_job:
|
|
||||||
logger.debug('Realm already registered with manager: %s', realm_spec['realm'])
|
|
||||||
return component
|
|
||||||
|
|
||||||
# Create the build information block for the registered realm.
|
# Create the build information block for the registered realm.
|
||||||
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
|
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
|
||||||
execution_id = realm_spec.get('execution_id', None)
|
execution_id = realm_spec.get('execution_id', None)
|
||||||
executor_name = realm_spec.get('executor_name', 'EC2Executor')
|
executor_name = realm_spec.get('executor_name', 'EC2Executor')
|
||||||
|
|
||||||
|
logger.debug('Registering realm %s with manager: %s', realm_spec['realm'], realm_spec)
|
||||||
|
component = self.register_component(realm_spec['realm'], BuildComponent,
|
||||||
|
token=realm_spec['token'])
|
||||||
|
|
||||||
build_info = BuildInfo(component=component, build_job=build_job, execution_id=execution_id,
|
build_info = BuildInfo(component=component, build_job=build_job, execution_id=execution_id,
|
||||||
executor_name=executor_name)
|
executor_name=executor_name)
|
||||||
|
|
||||||
self._component_to_job[component] = build_job
|
self._component_to_job[component] = build_job
|
||||||
self._build_uuid_to_info[build_job.build_uuid] = build_info
|
self._build_uuid_to_info[build_job.build_uuid] = build_info
|
||||||
|
|
||||||
|
logger.debug('Registered realm %s with manager', realm_spec['realm'])
|
||||||
return component
|
return component
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -362,7 +362,7 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
async(self._register_existing_realms())
|
async(self._register_existing_realms())
|
||||||
|
|
||||||
def setup_time(self):
|
def setup_time(self):
|
||||||
return self._manager_config.get('MACHINE_SETUP_TIME', 300)
|
return self._ephemeral_setup_timeout
|
||||||
|
|
||||||
def shutdown(self):
|
def shutdown(self):
|
||||||
logger.debug('Shutting down worker.')
|
logger.debug('Shutting down worker.')
|
||||||
|
@ -505,24 +505,26 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
|
|
||||||
@coroutine
|
@coroutine
|
||||||
def build_component_ready(self, build_component):
|
def build_component_ready(self, build_component):
|
||||||
try:
|
logger.debug('Got component ready for component with realm %s', build_component.builder_realm)
|
||||||
# Pop off the job for the component. We do so before we send out the etcd watch below,
|
|
||||||
# as it will also remove this mapping.
|
|
||||||
job = self._component_to_job.pop(build_component)
|
|
||||||
if job is None:
|
|
||||||
logger.error('Could not find job for the build component on realm %s',
|
|
||||||
build_component.builder_realm)
|
|
||||||
raise Return()
|
|
||||||
|
|
||||||
|
# Pop off the job for the component. We do so before we send out the etcd watch below,
|
||||||
|
# as it will also remove this mapping.
|
||||||
|
job = self._component_to_job.pop(build_component, None)
|
||||||
|
if job is None:
|
||||||
|
logger.error('Could not find job for the build component on realm %s; it may have expired',
|
||||||
|
build_component.builder_realm)
|
||||||
|
raise Return()
|
||||||
|
|
||||||
|
# Start the build job.
|
||||||
|
logger.debug('Sending build %s to newly ready component on realm %s',
|
||||||
|
job.build_uuid, build_component.builder_realm)
|
||||||
|
yield From(build_component.start_build(job))
|
||||||
|
|
||||||
|
try:
|
||||||
# Clean up the bookkeeping for allowing any manager to take the job.
|
# Clean up the bookkeeping for allowing any manager to take the job.
|
||||||
yield From(self._etcd_client.delete(self._etcd_realm_key(build_component.builder_realm)))
|
yield From(self._etcd_client.delete(self._etcd_realm_key(build_component.builder_realm)))
|
||||||
|
|
||||||
# Start the build job.
|
|
||||||
logger.debug('Sending build %s to newly ready component on realm %s',
|
|
||||||
job.build_uuid, build_component.builder_realm)
|
|
||||||
yield From(build_component.start_build(job))
|
|
||||||
except (KeyError, etcd.EtcdKeyError):
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
logger.warning('Builder is asking for more work, but work already completed')
|
logger.warning('Could not delete realm key %s', build_component.builder_realm)
|
||||||
|
|
||||||
def build_component_disposed(self, build_component, timed_out):
|
def build_component_disposed(self, build_component, timed_out):
|
||||||
logger.debug('Calling build_component_disposed.')
|
logger.debug('Calling build_component_disposed.')
|
||||||
|
|
Reference in a new issue