From 292abb53956f2bc0cdad984fc8bee06747dc95b8 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Tue, 30 Aug 2016 13:38:22 -0400 Subject: [PATCH 1/3] Better handling and logging of exceptions in build manager Also increases the setup timeout for EC2 --- buildman/component/buildcomponent.py | 2 +- buildman/manager/ephemeral.py | 48 +++++++++++++++------------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py index f23df0d8b..2884b4469 100644 --- a/buildman/component/buildcomponent.py +++ b/buildman/component/buildcomponent.py @@ -376,7 +376,7 @@ class BuildComponent(BaseComponent): if self._component_status == ComponentStatus.TIMED_OUT: return - logger.debug('got heartbeat on realm %s', self.builder_realm) + logger.debug('Got heartbeat on realm %s', self.builder_realm) self._last_heartbeat = datetime.datetime.utcnow() @trollius.coroutine diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index a8030362f..879f525c3 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -29,7 +29,7 @@ ETCD_ATOMIC_OP_TIMEOUT = 120 RETRY_IMMEDIATELY_TIMEOUT = 0 NO_WORKER_AVAILABLE_TIMEOUT = 10 DEFAULT_EPHEMERAL_API_TIMEOUT = 20 -DEFAULT_EPHEMERAL_SETUP_TIMEOUT = 300 +DEFAULT_EPHEMERAL_SETUP_TIMEOUT = 500 class EtcdAction(object): @@ -249,24 +249,24 @@ class EphemeralBuilderManager(BaseManager): logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key) def _register_realm(self, realm_spec): - logger.debug('Registering realm with manager: %s', realm_spec['realm']) - component = self.register_component(realm_spec['realm'], BuildComponent, - token=realm_spec['token']) - - if component in self._component_to_job: - logger.debug('Realm already registered with manager: %s', realm_spec['realm']) - return component + logger.debug('Got call to register realm %s with manager', realm_spec['realm']) # Create the build information block for the registered realm. build_job = BuildJob(AttrDict(realm_spec['job_queue_item'])) execution_id = realm_spec.get('execution_id', None) executor_name = realm_spec.get('executor_name', 'EC2Executor') + logger.debug('Registering realm %s with manager: %s', realm_spec['realm'], realm_spec) + component = self.register_component(realm_spec['realm'], BuildComponent, + token=realm_spec['token']) + build_info = BuildInfo(component=component, build_job=build_job, execution_id=execution_id, executor_name=executor_name) self._component_to_job[component] = build_job self._build_uuid_to_info[build_job.build_uuid] = build_info + + logger.debug('Registered realm %s with manager', realm_spec['realm']) return component @property @@ -362,7 +362,7 @@ class EphemeralBuilderManager(BaseManager): async(self._register_existing_realms()) def setup_time(self): - return self._manager_config.get('MACHINE_SETUP_TIME', 300) + return self._ephemeral_setup_timeout def shutdown(self): logger.debug('Shutting down worker.') @@ -505,24 +505,26 @@ class EphemeralBuilderManager(BaseManager): @coroutine def build_component_ready(self, build_component): - try: - # Pop off the job for the component. We do so before we send out the etcd watch below, - # as it will also remove this mapping. - job = self._component_to_job.pop(build_component) - if job is None: - logger.error('Could not find job for the build component on realm %s', - build_component.builder_realm) - raise Return() + logger.debug('Got component ready for component with realm %s', build_component.builder_realm) + # Pop off the job for the component. We do so before we send out the etcd watch below, + # as it will also remove this mapping. + job = self._component_to_job.pop(build_component, None) + if job is None: + logger.error('Could not find job for the build component on realm %s; it may have expired', + build_component.builder_realm) + raise Return() + + # Start the build job. + logger.debug('Sending build %s to newly ready component on realm %s', + job.build_uuid, build_component.builder_realm) + yield From(build_component.start_build(job)) + + try: # Clean up the bookkeeping for allowing any manager to take the job. yield From(self._etcd_client.delete(self._etcd_realm_key(build_component.builder_realm))) - - # Start the build job. - logger.debug('Sending build %s to newly ready component on realm %s', - job.build_uuid, build_component.builder_realm) - yield From(build_component.start_build(job)) except (KeyError, etcd.EtcdKeyError): - logger.warning('Builder is asking for more work, but work already completed') + logger.warning('Could not delete realm key %s', build_component.builder_realm) def build_component_disposed(self, build_component, timed_out): logger.debug('Calling build_component_disposed.') From 2fe896ba6a02e09f785b0f6f22c6814153bfd014 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Tue, 30 Aug 2016 13:57:26 -0400 Subject: [PATCH 2/3] Restore retries of jobs not started and add some leeway to the processing time --- buildman/server.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/buildman/server.py b/buildman/server.py index fc8dd54e2..f94ef3a69 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -24,6 +24,7 @@ logger = logging.getLogger(__name__) WORK_CHECK_TIMEOUT = 10 TIMEOUT_PERIOD_MINUTES = 20 JOB_TIMEOUT_SECONDS = 300 +SETUP_LEEWAY_SECONDS = 10 MINIMUM_JOB_EXTENSION = timedelta(minutes=2) HEARTBEAT_PERIOD_SEC = 30 @@ -164,8 +165,8 @@ class BuilderServer(object): logger.debug('Checking for more work for %d active workers', self._lifecycle_manager.num_workers()) - job_item = self._queue.get(processing_time=self._lifecycle_manager.setup_time(), - ordering_required=True) + processing_time = self._lifecycle_manager.setup_time() + SETUP_LEEWAY_SECONDS + job_item = self._queue.get(processing_time=processing_time, ordering_required=True) if job_item is None: logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT) continue @@ -185,6 +186,7 @@ class BuilderServer(object): except: logger.exception('Exception when scheduling job: %s', build_job.repo_build.uuid) self._current_status = BuildServerStatus.EXCEPTION + self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT) return if schedule_success: From e17e0e4172970debaa96aa61f9d2ad3d3b88e32b Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Tue, 30 Aug 2016 14:08:56 -0400 Subject: [PATCH 3/3] Add log for when the job key is written --- buildman/manager/ephemeral.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 879f525c3..35b118bc4 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -420,6 +420,8 @@ class EphemeralBuilderManager(BaseManager): lock_payload = json.dumps(payload) + logger.debug('Writing key for job %s with expiration in %s seconds', build_uuid, + self._ephemeral_setup_timeout) try: yield From(self._etcd_client.write(job_key, lock_payload, prevExist=False, ttl=self._ephemeral_setup_timeout))