From 74b87fa8135bcf0e53066a15b6ddfc6a2949e05f Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 14 Jul 2016 11:49:01 -0400 Subject: [PATCH] Build manager cleanup and more logging --- buildman/manager/ephemeral.py | 37 ++++++++++++++++++-------- buildman/manager/executor.py | 10 +++++-- test/test_buildman.py | 49 ++++++++++++++++++++++++++++++++++- 3 files changed, 82 insertions(+), 14 deletions(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index e6d3403e8..298cd681f 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -25,6 +25,7 @@ logger = logging.getLogger(__name__) ETCD_MAX_WATCH_TIMEOUT = 30 RETRY_IMMEDIATELY_TIMEOUT = 0 +NO_WORKER_AVAILABLE_TIMEOUT = 10 DEFAULT_EPHEMERAL_API_TIMEOUT = 20 EXECUTORS = { @@ -317,12 +318,12 @@ class EphemeralBuilderManager(BaseManager): logger.exception('Exception when reading job count from etcd for job: %s', build_uuid) raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) - logger.debug('Total jobs: %s', workers_alive) + logger.debug('Total jobs (scheduling job %s): %s', build_uuid, workers_alive) if workers_alive >= allowed_worker_count: logger.info('Too many workers alive, unable to start new worker for build job: %s. %s >= %s', build_uuid, workers_alive, allowed_worker_count) - raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) + raise Return(False, NO_WORKER_AVAILABLE_TIMEOUT) job_key = self._etcd_job_key(build_job) @@ -358,8 +359,9 @@ class EphemeralBuilderManager(BaseManager): raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) started_with_executor = None - logger.debug("executors are: %s", self._executors) + builder_id = None + logger.debug("Registered executors are: %s", [ex.__class__.__name__ for ex in self._executors]) for executor in self._executors: executor_type = executor.__class__.__name__ @@ -372,27 +374,37 @@ class EphemeralBuilderManager(BaseManager): # Check if we can use this executor based on the retries remaining. if executor.minimum_retry_threshold > build_job.retries_remaining: - logger.debug('Job %s cannot use executor %s due to not meeting retry threshold', build_uuid, - executor_type) + logger.debug('Job %s cannot use executor %s as it is below retry threshold (retry #: %s)', + build_uuid, executor_type, build_job.retries_remaining) continue - logger.debug('Starting builder for job: %s with executor: %s', build_uuid, executor_type) + logger.debug('Starting builder for job %s with selected executor: %s', build_uuid, + executor_type) try: builder_id = yield From(executor.start_builder(realm, token, build_uuid)) - metric_queue.put_deprecated('EphemeralBuilderStarted', 1, unit='Count') - metric_queue.ephemeral_build_workers.Inc(labelvalues=[builder_id, build_uuid]) - started_with_executor = executor - break except: logger.exception('Exception when starting builder for job: %s', build_uuid) continue + try: + metric_queue.put_deprecated('EphemeralBuilderStarted', 1, unit='Count') + metric_queue.ephemeral_build_workers.Inc(labelvalues=[builder_id, build_uuid]) + except: + logger.exception('Exception when writing start metrics for builder %s for job %s', + builder_id, build_uuid) + + started_with_executor = executor + + # Break out of the loop now that we've started a builder successfully. + break + if started_with_executor is None: logger.error('Could not start ephemeral worker for build %s', build_uuid) raise Return(False, self._ephemeral_api_timeout) - logger.debug('Started builder for job: %s with executor: %s', build_uuid, executor_type) + logger.debug('Started builder with ID %s for job: %s with executor: %s', builder_id, build_uuid, + started_with_executor.__class__.__name__) # Store the builder in etcd associated with the job id try: @@ -423,6 +435,9 @@ class EphemeralBuilderManager(BaseManager): raise Return(False, setup_time) self._job_to_executor[builder_id] = started_with_executor + + logger.debug('Builder spawn complete for job %s using executor %s with ID %s ', build_uuid, + started_with_executor.__class__.__name__, builder_id) raise Return(True, None) @coroutine diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index c9b222f12..f6ea79667 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -136,7 +136,7 @@ class EC2Executor(BuilderExecutor): coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable)) user_data = self.generate_cloud_config(realm, token, channel, self.manager_hostname) - logger.debug('Generated cloud config: %s', user_data) + logger.debug('Generated cloud config for build %s: %s', build_uuid, user_data) ec2_conn = self._get_conn() @@ -180,6 +180,10 @@ class EC2Executor(BuilderExecutor): launched = AsyncWrapper(reservation.instances[0]) + # Sleep a few seconds to wait for AWS to spawn the instance. + yield From(trollius.sleep(_TAG_RETRY_SLEEP)) + + # Tag the instance with its metadata. for i in range(0, _TAG_RETRY_COUNT): try: yield From(launched.add_tags({ @@ -191,7 +195,8 @@ class EC2Executor(BuilderExecutor): except boto.exception.EC2ResponseError as ec2e: if ec2e.error_code == 'InvalidInstanceID.NotFound': if i < _TAG_RETRY_COUNT - 1: - logger.warning('Failed to write EC2 tags (attempt #%s)', i) + logger.warning('Failed to write EC2 tags for instance %s for build %s (attempt #%s)', + launched.id, build_uuid, i) yield From(trollius.sleep(_TAG_RETRY_SLEEP)) continue @@ -199,6 +204,7 @@ class EC2Executor(BuilderExecutor): logger.exception('Failed to write EC2 tags (attempt #%s)', i) + logger.debug('Machine with ID %s started for build %s', launched.id, build_uuid) raise Return(launched.id) @coroutine diff --git a/test/test_buildman.py b/test/test_buildman.py index 163ef37ec..b85e4fa05 100644 --- a/test/test_buildman.py +++ b/test/test_buildman.py @@ -8,7 +8,7 @@ from trollius import coroutine, get_event_loop, From, Future, Return from mock import Mock from threading import Event -from buildman.manager.executor import BuilderExecutor +from buildman.manager.executor import BuilderExecutor, ExecutorException from buildman.manager.ephemeral import EphemeralBuilderManager, EXECUTORS @@ -34,6 +34,13 @@ class TestExecutor(BuilderExecutor): raise Return(str(uuid.uuid4)) + +class BadExecutor(BuilderExecutor): + @coroutine + def start_builder(self, realm, token, build_uuid): + raise ExecutorException('raised on purpose!') + + class TestEphemeral(unittest.TestCase): def __init__(self, *args, **kwargs): self.etcd_client_mock = None @@ -264,6 +271,46 @@ class TestEphemeral(unittest.TestCase): self.manager.registered_executors[0].job_started = None self.manager.registered_executors[1].job_started = None + + @async_test + def test_schedule_job_single_executor(self): + EXECUTORS['test'] = TestExecutor + + self.manager.initialize({ + 'EXECUTOR': 'test', + 'EXECUTOR_CONFIG': {}, + }) + + build_job = self._create_build_job(namespace='something', retries=3) + result = yield From(self.manager.schedule(build_job)) + self.assertTrue(result[0]) + + self.assertIsNotNone(self.manager.registered_executors[0].job_started) + self.manager.registered_executors[0].job_started = None + + + build_job = self._create_build_job(namespace='something', retries=0) + result = yield From(self.manager.schedule(build_job)) + self.assertTrue(result[0]) + + self.assertIsNotNone(self.manager.registered_executors[0].job_started) + self.manager.registered_executors[0].job_started = None + + + @async_test + def test_executor_exception(self): + EXECUTORS['bad'] = BadExecutor + + self.manager.initialize({ + 'EXECUTOR': 'bad', + 'EXECUTOR_CONFIG': {}, + }) + + build_job = self._create_build_job(namespace='something', retries=3) + result = yield From(self.manager.schedule(build_job)) + self.assertFalse(result[0]) + + if __name__ == '__main__': unittest.main()