Build manager cleanup and more logging

This commit is contained in:
Joseph Schorr 2016-07-14 11:49:01 -04:00
parent 65bbdda545
commit 74b87fa813
3 changed files with 82 additions and 14 deletions

View file

@ -25,6 +25,7 @@ logger = logging.getLogger(__name__)
ETCD_MAX_WATCH_TIMEOUT = 30 ETCD_MAX_WATCH_TIMEOUT = 30
RETRY_IMMEDIATELY_TIMEOUT = 0 RETRY_IMMEDIATELY_TIMEOUT = 0
NO_WORKER_AVAILABLE_TIMEOUT = 10
DEFAULT_EPHEMERAL_API_TIMEOUT = 20 DEFAULT_EPHEMERAL_API_TIMEOUT = 20
EXECUTORS = { EXECUTORS = {
@ -317,12 +318,12 @@ class EphemeralBuilderManager(BaseManager):
logger.exception('Exception when reading job count from etcd for job: %s', build_uuid) logger.exception('Exception when reading job count from etcd for job: %s', build_uuid)
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
logger.debug('Total jobs: %s', workers_alive) logger.debug('Total jobs (scheduling job %s): %s', build_uuid, workers_alive)
if workers_alive >= allowed_worker_count: if workers_alive >= allowed_worker_count:
logger.info('Too many workers alive, unable to start new worker for build job: %s. %s >= %s', logger.info('Too many workers alive, unable to start new worker for build job: %s. %s >= %s',
build_uuid, workers_alive, allowed_worker_count) build_uuid, workers_alive, allowed_worker_count)
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) raise Return(False, NO_WORKER_AVAILABLE_TIMEOUT)
job_key = self._etcd_job_key(build_job) job_key = self._etcd_job_key(build_job)
@ -358,8 +359,9 @@ class EphemeralBuilderManager(BaseManager):
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT) raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
started_with_executor = None started_with_executor = None
logger.debug("executors are: %s", self._executors) builder_id = None
logger.debug("Registered executors are: %s", [ex.__class__.__name__ for ex in self._executors])
for executor in self._executors: for executor in self._executors:
executor_type = executor.__class__.__name__ executor_type = executor.__class__.__name__
@ -372,27 +374,37 @@ class EphemeralBuilderManager(BaseManager):
# Check if we can use this executor based on the retries remaining. # Check if we can use this executor based on the retries remaining.
if executor.minimum_retry_threshold > build_job.retries_remaining: if executor.minimum_retry_threshold > build_job.retries_remaining:
logger.debug('Job %s cannot use executor %s due to not meeting retry threshold', build_uuid, logger.debug('Job %s cannot use executor %s as it is below retry threshold (retry #: %s)',
executor_type) build_uuid, executor_type, build_job.retries_remaining)
continue continue
logger.debug('Starting builder for job: %s with executor: %s', build_uuid, executor_type) logger.debug('Starting builder for job %s with selected executor: %s', build_uuid,
executor_type)
try: try:
builder_id = yield From(executor.start_builder(realm, token, build_uuid)) builder_id = yield From(executor.start_builder(realm, token, build_uuid))
metric_queue.put_deprecated('EphemeralBuilderStarted', 1, unit='Count')
metric_queue.ephemeral_build_workers.Inc(labelvalues=[builder_id, build_uuid])
started_with_executor = executor
break
except: except:
logger.exception('Exception when starting builder for job: %s', build_uuid) logger.exception('Exception when starting builder for job: %s', build_uuid)
continue continue
try:
metric_queue.put_deprecated('EphemeralBuilderStarted', 1, unit='Count')
metric_queue.ephemeral_build_workers.Inc(labelvalues=[builder_id, build_uuid])
except:
logger.exception('Exception when writing start metrics for builder %s for job %s',
builder_id, build_uuid)
started_with_executor = executor
# Break out of the loop now that we've started a builder successfully.
break
if started_with_executor is None: if started_with_executor is None:
logger.error('Could not start ephemeral worker for build %s', build_uuid) logger.error('Could not start ephemeral worker for build %s', build_uuid)
raise Return(False, self._ephemeral_api_timeout) raise Return(False, self._ephemeral_api_timeout)
logger.debug('Started builder for job: %s with executor: %s', build_uuid, executor_type) logger.debug('Started builder with ID %s for job: %s with executor: %s', builder_id, build_uuid,
started_with_executor.__class__.__name__)
# Store the builder in etcd associated with the job id # Store the builder in etcd associated with the job id
try: try:
@ -423,6 +435,9 @@ class EphemeralBuilderManager(BaseManager):
raise Return(False, setup_time) raise Return(False, setup_time)
self._job_to_executor[builder_id] = started_with_executor self._job_to_executor[builder_id] = started_with_executor
logger.debug('Builder spawn complete for job %s using executor %s with ID %s ', build_uuid,
started_with_executor.__class__.__name__, builder_id)
raise Return(True, None) raise Return(True, None)
@coroutine @coroutine

View file

@ -136,7 +136,7 @@ class EC2Executor(BuilderExecutor):
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable)) coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
user_data = self.generate_cloud_config(realm, token, channel, self.manager_hostname) user_data = self.generate_cloud_config(realm, token, channel, self.manager_hostname)
logger.debug('Generated cloud config: %s', user_data) logger.debug('Generated cloud config for build %s: %s', build_uuid, user_data)
ec2_conn = self._get_conn() ec2_conn = self._get_conn()
@ -180,6 +180,10 @@ class EC2Executor(BuilderExecutor):
launched = AsyncWrapper(reservation.instances[0]) launched = AsyncWrapper(reservation.instances[0])
# Sleep a few seconds to wait for AWS to spawn the instance.
yield From(trollius.sleep(_TAG_RETRY_SLEEP))
# Tag the instance with its metadata.
for i in range(0, _TAG_RETRY_COUNT): for i in range(0, _TAG_RETRY_COUNT):
try: try:
yield From(launched.add_tags({ yield From(launched.add_tags({
@ -191,7 +195,8 @@ class EC2Executor(BuilderExecutor):
except boto.exception.EC2ResponseError as ec2e: except boto.exception.EC2ResponseError as ec2e:
if ec2e.error_code == 'InvalidInstanceID.NotFound': if ec2e.error_code == 'InvalidInstanceID.NotFound':
if i < _TAG_RETRY_COUNT - 1: if i < _TAG_RETRY_COUNT - 1:
logger.warning('Failed to write EC2 tags (attempt #%s)', i) logger.warning('Failed to write EC2 tags for instance %s for build %s (attempt #%s)',
launched.id, build_uuid, i)
yield From(trollius.sleep(_TAG_RETRY_SLEEP)) yield From(trollius.sleep(_TAG_RETRY_SLEEP))
continue continue
@ -199,6 +204,7 @@ class EC2Executor(BuilderExecutor):
logger.exception('Failed to write EC2 tags (attempt #%s)', i) logger.exception('Failed to write EC2 tags (attempt #%s)', i)
logger.debug('Machine with ID %s started for build %s', launched.id, build_uuid)
raise Return(launched.id) raise Return(launched.id)
@coroutine @coroutine

View file

@ -8,7 +8,7 @@ from trollius import coroutine, get_event_loop, From, Future, Return
from mock import Mock from mock import Mock
from threading import Event from threading import Event
from buildman.manager.executor import BuilderExecutor from buildman.manager.executor import BuilderExecutor, ExecutorException
from buildman.manager.ephemeral import EphemeralBuilderManager, EXECUTORS from buildman.manager.ephemeral import EphemeralBuilderManager, EXECUTORS
@ -34,6 +34,13 @@ class TestExecutor(BuilderExecutor):
raise Return(str(uuid.uuid4)) raise Return(str(uuid.uuid4))
class BadExecutor(BuilderExecutor):
@coroutine
def start_builder(self, realm, token, build_uuid):
raise ExecutorException('raised on purpose!')
class TestEphemeral(unittest.TestCase): class TestEphemeral(unittest.TestCase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.etcd_client_mock = None self.etcd_client_mock = None
@ -264,6 +271,46 @@ class TestEphemeral(unittest.TestCase):
self.manager.registered_executors[0].job_started = None self.manager.registered_executors[0].job_started = None
self.manager.registered_executors[1].job_started = None self.manager.registered_executors[1].job_started = None
@async_test
def test_schedule_job_single_executor(self):
EXECUTORS['test'] = TestExecutor
self.manager.initialize({
'EXECUTOR': 'test',
'EXECUTOR_CONFIG': {},
})
build_job = self._create_build_job(namespace='something', retries=3)
result = yield From(self.manager.schedule(build_job))
self.assertTrue(result[0])
self.assertIsNotNone(self.manager.registered_executors[0].job_started)
self.manager.registered_executors[0].job_started = None
build_job = self._create_build_job(namespace='something', retries=0)
result = yield From(self.manager.schedule(build_job))
self.assertTrue(result[0])
self.assertIsNotNone(self.manager.registered_executors[0].job_started)
self.manager.registered_executors[0].job_started = None
@async_test
def test_executor_exception(self):
EXECUTORS['bad'] = BadExecutor
self.manager.initialize({
'EXECUTOR': 'bad',
'EXECUTOR_CONFIG': {},
})
build_job = self._create_build_job(namespace='something', retries=3)
result = yield From(self.manager.schedule(build_job))
self.assertFalse(result[0])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()