Build manager cleanup and more logging
This commit is contained in:
parent
65bbdda545
commit
74b87fa813
3 changed files with 82 additions and 14 deletions
|
@ -25,6 +25,7 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
ETCD_MAX_WATCH_TIMEOUT = 30
|
ETCD_MAX_WATCH_TIMEOUT = 30
|
||||||
RETRY_IMMEDIATELY_TIMEOUT = 0
|
RETRY_IMMEDIATELY_TIMEOUT = 0
|
||||||
|
NO_WORKER_AVAILABLE_TIMEOUT = 10
|
||||||
DEFAULT_EPHEMERAL_API_TIMEOUT = 20
|
DEFAULT_EPHEMERAL_API_TIMEOUT = 20
|
||||||
|
|
||||||
EXECUTORS = {
|
EXECUTORS = {
|
||||||
|
@ -317,12 +318,12 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
logger.exception('Exception when reading job count from etcd for job: %s', build_uuid)
|
logger.exception('Exception when reading job count from etcd for job: %s', build_uuid)
|
||||||
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
|
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
|
||||||
|
|
||||||
logger.debug('Total jobs: %s', workers_alive)
|
logger.debug('Total jobs (scheduling job %s): %s', build_uuid, workers_alive)
|
||||||
|
|
||||||
if workers_alive >= allowed_worker_count:
|
if workers_alive >= allowed_worker_count:
|
||||||
logger.info('Too many workers alive, unable to start new worker for build job: %s. %s >= %s',
|
logger.info('Too many workers alive, unable to start new worker for build job: %s. %s >= %s',
|
||||||
build_uuid, workers_alive, allowed_worker_count)
|
build_uuid, workers_alive, allowed_worker_count)
|
||||||
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
|
raise Return(False, NO_WORKER_AVAILABLE_TIMEOUT)
|
||||||
|
|
||||||
job_key = self._etcd_job_key(build_job)
|
job_key = self._etcd_job_key(build_job)
|
||||||
|
|
||||||
|
@ -358,8 +359,9 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
|
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
|
||||||
|
|
||||||
started_with_executor = None
|
started_with_executor = None
|
||||||
logger.debug("executors are: %s", self._executors)
|
builder_id = None
|
||||||
|
|
||||||
|
logger.debug("Registered executors are: %s", [ex.__class__.__name__ for ex in self._executors])
|
||||||
for executor in self._executors:
|
for executor in self._executors:
|
||||||
executor_type = executor.__class__.__name__
|
executor_type = executor.__class__.__name__
|
||||||
|
|
||||||
|
@ -372,27 +374,37 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
|
|
||||||
# Check if we can use this executor based on the retries remaining.
|
# Check if we can use this executor based on the retries remaining.
|
||||||
if executor.minimum_retry_threshold > build_job.retries_remaining:
|
if executor.minimum_retry_threshold > build_job.retries_remaining:
|
||||||
logger.debug('Job %s cannot use executor %s due to not meeting retry threshold', build_uuid,
|
logger.debug('Job %s cannot use executor %s as it is below retry threshold (retry #: %s)',
|
||||||
executor_type)
|
build_uuid, executor_type, build_job.retries_remaining)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logger.debug('Starting builder for job: %s with executor: %s', build_uuid, executor_type)
|
logger.debug('Starting builder for job %s with selected executor: %s', build_uuid,
|
||||||
|
executor_type)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
builder_id = yield From(executor.start_builder(realm, token, build_uuid))
|
builder_id = yield From(executor.start_builder(realm, token, build_uuid))
|
||||||
metric_queue.put_deprecated('EphemeralBuilderStarted', 1, unit='Count')
|
|
||||||
metric_queue.ephemeral_build_workers.Inc(labelvalues=[builder_id, build_uuid])
|
|
||||||
started_with_executor = executor
|
|
||||||
break
|
|
||||||
except:
|
except:
|
||||||
logger.exception('Exception when starting builder for job: %s', build_uuid)
|
logger.exception('Exception when starting builder for job: %s', build_uuid)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
metric_queue.put_deprecated('EphemeralBuilderStarted', 1, unit='Count')
|
||||||
|
metric_queue.ephemeral_build_workers.Inc(labelvalues=[builder_id, build_uuid])
|
||||||
|
except:
|
||||||
|
logger.exception('Exception when writing start metrics for builder %s for job %s',
|
||||||
|
builder_id, build_uuid)
|
||||||
|
|
||||||
|
started_with_executor = executor
|
||||||
|
|
||||||
|
# Break out of the loop now that we've started a builder successfully.
|
||||||
|
break
|
||||||
|
|
||||||
if started_with_executor is None:
|
if started_with_executor is None:
|
||||||
logger.error('Could not start ephemeral worker for build %s', build_uuid)
|
logger.error('Could not start ephemeral worker for build %s', build_uuid)
|
||||||
raise Return(False, self._ephemeral_api_timeout)
|
raise Return(False, self._ephemeral_api_timeout)
|
||||||
|
|
||||||
logger.debug('Started builder for job: %s with executor: %s', build_uuid, executor_type)
|
logger.debug('Started builder with ID %s for job: %s with executor: %s', builder_id, build_uuid,
|
||||||
|
started_with_executor.__class__.__name__)
|
||||||
|
|
||||||
# Store the builder in etcd associated with the job id
|
# Store the builder in etcd associated with the job id
|
||||||
try:
|
try:
|
||||||
|
@ -423,6 +435,9 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
raise Return(False, setup_time)
|
raise Return(False, setup_time)
|
||||||
|
|
||||||
self._job_to_executor[builder_id] = started_with_executor
|
self._job_to_executor[builder_id] = started_with_executor
|
||||||
|
|
||||||
|
logger.debug('Builder spawn complete for job %s using executor %s with ID %s ', build_uuid,
|
||||||
|
started_with_executor.__class__.__name__, builder_id)
|
||||||
raise Return(True, None)
|
raise Return(True, None)
|
||||||
|
|
||||||
@coroutine
|
@coroutine
|
||||||
|
|
|
@ -136,7 +136,7 @@ class EC2Executor(BuilderExecutor):
|
||||||
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
|
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
|
||||||
|
|
||||||
user_data = self.generate_cloud_config(realm, token, channel, self.manager_hostname)
|
user_data = self.generate_cloud_config(realm, token, channel, self.manager_hostname)
|
||||||
logger.debug('Generated cloud config: %s', user_data)
|
logger.debug('Generated cloud config for build %s: %s', build_uuid, user_data)
|
||||||
|
|
||||||
ec2_conn = self._get_conn()
|
ec2_conn = self._get_conn()
|
||||||
|
|
||||||
|
@ -180,6 +180,10 @@ class EC2Executor(BuilderExecutor):
|
||||||
|
|
||||||
launched = AsyncWrapper(reservation.instances[0])
|
launched = AsyncWrapper(reservation.instances[0])
|
||||||
|
|
||||||
|
# Sleep a few seconds to wait for AWS to spawn the instance.
|
||||||
|
yield From(trollius.sleep(_TAG_RETRY_SLEEP))
|
||||||
|
|
||||||
|
# Tag the instance with its metadata.
|
||||||
for i in range(0, _TAG_RETRY_COUNT):
|
for i in range(0, _TAG_RETRY_COUNT):
|
||||||
try:
|
try:
|
||||||
yield From(launched.add_tags({
|
yield From(launched.add_tags({
|
||||||
|
@ -191,7 +195,8 @@ class EC2Executor(BuilderExecutor):
|
||||||
except boto.exception.EC2ResponseError as ec2e:
|
except boto.exception.EC2ResponseError as ec2e:
|
||||||
if ec2e.error_code == 'InvalidInstanceID.NotFound':
|
if ec2e.error_code == 'InvalidInstanceID.NotFound':
|
||||||
if i < _TAG_RETRY_COUNT - 1:
|
if i < _TAG_RETRY_COUNT - 1:
|
||||||
logger.warning('Failed to write EC2 tags (attempt #%s)', i)
|
logger.warning('Failed to write EC2 tags for instance %s for build %s (attempt #%s)',
|
||||||
|
launched.id, build_uuid, i)
|
||||||
yield From(trollius.sleep(_TAG_RETRY_SLEEP))
|
yield From(trollius.sleep(_TAG_RETRY_SLEEP))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -199,6 +204,7 @@ class EC2Executor(BuilderExecutor):
|
||||||
|
|
||||||
logger.exception('Failed to write EC2 tags (attempt #%s)', i)
|
logger.exception('Failed to write EC2 tags (attempt #%s)', i)
|
||||||
|
|
||||||
|
logger.debug('Machine with ID %s started for build %s', launched.id, build_uuid)
|
||||||
raise Return(launched.id)
|
raise Return(launched.id)
|
||||||
|
|
||||||
@coroutine
|
@coroutine
|
||||||
|
|
|
@ -8,7 +8,7 @@ from trollius import coroutine, get_event_loop, From, Future, Return
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
from threading import Event
|
from threading import Event
|
||||||
|
|
||||||
from buildman.manager.executor import BuilderExecutor
|
from buildman.manager.executor import BuilderExecutor, ExecutorException
|
||||||
from buildman.manager.ephemeral import EphemeralBuilderManager, EXECUTORS
|
from buildman.manager.ephemeral import EphemeralBuilderManager, EXECUTORS
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,6 +34,13 @@ class TestExecutor(BuilderExecutor):
|
||||||
raise Return(str(uuid.uuid4))
|
raise Return(str(uuid.uuid4))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class BadExecutor(BuilderExecutor):
|
||||||
|
@coroutine
|
||||||
|
def start_builder(self, realm, token, build_uuid):
|
||||||
|
raise ExecutorException('raised on purpose!')
|
||||||
|
|
||||||
|
|
||||||
class TestEphemeral(unittest.TestCase):
|
class TestEphemeral(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.etcd_client_mock = None
|
self.etcd_client_mock = None
|
||||||
|
@ -264,6 +271,46 @@ class TestEphemeral(unittest.TestCase):
|
||||||
self.manager.registered_executors[0].job_started = None
|
self.manager.registered_executors[0].job_started = None
|
||||||
self.manager.registered_executors[1].job_started = None
|
self.manager.registered_executors[1].job_started = None
|
||||||
|
|
||||||
|
|
||||||
|
@async_test
|
||||||
|
def test_schedule_job_single_executor(self):
|
||||||
|
EXECUTORS['test'] = TestExecutor
|
||||||
|
|
||||||
|
self.manager.initialize({
|
||||||
|
'EXECUTOR': 'test',
|
||||||
|
'EXECUTOR_CONFIG': {},
|
||||||
|
})
|
||||||
|
|
||||||
|
build_job = self._create_build_job(namespace='something', retries=3)
|
||||||
|
result = yield From(self.manager.schedule(build_job))
|
||||||
|
self.assertTrue(result[0])
|
||||||
|
|
||||||
|
self.assertIsNotNone(self.manager.registered_executors[0].job_started)
|
||||||
|
self.manager.registered_executors[0].job_started = None
|
||||||
|
|
||||||
|
|
||||||
|
build_job = self._create_build_job(namespace='something', retries=0)
|
||||||
|
result = yield From(self.manager.schedule(build_job))
|
||||||
|
self.assertTrue(result[0])
|
||||||
|
|
||||||
|
self.assertIsNotNone(self.manager.registered_executors[0].job_started)
|
||||||
|
self.manager.registered_executors[0].job_started = None
|
||||||
|
|
||||||
|
|
||||||
|
@async_test
|
||||||
|
def test_executor_exception(self):
|
||||||
|
EXECUTORS['bad'] = BadExecutor
|
||||||
|
|
||||||
|
self.manager.initialize({
|
||||||
|
'EXECUTOR': 'bad',
|
||||||
|
'EXECUTOR_CONFIG': {},
|
||||||
|
})
|
||||||
|
|
||||||
|
build_job = self._create_build_job(namespace='something', retries=3)
|
||||||
|
result = yield From(self.manager.schedule(build_job))
|
||||||
|
self.assertFalse(result[0])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
||||||
|
|
Reference in a new issue