Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager.

This commit is contained in:
Jake Moshenko 2014-12-22 12:14:16 -05:00
parent a280bbcb6d
commit 12ee8e0fc0
11 changed files with 233 additions and 52 deletions

View file

@ -1,3 +1,5 @@
from trollius import coroutine
class BaseManager(object):
""" Base for all worker managers. """
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
@ -26,6 +28,7 @@ class BaseManager(object):
"""
raise NotImplementedError
@coroutine
def schedule(self, build_job, loop):
""" Schedules a queue item to be built. Returns True if the item was properly scheduled
and False if all workers are busy.
@ -48,8 +51,11 @@ class BaseManager(object):
"""
raise NotImplementedError
@coroutine
def job_completed(self, build_job, job_status, build_component):
""" Method invoked once a job_item has completed, in some manner. The job_status will be
one of: incomplete, error, complete. If incomplete, the job should be requeued.
one of: incomplete, error, complete. Implementations of this method should call
self.job_complete_callback with a status of Incomplete if they wish for the job to be
automatically requeued.
"""
raise NotImplementedError

View file

@ -5,7 +5,7 @@ from buildman.component.basecomponent import BaseComponent
from buildman.component.buildcomponent import BuildComponent
from buildman.manager.basemanager import BaseManager
from trollius.coroutines import From
from trollius.coroutines import From, Return, coroutine
REGISTRATION_REALM = 'registration'
logger = logging.getLogger(__name__)
@ -50,14 +50,15 @@ class EnterpriseManager(BaseManager):
self.register_component(realm, BuildComponent, token="")
return realm
@coroutine
def schedule(self, build_job, loop):
""" Schedules a build for an Enterprise Registry. """
if self.shutting_down or not self.ready_components:
return False
raise Return(False)
component = self.ready_components.pop()
loop.call_soon(component.start_build, build_job)
return True
raise Return(True)
def build_component_ready(self, build_component, loop):
self.ready_components.add(build_component)
@ -65,6 +66,7 @@ class EnterpriseManager(BaseManager):
def shutdown(self):
self.shutting_down = True
@coroutine
def job_completed(self, build_job, job_status, build_component):
self.job_complete_callback(build_job, job_status)

View file

@ -1,12 +1,15 @@
import logging
import etcd
import uuid
import calendar
from datetime import datetime, timedelta
from trollius import From, coroutine, Return
from buildman.manager.basemanager import BaseManager
from buildman.manager.executor import PopenExecutor, EC2Executor
from buildman.component.buildcomponent import BuildComponent
from buildman.asyncutil import AsyncWrapper
logger = logging.getLogger(__name__)
@ -32,6 +35,13 @@ class EphemeralBuilderManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
shutting_down = False
_executors = {
'popen': PopenExecutor,
'ec2': EC2Executor,
}
_etcd_client_klass = etcd.Client
def __init__(self, *args, **kwargs):
self._manager_config = None
self._etcd_client = None
@ -39,10 +49,6 @@ class EphemeralBuilderManager(BaseManager):
self._component_to_job = {}
self._component_to_builder = {}
self._executors = {
'popen': PopenExecutor,
'ec2': EC2Executor,
}
self._executor = None
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
@ -58,9 +64,8 @@ class EphemeralBuilderManager(BaseManager):
etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1')
etcd_port = self._manager_config.get('ETCD_PORT', 2379)
logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
self._etcd_client = etcd.Client(host=etcd_host, port=etcd_port)
clear_etcd(self._etcd_client)
self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port))
def setup_time(self):
setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300)
@ -71,13 +76,14 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Calling shutdown.')
raise NotImplementedError
@coroutine
def schedule(self, build_job, loop):
logger.debug('Calling schedule with job: %s', build_job.repo_build.uuid)
logger.debug('Calling schedule with job: %s', build_job.job_details['build_uuid'])
# Check if there are worker slots avialable by checking the number of jobs in etcd
allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 2)
allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
try:
building = self._etcd_client.read(ETCD_BUILDER_PREFIX, recursive=True)
building = yield From(self._etcd_client.read(ETCD_BUILDER_PREFIX, recursive=True))
workers_alive = sum(1 for child in building.children if not child.dir)
except KeyError:
workers_alive = 0
@ -87,7 +93,7 @@ class EphemeralBuilderManager(BaseManager):
if workers_alive >= allowed_worker_count:
logger.info('Too many workers alive, unable to start new worker. %s >= %s', workers_alive,
allowed_worker_count)
return False
raise Return(False)
job_key = self._etcd_job_key(build_job)
@ -97,28 +103,33 @@ class EphemeralBuilderManager(BaseManager):
expiration = datetime.utcnow() + timedelta(seconds=self.setup_time())
payload = {
'expiration': expiration.isoformat(),
'expiration': calendar.timegm(expiration.timetuple()),
}
try:
self._etcd_client.write(job_key, payload, prevExist=False)
yield From(self._etcd_client.write(job_key, payload, prevExist=False))
component = self.register_component(realm, BuildComponent, token=token)
self._component_to_job[component] = build_job
except KeyError:
# The job was already taken by someone else, we are probably a retry
logger.warning('Job already exists in etcd, did an old worker die?')
return False
logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?')
raise Return(False)
builder_id = self._executor.start_builder(realm, token)
logger.debug('Starting builder with executor: %s', self._executor)
builder_id = yield From(self._executor.start_builder(realm, token))
self._component_to_builder[component] = builder_id
return True
# Store the builder in etcd associated with the job id
payload['builder_id'] = builder_id
yield From(self._etcd_client.write(job_key, payload, prevExist=True))
raise Return(True)
def build_component_ready(self, build_component, loop):
try:
job = self._component_to_job.pop(build_component)
logger.debug('Sending build %s to newly ready component on realm %s', job.repo_build.uuid,
build_component.builder_realm)
logger.debug('Sending build %s to newly ready component on realm %s',
job.job_details['build_uuid'], build_component.builder_realm)
loop.call_soon(build_component.start_build, job)
except KeyError:
logger.warning('Builder is asking for more work, but work already completed')
@ -126,6 +137,7 @@ class EphemeralBuilderManager(BaseManager):
def build_component_disposed(self, build_component, timed_out):
logger.debug('Calling build_component_disposed.')
@coroutine
def job_completed(self, build_job, job_status, build_component):
logger.debug('Calling job_completed with status: %s', job_status)
@ -134,12 +146,24 @@ class EphemeralBuilderManager(BaseManager):
# Release the lock in etcd
job_key = self._etcd_job_key(build_job)
self._etcd_client.delete(job_key)
yield From(self._etcd_client.delete(job_key))
self.job_complete_callback(build_job, job_status)
@coroutine
def _clean_up_old_builder(self, job_key, job_payload):
""" Terminate an old builders once the expiration date has passed.
"""
logger.debug('Cleaning up the old builder for job: %s', job_key)
if 'builder_id' in job_payload:
logger.info('Terminating expired build node.')
yield From(self._executor.stop_builder(job_payload['builder_id']))
yield From(self._etcd_client.delete(job_key))
@staticmethod
def _etcd_job_key(build_job):
""" Create a key which is used to track a job in etcd.
"""
return '{0}{1}'.format(ETCD_BUILDER_PREFIX, build_job.repo_build.uuid)
return '{0}{1}'.format(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid'])

View file

@ -7,6 +7,10 @@ import requests
import cachetools
from jinja2 import FileSystemLoader, Environment
from trollius import coroutine, From, Return, get_event_loop
from functools import partial
from buildman.asyncutil import AsyncWrapper
logger = logging.getLogger(__name__)
@ -32,12 +36,14 @@ class BuilderExecutor(object):
""" Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
starting and stopping builders.
"""
@coroutine
def start_builder(self, realm, token):
""" Create a builder with the specified config. Returns a unique id which can be used to manage
the builder.
"""
raise NotImplementedError
@coroutine
def stop_builder(self, builder_id):
""" Stop a builder which is currently running.
"""
@ -74,14 +80,18 @@ class EC2Executor(BuilderExecutor):
"""
COREOS_STACK_URL = 'http://%s.release.core-os.net/amd64-usr/current/coreos_production_ami_hvm.txt'
def __init__(self, *args, **kwargs):
self._loop = get_event_loop()
super(EC2Executor, self).__init__(*args, **kwargs)
def _get_conn(self):
""" Creates an ec2 connection which can be used to manage instances.
"""
return boto.ec2.connect_to_region(
return AsyncWrapper(boto.ec2.connect_to_region(
self.executor_config['EC2_REGION'],
aws_access_key_id=self.executor_config['AWS_ACCESS_KEY'],
aws_secret_access_key=self.executor_config['AWS_SECRET_KEY'],
)
))
@classmethod
@cachetools.ttl_cache(ttl=ONE_HOUR)
@ -92,25 +102,24 @@ class EC2Executor(BuilderExecutor):
stack_amis = dict([stack.split('=') for stack in stack_list_string.split('|')])
return stack_amis[ec2_region]
@coroutine
def start_builder(self, realm, token):
region = self.executor_config['EC2_REGION']
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
coreos_ami = self._get_coreos_ami(region, channel)
get_ami_callable = partial(self._get_coreos_ami, region, channel)
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
user_data = self.generate_cloud_config(realm, token, channel, self.manager_public_ip)
logger.debug('Generated cloud config: %s', user_data)
ec2_conn = self._get_conn()
# class FakeReservation(object):
# def __init__(self):
# self.instances = None
# reservation = FakeReservation()
reservation = ec2_conn.run_instances(
reservation = yield ec2_conn.run_instances(
coreos_ami,
instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
security_groups=self.executor_config['EC2_SECURITY_GROUP_IDS'],
key_name=self.executor_config.get('EC2_KEY_NAME', None),
user_data=user_data,
instance_initiated_shutdown_behavior='terminate',
)
if not reservation.instances:
@ -124,12 +133,13 @@ class EC2Executor(BuilderExecutor):
'Realm': realm,
'Token': token,
})
return launched.id
raise Return(launched.id)
@coroutine
def stop_builder(self, builder_id):
ec2_conn = self._get_conn()
stopped_instance_ids = [si.id for si in ec2_conn.stop_instances([builder_id], force=True)]
if builder_id not in stopped_instance_ids:
stopped_instances = yield ec2_conn.stop_instances([builder_id], force=True)
if builder_id not in [si.id for si in stopped_instances]:
raise ExecutorException('Unable to stop instance: %s' % builder_id)
class PopenExecutor(BuilderExecutor):
@ -142,6 +152,7 @@ class PopenExecutor(BuilderExecutor):
""" Executor which uses Popen to fork a quay-builder process.
"""
@coroutine
def start_builder(self, realm, token):
# Now start a machine for this job, adding the machine id to the etcd information
logger.debug('Forking process for build')
@ -162,9 +173,9 @@ class PopenExecutor(BuilderExecutor):
builder_id = str(uuid.uuid4())
self._jobs[builder_id] = (spawned, logpipe)
logger.debug('Builder spawned with id: %s', builder_id)
return builder_id
raise Return(builder_id)
@coroutine
def stop_builder(self, builder_id):
if builder_id not in self._jobs:
raise ExecutorException('Builder id not being tracked by executor.')