Generalize the ephemeral build managers so that any manager may manage a builder spawned by any other manager.
This commit is contained in:
parent
ccb19571d6
commit
cc70225043
11 changed files with 258 additions and 125 deletions
|
@ -3,12 +3,12 @@ from trollius import coroutine
|
|||
class BaseManager(object):
|
||||
""" Base for all worker managers. """
|
||||
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
|
||||
job_complete_callback, public_ip_address, heartbeat_period_sec):
|
||||
job_complete_callback, manager_hostname, heartbeat_period_sec):
|
||||
self.register_component = register_component
|
||||
self.unregister_component = unregister_component
|
||||
self.job_heartbeat_callback = job_heartbeat_callback
|
||||
self.job_complete_callback = job_complete_callback
|
||||
self.public_ip_address = public_ip_address
|
||||
self.manager_hostname = manager_hostname
|
||||
self.heartbeat_period_sec = heartbeat_period_sec
|
||||
|
||||
@coroutine
|
||||
|
@ -31,7 +31,7 @@ class BaseManager(object):
|
|||
raise NotImplementedError
|
||||
|
||||
@coroutine
|
||||
def schedule(self, build_job, loop):
|
||||
def schedule(self, build_job):
|
||||
""" Schedules a queue item to be built. Returns True if the item was properly scheduled
|
||||
and False if all workers are busy.
|
||||
"""
|
||||
|
@ -42,7 +42,8 @@ class BaseManager(object):
|
|||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def build_component_ready(self, build_component, loop):
|
||||
@coroutine
|
||||
def build_component_ready(self, build_component):
|
||||
""" Method invoked whenever a build component announces itself as ready.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
|
|
@ -5,7 +5,7 @@ from buildman.component.basecomponent import BaseComponent
|
|||
from buildman.component.buildcomponent import BuildComponent
|
||||
from buildman.manager.basemanager import BaseManager
|
||||
|
||||
from trollius.coroutines import From, Return, coroutine
|
||||
from trollius import From, Return, coroutine, async
|
||||
|
||||
REGISTRATION_REALM = 'registration'
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -51,16 +51,19 @@ class EnterpriseManager(BaseManager):
|
|||
return realm
|
||||
|
||||
@coroutine
|
||||
def schedule(self, build_job, loop):
|
||||
def schedule(self, build_job):
|
||||
""" Schedules a build for an Enterprise Registry. """
|
||||
if self.shutting_down or not self.ready_components:
|
||||
raise Return(False)
|
||||
|
||||
component = self.ready_components.pop()
|
||||
loop.call_soon(component.start_build, build_job)
|
||||
|
||||
yield From(component.start_build(build_job))
|
||||
|
||||
raise Return(True)
|
||||
|
||||
def build_component_ready(self, build_component, loop):
|
||||
@coroutine
|
||||
def build_component_ready(self, build_component):
|
||||
self.ready_components.add(build_component)
|
||||
|
||||
def shutdown(self):
|
||||
|
|
|
@ -13,16 +13,28 @@ from urllib3.exceptions import ReadTimeoutError
|
|||
from buildman.manager.basemanager import BaseManager
|
||||
from buildman.manager.executor import PopenExecutor, EC2Executor
|
||||
from buildman.component.buildcomponent import BuildComponent
|
||||
from buildman.jobutil.buildjob import BuildJob
|
||||
from buildman.asyncutil import AsyncWrapper
|
||||
from util.morecollections import AttrDict
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
ETCD_BUILDER_PREFIX = 'building/'
|
||||
ETCD_EXPIRE_RESULT = 'expire'
|
||||
ETCD_REALM_PREFIX = 'realm/'
|
||||
ETCD_DISABLE_TIMEOUT = 0
|
||||
|
||||
class EtcdAction(object):
|
||||
GET = 'get'
|
||||
SET = 'set'
|
||||
EXPIRE = 'expire'
|
||||
UPDATE = 'update'
|
||||
DELETE = 'delete'
|
||||
CREATE = 'create'
|
||||
COMPARE_AND_SWAP = 'compareAndSwap'
|
||||
COMPARE_AND_DELETE = 'compareAndDelete'
|
||||
|
||||
|
||||
class EphemeralBuilderManager(BaseManager):
|
||||
""" Build manager implementation for the Enterprise Registry. """
|
||||
|
@ -41,52 +53,82 @@ class EphemeralBuilderManager(BaseManager):
|
|||
self._etcd_client = None
|
||||
|
||||
self._component_to_job = {}
|
||||
self._job_uuid_to_component = {}
|
||||
self._component_to_builder = {}
|
||||
|
||||
self._executor = None
|
||||
|
||||
self._worker_watch_task = None
|
||||
# Map of etcd keys being watched to the tasks watching them
|
||||
self._watch_tasks = {}
|
||||
|
||||
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
|
||||
|
||||
def _watch_builders(self):
|
||||
""" Watch the builders key for expirations.
|
||||
"""
|
||||
def _watch_etcd(self, etcd_key, change_callback, recursive=True):
|
||||
watch_task_key = (etcd_key, recursive)
|
||||
def callback_wrapper(changed_key_future):
|
||||
|
||||
if watch_task_key not in self._watch_tasks or self._watch_tasks[watch_task_key].done():
|
||||
self._watch_etcd(etcd_key, change_callback)
|
||||
|
||||
if changed_key_future.cancelled():
|
||||
# Due to lack of interest, tomorrow has been cancelled
|
||||
return
|
||||
|
||||
try:
|
||||
etcd_result = changed_key_future.result()
|
||||
except ReadTimeoutError:
|
||||
return
|
||||
|
||||
change_callback(etcd_result)
|
||||
|
||||
if not self._shutting_down:
|
||||
workers_future = self._etcd_client.watch(ETCD_BUILDER_PREFIX, recursive=True,
|
||||
timeout=ETCD_DISABLE_TIMEOUT)
|
||||
workers_future.add_done_callback(self._handle_key_expiration)
|
||||
logger.debug('Scheduling watch task.')
|
||||
self._worker_watch_task = async(workers_future)
|
||||
watch_future = self._etcd_client.watch(etcd_key, recursive=recursive,
|
||||
timeout=ETCD_DISABLE_TIMEOUT)
|
||||
watch_future.add_done_callback(callback_wrapper)
|
||||
logger.debug('Scheduling watch of key: %s%s', etcd_key, '/*' if recursive else '')
|
||||
self._watch_tasks[watch_task_key] = async(watch_future)
|
||||
|
||||
def _handle_key_expiration(self, changed_key_future):
|
||||
""" Handle when a builder expires
|
||||
"""
|
||||
if self._worker_watch_task is None or self._worker_watch_task.done():
|
||||
self._watch_builders()
|
||||
|
||||
if changed_key_future.cancelled():
|
||||
# Due to lack of interest, tomorrow has been cancelled
|
||||
return
|
||||
|
||||
try:
|
||||
etcd_result = changed_key_future.result()
|
||||
except ReadTimeoutError:
|
||||
return
|
||||
|
||||
if etcd_result.action == ETCD_EXPIRE_RESULT:
|
||||
def _handle_builder_expiration(self, etcd_result):
|
||||
if etcd_result.action == EtcdAction.EXPIRE:
|
||||
# Handle the expiration
|
||||
logger.debug('Builder expired, clean up the old build node')
|
||||
job_metadata = json.loads(etcd_result._prev_node.value)
|
||||
async(self._clean_up_old_builder(etcd_result.key, job_metadata))
|
||||
|
||||
def _handle_realm_change(self, etcd_result):
|
||||
if etcd_result.action == EtcdAction.SET:
|
||||
# We must listen on the realm created by ourselves or another worker
|
||||
realm_spec = json.loads(etcd_result.value)
|
||||
component = self.register_component(realm_spec['realm'], BuildComponent,
|
||||
token=realm_spec['token'])
|
||||
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
|
||||
self._component_to_job[component] = build_job
|
||||
self._component_to_builder[component] = realm_spec['builder_id']
|
||||
self._job_uuid_to_component[build_job.job_details['build_uuid']] = component
|
||||
|
||||
elif etcd_result.action == EtcdAction.DELETE or etcd_result.action == EtcdAction.EXPIRE:
|
||||
# We must stop listening for new connections on the specified realm, if we did not get the
|
||||
# connection
|
||||
realm_spec = json.loads(etcd_result._prev_node.value)
|
||||
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
|
||||
component = self._job_uuid_to_component.pop(build_job.job_details['build_uuid'], None)
|
||||
if component is not None:
|
||||
# We were not the manager which the worker connected to, remove the bookkeeping for it
|
||||
logger.debug('Unregistering unused component on realm: %s', realm_spec['realm'])
|
||||
del self._component_to_job[component]
|
||||
del self._component_to_builder[component]
|
||||
self.unregister_component(component)
|
||||
|
||||
else:
|
||||
logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key)
|
||||
|
||||
def initialize(self, manager_config):
|
||||
logger.debug('Calling initialize')
|
||||
self._manager_config = manager_config
|
||||
|
||||
executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor)
|
||||
self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}),
|
||||
self.public_ip_address)
|
||||
self.manager_hostname)
|
||||
|
||||
etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1')
|
||||
etcd_port = self._manager_config.get('ETCD_PORT', 2379)
|
||||
|
@ -97,7 +139,8 @@ class EphemeralBuilderManager(BaseManager):
|
|||
self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port),
|
||||
executor=self._async_thread_executor)
|
||||
|
||||
self._watch_builders()
|
||||
self._watch_etcd(ETCD_BUILDER_PREFIX, self._handle_builder_expiration)
|
||||
self._watch_etcd(ETCD_REALM_PREFIX, self._handle_realm_change)
|
||||
|
||||
def setup_time(self):
|
||||
setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300)
|
||||
|
@ -108,17 +151,17 @@ class EphemeralBuilderManager(BaseManager):
|
|||
logger.debug('Shutting down worker.')
|
||||
self._shutting_down = True
|
||||
|
||||
if self._worker_watch_task is not None:
|
||||
logger.debug('Canceling watch task.')
|
||||
self._worker_watch_task.cancel()
|
||||
self._worker_watch_task = None
|
||||
for (etcd_key, _), task in self._watch_tasks.items():
|
||||
if not task.done():
|
||||
logger.debug('Canceling watch task for %s', etcd_key)
|
||||
task.cancel()
|
||||
|
||||
if self._async_thread_executor is not None:
|
||||
logger.debug('Shutting down thread pool executor.')
|
||||
self._async_thread_executor.shutdown()
|
||||
|
||||
@coroutine
|
||||
def schedule(self, build_job, loop):
|
||||
def schedule(self, build_job):
|
||||
logger.debug('Calling schedule with job: %s', build_job.job_details['build_uuid'])
|
||||
|
||||
# Check if there are worker slots avialable by checking the number of jobs in etcd
|
||||
|
@ -154,8 +197,6 @@ class EphemeralBuilderManager(BaseManager):
|
|||
|
||||
try:
|
||||
yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=False, ttl=ttl))
|
||||
component = self.register_component(realm, BuildComponent, token=token)
|
||||
self._component_to_job[component] = build_job
|
||||
except KeyError:
|
||||
# The job was already taken by someone else, we are probably a retry
|
||||
logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?')
|
||||
|
@ -163,20 +204,38 @@ class EphemeralBuilderManager(BaseManager):
|
|||
|
||||
logger.debug('Starting builder with executor: %s', self._executor)
|
||||
builder_id = yield From(self._executor.start_builder(realm, token))
|
||||
self._component_to_builder[component] = builder_id
|
||||
|
||||
# Store the builder in etcd associated with the job id
|
||||
payload['builder_id'] = builder_id
|
||||
yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl))
|
||||
|
||||
# Store the realm spec which will allow any manager to accept this builder when it connects
|
||||
realm_spec = json.dumps({
|
||||
'realm': realm,
|
||||
'token': token,
|
||||
'builder_id': builder_id,
|
||||
'job_queue_item': build_job.job_item,
|
||||
})
|
||||
try:
|
||||
yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False,
|
||||
ttl=ttl))
|
||||
except KeyError:
|
||||
logger.error('Realm already exists in etcd. UUID collision or something is very very wrong.')
|
||||
raise Return(False)
|
||||
|
||||
raise Return(True)
|
||||
|
||||
def build_component_ready(self, build_component, loop):
|
||||
@coroutine
|
||||
def build_component_ready(self, build_component):
|
||||
try:
|
||||
# Clean up the bookkeeping for allowing any manager to take the job
|
||||
job = self._component_to_job.pop(build_component)
|
||||
del self._job_uuid_to_component[job.job_details['build_uuid']]
|
||||
yield From(self._etcd_client.delete(self._etcd_realm_key(build_component.builder_realm)))
|
||||
|
||||
logger.debug('Sending build %s to newly ready component on realm %s',
|
||||
job.job_details['build_uuid'], build_component.builder_realm)
|
||||
loop.call_soon(build_component.start_build, job)
|
||||
yield From(build_component.start_build(job))
|
||||
except KeyError:
|
||||
logger.warning('Builder is asking for more work, but work already completed')
|
||||
|
||||
|
@ -240,6 +299,12 @@ class EphemeralBuilderManager(BaseManager):
|
|||
"""
|
||||
return os.path.join(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid'])
|
||||
|
||||
@staticmethod
|
||||
def _etcd_realm_key(realm):
|
||||
""" Create a key which is used to track an incoming connection on a realm.
|
||||
"""
|
||||
return os.path.join(ETCD_REALM_PREFIX, realm)
|
||||
|
||||
def num_workers(self):
|
||||
""" Return the number of workers we're managing locally.
|
||||
"""
|
||||
|
|
|
@ -29,9 +29,9 @@ class ExecutorException(Exception):
|
|||
|
||||
|
||||
class BuilderExecutor(object):
|
||||
def __init__(self, executor_config, manager_public_ip):
|
||||
def __init__(self, executor_config, manager_hostname):
|
||||
self.executor_config = executor_config
|
||||
self.manager_public_ip = manager_public_ip
|
||||
self.manager_hostname = manager_hostname
|
||||
|
||||
""" Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
|
||||
starting and stopping builders.
|
||||
|
@ -52,7 +52,7 @@ class BuilderExecutor(object):
|
|||
def get_manager_websocket_url(self):
|
||||
return 'ws://{0}:'
|
||||
|
||||
def generate_cloud_config(self, realm, token, coreos_channel, manager_ip,
|
||||
def generate_cloud_config(self, realm, token, coreos_channel, manager_hostname,
|
||||
quay_username=None, quay_password=None, etcd_token=None):
|
||||
if quay_username is None:
|
||||
quay_username = self.executor_config['QUAY_USERNAME']
|
||||
|
@ -69,7 +69,7 @@ class BuilderExecutor(object):
|
|||
quay_username=quay_username,
|
||||
quay_password=quay_password,
|
||||
etcd_token=etcd_token,
|
||||
manager_ip=manager_ip,
|
||||
manager_hostname=manager_hostname,
|
||||
coreos_channel=coreos_channel,
|
||||
)
|
||||
|
||||
|
@ -108,7 +108,7 @@ class EC2Executor(BuilderExecutor):
|
|||
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
|
||||
get_ami_callable = partial(self._get_coreos_ami, region, channel)
|
||||
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
|
||||
user_data = self.generate_cloud_config(realm, token, channel, self.manager_public_ip)
|
||||
user_data = self.generate_cloud_config(realm, token, channel, self.manager_hostname)
|
||||
|
||||
logger.debug('Generated cloud config: %s', user_data)
|
||||
|
||||
|
@ -155,10 +155,10 @@ class EC2Executor(BuilderExecutor):
|
|||
class PopenExecutor(BuilderExecutor):
|
||||
""" Implementation of BuilderExecutor which uses Popen to fork a quay-builder process.
|
||||
"""
|
||||
def __init__(self, executor_config, manager_public_ip):
|
||||
def __init__(self, executor_config, manager_hostname):
|
||||
self._jobs = {}
|
||||
|
||||
super(PopenExecutor, self).__init__(executor_config, manager_public_ip)
|
||||
super(PopenExecutor, self).__init__(executor_config, manager_hostname)
|
||||
|
||||
""" Executor which uses Popen to fork a quay-builder process.
|
||||
"""
|
||||
|
|
Reference in a new issue