Generalize the ephemeral build managers so that any manager may manage a builder spawned by any other manager.

This commit is contained in:
Jake Moshenko 2014-12-31 11:33:56 -05:00
parent ccb19571d6
commit cc70225043
11 changed files with 258 additions and 125 deletions

View file

@ -41,8 +41,11 @@ def run_build_manager():
if manager_klass is None: if manager_klass is None:
return return
public_ip = os.environ.get('PUBLIC_IP', '127.0.0.1') manager_hostname = os.environ.get('BUILDMAN_HOSTNAME',
logger.debug('Will pass public IP address %s to builders for websocket connection', public_ip) app.config.get('BUILDMAN_HOSTNAME',
app.config['SERVER_HOSTNAME']))
logger.debug('Will pass buildman hostname %s to builders for websocket connection',
manager_hostname)
logger.debug('Starting build manager with lifecycle "%s"', build_manager_config[0]) logger.debug('Starting build manager with lifecycle "%s"', build_manager_config[0])
ssl_context = None ssl_context = None
@ -53,7 +56,7 @@ def run_build_manager():
os.path.join(os.environ.get('SSL_CONFIG'), 'ssl.key')) os.path.join(os.environ.get('SSL_CONFIG'), 'ssl.key'))
server = BuilderServer(app.config['SERVER_HOSTNAME'], dockerfile_build_queue, build_logs, server = BuilderServer(app.config['SERVER_HOSTNAME'], dockerfile_build_queue, build_logs,
user_files, manager_klass, build_manager_config[1], public_ip) user_files, manager_klass, build_manager_config[1], manager_hostname)
server.run('0.0.0.0', ssl=ssl_context) server.run('0.0.0.0', ssl=ssl_context)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -9,6 +9,7 @@ from autobahn.wamp.exception import ApplicationError
from buildman.server import BuildJobResult from buildman.server import BuildJobResult
from buildman.component.basecomponent import BaseComponent from buildman.component.basecomponent import BaseComponent
from buildman.jobutil.buildjob import BuildJobLoadException
from buildman.jobutil.buildpack import BuildPackage, BuildPackageException from buildman.jobutil.buildpack import BuildPackage, BuildPackageException
from buildman.jobutil.buildstatus import StatusHandler from buildman.jobutil.buildstatus import StatusHandler
from buildman.jobutil.workererror import WorkerError from buildman.jobutil.workererror import WorkerError
@ -58,19 +59,20 @@ class BuildComponent(BaseComponent):
yield trollius.From(self.subscribe(self._on_heartbeat, 'io.quay.builder.heartbeat')) yield trollius.From(self.subscribe(self._on_heartbeat, 'io.quay.builder.heartbeat'))
yield trollius.From(self.subscribe(self._on_log_message, 'io.quay.builder.logmessage')) yield trollius.From(self.subscribe(self._on_log_message, 'io.quay.builder.logmessage'))
self._set_status(ComponentStatus.WAITING) yield trollius.From(self._set_status(ComponentStatus.WAITING))
def is_ready(self): def is_ready(self):
""" Determines whether a build component is ready to begin a build. """ """ Determines whether a build component is ready to begin a build. """
return self._component_status == ComponentStatus.RUNNING return self._component_status == ComponentStatus.RUNNING
@trollius.coroutine
def start_build(self, build_job): def start_build(self, build_job):
""" Starts a build. """ """ Starts a build. """
self._current_job = build_job self._current_job = build_job
self._build_status = StatusHandler(self.build_logs, build_job.repo_build.uuid) self._build_status = StatusHandler(self.build_logs, build_job.repo_build.uuid)
self._image_info = {} self._image_info = {}
self._set_status(ComponentStatus.BUILDING) yield trollius.From(self._set_status(ComponentStatus.BUILDING))
# Retrieve the job's buildpack. # Retrieve the job's buildpack.
buildpack_url = self.user_files.get_file_url(build_job.repo_build.resource_key, buildpack_url = self.user_files.get_file_url(build_job.repo_build.resource_key,
@ -82,23 +84,27 @@ class BuildComponent(BaseComponent):
buildpack = BuildPackage.from_url(buildpack_url) buildpack = BuildPackage.from_url(buildpack_url)
except BuildPackageException as bpe: except BuildPackageException as bpe:
self._build_failure('Could not retrieve build package', bpe) self._build_failure('Could not retrieve build package', bpe)
return raise trollius.Return()
# Extract the base image information from the Dockerfile. # Extract the base image information from the Dockerfile.
parsed_dockerfile = None parsed_dockerfile = None
logger.debug('Parsing dockerfile') logger.debug('Parsing dockerfile')
build_config = build_job.build_config try:
build_config = build_job.build_config
except BuildJobLoadException as irbe:
self._build_failure('Could not load build job information', irbe)
try: try:
parsed_dockerfile = buildpack.parse_dockerfile(build_config.get('build_subdir')) parsed_dockerfile = buildpack.parse_dockerfile(build_config.get('build_subdir'))
except BuildPackageException as bpe: except BuildPackageException as bpe:
self._build_failure('Could not find Dockerfile in build package', bpe) self._build_failure('Could not find Dockerfile in build package', bpe)
return raise trollius.Return()
image_and_tag_tuple = parsed_dockerfile.get_image_and_tag() image_and_tag_tuple = parsed_dockerfile.get_image_and_tag()
if image_and_tag_tuple is None or image_and_tag_tuple[0] is None: if image_and_tag_tuple is None or image_and_tag_tuple[0] is None:
self._build_failure('Missing FROM line in Dockerfile') self._build_failure('Missing FROM line in Dockerfile')
return raise trollius.Return()
base_image_information = { base_image_information = {
'repository': image_and_tag_tuple[0], 'repository': image_and_tag_tuple[0],
@ -147,9 +153,7 @@ class BuildComponent(BaseComponent):
logger.debug('Invoking build: %s', self.builder_realm) logger.debug('Invoking build: %s', self.builder_realm)
logger.debug('With Arguments: %s', build_arguments) logger.debug('With Arguments: %s', build_arguments)
return (self self.call("io.quay.builder.build", **build_arguments).add_done_callback(self._build_complete)
.call("io.quay.builder.build", **build_arguments)
.add_done_callback(self._build_complete))
@staticmethod @staticmethod
def _total_completion(statuses, total_images): def _total_completion(statuses, total_images):
@ -276,38 +280,42 @@ class BuildComponent(BaseComponent):
self._current_job = None self._current_job = None
# Set the component back to a running state. # Set the component back to a running state.
self._set_status(ComponentStatus.RUNNING) yield trollius.From(self._set_status(ComponentStatus.RUNNING))
@staticmethod @staticmethod
def _ping(): def _ping():
""" Ping pong. """ """ Ping pong. """
return 'pong' return 'pong'
@trollius.coroutine
def _on_ready(self, token, version): def _on_ready(self, token, version):
if not version in SUPPORTED_WORKER_VERSIONS: if not version in SUPPORTED_WORKER_VERSIONS:
logger.warning('Build component (token "%s") is running an out-of-date version: %s', version) logger.warning('Build component (token "%s") is running an out-of-date version: %s', token,
return False version)
raise trollius.Return(False)
if self._component_status != 'waiting': if self._component_status != 'waiting':
logger.warning('Build component (token "%s") is already connected', self.expected_token) logger.warning('Build component (token "%s") is already connected', self.expected_token)
return False raise trollius.Return(False)
if token != self.expected_token: if token != self.expected_token:
logger.warning('Builder token mismatch. Expected: "%s". Found: "%s"', self.expected_token, token) logger.warning('Builder token mismatch. Expected: "%s". Found: "%s"', self.expected_token,
return False token)
raise trollius.Return(False)
self._set_status(ComponentStatus.RUNNING) yield trollius.From(self._set_status(ComponentStatus.RUNNING))
# Start the heartbeat check and updating loop. # Start the heartbeat check and updating loop.
loop = trollius.get_event_loop() loop = trollius.get_event_loop()
loop.create_task(self._heartbeat()) loop.create_task(self._heartbeat())
logger.debug('Build worker %s is connected and ready', self.builder_realm) logger.debug('Build worker %s is connected and ready', self.builder_realm)
return True raise trollius.Return(True)
@trollius.coroutine
def _set_status(self, phase): def _set_status(self, phase):
if phase == ComponentStatus.RUNNING: if phase == ComponentStatus.RUNNING:
loop = trollius.get_event_loop() loop = trollius.get_event_loop()
self.parent_manager.build_component_ready(self, loop) yield trollius.From(self.parent_manager.build_component_ready(self, loop))
self._component_status = phase self._component_status = phase
@ -344,13 +352,14 @@ class BuildComponent(BaseComponent):
logger.debug('Checking heartbeat on realm %s', self.builder_realm) logger.debug('Checking heartbeat on realm %s', self.builder_realm)
if (self._last_heartbeat and if (self._last_heartbeat and
self._last_heartbeat < datetime.datetime.utcnow() - HEARTBEAT_DELTA): self._last_heartbeat < datetime.datetime.utcnow() - HEARTBEAT_DELTA):
self._timeout() yield trollius.From(self._timeout())
return return
yield trollius.From(trollius.sleep(HEARTBEAT_TIMEOUT)) yield trollius.From(trollius.sleep(HEARTBEAT_TIMEOUT))
@trollius.coroutine
def _timeout(self): def _timeout(self):
self._set_status(ComponentStatus.TIMED_OUT) yield trollius.From(self._set_status(ComponentStatus.TIMED_OUT))
logger.warning('Build component with realm %s has timed out', self.builder_realm) logger.warning('Build component with realm %s has timed out', self.builder_realm)
self._dispose(timed_out=True) self._dispose(timed_out=True)

View file

@ -1,6 +1,9 @@
import json
from cachetools import lru_cache
from data import model from data import model
import json
class BuildJobLoadException(Exception): class BuildJobLoadException(Exception):
""" Exception raised if a build job could not be instantiated for some reason. """ """ Exception raised if a build job could not be instantiated for some reason. """
@ -18,14 +21,22 @@ class BuildJob(object):
'Could not parse build queue item config with ID %s' % self.job_details['build_uuid'] 'Could not parse build queue item config with ID %s' % self.job_details['build_uuid']
) )
@lru_cache(maxsize=1)
def _load_repo_build(self):
try: try:
self.repo_build = model.get_repository_build(self.job_details['build_uuid']) return model.get_repository_build(self.job_details['build_uuid'])
except model.InvalidRepositoryBuildException: except model.InvalidRepositoryBuildException:
raise BuildJobLoadException( raise BuildJobLoadException(
'Could not load repository build with ID %s' % self.job_details['build_uuid']) 'Could not load repository build with ID %s' % self.job_details['build_uuid'])
@property
def repo_build(self):
return self._load_repo_build()
@property
def build_config(self):
try: try:
self.build_config = json.loads(self.repo_build.job_config) return json.loads(self.repo_build.job_config)
except ValueError: except ValueError:
raise BuildJobLoadException( raise BuildJobLoadException(
'Could not parse repository build job config with ID %s' % self.job_details['build_uuid'] 'Could not parse repository build job config with ID %s' % self.job_details['build_uuid']

View file

@ -3,12 +3,12 @@ from trollius import coroutine
class BaseManager(object): class BaseManager(object):
""" Base for all worker managers. """ """ Base for all worker managers. """
def __init__(self, register_component, unregister_component, job_heartbeat_callback, def __init__(self, register_component, unregister_component, job_heartbeat_callback,
job_complete_callback, public_ip_address, heartbeat_period_sec): job_complete_callback, manager_hostname, heartbeat_period_sec):
self.register_component = register_component self.register_component = register_component
self.unregister_component = unregister_component self.unregister_component = unregister_component
self.job_heartbeat_callback = job_heartbeat_callback self.job_heartbeat_callback = job_heartbeat_callback
self.job_complete_callback = job_complete_callback self.job_complete_callback = job_complete_callback
self.public_ip_address = public_ip_address self.manager_hostname = manager_hostname
self.heartbeat_period_sec = heartbeat_period_sec self.heartbeat_period_sec = heartbeat_period_sec
@coroutine @coroutine
@ -31,7 +31,7 @@ class BaseManager(object):
raise NotImplementedError raise NotImplementedError
@coroutine @coroutine
def schedule(self, build_job, loop): def schedule(self, build_job):
""" Schedules a queue item to be built. Returns True if the item was properly scheduled """ Schedules a queue item to be built. Returns True if the item was properly scheduled
and False if all workers are busy. and False if all workers are busy.
""" """
@ -42,7 +42,8 @@ class BaseManager(object):
""" """
raise NotImplementedError raise NotImplementedError
def build_component_ready(self, build_component, loop): @coroutine
def build_component_ready(self, build_component):
""" Method invoked whenever a build component announces itself as ready. """ Method invoked whenever a build component announces itself as ready.
""" """
raise NotImplementedError raise NotImplementedError

View file

@ -5,7 +5,7 @@ from buildman.component.basecomponent import BaseComponent
from buildman.component.buildcomponent import BuildComponent from buildman.component.buildcomponent import BuildComponent
from buildman.manager.basemanager import BaseManager from buildman.manager.basemanager import BaseManager
from trollius.coroutines import From, Return, coroutine from trollius import From, Return, coroutine, async
REGISTRATION_REALM = 'registration' REGISTRATION_REALM = 'registration'
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -51,16 +51,19 @@ class EnterpriseManager(BaseManager):
return realm return realm
@coroutine @coroutine
def schedule(self, build_job, loop): def schedule(self, build_job):
""" Schedules a build for an Enterprise Registry. """ """ Schedules a build for an Enterprise Registry. """
if self.shutting_down or not self.ready_components: if self.shutting_down or not self.ready_components:
raise Return(False) raise Return(False)
component = self.ready_components.pop() component = self.ready_components.pop()
loop.call_soon(component.start_build, build_job)
yield From(component.start_build(build_job))
raise Return(True) raise Return(True)
def build_component_ready(self, build_component, loop): @coroutine
def build_component_ready(self, build_component):
self.ready_components.add(build_component) self.ready_components.add(build_component)
def shutdown(self): def shutdown(self):

View file

@ -13,16 +13,28 @@ from urllib3.exceptions import ReadTimeoutError
from buildman.manager.basemanager import BaseManager from buildman.manager.basemanager import BaseManager
from buildman.manager.executor import PopenExecutor, EC2Executor from buildman.manager.executor import PopenExecutor, EC2Executor
from buildman.component.buildcomponent import BuildComponent from buildman.component.buildcomponent import BuildComponent
from buildman.jobutil.buildjob import BuildJob
from buildman.asyncutil import AsyncWrapper from buildman.asyncutil import AsyncWrapper
from util.morecollections import AttrDict
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
ETCD_BUILDER_PREFIX = 'building/' ETCD_BUILDER_PREFIX = 'building/'
ETCD_EXPIRE_RESULT = 'expire' ETCD_REALM_PREFIX = 'realm/'
ETCD_DISABLE_TIMEOUT = 0 ETCD_DISABLE_TIMEOUT = 0
class EtcdAction(object):
GET = 'get'
SET = 'set'
EXPIRE = 'expire'
UPDATE = 'update'
DELETE = 'delete'
CREATE = 'create'
COMPARE_AND_SWAP = 'compareAndSwap'
COMPARE_AND_DELETE = 'compareAndDelete'
class EphemeralBuilderManager(BaseManager): class EphemeralBuilderManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """ """ Build manager implementation for the Enterprise Registry. """
@ -41,52 +53,82 @@ class EphemeralBuilderManager(BaseManager):
self._etcd_client = None self._etcd_client = None
self._component_to_job = {} self._component_to_job = {}
self._job_uuid_to_component = {}
self._component_to_builder = {} self._component_to_builder = {}
self._executor = None self._executor = None
self._worker_watch_task = None # Map of etcd keys being watched to the tasks watching them
self._watch_tasks = {}
super(EphemeralBuilderManager, self).__init__(*args, **kwargs) super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
def _watch_builders(self): def _watch_etcd(self, etcd_key, change_callback, recursive=True):
""" Watch the builders key for expirations. watch_task_key = (etcd_key, recursive)
""" def callback_wrapper(changed_key_future):
if watch_task_key not in self._watch_tasks or self._watch_tasks[watch_task_key].done():
self._watch_etcd(etcd_key, change_callback)
if changed_key_future.cancelled():
# Due to lack of interest, tomorrow has been cancelled
return
try:
etcd_result = changed_key_future.result()
except ReadTimeoutError:
return
change_callback(etcd_result)
if not self._shutting_down: if not self._shutting_down:
workers_future = self._etcd_client.watch(ETCD_BUILDER_PREFIX, recursive=True, watch_future = self._etcd_client.watch(etcd_key, recursive=recursive,
timeout=ETCD_DISABLE_TIMEOUT) timeout=ETCD_DISABLE_TIMEOUT)
workers_future.add_done_callback(self._handle_key_expiration) watch_future.add_done_callback(callback_wrapper)
logger.debug('Scheduling watch task.') logger.debug('Scheduling watch of key: %s%s', etcd_key, '/*' if recursive else '')
self._worker_watch_task = async(workers_future) self._watch_tasks[watch_task_key] = async(watch_future)
def _handle_key_expiration(self, changed_key_future): def _handle_builder_expiration(self, etcd_result):
""" Handle when a builder expires if etcd_result.action == EtcdAction.EXPIRE:
"""
if self._worker_watch_task is None or self._worker_watch_task.done():
self._watch_builders()
if changed_key_future.cancelled():
# Due to lack of interest, tomorrow has been cancelled
return
try:
etcd_result = changed_key_future.result()
except ReadTimeoutError:
return
if etcd_result.action == ETCD_EXPIRE_RESULT:
# Handle the expiration # Handle the expiration
logger.debug('Builder expired, clean up the old build node') logger.debug('Builder expired, clean up the old build node')
job_metadata = json.loads(etcd_result._prev_node.value) job_metadata = json.loads(etcd_result._prev_node.value)
async(self._clean_up_old_builder(etcd_result.key, job_metadata)) async(self._clean_up_old_builder(etcd_result.key, job_metadata))
def _handle_realm_change(self, etcd_result):
if etcd_result.action == EtcdAction.SET:
# We must listen on the realm created by ourselves or another worker
realm_spec = json.loads(etcd_result.value)
component = self.register_component(realm_spec['realm'], BuildComponent,
token=realm_spec['token'])
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
self._component_to_job[component] = build_job
self._component_to_builder[component] = realm_spec['builder_id']
self._job_uuid_to_component[build_job.job_details['build_uuid']] = component
elif etcd_result.action == EtcdAction.DELETE or etcd_result.action == EtcdAction.EXPIRE:
# We must stop listening for new connections on the specified realm, if we did not get the
# connection
realm_spec = json.loads(etcd_result._prev_node.value)
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
component = self._job_uuid_to_component.pop(build_job.job_details['build_uuid'], None)
if component is not None:
# We were not the manager which the worker connected to, remove the bookkeeping for it
logger.debug('Unregistering unused component on realm: %s', realm_spec['realm'])
del self._component_to_job[component]
del self._component_to_builder[component]
self.unregister_component(component)
else:
logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key)
def initialize(self, manager_config): def initialize(self, manager_config):
logger.debug('Calling initialize') logger.debug('Calling initialize')
self._manager_config = manager_config self._manager_config = manager_config
executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor) executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor)
self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}), self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}),
self.public_ip_address) self.manager_hostname)
etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1') etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1')
etcd_port = self._manager_config.get('ETCD_PORT', 2379) etcd_port = self._manager_config.get('ETCD_PORT', 2379)
@ -97,7 +139,8 @@ class EphemeralBuilderManager(BaseManager):
self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port), self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port),
executor=self._async_thread_executor) executor=self._async_thread_executor)
self._watch_builders() self._watch_etcd(ETCD_BUILDER_PREFIX, self._handle_builder_expiration)
self._watch_etcd(ETCD_REALM_PREFIX, self._handle_realm_change)
def setup_time(self): def setup_time(self):
setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300) setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300)
@ -108,17 +151,17 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Shutting down worker.') logger.debug('Shutting down worker.')
self._shutting_down = True self._shutting_down = True
if self._worker_watch_task is not None: for (etcd_key, _), task in self._watch_tasks.items():
logger.debug('Canceling watch task.') if not task.done():
self._worker_watch_task.cancel() logger.debug('Canceling watch task for %s', etcd_key)
self._worker_watch_task = None task.cancel()
if self._async_thread_executor is not None: if self._async_thread_executor is not None:
logger.debug('Shutting down thread pool executor.') logger.debug('Shutting down thread pool executor.')
self._async_thread_executor.shutdown() self._async_thread_executor.shutdown()
@coroutine @coroutine
def schedule(self, build_job, loop): def schedule(self, build_job):
logger.debug('Calling schedule with job: %s', build_job.job_details['build_uuid']) logger.debug('Calling schedule with job: %s', build_job.job_details['build_uuid'])
# Check if there are worker slots avialable by checking the number of jobs in etcd # Check if there are worker slots avialable by checking the number of jobs in etcd
@ -154,8 +197,6 @@ class EphemeralBuilderManager(BaseManager):
try: try:
yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=False, ttl=ttl)) yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=False, ttl=ttl))
component = self.register_component(realm, BuildComponent, token=token)
self._component_to_job[component] = build_job
except KeyError: except KeyError:
# The job was already taken by someone else, we are probably a retry # The job was already taken by someone else, we are probably a retry
logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?') logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?')
@ -163,20 +204,38 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Starting builder with executor: %s', self._executor) logger.debug('Starting builder with executor: %s', self._executor)
builder_id = yield From(self._executor.start_builder(realm, token)) builder_id = yield From(self._executor.start_builder(realm, token))
self._component_to_builder[component] = builder_id
# Store the builder in etcd associated with the job id # Store the builder in etcd associated with the job id
payload['builder_id'] = builder_id payload['builder_id'] = builder_id
yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl)) yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl))
# Store the realm spec which will allow any manager to accept this builder when it connects
realm_spec = json.dumps({
'realm': realm,
'token': token,
'builder_id': builder_id,
'job_queue_item': build_job.job_item,
})
try:
yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False,
ttl=ttl))
except KeyError:
logger.error('Realm already exists in etcd. UUID collision or something is very very wrong.')
raise Return(False)
raise Return(True) raise Return(True)
def build_component_ready(self, build_component, loop): @coroutine
def build_component_ready(self, build_component):
try: try:
# Clean up the bookkeeping for allowing any manager to take the job
job = self._component_to_job.pop(build_component) job = self._component_to_job.pop(build_component)
del self._job_uuid_to_component[job.job_details['build_uuid']]
yield From(self._etcd_client.delete(self._etcd_realm_key(build_component.builder_realm)))
logger.debug('Sending build %s to newly ready component on realm %s', logger.debug('Sending build %s to newly ready component on realm %s',
job.job_details['build_uuid'], build_component.builder_realm) job.job_details['build_uuid'], build_component.builder_realm)
loop.call_soon(build_component.start_build, job) yield From(build_component.start_build(job))
except KeyError: except KeyError:
logger.warning('Builder is asking for more work, but work already completed') logger.warning('Builder is asking for more work, but work already completed')
@ -240,6 +299,12 @@ class EphemeralBuilderManager(BaseManager):
""" """
return os.path.join(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid']) return os.path.join(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid'])
@staticmethod
def _etcd_realm_key(realm):
""" Create a key which is used to track an incoming connection on a realm.
"""
return os.path.join(ETCD_REALM_PREFIX, realm)
def num_workers(self): def num_workers(self):
""" Return the number of workers we're managing locally. """ Return the number of workers we're managing locally.
""" """

View file

@ -29,9 +29,9 @@ class ExecutorException(Exception):
class BuilderExecutor(object): class BuilderExecutor(object):
def __init__(self, executor_config, manager_public_ip): def __init__(self, executor_config, manager_hostname):
self.executor_config = executor_config self.executor_config = executor_config
self.manager_public_ip = manager_public_ip self.manager_hostname = manager_hostname
""" Interface which can be plugged into the EphemeralNodeManager to provide a strategy for """ Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
starting and stopping builders. starting and stopping builders.
@ -52,7 +52,7 @@ class BuilderExecutor(object):
def get_manager_websocket_url(self): def get_manager_websocket_url(self):
return 'ws://{0}:' return 'ws://{0}:'
def generate_cloud_config(self, realm, token, coreos_channel, manager_ip, def generate_cloud_config(self, realm, token, coreos_channel, manager_hostname,
quay_username=None, quay_password=None, etcd_token=None): quay_username=None, quay_password=None, etcd_token=None):
if quay_username is None: if quay_username is None:
quay_username = self.executor_config['QUAY_USERNAME'] quay_username = self.executor_config['QUAY_USERNAME']
@ -69,7 +69,7 @@ class BuilderExecutor(object):
quay_username=quay_username, quay_username=quay_username,
quay_password=quay_password, quay_password=quay_password,
etcd_token=etcd_token, etcd_token=etcd_token,
manager_ip=manager_ip, manager_hostname=manager_hostname,
coreos_channel=coreos_channel, coreos_channel=coreos_channel,
) )
@ -108,7 +108,7 @@ class EC2Executor(BuilderExecutor):
channel = self.executor_config.get('COREOS_CHANNEL', 'stable') channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
get_ami_callable = partial(self._get_coreos_ami, region, channel) get_ami_callable = partial(self._get_coreos_ami, region, channel)
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable)) coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
user_data = self.generate_cloud_config(realm, token, channel, self.manager_public_ip) user_data = self.generate_cloud_config(realm, token, channel, self.manager_hostname)
logger.debug('Generated cloud config: %s', user_data) logger.debug('Generated cloud config: %s', user_data)
@ -155,10 +155,10 @@ class EC2Executor(BuilderExecutor):
class PopenExecutor(BuilderExecutor): class PopenExecutor(BuilderExecutor):
""" Implementation of BuilderExecutor which uses Popen to fork a quay-builder process. """ Implementation of BuilderExecutor which uses Popen to fork a quay-builder process.
""" """
def __init__(self, executor_config, manager_public_ip): def __init__(self, executor_config, manager_hostname):
self._jobs = {} self._jobs = {}
super(PopenExecutor, self).__init__(executor_config, manager_public_ip) super(PopenExecutor, self).__init__(executor_config, manager_hostname)
""" Executor which uses Popen to fork a quay-builder process. """ Executor which uses Popen to fork a quay-builder process.
""" """

View file

@ -37,7 +37,7 @@ class BuilderServer(object):
controller. controller.
""" """
def __init__(self, registry_hostname, queue, build_logs, user_files, lifecycle_manager_klass, def __init__(self, registry_hostname, queue, build_logs, user_files, lifecycle_manager_klass,
lifecycle_manager_config, manager_public_ip): lifecycle_manager_config, manager_hostname):
self._loop = None self._loop = None
self._current_status = 'starting' self._current_status = 'starting'
self._current_components = [] self._current_components = []
@ -53,7 +53,7 @@ class BuilderServer(object):
self._unregister_component, self._unregister_component,
self._job_heartbeat, self._job_heartbeat,
self._job_complete, self._job_complete,
manager_public_ip, manager_hostname,
HEARTBEAT_PERIOD_SEC, HEARTBEAT_PERIOD_SEC,
) )
self._lifecycle_manager_config = lifecycle_manager_config self._lifecycle_manager_config = lifecycle_manager_config
@ -158,7 +158,7 @@ class BuilderServer(object):
self._queue.incomplete(job_item, restore_retry=False) self._queue.incomplete(job_item, restore_retry=False)
logger.debug('Build job found. Checking for an avaliable worker.') logger.debug('Build job found. Checking for an avaliable worker.')
scheduled = yield From(self._lifecycle_manager.schedule(build_job, self._loop)) scheduled = yield From(self._lifecycle_manager.schedule(build_job))
if scheduled: if scheduled:
self._job_count = self._job_count + 1 self._job_count = self._job_count + 1
logger.debug('Build job scheduled. Running: %s', self._job_count) logger.debug('Build job scheduled. Running: %s', self._job_count)
@ -168,7 +168,6 @@ class BuilderServer(object):
yield From(trollius.sleep(WORK_CHECK_TIMEOUT)) yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
@trollius.coroutine @trollius.coroutine
def _initialize(self, loop, host, ssl=None): def _initialize(self, loop, host, ssl=None):
self._loop = loop self._loop = loop

View file

@ -6,7 +6,7 @@ write_files:
content: | content: |
REALM={{ realm }} REALM={{ realm }}
TOKEN={{ token }} TOKEN={{ token }}
ENDPOINT=wss://buildman.quay.io:8787 SERVER=wss://{{ manager_hostname }}
coreos: coreos:
update: update:
@ -31,7 +31,6 @@ coreos:
[Service] [Service]
TimeoutStartSec=600 TimeoutStartSec=600
TimeoutStopSec=2000 TimeoutStopSec=2000
ExecStartPre=/bin/sh -xc "echo '{{ manager_ip }} buildman.quay.io' >> /etc/hosts; exit 0"
ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io
ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest
ExecStop=/usr/bin/docker stop quay-builder ExecStop=/usr/bin/docker stop quay-builder

View file

@ -78,7 +78,8 @@ class WorkQueue(object):
def get(self, processing_time=300): def get(self, processing_time=300):
""" """
Get an available item and mark it as unavailable for the default of five Get an available item and mark it as unavailable for the default of five
minutes. minutes. The result of this method must always be composed of simple
python objects which are JSON serializable for network portability reasons.
""" """
now = datetime.utcnow() now = datetime.utcnow()

View file

@ -4,19 +4,20 @@ import os.path
import time import time
import json import json
from trollius import coroutine, get_event_loop, From, Future, sleep from trollius import coroutine, get_event_loop, From, Future, sleep, Return
from mock import Mock from mock import Mock
from threading import Event from threading import Event
from urllib3.exceptions import ReadTimeoutError from urllib3.exceptions import ReadTimeoutError
from buildman.manager.executor import BuilderExecutor from buildman.manager.executor import BuilderExecutor
from buildman.manager.ephemeral import (EphemeralBuilderManager, ETCD_BUILDER_PREFIX, from buildman.manager.ephemeral import (EphemeralBuilderManager, ETCD_BUILDER_PREFIX,
ETCD_EXPIRE_RESULT) ETCD_REALM_PREFIX, EtcdAction)
from buildman.server import BuildJobResult from buildman.server import BuildJobResult
from buildman.component.buildcomponent import BuildComponent from buildman.component.buildcomponent import BuildComponent
BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead' BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead'
REALM_ID = '1234-realm'
def async_test(f): def async_test(f):
@ -43,17 +44,17 @@ class TestEphemeral(unittest.TestCase):
self.etcd_client_mock.watch = Mock(side_effect=hang_until_event) self.etcd_client_mock.watch = Mock(side_effect=hang_until_event)
return self.etcd_client_mock return self.etcd_client_mock
def _create_mock_executor(self, *args, **kwargs): def _create_completed_future(self, result=None):
def create_completed_future(result=None): def inner(*args, **kwargs):
def inner(*args, **kwargs): new_future = Future()
new_future = Future() new_future.set_result(result)
new_future.set_result(result) return new_future
return new_future return inner
return inner
def _create_mock_executor(self, *args, **kwargs):
self.test_executor = Mock(spec=BuilderExecutor) self.test_executor = Mock(spec=BuilderExecutor)
self.test_executor.start_builder = Mock(side_effect=create_completed_future('123')) self.test_executor.start_builder = Mock(side_effect=self._create_completed_future('123'))
self.test_executor.stop_builder = Mock(side_effect=create_completed_future()) self.test_executor.stop_builder = Mock(side_effect=self._create_completed_future())
return self.test_executor return self.test_executor
def _create_build_job(self): def _create_build_job(self):
@ -61,6 +62,10 @@ class TestEphemeral(unittest.TestCase):
mock_job.job_details = { mock_job.job_details = {
'build_uuid': BUILD_UUID, 'build_uuid': BUILD_UUID,
} }
mock_job.job_item = {
'body': json.dumps(mock_job.job_details),
'id': 1,
}
return mock_job return mock_job
def setUp(self): def setUp(self):
@ -71,13 +76,13 @@ class TestEphemeral(unittest.TestCase):
self.etcd_wait_event.clear() self.etcd_wait_event.clear()
self.register_component_callback = Mock() self.register_component_callback = Mock()
self.uniregister_component_callback = Mock() self.unregister_component_callback = Mock()
self.job_heartbeat_callback = Mock() self.job_heartbeat_callback = Mock()
self.job_complete_callback = Mock() self.job_complete_callback = Mock()
self.manager = EphemeralBuilderManager( self.manager = EphemeralBuilderManager(
self.register_component_callback, self.register_component_callback,
self.uniregister_component_callback, self.unregister_component_callback,
self.job_heartbeat_callback, self.job_heartbeat_callback,
self.job_complete_callback, self.job_complete_callback,
'127.0.0.1', '127.0.0.1',
@ -97,15 +102,19 @@ class TestEphemeral(unittest.TestCase):
del EphemeralBuilderManager._executors['test'] del EphemeralBuilderManager._executors['test']
EphemeralBuilderManager._etcd_client_klass = self.old_etcd_client_klass EphemeralBuilderManager._etcd_client_klass = self.old_etcd_client_klass
@async_test @coroutine
def test_schedule_and_complete(self): def _setup_job_for_managers(self):
# Test that we are watching the realm location before anything else happens
self.etcd_client_mock.watch.assert_any_call(ETCD_REALM_PREFIX, recursive=True, timeout=0)
self.etcd_client_mock.read = Mock(side_effect=KeyError) self.etcd_client_mock.read = Mock(side_effect=KeyError)
test_component = BuildComponent(None) test_component = Mock(spec=BuildComponent)
test_component.builder_realm = REALM_ID
test_component.start_build = Mock(side_effect=self._create_completed_future())
self.register_component_callback.return_value = test_component self.register_component_callback.return_value = test_component
# Ask for a builder to be scheduled # Ask for a builder to be scheduled
loop = get_event_loop() is_scheduled = yield From(self.manager.schedule(self.mock_job))
is_scheduled = yield From(self.manager.schedule(self.mock_job, loop))
self.assertTrue(is_scheduled) self.assertTrue(is_scheduled)
@ -114,29 +123,76 @@ class TestEphemeral(unittest.TestCase):
self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key) self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key)
self.assertEqual(self.etcd_client_mock.write.call_args_list[1][0][0], self.mock_job_key) self.assertEqual(self.etcd_client_mock.write.call_args_list[1][0][0], self.mock_job_key)
# Right now the job is not registered with any managers because etcd has not accepted the job
self.assertEqual(self.register_component_callback.call_count, 0)
realm_created = Mock(spec=etcd.EtcdResult)
realm_created.action = EtcdAction.SET
realm_created.key = os.path.join(ETCD_REALM_PREFIX, REALM_ID)
realm_created.value = json.dumps({
'realm': REALM_ID,
'token': 'beef',
'builder_id': '123',
'job_queue_item': self.mock_job.job_item,
})
self.manager._handle_realm_change(realm_created)
self.assertEqual(self.register_component_callback.call_count, 1) self.assertEqual(self.register_component_callback.call_count, 1)
raise Return(test_component)
@async_test
def test_schedule_and_complete(self):
# Test that a job is properly registered with all of the managers
test_component = yield From(self._setup_job_for_managers())
# Take the job ourselves
yield From(self.manager.build_component_ready(test_component))
self.etcd_client_mock.delete.assert_called_once_with(os.path.join(ETCD_REALM_PREFIX, REALM_ID))
self.etcd_client_mock.delete.reset_mock()
# Finish the job
yield From(self.manager.job_completed(self.mock_job, BuildJobResult.COMPLETE, test_component)) yield From(self.manager.job_completed(self.mock_job, BuildJobResult.COMPLETE, test_component))
self.assertEqual(self.test_executor.stop_builder.call_count, 1) self.assertEqual(self.test_executor.stop_builder.call_count, 1)
self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key) self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key)
@async_test
def test_another_manager_takes_job(self):
# Prepare a job to be taken by another manager
test_component = yield From(self._setup_job_for_managers())
realm_deleted = Mock(spec=etcd.EtcdResult)
realm_deleted.action = EtcdAction.DELETE
realm_deleted.key = os.path.join(ETCD_REALM_PREFIX, REALM_ID)
realm_deleted._prev_node = Mock(spec=etcd.EtcdResult)
realm_deleted._prev_node.value = json.dumps({
'realm': REALM_ID,
'token': 'beef',
'builder_id': '123',
'job_queue_item': self.mock_job.job_item,
})
self.manager._handle_realm_change(realm_deleted)
self.unregister_component_callback.assert_called_once_with(test_component)
@async_test @async_test
def test_expiring_worker(self): def test_expiring_worker(self):
# Test that we are watching before anything else happens # Test that we are watching before anything else happens
self.etcd_client_mock.watch.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True, self.etcd_client_mock.watch.assert_any_call(ETCD_BUILDER_PREFIX, recursive=True, timeout=0)
timeout=0)
# Send a signal to the callback that a worker has expired # Send a signal to the callback that a worker has expired
expired_result = Mock(spec=etcd.EtcdResult) expired_result = Mock(spec=etcd.EtcdResult)
expired_result.action = ETCD_EXPIRE_RESULT expired_result.action = EtcdAction.EXPIRE
expired_result.key = self.mock_job_key expired_result.key = self.mock_job_key
expired_result._prev_node = Mock(spec=etcd.EtcdResult) expired_result._prev_node = Mock(spec=etcd.EtcdResult)
expired_result._prev_node.value = json.dumps({'builder_id': '1234'}) expired_result._prev_node.value = json.dumps({'builder_id': '1234'})
expired_future = Future()
expired_future.set_result(expired_result)
self.manager._handle_key_expiration(expired_future) self.manager._handle_builder_expiration(expired_result)
yield From(sleep(.01)) yield From(sleep(.01))
@ -151,10 +207,8 @@ class TestEphemeral(unittest.TestCase):
set_result = Mock(sepc=etcd.EtcdResult) set_result = Mock(sepc=etcd.EtcdResult)
set_result.action = 'set' set_result.action = 'set'
set_result.key = self.mock_job_key set_result.key = self.mock_job_key
set_future = Future()
set_future.set_result(set_result)
self.manager._handle_key_expiration(set_future) self.manager._handle_builder_expiration(set_result)
yield From(sleep(.01)) yield From(sleep(.01))
@ -179,15 +233,3 @@ class TestEphemeral(unittest.TestCase):
self.job_heartbeat_callback.assert_called_once_with(self.mock_job) self.job_heartbeat_callback.assert_called_once_with(self.mock_job)
self.assertEqual(self.etcd_client_mock.write.call_count, 1) self.assertEqual(self.etcd_client_mock.write.call_count, 1)
self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key) self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key)
@async_test
def test_etcd_read_timeout(self):
# Send a signal to the callback that a worker key has been changed
read_timeout_future = Future()
read_timeout_future.set_exception(ReadTimeoutError(None, None, None))
self.manager._handle_key_expiration(read_timeout_future)
yield From(sleep(.01))
self.assertEquals(self.test_executor.stop_builder.call_count, 0)