From 2d7e84475343192ca2876c39fbd747e376140953 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 16 Dec 2014 13:41:30 -0500
Subject: [PATCH 001/127] First implementation of ephemeral build lifecycle
manager.
---
buildman/builder.py | 9 +-
buildman/component/buildcomponent.py | 22 +--
buildman/jobutil/buildjob.py | 32 ++---
buildman/manager/basemanager.py | 10 +-
buildman/manager/enterprise.py | 26 ++--
buildman/manager/ephemeral.py | 145 +++++++++++++++++++
buildman/manager/executor.py | 204 +++++++++++++++++++++++++++
buildman/server.py | 21 +--
buildman/templates/cloudconfig.yaml | 38 +++++
requirements-nover.txt | 2 +
10 files changed, 453 insertions(+), 56 deletions(-)
create mode 100644 buildman/manager/ephemeral.py
create mode 100644 buildman/manager/executor.py
create mode 100644 buildman/templates/cloudconfig.yaml
diff --git a/buildman/builder.py b/buildman/builder.py
index 3e14db3eb..df485f142 100644
--- a/buildman/builder.py
+++ b/buildman/builder.py
@@ -6,6 +6,7 @@ import time
from app import app, userfiles as user_files, build_logs, dockerfile_build_queue
from buildman.manager.enterprise import EnterpriseManager
+from buildman.manager.ephemeral import EphemeralBuilderManager
from buildman.server import BuilderServer
from trollius import SSLContext
@@ -13,7 +14,8 @@ from trollius import SSLContext
logger = logging.getLogger(__name__)
BUILD_MANAGERS = {
- 'enterprise': EnterpriseManager
+ 'enterprise': EnterpriseManager,
+ 'ephemeral': EphemeralBuilderManager,
}
EXTERNALLY_MANAGED = 'external'
@@ -39,6 +41,9 @@ def run_build_manager():
if manager_klass is None:
return
+ public_ip = os.environ.get('PUBLIC_IP', '127.0.0.1')
+ logger.debug('Will pass public IP address %s to builders for websocket connection', public_ip)
+
logger.debug('Starting build manager with lifecycle "%s"', build_manager_config[0])
ssl_context = None
if os.environ.get('SSL_CONFIG'):
@@ -48,7 +53,7 @@ def run_build_manager():
os.environ.get('SSL_CONFIG') + '/ssl.key')
server = BuilderServer(app.config['SERVER_HOSTNAME'], dockerfile_build_queue, build_logs,
- user_files, manager_klass)
+ user_files, manager_klass, build_manager_config[1], public_ip)
server.run('0.0.0.0', ssl=ssl_context)
if __name__ == '__main__':
diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py
index d518d3453..05d342628 100644
--- a/buildman/component/buildcomponent.py
+++ b/buildman/component/buildcomponent.py
@@ -39,7 +39,7 @@ class BuildComponent(BaseComponent):
self.builder_realm = realm
self.parent_manager = None
- self.server_hostname = None
+ self.registry_hostname = None
self._component_status = ComponentStatus.JOINING
self._last_heartbeat = None
@@ -68,13 +68,13 @@ class BuildComponent(BaseComponent):
def start_build(self, build_job):
""" Starts a build. """
self._current_job = build_job
- self._build_status = StatusHandler(self.build_logs, build_job.repo_build())
+ self._build_status = StatusHandler(self.build_logs, build_job.repo_build)
self._image_info = {}
self._set_status(ComponentStatus.BUILDING)
# Retrieve the job's buildpack.
- buildpack_url = self.user_files.get_file_url(build_job.repo_build().resource_key,
+ buildpack_url = self.user_files.get_file_url(build_job.repo_build.resource_key,
requires_cors=False)
logger.debug('Retreiving build package: %s', buildpack_url)
@@ -89,7 +89,7 @@ class BuildComponent(BaseComponent):
parsed_dockerfile = None
logger.debug('Parsing dockerfile')
- build_config = build_job.build_config()
+ build_config = build_job.build_config
try:
parsed_dockerfile = buildpack.parse_dockerfile(build_config.get('build_subdir'))
except BuildPackageException as bpe:
@@ -116,7 +116,7 @@ class BuildComponent(BaseComponent):
base_image_information['password'] = build_config['pull_credentials'].get('password', '')
# Retrieve the repository's fully qualified name.
- repo = build_job.repo_build().repository
+ repo = build_job.repo_build.repository
repository_name = repo.namespace_user.username + '/' + repo.name
# Parse the build queue item into build arguments.
@@ -136,9 +136,9 @@ class BuildComponent(BaseComponent):
'build_package': buildpack_url,
'sub_directory': build_config.get('build_subdir', ''),
'repository': repository_name,
- 'registry': self.server_hostname,
- 'pull_token': build_job.repo_build().access_token.code,
- 'push_token': build_job.repo_build().access_token.code,
+ 'registry': self.registry_hostname,
+ 'pull_token': build_job.repo_build.access_token.code,
+ 'push_token': build_job.repo_build.access_token.code,
'tag_names': build_config.get('docker_tags', ['latest']),
'base_image': base_image_information,
'cached_tag': build_job.determine_cached_tag() or ''
@@ -244,7 +244,7 @@ class BuildComponent(BaseComponent):
'internal_error': exception.message if exception else None
})
- build_id = self._current_job.repo_build().uuid
+ build_id = self._current_job.repo_build.uuid
logger.warning('Build %s failed with message: %s', build_id, error_message)
# Mark that the build has finished (in an error state)
@@ -305,6 +305,10 @@ class BuildComponent(BaseComponent):
return True
def _set_status(self, phase):
+ if phase == ComponentStatus.RUNNING:
+ loop = trollius.get_event_loop()
+ self.parent_manager.build_component_ready(self, loop)
+
self._component_status = phase
def _on_heartbeat(self):
diff --git a/buildman/jobutil/buildjob.py b/buildman/jobutil/buildjob.py
index 6ec02a830..e92be23a6 100644
--- a/buildman/jobutil/buildjob.py
+++ b/buildman/jobutil/buildjob.py
@@ -9,50 +9,38 @@ class BuildJobLoadException(Exception):
class BuildJob(object):
""" Represents a single in-progress build job. """
def __init__(self, job_item):
- self._job_item = job_item
+ self.job_item = job_item
try:
- self._job_details = json.loads(job_item.body)
+ self.job_details = json.loads(job_item.body)
except ValueError:
raise BuildJobLoadException(
- 'Could not parse build queue item config with ID %s' % self._job_details['build_uuid']
+ 'Could not parse build queue item config with ID %s' % self.job_details['build_uuid']
)
try:
- self._repo_build = model.get_repository_build(self._job_details['build_uuid'])
+ self.repo_build = model.get_repository_build(self.job_details['build_uuid'])
except model.InvalidRepositoryBuildException:
raise BuildJobLoadException(
- 'Could not load repository build with ID %s' % self._job_details['build_uuid'])
+ 'Could not load repository build with ID %s' % self.job_details['build_uuid'])
try:
- self._build_config = json.loads(self._repo_build.job_config)
+ self.build_config = json.loads(self.repo_build.job_config)
except ValueError:
raise BuildJobLoadException(
- 'Could not parse repository build job config with ID %s' % self._job_details['build_uuid']
+ 'Could not parse repository build job config with ID %s' % self.job_details['build_uuid']
)
def determine_cached_tag(self):
""" Returns the tag to pull to prime the cache or None if none. """
# TODO(jschorr): Change this to use the more complicated caching rules, once we have caching
# be a pull of things besides the constructed tags.
- tags = self._build_config.get('docker_tags', ['latest'])
- existing_tags = model.list_repository_tags(self._repo_build.repository.namespace_user.username,
- self._repo_build.repository.name)
+ tags = self.build_config.get('docker_tags', ['latest'])
+ existing_tags = model.list_repository_tags(self.repo_build.repository.namespace_user.username,
+ self.repo_build.repository.name)
cached_tags = set(tags) & set([tag.name for tag in existing_tags])
if cached_tags:
return list(cached_tags)[0]
return None
-
- def job_item(self):
- """ Returns the job's queue item. """
- return self._job_item
-
- def repo_build(self):
- """ Returns the repository build DB row for the job. """
- return self._repo_build
-
- def build_config(self):
- """ Returns the parsed repository build config for the job. """
- return self._build_config
diff --git a/buildman/manager/basemanager.py b/buildman/manager/basemanager.py
index f66054c45..f71971997 100644
--- a/buildman/manager/basemanager.py
+++ b/buildman/manager/basemanager.py
@@ -1,11 +1,12 @@
class BaseManager(object):
""" Base for all worker managers. """
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
- job_complete_callback):
+ job_complete_callback, public_ip_address):
self.register_component = register_component
self.unregister_component = unregister_component
self.job_heartbeat_callback = job_heartbeat_callback
self.job_complete_callback = job_complete_callback
+ self.public_ip_address = public_ip_address
def job_heartbeat(self, build_job):
""" Method invoked to tell the manager that a job is still running. This method will be called
@@ -31,11 +32,16 @@ class BaseManager(object):
"""
raise NotImplementedError
- def initialize(self):
+ def initialize(self, manager_config):
""" Runs any initialization code for the manager. Called once the server is in a ready state.
"""
raise NotImplementedError
+ def build_component_ready(self, build_component, loop):
+ """ Method invoked whenever a build component announces itself as ready.
+ """
+ raise NotImplementedError
+
def build_component_disposed(self, build_component, timed_out):
""" Method invoked whenever a build component has been disposed. The timed_out boolean indicates
whether the component's heartbeat timed out.
diff --git a/buildman/manager/enterprise.py b/buildman/manager/enterprise.py
index 824e02d53..1eedf2790 100644
--- a/buildman/manager/enterprise.py
+++ b/buildman/manager/enterprise.py
@@ -28,10 +28,12 @@ class DynamicRegistrationComponent(BaseComponent):
class EnterpriseManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
- build_components = []
- shutting_down = False
- def initialize(self):
+ def __init__(self, *args, **kwargs):
+ self.ready_components = set()
+ self.shutting_down = False
+
+ def initialize(self, manager_config):
# Add a component which is used by build workers for dynamic registration. Unlike
# production, build workers in enterprise are long-lived and register dynamically.
self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
@@ -45,21 +47,20 @@ class EnterpriseManager(BaseManager):
""" Adds a new build component for an Enterprise Registry. """
# Generate a new unique realm ID for the build worker.
realm = str(uuid.uuid4())
- component = self.register_component(realm, BuildComponent, token="")
- self.build_components.append(component)
+ self.register_component(realm, BuildComponent, token="")
return realm
def schedule(self, build_job, loop):
""" Schedules a build for an Enterprise Registry. """
- if self.shutting_down:
+ if self.shutting_down or not self.ready_components:
return False
- for component in self.build_components:
- if component.is_ready():
- loop.call_soon(component.start_build, build_job)
- return True
+ component = self.ready_components.pop()
+ loop.call_soon(component.start_build, build_job)
+ return True
- return False
+ def build_component_ready(self, build_component, loop):
+ self.ready_components.add(build_component)
def shutdown(self):
self.shutting_down = True
@@ -68,5 +69,6 @@ class EnterpriseManager(BaseManager):
self.job_complete_callback(build_job, job_status)
def build_component_disposed(self, build_component, timed_out):
- self.build_components.remove(build_component)
+ if build_component in self.ready_components:
+ self.ready_components.remove(build_component)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
new file mode 100644
index 000000000..68af9de0e
--- /dev/null
+++ b/buildman/manager/ephemeral.py
@@ -0,0 +1,145 @@
+import logging
+import etcd
+import uuid
+
+from datetime import datetime, timedelta
+
+from buildman.manager.basemanager import BaseManager
+from buildman.manager.executor import PopenExecutor, EC2Executor
+from buildman.component.buildcomponent import BuildComponent
+
+
+logger = logging.getLogger(__name__)
+
+
+ETCD_BUILDER_PREFIX = 'building/'
+
+
+def clear_etcd(client):
+ """ Debugging method used to clear out the section of etcd we are using to track jobs in flight.
+ """
+ try:
+ building = client.read(ETCD_BUILDER_PREFIX, recursive=True)
+ for child in building.leaves:
+ if not child.dir:
+ logger.warning('Deleting key: %s', child.key)
+ client.delete(child.key)
+ except KeyError:
+ pass
+
+
+class EphemeralBuilderManager(BaseManager):
+ """ Build manager implementation for the Enterprise Registry. """
+ shutting_down = False
+
+ def __init__(self, *args, **kwargs):
+ self._manager_config = None
+ self._etcd_client = None
+
+ self._component_to_job = {}
+ self._component_to_builder = {}
+
+ self._executors = {
+ 'popen': PopenExecutor,
+ 'ec2': EC2Executor,
+ }
+ self._executor = None
+
+ super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
+
+ def initialize(self, manager_config):
+ logger.debug('Calling initialize')
+ self._manager_config = manager_config
+
+ executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor)
+ self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}),
+ self.public_ip_address)
+
+ etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1')
+ etcd_port = self._manager_config.get('ETCD_PORT', 2379)
+ logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
+ self._etcd_client = etcd.Client(host=etcd_host, port=etcd_port)
+
+ clear_etcd(self._etcd_client)
+
+ def setup_time(self):
+ setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300)
+ logger.debug('Returning setup_time: %s', setup_time)
+ return setup_time
+
+ def shutdown(self):
+ logger.debug('Calling shutdown.')
+ raise NotImplementedError
+
+ def schedule(self, build_job, loop):
+ logger.debug('Calling schedule with job: %s', build_job.repo_build.uuid)
+
+ # Check if there are worker slots avialable by checking the number of jobs in etcd
+ allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 2)
+ try:
+ building = self._etcd_client.read(ETCD_BUILDER_PREFIX, recursive=True)
+ workers_alive = sum(1 for child in building.children if not child.dir)
+ except KeyError:
+ workers_alive = 0
+
+ logger.debug('Total jobs: %s', workers_alive)
+
+ if workers_alive >= allowed_worker_count:
+ logger.info('Too many workers alive, unable to start new worker. %s >= %s', workers_alive,
+ allowed_worker_count)
+ return False
+
+ job_key = self._etcd_job_key(build_job)
+
+ # First try to take a lock for this job, meaning we will be responsible for its lifeline
+ realm = str(uuid.uuid4())
+ token = str(uuid.uuid4())
+ expiration = datetime.utcnow() + timedelta(seconds=self.setup_time())
+
+ payload = {
+ 'expiration': expiration.isoformat(),
+ }
+
+ try:
+ self._etcd_client.write(job_key, payload, prevExist=False)
+ component = self.register_component(realm, BuildComponent, token=token)
+ self._component_to_job[component] = build_job
+ except KeyError:
+ # The job was already taken by someone else, we are probably a retry
+ logger.warning('Job already exists in etcd, did an old worker die?')
+ return False
+
+ builder_id = self._executor.start_builder(realm, token)
+ self._component_to_builder[component] = builder_id
+
+ return True
+
+ def build_component_ready(self, build_component, loop):
+ try:
+ job = self._component_to_job.pop(build_component)
+ logger.debug('Sending build %s to newly ready component on realm %s', job.repo_build.uuid,
+ build_component.builder_realm)
+ loop.call_soon(build_component.start_build, job)
+ except KeyError:
+ logger.warning('Builder is asking for more work, but work already completed')
+
+ def build_component_disposed(self, build_component, timed_out):
+ logger.debug('Calling build_component_disposed.')
+
+ def job_completed(self, build_job, job_status, build_component):
+ logger.debug('Calling job_completed with status: %s', job_status)
+
+ # Kill he ephmeral builder
+ self._executor.stop_builder(self._component_to_builder.pop(build_component))
+
+ # Release the lock in etcd
+ job_key = self._etcd_job_key(build_job)
+ self._etcd_client.delete(job_key)
+
+ self.job_complete_callback(build_job, job_status)
+
+ @staticmethod
+ def _etcd_job_key(build_job):
+ """ Create a key which is used to track a job in etcd.
+ """
+ return '{0}{1}'.format(ETCD_BUILDER_PREFIX, build_job.repo_build.uuid)
diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py
new file mode 100644
index 000000000..a3cd4981b
--- /dev/null
+++ b/buildman/manager/executor.py
@@ -0,0 +1,204 @@
+import logging
+import os
+import uuid
+import threading
+import boto.ec2
+import requests
+import cachetools
+
+from jinja2 import FileSystemLoader, Environment
+
+
+logger = logging.getLogger(__name__)
+
+
+ONE_HOUR = 60*60
+
+ENV = Environment(loader=FileSystemLoader('buildman/templates'))
+TEMPLATE = ENV.get_template('cloudconfig.yaml')
+
+
+class ExecutorException(Exception):
+ """ Exception raised when there is a problem starting or stopping a builder.
+ """
+ pass
+
+
+class BuilderExecutor(object):
+ def __init__(self, executor_config, manager_public_ip):
+ self.executor_config = executor_config
+ self.manager_public_ip = manager_public_ip
+
+ """ Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
+ starting and stopping builders.
+ """
+ def start_builder(self, realm, token):
+ """ Create a builder with the specified config. Returns a unique id which can be used to manage
+ the builder.
+ """
+ raise NotImplementedError
+
+ def stop_builder(self, builder_id):
+ """ Stop a builder which is currently running.
+ """
+ raise NotImplementedError
+
+ def get_manager_websocket_url(self):
+ return 'ws://{0}:'
+
+ def generate_cloud_config(self, realm, token, coreos_channel, manager_ip,
+ quay_username=None, quay_password=None, etcd_token=None):
+ if quay_username is None:
+ quay_username = self.executor_config['QUAY_USERNAME']
+
+ if quay_password is None:
+ quay_password = self.executor_config['QUAY_PASSWORD']
+
+ if etcd_token is None:
+ etcd_token = self.executor_config['ETCD_DISCOVERY_TOKEN']
+
+ return TEMPLATE.render(
+ realm=realm,
+ token=token,
+ quay_username=quay_username,
+ quay_password=quay_password,
+ etcd_token=etcd_token,
+ manager_ip=manager_ip,
+ coreos_channel=coreos_channel,
+ )
+
+
+class EC2Executor(BuilderExecutor):
+ """ Implementation of BuilderExecutor which uses libcloud to start machines on a variety of cloud
+ providers.
+ """
+ COREOS_STACK_URL = 'http://%s.release.core-os.net/amd64-usr/current/coreos_production_ami_hvm.txt'
+
+ def _get_conn(self):
+ """ Creates an ec2 connection which can be used to manage instances.
+ """
+ return boto.ec2.connect_to_region(
+ self.executor_config['EC2_REGION'],
+ aws_access_key_id=self.executor_config['AWS_ACCESS_KEY'],
+ aws_secret_access_key=self.executor_config['AWS_SECRET_KEY'],
+ )
+
+ @classmethod
+ @cachetools.ttl_cache(ttl=ONE_HOUR)
+ def _get_coreos_ami(cls, ec2_region, coreos_channel):
+ """ Retrieve the CoreOS AMI id from the canonical listing.
+ """
+ stack_list_string = requests.get(EC2Executor.COREOS_STACK_URL % coreos_channel).text
+ stack_amis = dict([stack.split('=') for stack in stack_list_string.split('|')])
+ return stack_amis[ec2_region]
+
+ def start_builder(self, realm, token):
+ region = self.executor_config['EC2_REGION']
+ channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
+ coreos_ami = self._get_coreos_ami(region, channel)
+ user_data = self.generate_cloud_config(realm, token, channel, self.manager_public_ip)
+
+ logger.debug('Generated cloud config: %s', user_data)
+
+ ec2_conn = self._get_conn()
+ # class FakeReservation(object):
+ # def __init__(self):
+ # self.instances = None
+ # reservation = FakeReservation()
+ reservation = ec2_conn.run_instances(
+ coreos_ami,
+ instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
+ security_groups=self.executor_config['EC2_SECURITY_GROUP_IDS'],
+ key_name=self.executor_config.get('EC2_KEY_NAME', None),
+ user_data=user_data,
+ )
+
+ if not reservation.instances:
+ raise ExecutorException('Unable to spawn builder instance.')
+ elif len(reservation.instances) != 1:
+ raise ExecutorException('EC2 started wrong number of instances!')
+
+ return reservation.instances[0]
+
+ def stop_builder(self, builder_id):
+ ec2_conn = self._get_conn()
+ stopped_instances = ec2_conn.stop_instances([builder_id], force=True)
+ if builder_id not in stopped_instances:
+ raise ExecutorException('Unable to stop instance: %s' % builder_id)
+
+class PopenExecutor(BuilderExecutor):
+ """ Implementation of BuilderExecutor which uses Popen to fork a quay-builder process.
+ """
+ def __init__(self, executor_config, manager_public_ip):
+ self._jobs = {}
+
+ super(PopenExecutor, self).__init__(executor_config, manager_public_ip)
+
+ """ Executor which uses Popen to fork a quay-builder process.
+ """
+ def start_builder(self, realm, token):
+ # Now start a machine for this job, adding the machine id to the etcd information
+ logger.debug('Forking process for build')
+ import subprocess
+ builder_env = {
+ 'TOKEN': token,
+ 'REALM': realm,
+ 'ENDPOINT': 'ws://localhost:8787',
+ 'DOCKER_TLS_VERIFY': os.environ.get('DOCKER_TLS_VERIFY', ''),
+ 'DOCKER_CERT_PATH': os.environ.get('DOCKER_CERT_PATH', ''),
+ 'DOCKER_HOST': os.environ.get('DOCKER_HOST', ''),
+ }
+
+ logpipe = LogPipe(logging.INFO)
+ spawned = subprocess.Popen('/Users/jake/bin/quay-builder', stdout=logpipe, stderr=logpipe,
+ env=builder_env)
+
+ builder_id = str(uuid.uuid4())
+ self._jobs[builder_id] = (spawned, logpipe)
+ logger.debug('Builder spawned with id: %s', builder_id)
+ return builder_id
+
+
+ def stop_builder(self, builder_id):
+ if builder_id not in self._jobs:
+ raise ExecutorException('Builder id not being tracked by executor.')
+
+ logger.debug('Killing builder with id: %s', builder_id)
+ spawned, logpipe = self._jobs[builder_id]
+
+ if spawned.poll() is None:
+ spawned.kill()
+ logpipe.close()
+
+
+class LogPipe(threading.Thread):
+ """ Adapted from http://codereview.stackexchange.com/a/17959
+ """
+ def __init__(self, level):
+ """Setup the object with a logger and a loglevel
+ and start the thread
+ """
+ threading.Thread.__init__(self)
+ self.daemon = False
+ self.level = level
+ self.fd_read, self.fd_write = os.pipe()
+ self.pipe_reader = os.fdopen(self.fd_read)
+ self.start()
+
+ def fileno(self):
+ """Return the write file descriptor of the pipe
+ """
+ return self.fd_write
+
+ def run(self):
+ """Run the thread, logging everything.
+ """
+ for line in iter(self.pipe_reader.readline, ''):
+ logging.log(self.level, line.strip('\n'))
+
+ self.pipe_reader.close()
+
+ def close(self):
+ """Close the write end of the pipe.
+ """
+ os.close(self.fd_write)
diff --git a/buildman/server.py b/buildman/server.py
index 3863406f2..6f57b6627 100644
--- a/buildman/server.py
+++ b/buildman/server.py
@@ -34,14 +34,15 @@ class BuilderServer(object):
""" Server which handles both HTTP and WAMP requests, managing the full state of the build
controller.
"""
- def __init__(self, server_hostname, queue, build_logs, user_files, lifecycle_manager_klass):
+ def __init__(self, registry_hostname, queue, build_logs, user_files, lifecycle_manager_klass,
+ lifecycle_manager_config, manager_public_ip):
self._loop = None
self._current_status = 'starting'
self._current_components = []
self._job_count = 0
self._session_factory = RouterSessionFactory(RouterFactory())
- self._server_hostname = server_hostname
+ self._registry_hostname = registry_hostname
self._queue = queue
self._build_logs = build_logs
self._user_files = user_files
@@ -49,8 +50,10 @@ class BuilderServer(object):
self._register_component,
self._unregister_component,
self._job_heartbeat,
- self._job_complete
+ self._job_complete,
+ manager_public_ip,
)
+ self._lifecycle_manager_config = lifecycle_manager_config
self._shutdown_event = Event()
self._current_status = 'running'
@@ -69,7 +72,7 @@ class BuilderServer(object):
def run(self, host, ssl=None):
logger.debug('Initializing the lifecycle manager')
- self._lifecycle_manager.initialize()
+ self._lifecycle_manager.initialize(self._lifecycle_manager_config)
logger.debug('Initializing all members of the event loop')
loop = trollius.get_event_loop()
@@ -102,7 +105,7 @@ class BuilderServer(object):
component.parent_manager = self._lifecycle_manager
component.build_logs = self._build_logs
component.user_files = self._user_files
- component.server_hostname = self._server_hostname
+ component.registry_hostname = self._registry_hostname
self._current_components.append(component)
self._session_factory.add(component)
@@ -116,16 +119,16 @@ class BuilderServer(object):
self._session_factory.remove(component)
def _job_heartbeat(self, build_job):
- WorkQueue.extend_processing(build_job.job_item(), seconds_from_now=JOB_TIMEOUT_SECONDS,
+ WorkQueue.extend_processing(build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS,
retry_count=1, minimum_extension=MINIMUM_JOB_EXTENSION)
def _job_complete(self, build_job, job_status):
if job_status == BuildJobResult.INCOMPLETE:
- self._queue.incomplete(build_job.job_item(), restore_retry=True, retry_after=30)
+ self._queue.incomplete(build_job.job_item, restore_retry=True, retry_after=30)
elif job_status == BuildJobResult.ERROR:
- self._queue.incomplete(build_job.job_item(), restore_retry=False)
+ self._queue.incomplete(build_job.job_item, restore_retry=False)
else:
- self._queue.complete(build_job.job_item())
+ self._queue.complete(build_job.job_item)
self._job_count = self._job_count - 1
diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml
new file mode 100644
index 000000000..ca9c6c16a
--- /dev/null
+++ b/buildman/templates/cloudconfig.yaml
@@ -0,0 +1,38 @@
+#cloud-config
+
+write_files:
+- path: /root/overrides.list
+ permission: '0644'
+ content: |
+ REALM={{ realm }}
+ TOKEN={{ token }}
+ ENDPOINT=wss://buildman.quay.io:8787
+
+coreos:
+ update:
+ reboot-strategy: off
+ group: {{ coreos_channel }}
+
+ etcd:
+ discovery: https://discovery.etcd.io/{{ etcd_token }}
+ # multi-region and multi-cloud deployments need to use $public_ipv4
+ addr: $private_ipv4:4001
+ peer-addr: $private_ipv4:7001
+
+ units:
+ - name: quay-builder.service
+ command: start
+ content: |
+ [Unit]
+ Description=Quay builder container
+ Author=Jake Moshenko
+ After=docker.service
+
+ [Service]
+ Restart=always
+ TimeoutStartSec=600
+ TimeoutStopSec=2000
+ ExecStartPre=/usr/bin/sudo /bin/sh -xc "echo '{{ manager_ip }} buildman.quay.io' >> /etc/hosts; exit 0"
+ ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io
+ ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest
+ ExecStop=/usr/bin/docker stop quay-builder
diff --git a/requirements-nover.txt b/requirements-nover.txt
index c1bf6c19f..51cd42e3c 100644
--- a/requirements-nover.txt
+++ b/requirements-nover.txt
@@ -41,3 +41,5 @@ git+https://github.com/DevTable/aniso8601-fake.git
git+https://github.com/DevTable/anunidecode.git
git+https://github.com/DevTable/avatar-generator.git
gipc
+python-etcd
+cachetools
From 1d68594dc220d5cc16dda89aa5cad62d969210e6 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 16 Dec 2014 15:10:50 -0500
Subject: [PATCH 002/127] Extract instance ids from the instance objects
returned by boto.
---
buildman/manager/executor.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py
index a3cd4981b..e82ecf672 100644
--- a/buildman/manager/executor.py
+++ b/buildman/manager/executor.py
@@ -118,12 +118,12 @@ class EC2Executor(BuilderExecutor):
elif len(reservation.instances) != 1:
raise ExecutorException('EC2 started wrong number of instances!')
- return reservation.instances[0]
+ return reservation.instances[0].id
def stop_builder(self, builder_id):
ec2_conn = self._get_conn()
- stopped_instances = ec2_conn.stop_instances([builder_id], force=True)
- if builder_id not in stopped_instances:
+ stopped_instance_ids = [si.id for si in ec2_conn.stop_instances([builder_id], force=True)]
+ if builder_id not in stopped_instance_ids:
raise ExecutorException('Unable to stop instance: %s' % builder_id)
class PopenExecutor(BuilderExecutor):
From a280bbcb6db471e3275e4d3937404da6f09479b5 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 16 Dec 2014 15:17:39 -0500
Subject: [PATCH 003/127] Add tag metadata to the instances.
---
buildman/manager/executor.py | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py
index e82ecf672..b35a90c97 100644
--- a/buildman/manager/executor.py
+++ b/buildman/manager/executor.py
@@ -118,7 +118,13 @@ class EC2Executor(BuilderExecutor):
elif len(reservation.instances) != 1:
raise ExecutorException('EC2 started wrong number of instances!')
- return reservation.instances[0].id
+ launched = reservation.instances[0]
+ launched.add_tags({
+ 'Name': 'Quay Ephemeral Builder',
+ 'Realm': realm,
+ 'Token': token,
+ })
+ return launched.id
def stop_builder(self, builder_id):
ec2_conn = self._get_conn()
From 12ee8e0fc02a7abe1c0cf457698fa618a7a26a76 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Mon, 22 Dec 2014 12:14:16 -0500
Subject: [PATCH 004/127] Switch a few of the buildman methods to coroutines in
order to support network calls in methods. Add a test for the ephemeral build
manager.
---
buildman/asyncutil.py | 27 +++++++
buildman/component/buildcomponent.py | 22 +++---
buildman/manager/basemanager.py | 8 +-
buildman/manager/enterprise.py | 8 +-
buildman/manager/ephemeral.py | 64 +++++++++++-----
buildman/manager/executor.py | 37 +++++----
buildman/server.py | 3 +-
buildman/templates/cloudconfig.yaml | 2 +-
endpoints/api/build.py | 4 +-
requirements-nover.txt | 1 +
test/test_buildman.py | 109 +++++++++++++++++++++++++++
11 files changed, 233 insertions(+), 52 deletions(-)
create mode 100644 buildman/asyncutil.py
create mode 100644 test/test_buildman.py
diff --git a/buildman/asyncutil.py b/buildman/asyncutil.py
new file mode 100644
index 000000000..4f2d4e1a9
--- /dev/null
+++ b/buildman/asyncutil.py
@@ -0,0 +1,27 @@
+from functools import partial, wraps
+from trollius import get_event_loop
+
+
+class AsyncWrapper(object):
+ """ Wrapper class which will transform a syncronous library to one that can be used with
+ trollius coroutines.
+ """
+ def __init__(self, delegate, loop=None, executor=None):
+ self._loop = loop if loop is not None else get_event_loop()
+ self._delegate = delegate
+ self._executor = executor
+
+ def __getattr__(self, attrib):
+ delegate_attr = getattr(self._delegate, attrib)
+
+ if not callable(delegate_attr):
+ return delegate_attr
+
+ def wrapper(*args, **kwargs):
+ """ Wraps the delegate_attr with primitives that will transform sync calls to ones shelled
+ out to a thread pool.
+ """
+ callable_delegate_attr = partial(delegate_attr, *args, **kwargs)
+ return self._loop.run_in_executor(self._executor, callable_delegate_attr)
+
+ return wrapper
diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py
index 05d342628..53b04bf87 100644
--- a/buildman/component/buildcomponent.py
+++ b/buildman/component/buildcomponent.py
@@ -6,7 +6,6 @@ import trollius
import re
from autobahn.wamp.exception import ApplicationError
-from trollius.coroutines import From
from buildman.server import BuildJobResult
from buildman.component.basecomponent import BaseComponent
@@ -54,10 +53,10 @@ class BuildComponent(BaseComponent):
def onJoin(self, details):
logger.debug('Registering methods and listeners for component %s', self.builder_realm)
- yield From(self.register(self._on_ready, u'io.quay.buildworker.ready'))
- yield From(self.register(self._ping, u'io.quay.buildworker.ping'))
- yield From(self.subscribe(self._on_heartbeat, 'io.quay.builder.heartbeat'))
- yield From(self.subscribe(self._on_log_message, 'io.quay.builder.logmessage'))
+ yield trollius.From(self.register(self._on_ready, u'io.quay.buildworker.ready'))
+ yield trollius.From(self.register(self._ping, u'io.quay.buildworker.ping'))
+ yield trollius.From(self.subscribe(self._on_heartbeat, 'io.quay.builder.heartbeat'))
+ yield trollius.From(self.subscribe(self._on_log_message, 'io.quay.builder.logmessage'))
self._set_status(ComponentStatus.WAITING)
@@ -270,9 +269,10 @@ class BuildComponent(BaseComponent):
else:
self._build_finished(BuildJobResult.ERROR)
+ @trollius.coroutine
def _build_finished(self, job_status):
""" Alerts the parent that a build has completed and sets the status back to running. """
- self.parent_manager.job_completed(self._current_job, job_status, self)
+ yield trollius.From(self.parent_manager.job_completed(self._current_job, job_status, self))
self._current_job = None
# Set the component back to a running state.
@@ -313,7 +313,7 @@ class BuildComponent(BaseComponent):
def _on_heartbeat(self):
""" Updates the last known heartbeat. """
- self._last_heartbeat = datetime.datetime.now()
+ self._last_heartbeat = datetime.datetime.utcnow()
@trollius.coroutine
def _heartbeat(self):
@@ -321,7 +321,7 @@ class BuildComponent(BaseComponent):
and updating the heartbeat in the build status dictionary (if applicable). This allows
the build system to catch crashes from either end.
"""
- yield From(trollius.sleep(INITIAL_TIMEOUT))
+ yield trollius.From(trollius.sleep(INITIAL_TIMEOUT))
while True:
# If the component is no longer running or actively building, nothing more to do.
@@ -335,7 +335,6 @@ class BuildComponent(BaseComponent):
with build_status as status_dict:
status_dict['heartbeat'] = int(time.time())
-
# Mark the build item.
current_job = self._current_job
if current_job is not None:
@@ -343,11 +342,12 @@ class BuildComponent(BaseComponent):
# Check the heartbeat from the worker.
logger.debug('Checking heartbeat on realm %s', self.builder_realm)
- if self._last_heartbeat and self._last_heartbeat < datetime.datetime.now() - HEARTBEAT_DELTA:
+ if (self._last_heartbeat and
+ self._last_heartbeat < datetime.datetime.utcnow() - HEARTBEAT_DELTA):
self._timeout()
return
- yield From(trollius.sleep(HEARTBEAT_TIMEOUT))
+ yield trollius.From(trollius.sleep(HEARTBEAT_TIMEOUT))
def _timeout(self):
self._set_status(ComponentStatus.TIMED_OUT)
diff --git a/buildman/manager/basemanager.py b/buildman/manager/basemanager.py
index f71971997..fc9fd70cf 100644
--- a/buildman/manager/basemanager.py
+++ b/buildman/manager/basemanager.py
@@ -1,3 +1,5 @@
+from trollius import coroutine
+
class BaseManager(object):
""" Base for all worker managers. """
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
@@ -26,6 +28,7 @@ class BaseManager(object):
"""
raise NotImplementedError
+ @coroutine
def schedule(self, build_job, loop):
""" Schedules a queue item to be built. Returns True if the item was properly scheduled
and False if all workers are busy.
@@ -48,8 +51,11 @@ class BaseManager(object):
"""
raise NotImplementedError
+ @coroutine
def job_completed(self, build_job, job_status, build_component):
""" Method invoked once a job_item has completed, in some manner. The job_status will be
- one of: incomplete, error, complete. If incomplete, the job should be requeued.
+ one of: incomplete, error, complete. Implementations of this method should call
+ self.job_complete_callback with a status of Incomplete if they wish for the job to be
+ automatically requeued.
"""
raise NotImplementedError
diff --git a/buildman/manager/enterprise.py b/buildman/manager/enterprise.py
index 1eedf2790..516464ff3 100644
--- a/buildman/manager/enterprise.py
+++ b/buildman/manager/enterprise.py
@@ -5,7 +5,7 @@ from buildman.component.basecomponent import BaseComponent
from buildman.component.buildcomponent import BuildComponent
from buildman.manager.basemanager import BaseManager
-from trollius.coroutines import From
+from trollius.coroutines import From, Return, coroutine
REGISTRATION_REALM = 'registration'
logger = logging.getLogger(__name__)
@@ -50,14 +50,15 @@ class EnterpriseManager(BaseManager):
self.register_component(realm, BuildComponent, token="")
return realm
+ @coroutine
def schedule(self, build_job, loop):
""" Schedules a build for an Enterprise Registry. """
if self.shutting_down or not self.ready_components:
- return False
+ raise Return(False)
component = self.ready_components.pop()
loop.call_soon(component.start_build, build_job)
- return True
+ raise Return(True)
def build_component_ready(self, build_component, loop):
self.ready_components.add(build_component)
@@ -65,6 +66,7 @@ class EnterpriseManager(BaseManager):
def shutdown(self):
self.shutting_down = True
+ @coroutine
def job_completed(self, build_job, job_status, build_component):
self.job_complete_callback(build_job, job_status)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index 68af9de0e..ed2da908e 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -1,12 +1,15 @@
import logging
import etcd
import uuid
+import calendar
from datetime import datetime, timedelta
+from trollius import From, coroutine, Return
from buildman.manager.basemanager import BaseManager
from buildman.manager.executor import PopenExecutor, EC2Executor
from buildman.component.buildcomponent import BuildComponent
+from buildman.asyncutil import AsyncWrapper
logger = logging.getLogger(__name__)
@@ -32,6 +35,13 @@ class EphemeralBuilderManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
shutting_down = False
+ _executors = {
+ 'popen': PopenExecutor,
+ 'ec2': EC2Executor,
+ }
+
+ _etcd_client_klass = etcd.Client
+
def __init__(self, *args, **kwargs):
self._manager_config = None
self._etcd_client = None
@@ -39,10 +49,6 @@ class EphemeralBuilderManager(BaseManager):
self._component_to_job = {}
self._component_to_builder = {}
- self._executors = {
- 'popen': PopenExecutor,
- 'ec2': EC2Executor,
- }
self._executor = None
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
@@ -58,9 +64,8 @@ class EphemeralBuilderManager(BaseManager):
etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1')
etcd_port = self._manager_config.get('ETCD_PORT', 2379)
logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
- self._etcd_client = etcd.Client(host=etcd_host, port=etcd_port)
- clear_etcd(self._etcd_client)
+ self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port))
def setup_time(self):
setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300)
@@ -71,13 +76,14 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Calling shutdown.')
raise NotImplementedError
+ @coroutine
def schedule(self, build_job, loop):
- logger.debug('Calling schedule with job: %s', build_job.repo_build.uuid)
+ logger.debug('Calling schedule with job: %s', build_job.job_details['build_uuid'])
# Check if there are worker slots avialable by checking the number of jobs in etcd
- allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 2)
+ allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
try:
- building = self._etcd_client.read(ETCD_BUILDER_PREFIX, recursive=True)
+ building = yield From(self._etcd_client.read(ETCD_BUILDER_PREFIX, recursive=True))
workers_alive = sum(1 for child in building.children if not child.dir)
except KeyError:
workers_alive = 0
@@ -87,7 +93,7 @@ class EphemeralBuilderManager(BaseManager):
if workers_alive >= allowed_worker_count:
logger.info('Too many workers alive, unable to start new worker. %s >= %s', workers_alive,
allowed_worker_count)
- return False
+ raise Return(False)
job_key = self._etcd_job_key(build_job)
@@ -97,28 +103,33 @@ class EphemeralBuilderManager(BaseManager):
expiration = datetime.utcnow() + timedelta(seconds=self.setup_time())
payload = {
- 'expiration': expiration.isoformat(),
+ 'expiration': calendar.timegm(expiration.timetuple()),
}
try:
- self._etcd_client.write(job_key, payload, prevExist=False)
+ yield From(self._etcd_client.write(job_key, payload, prevExist=False))
component = self.register_component(realm, BuildComponent, token=token)
self._component_to_job[component] = build_job
except KeyError:
# The job was already taken by someone else, we are probably a retry
- logger.warning('Job already exists in etcd, did an old worker die?')
- return False
+ logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?')
+ raise Return(False)
- builder_id = self._executor.start_builder(realm, token)
+ logger.debug('Starting builder with executor: %s', self._executor)
+ builder_id = yield From(self._executor.start_builder(realm, token))
self._component_to_builder[component] = builder_id
- return True
+ # Store the builder in etcd associated with the job id
+ payload['builder_id'] = builder_id
+ yield From(self._etcd_client.write(job_key, payload, prevExist=True))
+
+ raise Return(True)
def build_component_ready(self, build_component, loop):
try:
job = self._component_to_job.pop(build_component)
- logger.debug('Sending build %s to newly ready component on realm %s', job.repo_build.uuid,
- build_component.builder_realm)
+ logger.debug('Sending build %s to newly ready component on realm %s',
+ job.job_details['build_uuid'], build_component.builder_realm)
loop.call_soon(build_component.start_build, job)
except KeyError:
logger.warning('Builder is asking for more work, but work already completed')
@@ -126,6 +137,7 @@ class EphemeralBuilderManager(BaseManager):
def build_component_disposed(self, build_component, timed_out):
logger.debug('Calling build_component_disposed.')
+ @coroutine
def job_completed(self, build_job, job_status, build_component):
logger.debug('Calling job_completed with status: %s', job_status)
@@ -134,12 +146,24 @@ class EphemeralBuilderManager(BaseManager):
# Release the lock in etcd
job_key = self._etcd_job_key(build_job)
- self._etcd_client.delete(job_key)
+ yield From(self._etcd_client.delete(job_key))
self.job_complete_callback(build_job, job_status)
+ @coroutine
+ def _clean_up_old_builder(self, job_key, job_payload):
+ """ Terminate an old builders once the expiration date has passed.
+ """
+ logger.debug('Cleaning up the old builder for job: %s', job_key)
+ if 'builder_id' in job_payload:
+ logger.info('Terminating expired build node.')
+ yield From(self._executor.stop_builder(job_payload['builder_id']))
+
+ yield From(self._etcd_client.delete(job_key))
+
+
@staticmethod
def _etcd_job_key(build_job):
""" Create a key which is used to track a job in etcd.
"""
- return '{0}{1}'.format(ETCD_BUILDER_PREFIX, build_job.repo_build.uuid)
+ return '{0}{1}'.format(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid'])
diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py
index b35a90c97..82b98ef5c 100644
--- a/buildman/manager/executor.py
+++ b/buildman/manager/executor.py
@@ -7,6 +7,10 @@ import requests
import cachetools
from jinja2 import FileSystemLoader, Environment
+from trollius import coroutine, From, Return, get_event_loop
+from functools import partial
+
+from buildman.asyncutil import AsyncWrapper
logger = logging.getLogger(__name__)
@@ -32,12 +36,14 @@ class BuilderExecutor(object):
""" Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
starting and stopping builders.
"""
+ @coroutine
def start_builder(self, realm, token):
""" Create a builder with the specified config. Returns a unique id which can be used to manage
the builder.
"""
raise NotImplementedError
+ @coroutine
def stop_builder(self, builder_id):
""" Stop a builder which is currently running.
"""
@@ -74,14 +80,18 @@ class EC2Executor(BuilderExecutor):
"""
COREOS_STACK_URL = 'http://%s.release.core-os.net/amd64-usr/current/coreos_production_ami_hvm.txt'
+ def __init__(self, *args, **kwargs):
+ self._loop = get_event_loop()
+ super(EC2Executor, self).__init__(*args, **kwargs)
+
def _get_conn(self):
""" Creates an ec2 connection which can be used to manage instances.
"""
- return boto.ec2.connect_to_region(
+ return AsyncWrapper(boto.ec2.connect_to_region(
self.executor_config['EC2_REGION'],
aws_access_key_id=self.executor_config['AWS_ACCESS_KEY'],
aws_secret_access_key=self.executor_config['AWS_SECRET_KEY'],
- )
+ ))
@classmethod
@cachetools.ttl_cache(ttl=ONE_HOUR)
@@ -92,25 +102,24 @@ class EC2Executor(BuilderExecutor):
stack_amis = dict([stack.split('=') for stack in stack_list_string.split('|')])
return stack_amis[ec2_region]
+ @coroutine
def start_builder(self, realm, token):
region = self.executor_config['EC2_REGION']
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
- coreos_ami = self._get_coreos_ami(region, channel)
+ get_ami_callable = partial(self._get_coreos_ami, region, channel)
+ coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
user_data = self.generate_cloud_config(realm, token, channel, self.manager_public_ip)
logger.debug('Generated cloud config: %s', user_data)
ec2_conn = self._get_conn()
- # class FakeReservation(object):
- # def __init__(self):
- # self.instances = None
- # reservation = FakeReservation()
- reservation = ec2_conn.run_instances(
+ reservation = yield ec2_conn.run_instances(
coreos_ami,
instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
security_groups=self.executor_config['EC2_SECURITY_GROUP_IDS'],
key_name=self.executor_config.get('EC2_KEY_NAME', None),
user_data=user_data,
+ instance_initiated_shutdown_behavior='terminate',
)
if not reservation.instances:
@@ -124,12 +133,13 @@ class EC2Executor(BuilderExecutor):
'Realm': realm,
'Token': token,
})
- return launched.id
+ raise Return(launched.id)
+ @coroutine
def stop_builder(self, builder_id):
ec2_conn = self._get_conn()
- stopped_instance_ids = [si.id for si in ec2_conn.stop_instances([builder_id], force=True)]
- if builder_id not in stopped_instance_ids:
+ stopped_instances = yield ec2_conn.stop_instances([builder_id], force=True)
+ if builder_id not in [si.id for si in stopped_instances]:
raise ExecutorException('Unable to stop instance: %s' % builder_id)
class PopenExecutor(BuilderExecutor):
@@ -142,6 +152,7 @@ class PopenExecutor(BuilderExecutor):
""" Executor which uses Popen to fork a quay-builder process.
"""
+ @coroutine
def start_builder(self, realm, token):
# Now start a machine for this job, adding the machine id to the etcd information
logger.debug('Forking process for build')
@@ -162,9 +173,9 @@ class PopenExecutor(BuilderExecutor):
builder_id = str(uuid.uuid4())
self._jobs[builder_id] = (spawned, logpipe)
logger.debug('Builder spawned with id: %s', builder_id)
- return builder_id
-
+ raise Return(builder_id)
+ @coroutine
def stop_builder(self, builder_id):
if builder_id not in self._jobs:
raise ExecutorException('Builder id not being tracked by executor.')
diff --git a/buildman/server.py b/buildman/server.py
index 6f57b6627..66f0010b6 100644
--- a/buildman/server.py
+++ b/buildman/server.py
@@ -154,7 +154,8 @@ class BuilderServer(object):
self._queue.incomplete(job_item, restore_retry=False)
logger.debug('Build job found. Checking for an avaliable worker.')
- if self._lifecycle_manager.schedule(build_job, self._loop):
+ scheduled = yield From(self._lifecycle_manager.schedule(build_job, self._loop))
+ if scheduled:
self._job_count = self._job_count + 1
logger.debug('Build job scheduled. Running: %s', self._job_count)
else:
diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml
index ca9c6c16a..e75ce5626 100644
--- a/buildman/templates/cloudconfig.yaml
+++ b/buildman/templates/cloudconfig.yaml
@@ -29,10 +29,10 @@ coreos:
After=docker.service
[Service]
- Restart=always
TimeoutStartSec=600
TimeoutStopSec=2000
ExecStartPre=/usr/bin/sudo /bin/sh -xc "echo '{{ manager_ip }} buildman.quay.io' >> /etc/hosts; exit 0"
ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io
ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest
ExecStop=/usr/bin/docker stop quay-builder
+ ExecStopPost=/usr/bin/sudo /bin/sh -xc "/bin/sleep 600; /sbin/shutown -h now"
diff --git a/endpoints/api/build.py b/endpoints/api/build.py
index e7fdf2f11..506c250da 100644
--- a/endpoints/api/build.py
+++ b/endpoints/api/build.py
@@ -72,8 +72,8 @@ def build_status_view(build_obj, can_write=False):
# minutes. If not, then the build timed out.
if phase != database.BUILD_PHASE.COMPLETE and phase != database.BUILD_PHASE.ERROR:
if status is not None and 'heartbeat' in status and status['heartbeat']:
- heartbeat = datetime.datetime.fromtimestamp(status['heartbeat'])
- if datetime.datetime.now() - heartbeat > datetime.timedelta(minutes=1):
+ heartbeat = datetime.datetime.utcfromtimestamp(status['heartbeat'])
+ if datetime.datetime.utcnow() - heartbeat > datetime.timedelta(minutes=1):
phase = database.BUILD_PHASE.INTERNAL_ERROR
logger.debug('Can write: %s job_config: %s', can_write, build_obj.job_config)
diff --git a/requirements-nover.txt b/requirements-nover.txt
index 51cd42e3c..2993895d7 100644
--- a/requirements-nover.txt
+++ b/requirements-nover.txt
@@ -43,3 +43,4 @@ git+https://github.com/DevTable/avatar-generator.git
gipc
python-etcd
cachetools
+mock
diff --git a/test/test_buildman.py b/test/test_buildman.py
new file mode 100644
index 000000000..0886b671a
--- /dev/null
+++ b/test/test_buildman.py
@@ -0,0 +1,109 @@
+import unittest
+import etcd
+
+from trollius import coroutine, get_event_loop, From, Future
+from mock import Mock
+from functools import partial
+
+from buildman.manager.executor import BuilderExecutor
+from buildman.manager.ephemeral import EphemeralBuilderManager, ETCD_BUILDER_PREFIX
+from buildman.server import BuildJobResult
+from buildman.component.buildcomponent import BuildComponent
+
+
+BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead'
+
+
+import logging
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+def async_test(f):
+ def wrapper(*args, **kwargs):
+ coro = coroutine(f)
+ future = coro(*args, **kwargs)
+ loop = get_event_loop()
+ loop.run_until_complete(future)
+ return wrapper
+
+class TestEphemeral(unittest.TestCase):
+ def __init__(self, *args, **kwargs):
+ self.etcd_client_mock = None
+ self.test_executor = None
+ super(TestEphemeral, self).__init__(*args, **kwargs)
+
+ def _create_mock_etcd_client(self, *args, **kwargs):
+ self.etcd_client_mock = Mock(spec=etcd.Client, name='etcd.Client')
+ return self.etcd_client_mock
+
+ def _create_mock_executor(self, *args, **kwargs):
+ def create_completed_future(result=None):
+ def inner(*args, **kwargs):
+ new_future = Future()
+ new_future.set_result(result)
+ return new_future
+ return inner
+
+ self.test_executor = Mock(spec=BuilderExecutor)
+ self.test_executor.start_builder = Mock(side_effect=create_completed_future('123'))
+ self.test_executor.stop_builder = Mock(side_effect=create_completed_future())
+ return self.test_executor
+
+ def _create_build_job(self):
+ mock_job = Mock()
+ mock_job.job_details = {
+ 'build_uuid': BUILD_UUID,
+ }
+ return mock_job
+
+ def setUp(self):
+ EphemeralBuilderManager._executors['test'] = self._create_mock_executor
+
+ self.old_etcd_client_klass = EphemeralBuilderManager._etcd_client_klass
+ EphemeralBuilderManager._etcd_client_klass = self._create_mock_etcd_client
+
+ self.register_component_callback = Mock()
+ self.uniregister_component_callback = Mock()
+ self.job_heartbeat_callback = Mock()
+ self.job_complete_callback = Mock()
+
+ self.manager = EphemeralBuilderManager(
+ self.register_component_callback,
+ self.uniregister_component_callback,
+ self.job_heartbeat_callback,
+ self.job_complete_callback,
+ '127.0.0.1'
+ )
+
+ self.manager.initialize({'EXECUTOR': 'test'})
+
+ def tearDown(self):
+ del EphemeralBuilderManager._executors['test']
+ EphemeralBuilderManager._etcd_client_klass = self.old_etcd_client_klass
+
+ @async_test
+ def test_schedule_and_complete(self):
+ mock_job = self._create_build_job()
+
+ self.etcd_client_mock.read = Mock(side_effect=KeyError)
+ test_component = BuildComponent(None)
+ self.register_component_callback.return_value = test_component
+
+ # Ask for a builder to be scheduled
+ loop = get_event_loop()
+ is_scheduled = yield From(self.manager.schedule(mock_job, loop))
+
+ self.assertTrue(is_scheduled)
+
+ job_key = ETCD_BUILDER_PREFIX + mock_job.job_details['build_uuid']
+ self.etcd_client_mock.read.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True)
+ self.assertEqual(len(self.test_executor.start_builder.call_args_list), 1)
+ self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], job_key)
+ self.assertEqual(self.etcd_client_mock.write.call_args_list[1][0][0], job_key)
+
+ self.assertEqual(len(self.register_component_callback.call_args_list), 1)
+
+ yield From(self.manager.job_completed(mock_job, BuildJobResult.COMPLETE, test_component))
+
+ self.assertEqual(len(self.test_executor.stop_builder.call_args_list), 1)
+ self.etcd_client_mock.delete.assert_called_once_with(job_key)
From 2b6c2a2a50daeccb975e243faf24b11de40b55f3 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Mon, 22 Dec 2014 16:22:07 -0500
Subject: [PATCH 005/127] Improve tests for the ephemeral build manager.
---
buildman/manager/ephemeral.py | 72 ++++++++++++++++++++++----------
test/test_buildman.py | 78 ++++++++++++++++++++++++++++-------
2 files changed, 115 insertions(+), 35 deletions(-)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index ed2da908e..80a96d336 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -2,9 +2,11 @@ import logging
import etcd
import uuid
import calendar
+import os.path
from datetime import datetime, timedelta
-from trollius import From, coroutine, Return
+from trollius import From, coroutine, Return, async
+from concurrent.futures import ThreadPoolExecutor
from buildman.manager.basemanager import BaseManager
from buildman.manager.executor import PopenExecutor, EC2Executor
@@ -16,25 +18,11 @@ logger = logging.getLogger(__name__)
ETCD_BUILDER_PREFIX = 'building/'
-
-
-def clear_etcd(client):
- """ Debugging method used to clear out the section of etcd we are using to track jobs in flight.
- """
- try:
- building = client.read(ETCD_BUILDER_PREFIX, recursive=True)
- for child in building.leaves:
- if not child.dir:
- logger.warning('Deleting key: %s', child.key)
- client.delete(child.key)
- except KeyError:
- pass
+ETCD_EXPIRE_RESULT = 'expire'
class EphemeralBuilderManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
- shutting_down = False
-
_executors = {
'popen': PopenExecutor,
'ec2': EC2Executor,
@@ -43,7 +31,10 @@ class EphemeralBuilderManager(BaseManager):
_etcd_client_klass = etcd.Client
def __init__(self, *args, **kwargs):
+ self._shutting_down = False
+
self._manager_config = None
+ self._async_thread_executor = None
self._etcd_client = None
self._component_to_job = {}
@@ -51,8 +42,35 @@ class EphemeralBuilderManager(BaseManager):
self._executor = None
+ self._worker_watch_task = None
+
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
+ def _watch_builders(self):
+ """ Watch the builders key for expirations.
+ """
+ if not self._shutting_down:
+ workers_future = self._etcd_client.watch(ETCD_BUILDER_PREFIX, recursive=True)
+ workers_future.add_done_callback(self._handle_key_expiration)
+ logger.debug('Scheduling watch task.')
+ self._worker_watch_task = async(workers_future)
+
+ def _handle_key_expiration(self, changed_key_future):
+ """ Handle when a builder expires
+ """
+ if self._worker_watch_task is None or self._worker_watch_task.done():
+ self._watch_builders()
+
+ if changed_key_future.cancelled():
+ # Due to lack of interest, tomorrow has been cancelled
+ return
+
+ etcd_result = changed_key_future.result()
+ if etcd_result.action == ETCD_EXPIRE_RESULT:
+ # Handle the expiration
+ logger.debug('Builder expired, clean up the old build node')
+ async(self._clean_up_old_builder(etcd_result.key, etcd_result._prev_node.value))
+
def initialize(self, manager_config):
logger.debug('Calling initialize')
self._manager_config = manager_config
@@ -65,7 +83,11 @@ class EphemeralBuilderManager(BaseManager):
etcd_port = self._manager_config.get('ETCD_PORT', 2379)
logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
- self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port))
+ self._async_thread_executor = ThreadPoolExecutor(self._manager_config.get('ETCD_WORKERS', 5))
+ self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port),
+ executor=self._async_thread_executor)
+
+ self._watch_builders()
def setup_time(self):
setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300)
@@ -73,8 +95,17 @@ class EphemeralBuilderManager(BaseManager):
return setup_time
def shutdown(self):
- logger.debug('Calling shutdown.')
- raise NotImplementedError
+ logger.debug('Shutting down worker.')
+ self._shutting_down = True
+
+ if self._worker_watch_task is not None:
+ logger.debug('Canceling watch task.')
+ self._worker_watch_task.cancel()
+ self._worker_watch_task = None
+
+ if self._async_thread_executor is not None:
+ logger.debug('Shutting down thread pool executor.')
+ self._async_thread_executor.shutdown()
@coroutine
def schedule(self, build_job, loop):
@@ -161,9 +192,8 @@ class EphemeralBuilderManager(BaseManager):
yield From(self._etcd_client.delete(job_key))
-
@staticmethod
def _etcd_job_key(build_job):
""" Create a key which is used to track a job in etcd.
"""
- return '{0}{1}'.format(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid'])
+ return os.path.join(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid'])
diff --git a/test/test_buildman.py b/test/test_buildman.py
index 0886b671a..d5a7423e6 100644
--- a/test/test_buildman.py
+++ b/test/test_buildman.py
@@ -1,12 +1,15 @@
import unittest
import etcd
+import os.path
+import time
-from trollius import coroutine, get_event_loop, From, Future
+from trollius import coroutine, get_event_loop, From, Future, sleep
from mock import Mock
-from functools import partial
+from threading import Event
from buildman.manager.executor import BuilderExecutor
-from buildman.manager.ephemeral import EphemeralBuilderManager, ETCD_BUILDER_PREFIX
+from buildman.manager.ephemeral import (EphemeralBuilderManager, ETCD_BUILDER_PREFIX,
+ ETCD_EXPIRE_RESULT)
from buildman.server import BuildJobResult
from buildman.component.buildcomponent import BuildComponent
@@ -14,10 +17,6 @@ from buildman.component.buildcomponent import BuildComponent
BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead'
-import logging
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger(__name__)
-
def async_test(f):
def wrapper(*args, **kwargs):
coro = coroutine(f)
@@ -29,11 +28,17 @@ def async_test(f):
class TestEphemeral(unittest.TestCase):
def __init__(self, *args, **kwargs):
self.etcd_client_mock = None
+ self.etcd_wait_event = Event()
self.test_executor = None
super(TestEphemeral, self).__init__(*args, **kwargs)
def _create_mock_etcd_client(self, *args, **kwargs):
+ def hang_until_event(*args, **kwargs):
+ time.sleep(.01) # 10ms to simulate network latency
+ self.etcd_wait_event.wait()
+
self.etcd_client_mock = Mock(spec=etcd.Client, name='etcd.Client')
+ self.etcd_client_mock.watch = Mock(side_effect=hang_until_event)
return self.etcd_client_mock
def _create_mock_executor(self, *args, **kwargs):
@@ -61,6 +66,7 @@ class TestEphemeral(unittest.TestCase):
self.old_etcd_client_klass = EphemeralBuilderManager._etcd_client_klass
EphemeralBuilderManager._etcd_client_klass = self._create_mock_etcd_client
+ self.etcd_wait_event.clear()
self.register_component_callback = Mock()
self.uniregister_component_callback = Mock()
@@ -77,7 +83,13 @@ class TestEphemeral(unittest.TestCase):
self.manager.initialize({'EXECUTOR': 'test'})
+ self.mock_job_key = os.path.join(ETCD_BUILDER_PREFIX, BUILD_UUID)
+
def tearDown(self):
+ self.etcd_wait_event.set()
+
+ self.manager.shutdown()
+
del EphemeralBuilderManager._executors['test']
EphemeralBuilderManager._etcd_client_klass = self.old_etcd_client_klass
@@ -95,15 +107,53 @@ class TestEphemeral(unittest.TestCase):
self.assertTrue(is_scheduled)
- job_key = ETCD_BUILDER_PREFIX + mock_job.job_details['build_uuid']
self.etcd_client_mock.read.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True)
- self.assertEqual(len(self.test_executor.start_builder.call_args_list), 1)
- self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], job_key)
- self.assertEqual(self.etcd_client_mock.write.call_args_list[1][0][0], job_key)
+ self.assertEqual(self.test_executor.start_builder.call_count, 1)
+ self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key)
+ self.assertEqual(self.etcd_client_mock.write.call_args_list[1][0][0], self.mock_job_key)
- self.assertEqual(len(self.register_component_callback.call_args_list), 1)
+ self.assertEqual(self.register_component_callback.call_count, 1)
yield From(self.manager.job_completed(mock_job, BuildJobResult.COMPLETE, test_component))
- self.assertEqual(len(self.test_executor.stop_builder.call_args_list), 1)
- self.etcd_client_mock.delete.assert_called_once_with(job_key)
+ self.assertEqual(self.test_executor.stop_builder.call_count, 1)
+ self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key)
+
+ @async_test
+ def test_expiring_worker(self):
+ # Test that we are watching before anything else happens
+ self.etcd_client_mock.watch.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True)
+
+ # Send a signal to the callback that a worker has expired
+ expired_result = Mock(sepc=etcd.EtcdResult)
+ expired_result.action = ETCD_EXPIRE_RESULT
+ expired_result.key = self.mock_job_key
+ expired_result._prev_node = Mock(spec=etcd.EtcdResult)
+ expired_result._prev_node.value = {'builder_id': '1234'}
+ expired_future = Future()
+ expired_future.set_result(expired_result)
+
+ self.manager._handle_key_expiration(expired_future)
+
+ yield From(sleep(.01))
+
+ self.test_executor.stop_builder.assert_called_once_with('1234')
+ self.assertEqual(self.test_executor.stop_builder.call_count, 1)
+
+ self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key)
+
+ @async_test
+ def test_change_worker(self):
+ # Send a signal to the callback that a worker key has been changed
+ set_result = Mock(sepc=etcd.EtcdResult)
+ set_result.action = 'set'
+ set_result.key = self.mock_job_key
+ set_future = Future()
+ set_future.set_result(set_result)
+
+ self.manager._handle_key_expiration(set_future)
+
+ yield From(sleep(.01))
+
+ self.assertEquals(self.test_executor.stop_builder.call_count, 0)
+
From 34bf92673bfc1654f2a7020ed4b41d5a321a03e2 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Mon, 22 Dec 2014 17:24:44 -0500
Subject: [PATCH 006/127] Add support for adjusting etcd ttl on job_heartbeat.
Switch the heartbeat method to a coroutine.
---
buildman/component/buildcomponent.py | 2 +-
buildman/manager/basemanager.py | 4 +++-
buildman/manager/ephemeral.py | 36 ++++++++++++++++++++++++----
buildman/manager/executor.py | 1 +
buildman/server.py | 6 ++++-
test/test_buildman.py | 26 +++++++++++++++-----
6 files changed, 62 insertions(+), 13 deletions(-)
diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py
index 53b04bf87..fb81e6aa5 100644
--- a/buildman/component/buildcomponent.py
+++ b/buildman/component/buildcomponent.py
@@ -338,7 +338,7 @@ class BuildComponent(BaseComponent):
# Mark the build item.
current_job = self._current_job
if current_job is not None:
- self.parent_manager.job_heartbeat(current_job)
+ yield trollius.From(self.parent_manager.job_heartbeat(current_job))
# Check the heartbeat from the worker.
logger.debug('Checking heartbeat on realm %s', self.builder_realm)
diff --git a/buildman/manager/basemanager.py b/buildman/manager/basemanager.py
index 76e97e5ac..ee17cf531 100644
--- a/buildman/manager/basemanager.py
+++ b/buildman/manager/basemanager.py
@@ -3,13 +3,15 @@ from trollius import coroutine
class BaseManager(object):
""" Base for all worker managers. """
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
- job_complete_callback, public_ip_address):
+ job_complete_callback, public_ip_address, heartbeat_period_sec):
self.register_component = register_component
self.unregister_component = unregister_component
self.job_heartbeat_callback = job_heartbeat_callback
self.job_complete_callback = job_complete_callback
self.public_ip_address = public_ip_address
+ self.heartbeat_period_sec = heartbeat_period_sec
+ @coroutine
def job_heartbeat(self, build_job):
""" Method invoked to tell the manager that a job is still running. This method will be called
every few minutes. """
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index 80a96d336..fdc116e5b 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -83,7 +83,8 @@ class EphemeralBuilderManager(BaseManager):
etcd_port = self._manager_config.get('ETCD_PORT', 2379)
logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
- self._async_thread_executor = ThreadPoolExecutor(self._manager_config.get('ETCD_WORKERS', 5))
+ worker_threads = self._manager_config.get('ETCD_WORKER_THREADS', 5)
+ self._async_thread_executor = ThreadPoolExecutor(worker_threads)
self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port),
executor=self._async_thread_executor)
@@ -131,14 +132,15 @@ class EphemeralBuilderManager(BaseManager):
# First try to take a lock for this job, meaning we will be responsible for its lifeline
realm = str(uuid.uuid4())
token = str(uuid.uuid4())
- expiration = datetime.utcnow() + timedelta(seconds=self.setup_time())
+ ttl = self.setup_time()
+ expiration = datetime.utcnow() + timedelta(seconds=ttl)
payload = {
'expiration': calendar.timegm(expiration.timetuple()),
}
try:
- yield From(self._etcd_client.write(job_key, payload, prevExist=False))
+ yield From(self._etcd_client.write(job_key, payload, prevExist=False, ttl=ttl))
component = self.register_component(realm, BuildComponent, token=token)
self._component_to_job[component] = build_job
except KeyError:
@@ -168,11 +170,14 @@ class EphemeralBuilderManager(BaseManager):
def build_component_disposed(self, build_component, timed_out):
logger.debug('Calling build_component_disposed.')
+ # TODO make it so that I don't have to unregister the component if it timed out
+ self.unregister_component(build_component)
+
@coroutine
def job_completed(self, build_job, job_status, build_component):
logger.debug('Calling job_completed with status: %s', job_status)
- # Kill he ephmeral builder
+ # Kill the ephmeral builder
self._executor.stop_builder(self._component_to_builder.pop(build_component))
# Release the lock in etcd
@@ -181,6 +186,24 @@ class EphemeralBuilderManager(BaseManager):
self.job_complete_callback(build_job, job_status)
+ @coroutine
+ def job_heartbeat(self, build_job):
+ # Extend the deadline in etcd
+ job_key = self._etcd_job_key(build_job)
+ build_job_response = yield From(self._etcd_client.read(job_key))
+
+ ttl = self.heartbeat_period_sec * 2
+ new_expiration = datetime.utcnow() + timedelta(seconds=ttl)
+
+ payload = {
+ 'expiration': calendar.timegm(new_expiration.timetuple()),
+ 'builder_id': build_job_response.value['builder_id'],
+ }
+
+ yield From(self._etcd_client.write(job_key, payload, ttl=ttl))
+
+ self.job_heartbeat_callback(build_job)
+
@coroutine
def _clean_up_old_builder(self, job_key, job_payload):
""" Terminate an old builders once the expiration date has passed.
@@ -197,3 +220,8 @@ class EphemeralBuilderManager(BaseManager):
""" Create a key which is used to track a job in etcd.
"""
return os.path.join(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid'])
+
+ def num_workers(self):
+ """ Return the number of workers we're managing locally.
+ """
+ return len(self._component_to_builder)
diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py
index 82b98ef5c..e3a6a4f4a 100644
--- a/buildman/manager/executor.py
+++ b/buildman/manager/executor.py
@@ -142,6 +142,7 @@ class EC2Executor(BuilderExecutor):
if builder_id not in [si.id for si in stopped_instances]:
raise ExecutorException('Unable to stop instance: %s' % builder_id)
+
class PopenExecutor(BuilderExecutor):
""" Implementation of BuilderExecutor which uses Popen to fork a quay-builder process.
"""
diff --git a/buildman/server.py b/buildman/server.py
index 576bb3a10..ba9536c1e 100644
--- a/buildman/server.py
+++ b/buildman/server.py
@@ -24,6 +24,8 @@ MINIMUM_JOB_EXTENSION = timedelta(minutes=2)
WEBSOCKET_PORT = 8787
CONTROLLER_PORT = 8686
+HEARTBEAT_PERIOD_SEC = 30
+
class BuildJobResult(object):
""" Build job result enum """
INCOMPLETE = 'incomplete'
@@ -52,6 +54,7 @@ class BuilderServer(object):
self._job_heartbeat,
self._job_complete,
manager_public_ip,
+ HEARTBEAT_PERIOD_SEC,
)
self._lifecycle_manager_config = lifecycle_manager_config
@@ -140,7 +143,8 @@ class BuilderServer(object):
@trollius.coroutine
def _work_checker(self):
while self._current_status == 'running':
- logger.debug('Checking for more work for %d active workers', self._lifecycle_manager.num_workers())
+ logger.debug('Checking for more work for %d active workers',
+ self._lifecycle_manager.num_workers())
job_item = self._queue.get(processing_time=self._lifecycle_manager.setup_time())
if job_item is None:
logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
diff --git a/test/test_buildman.py b/test/test_buildman.py
index d5a7423e6..d31539a3d 100644
--- a/test/test_buildman.py
+++ b/test/test_buildman.py
@@ -78,11 +78,13 @@ class TestEphemeral(unittest.TestCase):
self.uniregister_component_callback,
self.job_heartbeat_callback,
self.job_complete_callback,
- '127.0.0.1'
+ '127.0.0.1',
+ 30,
)
self.manager.initialize({'EXECUTOR': 'test'})
+ self.mock_job = self._create_build_job()
self.mock_job_key = os.path.join(ETCD_BUILDER_PREFIX, BUILD_UUID)
def tearDown(self):
@@ -95,15 +97,13 @@ class TestEphemeral(unittest.TestCase):
@async_test
def test_schedule_and_complete(self):
- mock_job = self._create_build_job()
-
self.etcd_client_mock.read = Mock(side_effect=KeyError)
test_component = BuildComponent(None)
self.register_component_callback.return_value = test_component
# Ask for a builder to be scheduled
loop = get_event_loop()
- is_scheduled = yield From(self.manager.schedule(mock_job, loop))
+ is_scheduled = yield From(self.manager.schedule(self.mock_job, loop))
self.assertTrue(is_scheduled)
@@ -114,7 +114,7 @@ class TestEphemeral(unittest.TestCase):
self.assertEqual(self.register_component_callback.call_count, 1)
- yield From(self.manager.job_completed(mock_job, BuildJobResult.COMPLETE, test_component))
+ yield From(self.manager.job_completed(self.mock_job, BuildJobResult.COMPLETE, test_component))
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key)
@@ -125,7 +125,7 @@ class TestEphemeral(unittest.TestCase):
self.etcd_client_mock.watch.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True)
# Send a signal to the callback that a worker has expired
- expired_result = Mock(sepc=etcd.EtcdResult)
+ expired_result = Mock(spec=etcd.EtcdResult)
expired_result.action = ETCD_EXPIRE_RESULT
expired_result.key = self.mock_job_key
expired_result._prev_node = Mock(spec=etcd.EtcdResult)
@@ -157,3 +157,17 @@ class TestEphemeral(unittest.TestCase):
self.assertEquals(self.test_executor.stop_builder.call_count, 0)
+ @async_test
+ def test_heartbeat_response(self):
+ builder_result = Mock(spec=etcd.EtcdResult)
+ builder_result.value = {'builder_id': '123', 'expiration': '123'}
+ self.etcd_client_mock.read = Mock(return_value=builder_result)
+
+ yield From(self.manager.job_heartbeat(self.mock_job))
+
+ # Wait for threads to complete
+ yield From(sleep(.01))
+
+ self.job_heartbeat_callback.assert_called_once_with(self.mock_job)
+ self.assertEqual(self.etcd_client_mock.write.call_count, 1)
+ self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key)
From aac7feb20b36542ea3594aa6343687b602558088 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 11:17:23 -0500
Subject: [PATCH 007/127] Refresh the build_job from the database before we
write updates.
---
buildman/component/buildcomponent.py | 2 +-
buildman/jobutil/buildstatus.py | 13 ++++++++-----
2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py
index fb81e6aa5..391f8ffed 100644
--- a/buildman/component/buildcomponent.py
+++ b/buildman/component/buildcomponent.py
@@ -67,7 +67,7 @@ class BuildComponent(BaseComponent):
def start_build(self, build_job):
""" Starts a build. """
self._current_job = build_job
- self._build_status = StatusHandler(self.build_logs, build_job.repo_build)
+ self._build_status = StatusHandler(self.build_logs, build_job.repo_build.uuid)
self._image_info = {}
self._set_status(ComponentStatus.BUILDING)
diff --git a/buildman/jobutil/buildstatus.py b/buildman/jobutil/buildstatus.py
index 68b8cd5e3..b79776c46 100644
--- a/buildman/jobutil/buildstatus.py
+++ b/buildman/jobutil/buildstatus.py
@@ -1,12 +1,12 @@
from data.database import BUILD_PHASE
+from data import model
class StatusHandler(object):
""" Context wrapper for writing status to build logs. """
- def __init__(self, build_logs, repository_build):
+ def __init__(self, build_logs, repository_build_uuid):
self._current_phase = None
- self._repository_build = repository_build
- self._uuid = repository_build.uuid
+ self._uuid = repository_build_uuid
self._build_logs = build_logs
self._status = {
@@ -41,8 +41,11 @@ class StatusHandler(object):
self._current_phase = phase
self._append_log_message(phase, self._build_logs.PHASE, extra_data)
- self._repository_build.phase = phase
- self._repository_build.save()
+
+ # Update the repository build with the new phase
+ repo_build = model.get_repository_build(self._uuid)
+ repo_build.phase = phase
+ repo_build.save()
return True
def __enter__(self):
From 055a6b0c377f143038c1413b14f5e0137f143fdd Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 11:18:10 -0500
Subject: [PATCH 008/127] Add a total maximum time that a machine is allowed to
stick around before we terminate it more forcefully.
---
buildman/manager/ephemeral.py | 13 +++++++++++--
test/test_buildman.py | 7 ++++++-
2 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index fdc116e5b..7d9eacdc2 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -135,8 +135,12 @@ class EphemeralBuilderManager(BaseManager):
ttl = self.setup_time()
expiration = datetime.utcnow() + timedelta(seconds=ttl)
+ machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200)
+ max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration)
+
payload = {
'expiration': calendar.timegm(expiration.timetuple()),
+ 'max_expiration': calendar.timegm(max_expiration.timetuple()),
}
try:
@@ -154,7 +158,7 @@ class EphemeralBuilderManager(BaseManager):
# Store the builder in etcd associated with the job id
payload['builder_id'] = builder_id
- yield From(self._etcd_client.write(job_key, payload, prevExist=True))
+ yield From(self._etcd_client.write(job_key, payload, prevExist=True, ttl=ttl))
raise Return(True)
@@ -192,12 +196,17 @@ class EphemeralBuilderManager(BaseManager):
job_key = self._etcd_job_key(build_job)
build_job_response = yield From(self._etcd_client.read(job_key))
- ttl = self.heartbeat_period_sec * 2
+ max_expiration = datetime.utcfromtimestamp(build_job_response.value['max_expiration'])
+ max_expiration_remaining = max_expiration - datetime.utcnow()
+ max_expiration_sec = max(0, int(max_expiration_remaining.total_seconds()))
+
+ ttl = min(self.heartbeat_period_sec * 2, max_expiration_sec)
new_expiration = datetime.utcnow() + timedelta(seconds=ttl)
payload = {
'expiration': calendar.timegm(new_expiration.timetuple()),
'builder_id': build_job_response.value['builder_id'],
+ 'max_expiration': build_job_response.value['max_expiration'],
}
yield From(self._etcd_client.write(job_key, payload, ttl=ttl))
diff --git a/test/test_buildman.py b/test/test_buildman.py
index d31539a3d..6835cdd49 100644
--- a/test/test_buildman.py
+++ b/test/test_buildman.py
@@ -159,8 +159,13 @@ class TestEphemeral(unittest.TestCase):
@async_test
def test_heartbeat_response(self):
+ expiration_timestamp = time.time() + 60
builder_result = Mock(spec=etcd.EtcdResult)
- builder_result.value = {'builder_id': '123', 'expiration': '123'}
+ builder_result.value = {
+ 'builder_id': '123',
+ 'expiration': expiration_timestamp,
+ 'max_expiration': expiration_timestamp,
+ }
self.etcd_client_mock.read = Mock(return_value=builder_result)
yield From(self.manager.job_heartbeat(self.mock_job))
From 709e571b78ff7b7345b3f1771368e673cc610065 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 12:13:49 -0500
Subject: [PATCH 009/127] Handle read timeouts from etcd when watching a key.
---
buildman/manager/ephemeral.py | 7 ++++++-
test/test_buildman.py | 13 +++++++++++++
2 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index 7d9eacdc2..39776b60e 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -7,6 +7,7 @@ import os.path
from datetime import datetime, timedelta
from trollius import From, coroutine, Return, async
from concurrent.futures import ThreadPoolExecutor
+from urllib3.exceptions import ReadTimeoutError
from buildman.manager.basemanager import BaseManager
from buildman.manager.executor import PopenExecutor, EC2Executor
@@ -65,7 +66,11 @@ class EphemeralBuilderManager(BaseManager):
# Due to lack of interest, tomorrow has been cancelled
return
- etcd_result = changed_key_future.result()
+ try:
+ etcd_result = changed_key_future.result()
+ except ReadTimeoutError:
+ return
+
if etcd_result.action == ETCD_EXPIRE_RESULT:
# Handle the expiration
logger.debug('Builder expired, clean up the old build node')
diff --git a/test/test_buildman.py b/test/test_buildman.py
index 6835cdd49..9d0f5c1f4 100644
--- a/test/test_buildman.py
+++ b/test/test_buildman.py
@@ -6,6 +6,7 @@ import time
from trollius import coroutine, get_event_loop, From, Future, sleep
from mock import Mock
from threading import Event
+from urllib3.exceptions import ReadTimeoutError
from buildman.manager.executor import BuilderExecutor
from buildman.manager.ephemeral import (EphemeralBuilderManager, ETCD_BUILDER_PREFIX,
@@ -176,3 +177,15 @@ class TestEphemeral(unittest.TestCase):
self.job_heartbeat_callback.assert_called_once_with(self.mock_job)
self.assertEqual(self.etcd_client_mock.write.call_count, 1)
self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key)
+
+ @async_test
+ def test_etcd_read_timeout(self):
+ # Send a signal to the callback that a worker key has been changed
+ read_timeout_future = Future()
+ read_timeout_future.set_exception(ReadTimeoutError(None, None, None))
+
+ self.manager._handle_key_expiration(read_timeout_future)
+
+ yield From(sleep(.01))
+
+ self.assertEquals(self.test_executor.stop_builder.call_count, 0)
From 4e22e22ba12b9c6899d9f49f76838bbec491ae4f Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 14:09:04 -0500
Subject: [PATCH 010/127] We have to serialize our build data before sending it
to etc.
---
buildman/manager/ephemeral.py | 19 +++++++++++--------
test/test_buildman.py | 7 ++++---
2 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index 39776b60e..3b467bb23 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -3,6 +3,7 @@ import etcd
import uuid
import calendar
import os.path
+import json
from datetime import datetime, timedelta
from trollius import From, coroutine, Return, async
@@ -74,7 +75,8 @@ class EphemeralBuilderManager(BaseManager):
if etcd_result.action == ETCD_EXPIRE_RESULT:
# Handle the expiration
logger.debug('Builder expired, clean up the old build node')
- async(self._clean_up_old_builder(etcd_result.key, etcd_result._prev_node.value))
+ job_metadata = json.loads(etcd_result._prev_node.value)
+ async(self._clean_up_old_builder(etcd_result.key, job_metadata))
def initialize(self, manager_config):
logger.debug('Calling initialize')
@@ -149,7 +151,7 @@ class EphemeralBuilderManager(BaseManager):
}
try:
- yield From(self._etcd_client.write(job_key, payload, prevExist=False, ttl=ttl))
+ yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=False, ttl=ttl))
component = self.register_component(realm, BuildComponent, token=token)
self._component_to_job[component] = build_job
except KeyError:
@@ -163,7 +165,7 @@ class EphemeralBuilderManager(BaseManager):
# Store the builder in etcd associated with the job id
payload['builder_id'] = builder_id
- yield From(self._etcd_client.write(job_key, payload, prevExist=True, ttl=ttl))
+ yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl))
raise Return(True)
@@ -199,9 +201,10 @@ class EphemeralBuilderManager(BaseManager):
def job_heartbeat(self, build_job):
# Extend the deadline in etcd
job_key = self._etcd_job_key(build_job)
- build_job_response = yield From(self._etcd_client.read(job_key))
+ build_job_metadata_response = yield From(self._etcd_client.read(job_key))
+ build_job_metadata = json.loads(build_job_metadata_response.value)
- max_expiration = datetime.utcfromtimestamp(build_job_response.value['max_expiration'])
+ max_expiration = datetime.utcfromtimestamp(build_job_metadata['max_expiration'])
max_expiration_remaining = max_expiration - datetime.utcnow()
max_expiration_sec = max(0, int(max_expiration_remaining.total_seconds()))
@@ -210,11 +213,11 @@ class EphemeralBuilderManager(BaseManager):
payload = {
'expiration': calendar.timegm(new_expiration.timetuple()),
- 'builder_id': build_job_response.value['builder_id'],
- 'max_expiration': build_job_response.value['max_expiration'],
+ 'builder_id': build_job_metadata['builder_id'],
+ 'max_expiration': build_job_metadata['max_expiration'],
}
- yield From(self._etcd_client.write(job_key, payload, ttl=ttl))
+ yield From(self._etcd_client.write(job_key, json.dumps(payload), ttl=ttl))
self.job_heartbeat_callback(build_job)
diff --git a/test/test_buildman.py b/test/test_buildman.py
index 9d0f5c1f4..e33adccbd 100644
--- a/test/test_buildman.py
+++ b/test/test_buildman.py
@@ -2,6 +2,7 @@ import unittest
import etcd
import os.path
import time
+import json
from trollius import coroutine, get_event_loop, From, Future, sleep
from mock import Mock
@@ -130,7 +131,7 @@ class TestEphemeral(unittest.TestCase):
expired_result.action = ETCD_EXPIRE_RESULT
expired_result.key = self.mock_job_key
expired_result._prev_node = Mock(spec=etcd.EtcdResult)
- expired_result._prev_node.value = {'builder_id': '1234'}
+ expired_result._prev_node.value = json.dumps({'builder_id': '1234'})
expired_future = Future()
expired_future.set_result(expired_result)
@@ -162,11 +163,11 @@ class TestEphemeral(unittest.TestCase):
def test_heartbeat_response(self):
expiration_timestamp = time.time() + 60
builder_result = Mock(spec=etcd.EtcdResult)
- builder_result.value = {
+ builder_result.value = json.dumps({
'builder_id': '123',
'expiration': expiration_timestamp,
'max_expiration': expiration_timestamp,
- }
+ })
self.etcd_client_mock.read = Mock(return_value=builder_result)
yield From(self.manager.job_heartbeat(self.mock_job))
From b2d7fad6676774cf22ce0c3324453f6d8c3ce285 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 14:09:24 -0500
Subject: [PATCH 011/127] Fix a typo with the automatic node shutdown fallback
in the ephemeral nodes.
---
buildman/templates/cloudconfig.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml
index e75ce5626..fc8ec6a4f 100644
--- a/buildman/templates/cloudconfig.yaml
+++ b/buildman/templates/cloudconfig.yaml
@@ -35,4 +35,4 @@ coreos:
ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io
ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest
ExecStop=/usr/bin/docker stop quay-builder
- ExecStopPost=/usr/bin/sudo /bin/sh -xc "/bin/sleep 600; /sbin/shutown -h now"
+ ExecStopPost=/usr/bin/sudo /bin/sh -xc "/bin/sleep 600; /sbin/shutdown -h now"
From 2ed9b3d2437e19c7369dfd83c4eeed71d89dcf14 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 14:54:34 -0500
Subject: [PATCH 012/127] Disable the etcd timeout on watch calls to prevent
them from disconnecting the client.
---
buildman/manager/ephemeral.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index 3b467bb23..ac603b9ce 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -21,6 +21,7 @@ logger = logging.getLogger(__name__)
ETCD_BUILDER_PREFIX = 'building/'
ETCD_EXPIRE_RESULT = 'expire'
+ETCD_DISABLE_TIMEOUT = 0
class EphemeralBuilderManager(BaseManager):
@@ -52,7 +53,8 @@ class EphemeralBuilderManager(BaseManager):
""" Watch the builders key for expirations.
"""
if not self._shutting_down:
- workers_future = self._etcd_client.watch(ETCD_BUILDER_PREFIX, recursive=True)
+ workers_future = self._etcd_client.watch(ETCD_BUILDER_PREFIX, recursive=True,
+ timeout=ETCD_DISABLE_TIMEOUT)
workers_future.add_done_callback(self._handle_key_expiration)
logger.debug('Scheduling watch task.')
self._worker_watch_task = async(workers_future)
From 723fb27671bd08690f74172f195ff315892a50c4 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 14:54:58 -0500
Subject: [PATCH 013/127] Calls to the ec2 service must be async, and responses
must be wrapped as well.
---
buildman/manager/executor.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py
index e3a6a4f4a..beef14881 100644
--- a/buildman/manager/executor.py
+++ b/buildman/manager/executor.py
@@ -113,32 +113,32 @@ class EC2Executor(BuilderExecutor):
logger.debug('Generated cloud config: %s', user_data)
ec2_conn = self._get_conn()
- reservation = yield ec2_conn.run_instances(
+ reservation = yield From(ec2_conn.run_instances(
coreos_ami,
instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
security_groups=self.executor_config['EC2_SECURITY_GROUP_IDS'],
key_name=self.executor_config.get('EC2_KEY_NAME', None),
user_data=user_data,
instance_initiated_shutdown_behavior='terminate',
- )
+ ))
if not reservation.instances:
raise ExecutorException('Unable to spawn builder instance.')
elif len(reservation.instances) != 1:
raise ExecutorException('EC2 started wrong number of instances!')
- launched = reservation.instances[0]
- launched.add_tags({
+ launched = AsyncWrapper(reservation.instances[0])
+ yield From(launched.add_tags({
'Name': 'Quay Ephemeral Builder',
'Realm': realm,
'Token': token,
- })
+ }))
raise Return(launched.id)
@coroutine
def stop_builder(self, builder_id):
ec2_conn = self._get_conn()
- stopped_instances = yield ec2_conn.stop_instances([builder_id], force=True)
+ stopped_instances = yield From(ec2_conn.stop_instances([builder_id], force=True))
if builder_id not in [si.id for si in stopped_instances]:
raise ExecutorException('Unable to stop instance: %s' % builder_id)
From 2f2a88825d4fdc8c970adb1dfcc8e714d297d626 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 15:35:21 -0500
Subject: [PATCH 014/127] Try using SSD for root volumes.
---
buildman/manager/executor.py | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py
index beef14881..b2081d581 100644
--- a/buildman/manager/executor.py
+++ b/buildman/manager/executor.py
@@ -113,6 +113,14 @@ class EC2Executor(BuilderExecutor):
logger.debug('Generated cloud config: %s', user_data)
ec2_conn = self._get_conn()
+
+ ssd_root_ebs = boto.ec2.blockdevicemapping.BlockDeviceType(
+ size=8,
+ volume_type='gp2',
+ delete_on_termination=True,
+ )
+ block_devices = boto.ec2.blockdevicemapping.BlockDeviceMapping()
+ block_devices['/dev/sda1'] = ssd_root_ebs
reservation = yield From(ec2_conn.run_instances(
coreos_ami,
instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
@@ -120,6 +128,7 @@ class EC2Executor(BuilderExecutor):
key_name=self.executor_config.get('EC2_KEY_NAME', None),
user_data=user_data,
instance_initiated_shutdown_behavior='terminate',
+ block_device_map=block_devices,
))
if not reservation.instances:
From 4a2295373f5e997bbd65eee664c80bd3f1e53c62 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 15:35:34 -0500
Subject: [PATCH 015/127] Fix tests for no timeout watches.
---
test/test_buildman.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/test/test_buildman.py b/test/test_buildman.py
index e33adccbd..0d0b6ced2 100644
--- a/test/test_buildman.py
+++ b/test/test_buildman.py
@@ -124,7 +124,8 @@ class TestEphemeral(unittest.TestCase):
@async_test
def test_expiring_worker(self):
# Test that we are watching before anything else happens
- self.etcd_client_mock.watch.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True)
+ self.etcd_client_mock.watch.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True,
+ timeout=0)
# Send a signal to the callback that a worker has expired
expired_result = Mock(spec=etcd.EtcdResult)
From 8e16fbf59b9530672629ab9959a27bdb84a62e20 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 15:41:58 -0500
Subject: [PATCH 016/127] The root device on CoreOS is /dev/xvda.
---
buildman/manager/executor.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py
index b2081d581..b80e87922 100644
--- a/buildman/manager/executor.py
+++ b/buildman/manager/executor.py
@@ -120,7 +120,7 @@ class EC2Executor(BuilderExecutor):
delete_on_termination=True,
)
block_devices = boto.ec2.blockdevicemapping.BlockDeviceMapping()
- block_devices['/dev/sda1'] = ssd_root_ebs
+ block_devices['/dev/xvda'] = ssd_root_ebs
reservation = yield From(ec2_conn.run_instances(
coreos_ami,
instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
From ef70432b117f0fa85d9e5e0d840a439c894447be Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 16:04:10 -0500
Subject: [PATCH 017/127] We need to call build_finished async.
---
buildman/component/buildcomponent.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py
index 391f8ffed..726435bcc 100644
--- a/buildman/component/buildcomponent.py
+++ b/buildman/component/buildcomponent.py
@@ -247,7 +247,7 @@ class BuildComponent(BaseComponent):
logger.warning('Build %s failed with message: %s', build_id, error_message)
# Mark that the build has finished (in an error state)
- self._build_finished(BuildJobResult.ERROR)
+ trollius.async(self._build_finished(BuildJobResult.ERROR))
def _build_complete(self, result):
""" Wraps up a completed build. Handles any errors and calls self._build_finished. """
@@ -255,7 +255,7 @@ class BuildComponent(BaseComponent):
# Retrieve the result. This will raise an ApplicationError on any error that occurred.
result.result()
self._build_status.set_phase(BUILD_PHASE.COMPLETE)
- self._build_finished(BuildJobResult.COMPLETE)
+ trollius.async(self._build_finished(BuildJobResult.COMPLETE))
except ApplicationError as aex:
worker_error = WorkerError(aex.error, aex.kwargs.get('base_error'))
@@ -265,9 +265,9 @@ class BuildComponent(BaseComponent):
# Mark the build as completed.
if worker_error.is_internal_error():
- self._build_finished(BuildJobResult.INCOMPLETE)
+ trollius.async(self._build_finished(BuildJobResult.INCOMPLETE))
else:
- self._build_finished(BuildJobResult.ERROR)
+ trollius.async(self._build_finished(BuildJobResult.ERROR))
@trollius.coroutine
def _build_finished(self, job_status):
From 3ce64b4a7f5b34cb79fc1882d7654f358ad4f1e1 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 16:12:10 -0500
Subject: [PATCH 018/127] We must yield from stop_builder.
---
buildman/manager/ephemeral.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index ac603b9ce..63f03a6b7 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -191,7 +191,7 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Calling job_completed with status: %s', job_status)
# Kill the ephmeral builder
- self._executor.stop_builder(self._component_to_builder.pop(build_component))
+ yield From(self._executor.stop_builder(self._component_to_builder.pop(build_component)))
# Release the lock in etcd
job_key = self._etcd_job_key(build_job)
From cece94e1dad95208303dffe109641729f7056973 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 16:20:42 -0500
Subject: [PATCH 019/127] We want to terminate instances, not stop them.
---
buildman/manager/executor.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py
index b80e87922..1a7c4e114 100644
--- a/buildman/manager/executor.py
+++ b/buildman/manager/executor.py
@@ -147,9 +147,9 @@ class EC2Executor(BuilderExecutor):
@coroutine
def stop_builder(self, builder_id):
ec2_conn = self._get_conn()
- stopped_instances = yield From(ec2_conn.stop_instances([builder_id], force=True))
- if builder_id not in [si.id for si in stopped_instances]:
- raise ExecutorException('Unable to stop instance: %s' % builder_id)
+ terminated_instances = yield From(ec2_conn.terminate_instances([builder_id], force=True))
+ if builder_id not in [si.id for si in terminated_instances]:
+ raise ExecutorException('Unable to terminate instance: %s' % builder_id)
class PopenExecutor(BuilderExecutor):
From 1005c29b6b914d9fa837f4015d4ab31b186e5b89 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 17:08:16 -0500
Subject: [PATCH 020/127] Fix the shutdown command for when the builder
terminates itself.
---
buildman/templates/cloudconfig.yaml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml
index fc8ec6a4f..321e75927 100644
--- a/buildman/templates/cloudconfig.yaml
+++ b/buildman/templates/cloudconfig.yaml
@@ -31,8 +31,8 @@ coreos:
[Service]
TimeoutStartSec=600
TimeoutStopSec=2000
- ExecStartPre=/usr/bin/sudo /bin/sh -xc "echo '{{ manager_ip }} buildman.quay.io' >> /etc/hosts; exit 0"
+ ExecStartPre=/bin/sh -xc "echo '{{ manager_ip }} buildman.quay.io' >> /etc/hosts; exit 0"
ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io
ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest
ExecStop=/usr/bin/docker stop quay-builder
- ExecStopPost=/usr/bin/sudo /bin/sh -xc "/bin/sleep 600; /sbin/shutdown -h now"
+ ExecStopPost=/bin/sh -xc "/bin/sleep 600; /usr/bin/systemctl --no-block poweroff"
From ec87e37d8c5f5788e8af231a7d5b6acb8cc471b3 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 17:17:53 -0500
Subject: [PATCH 021/127] EC2 terminate_instances does not take a force flag.
---
buildman/manager/executor.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py
index 1a7c4e114..814b95a5b 100644
--- a/buildman/manager/executor.py
+++ b/buildman/manager/executor.py
@@ -147,7 +147,7 @@ class EC2Executor(BuilderExecutor):
@coroutine
def stop_builder(self, builder_id):
ec2_conn = self._get_conn()
- terminated_instances = yield From(ec2_conn.terminate_instances([builder_id], force=True))
+ terminated_instances = yield From(ec2_conn.terminate_instances([builder_id]))
if builder_id not in [si.id for si in terminated_instances]:
raise ExecutorException('Unable to terminate instance: %s' % builder_id)
From ccb19571d635f57fd7c0c22459a7719692759f77 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Tue, 23 Dec 2014 17:42:47 -0500
Subject: [PATCH 022/127] Try lowering the sleep on the shutdown timeout to
avoid the service dispatch timeout built into systemd.
---
buildman/templates/cloudconfig.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml
index 321e75927..3bebde670 100644
--- a/buildman/templates/cloudconfig.yaml
+++ b/buildman/templates/cloudconfig.yaml
@@ -35,4 +35,4 @@ coreos:
ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io
ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest
ExecStop=/usr/bin/docker stop quay-builder
- ExecStopPost=/bin/sh -xc "/bin/sleep 600; /usr/bin/systemctl --no-block poweroff"
+ ExecStopPost=/bin/sh -xc "/bin/sleep 120; /usr/bin/systemctl --no-block poweroff"
From cc70225043f5990d62a3acc3faba83c618d9a7db Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Wed, 31 Dec 2014 11:33:56 -0500
Subject: [PATCH 023/127] Generalize the ephemeral build managers so that any
manager may manage a builder spawned by any other manager.
---
buildman/builder.py | 9 +-
buildman/component/buildcomponent.py | 49 ++++++----
buildman/jobutil/buildjob.py | 17 +++-
buildman/manager/basemanager.py | 9 +-
buildman/manager/enterprise.py | 11 ++-
buildman/manager/ephemeral.py | 141 +++++++++++++++++++--------
buildman/manager/executor.py | 14 +--
buildman/server.py | 7 +-
buildman/templates/cloudconfig.yaml | 3 +-
data/queue.py | 3 +-
test/test_buildman.py | 120 +++++++++++++++--------
11 files changed, 258 insertions(+), 125 deletions(-)
diff --git a/buildman/builder.py b/buildman/builder.py
index 4e88d3ed7..e1c7a852b 100644
--- a/buildman/builder.py
+++ b/buildman/builder.py
@@ -41,8 +41,11 @@ def run_build_manager():
if manager_klass is None:
return
- public_ip = os.environ.get('PUBLIC_IP', '127.0.0.1')
- logger.debug('Will pass public IP address %s to builders for websocket connection', public_ip)
+ manager_hostname = os.environ.get('BUILDMAN_HOSTNAME',
+ app.config.get('BUILDMAN_HOSTNAME',
+ app.config['SERVER_HOSTNAME']))
+ logger.debug('Will pass buildman hostname %s to builders for websocket connection',
+ manager_hostname)
logger.debug('Starting build manager with lifecycle "%s"', build_manager_config[0])
ssl_context = None
@@ -53,7 +56,7 @@ def run_build_manager():
os.path.join(os.environ.get('SSL_CONFIG'), 'ssl.key'))
server = BuilderServer(app.config['SERVER_HOSTNAME'], dockerfile_build_queue, build_logs,
- user_files, manager_klass, build_manager_config[1], public_ip)
+ user_files, manager_klass, build_manager_config[1], manager_hostname)
server.run('0.0.0.0', ssl=ssl_context)
if __name__ == '__main__':
diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py
index 726435bcc..42e6696f2 100644
--- a/buildman/component/buildcomponent.py
+++ b/buildman/component/buildcomponent.py
@@ -9,6 +9,7 @@ from autobahn.wamp.exception import ApplicationError
from buildman.server import BuildJobResult
from buildman.component.basecomponent import BaseComponent
+from buildman.jobutil.buildjob import BuildJobLoadException
from buildman.jobutil.buildpack import BuildPackage, BuildPackageException
from buildman.jobutil.buildstatus import StatusHandler
from buildman.jobutil.workererror import WorkerError
@@ -58,19 +59,20 @@ class BuildComponent(BaseComponent):
yield trollius.From(self.subscribe(self._on_heartbeat, 'io.quay.builder.heartbeat'))
yield trollius.From(self.subscribe(self._on_log_message, 'io.quay.builder.logmessage'))
- self._set_status(ComponentStatus.WAITING)
+ yield trollius.From(self._set_status(ComponentStatus.WAITING))
def is_ready(self):
""" Determines whether a build component is ready to begin a build. """
return self._component_status == ComponentStatus.RUNNING
+ @trollius.coroutine
def start_build(self, build_job):
""" Starts a build. """
self._current_job = build_job
self._build_status = StatusHandler(self.build_logs, build_job.repo_build.uuid)
self._image_info = {}
- self._set_status(ComponentStatus.BUILDING)
+ yield trollius.From(self._set_status(ComponentStatus.BUILDING))
# Retrieve the job's buildpack.
buildpack_url = self.user_files.get_file_url(build_job.repo_build.resource_key,
@@ -82,23 +84,27 @@ class BuildComponent(BaseComponent):
buildpack = BuildPackage.from_url(buildpack_url)
except BuildPackageException as bpe:
self._build_failure('Could not retrieve build package', bpe)
- return
+ raise trollius.Return()
# Extract the base image information from the Dockerfile.
parsed_dockerfile = None
logger.debug('Parsing dockerfile')
- build_config = build_job.build_config
+ try:
+ build_config = build_job.build_config
+ except BuildJobLoadException as irbe:
+ self._build_failure('Could not load build job information', irbe)
+
try:
parsed_dockerfile = buildpack.parse_dockerfile(build_config.get('build_subdir'))
except BuildPackageException as bpe:
self._build_failure('Could not find Dockerfile in build package', bpe)
- return
+ raise trollius.Return()
image_and_tag_tuple = parsed_dockerfile.get_image_and_tag()
if image_and_tag_tuple is None or image_and_tag_tuple[0] is None:
self._build_failure('Missing FROM line in Dockerfile')
- return
+ raise trollius.Return()
base_image_information = {
'repository': image_and_tag_tuple[0],
@@ -147,9 +153,7 @@ class BuildComponent(BaseComponent):
logger.debug('Invoking build: %s', self.builder_realm)
logger.debug('With Arguments: %s', build_arguments)
- return (self
- .call("io.quay.builder.build", **build_arguments)
- .add_done_callback(self._build_complete))
+ self.call("io.quay.builder.build", **build_arguments).add_done_callback(self._build_complete)
@staticmethod
def _total_completion(statuses, total_images):
@@ -276,38 +280,42 @@ class BuildComponent(BaseComponent):
self._current_job = None
# Set the component back to a running state.
- self._set_status(ComponentStatus.RUNNING)
+ yield trollius.From(self._set_status(ComponentStatus.RUNNING))
@staticmethod
def _ping():
""" Ping pong. """
return 'pong'
+ @trollius.coroutine
def _on_ready(self, token, version):
if not version in SUPPORTED_WORKER_VERSIONS:
- logger.warning('Build component (token "%s") is running an out-of-date version: %s', version)
- return False
+ logger.warning('Build component (token "%s") is running an out-of-date version: %s', token,
+ version)
+ raise trollius.Return(False)
if self._component_status != 'waiting':
logger.warning('Build component (token "%s") is already connected', self.expected_token)
- return False
+ raise trollius.Return(False)
if token != self.expected_token:
- logger.warning('Builder token mismatch. Expected: "%s". Found: "%s"', self.expected_token, token)
- return False
+ logger.warning('Builder token mismatch. Expected: "%s". Found: "%s"', self.expected_token,
+ token)
+ raise trollius.Return(False)
- self._set_status(ComponentStatus.RUNNING)
+ yield trollius.From(self._set_status(ComponentStatus.RUNNING))
# Start the heartbeat check and updating loop.
loop = trollius.get_event_loop()
loop.create_task(self._heartbeat())
logger.debug('Build worker %s is connected and ready', self.builder_realm)
- return True
+ raise trollius.Return(True)
+ @trollius.coroutine
def _set_status(self, phase):
if phase == ComponentStatus.RUNNING:
loop = trollius.get_event_loop()
- self.parent_manager.build_component_ready(self, loop)
+ yield trollius.From(self.parent_manager.build_component_ready(self, loop))
self._component_status = phase
@@ -344,13 +352,14 @@ class BuildComponent(BaseComponent):
logger.debug('Checking heartbeat on realm %s', self.builder_realm)
if (self._last_heartbeat and
self._last_heartbeat < datetime.datetime.utcnow() - HEARTBEAT_DELTA):
- self._timeout()
+ yield trollius.From(self._timeout())
return
yield trollius.From(trollius.sleep(HEARTBEAT_TIMEOUT))
+ @trollius.coroutine
def _timeout(self):
- self._set_status(ComponentStatus.TIMED_OUT)
+ yield trollius.From(self._set_status(ComponentStatus.TIMED_OUT))
logger.warning('Build component with realm %s has timed out', self.builder_realm)
self._dispose(timed_out=True)
diff --git a/buildman/jobutil/buildjob.py b/buildman/jobutil/buildjob.py
index e92be23a6..c2d2769db 100644
--- a/buildman/jobutil/buildjob.py
+++ b/buildman/jobutil/buildjob.py
@@ -1,6 +1,9 @@
+import json
+
+from cachetools import lru_cache
+
from data import model
-import json
class BuildJobLoadException(Exception):
""" Exception raised if a build job could not be instantiated for some reason. """
@@ -18,14 +21,22 @@ class BuildJob(object):
'Could not parse build queue item config with ID %s' % self.job_details['build_uuid']
)
+ @lru_cache(maxsize=1)
+ def _load_repo_build(self):
try:
- self.repo_build = model.get_repository_build(self.job_details['build_uuid'])
+ return model.get_repository_build(self.job_details['build_uuid'])
except model.InvalidRepositoryBuildException:
raise BuildJobLoadException(
'Could not load repository build with ID %s' % self.job_details['build_uuid'])
+ @property
+ def repo_build(self):
+ return self._load_repo_build()
+
+ @property
+ def build_config(self):
try:
- self.build_config = json.loads(self.repo_build.job_config)
+ return json.loads(self.repo_build.job_config)
except ValueError:
raise BuildJobLoadException(
'Could not parse repository build job config with ID %s' % self.job_details['build_uuid']
diff --git a/buildman/manager/basemanager.py b/buildman/manager/basemanager.py
index ee17cf531..2c57ac095 100644
--- a/buildman/manager/basemanager.py
+++ b/buildman/manager/basemanager.py
@@ -3,12 +3,12 @@ from trollius import coroutine
class BaseManager(object):
""" Base for all worker managers. """
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
- job_complete_callback, public_ip_address, heartbeat_period_sec):
+ job_complete_callback, manager_hostname, heartbeat_period_sec):
self.register_component = register_component
self.unregister_component = unregister_component
self.job_heartbeat_callback = job_heartbeat_callback
self.job_complete_callback = job_complete_callback
- self.public_ip_address = public_ip_address
+ self.manager_hostname = manager_hostname
self.heartbeat_period_sec = heartbeat_period_sec
@coroutine
@@ -31,7 +31,7 @@ class BaseManager(object):
raise NotImplementedError
@coroutine
- def schedule(self, build_job, loop):
+ def schedule(self, build_job):
""" Schedules a queue item to be built. Returns True if the item was properly scheduled
and False if all workers are busy.
"""
@@ -42,7 +42,8 @@ class BaseManager(object):
"""
raise NotImplementedError
- def build_component_ready(self, build_component, loop):
+ @coroutine
+ def build_component_ready(self, build_component):
""" Method invoked whenever a build component announces itself as ready.
"""
raise NotImplementedError
diff --git a/buildman/manager/enterprise.py b/buildman/manager/enterprise.py
index 5a97c0955..d7fdea39a 100644
--- a/buildman/manager/enterprise.py
+++ b/buildman/manager/enterprise.py
@@ -5,7 +5,7 @@ from buildman.component.basecomponent import BaseComponent
from buildman.component.buildcomponent import BuildComponent
from buildman.manager.basemanager import BaseManager
-from trollius.coroutines import From, Return, coroutine
+from trollius import From, Return, coroutine, async
REGISTRATION_REALM = 'registration'
logger = logging.getLogger(__name__)
@@ -51,16 +51,19 @@ class EnterpriseManager(BaseManager):
return realm
@coroutine
- def schedule(self, build_job, loop):
+ def schedule(self, build_job):
""" Schedules a build for an Enterprise Registry. """
if self.shutting_down or not self.ready_components:
raise Return(False)
component = self.ready_components.pop()
- loop.call_soon(component.start_build, build_job)
+
+ yield From(component.start_build(build_job))
+
raise Return(True)
- def build_component_ready(self, build_component, loop):
+ @coroutine
+ def build_component_ready(self, build_component):
self.ready_components.add(build_component)
def shutdown(self):
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index 63f03a6b7..7126ec836 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -13,16 +13,28 @@ from urllib3.exceptions import ReadTimeoutError
from buildman.manager.basemanager import BaseManager
from buildman.manager.executor import PopenExecutor, EC2Executor
from buildman.component.buildcomponent import BuildComponent
+from buildman.jobutil.buildjob import BuildJob
from buildman.asyncutil import AsyncWrapper
+from util.morecollections import AttrDict
logger = logging.getLogger(__name__)
ETCD_BUILDER_PREFIX = 'building/'
-ETCD_EXPIRE_RESULT = 'expire'
+ETCD_REALM_PREFIX = 'realm/'
ETCD_DISABLE_TIMEOUT = 0
+class EtcdAction(object):
+ GET = 'get'
+ SET = 'set'
+ EXPIRE = 'expire'
+ UPDATE = 'update'
+ DELETE = 'delete'
+ CREATE = 'create'
+ COMPARE_AND_SWAP = 'compareAndSwap'
+ COMPARE_AND_DELETE = 'compareAndDelete'
+
class EphemeralBuilderManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
@@ -41,52 +53,82 @@ class EphemeralBuilderManager(BaseManager):
self._etcd_client = None
self._component_to_job = {}
+ self._job_uuid_to_component = {}
self._component_to_builder = {}
self._executor = None
- self._worker_watch_task = None
+ # Map of etcd keys being watched to the tasks watching them
+ self._watch_tasks = {}
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
- def _watch_builders(self):
- """ Watch the builders key for expirations.
- """
+ def _watch_etcd(self, etcd_key, change_callback, recursive=True):
+ watch_task_key = (etcd_key, recursive)
+ def callback_wrapper(changed_key_future):
+
+ if watch_task_key not in self._watch_tasks or self._watch_tasks[watch_task_key].done():
+ self._watch_etcd(etcd_key, change_callback)
+
+ if changed_key_future.cancelled():
+ # Due to lack of interest, tomorrow has been cancelled
+ return
+
+ try:
+ etcd_result = changed_key_future.result()
+ except ReadTimeoutError:
+ return
+
+ change_callback(etcd_result)
+
if not self._shutting_down:
- workers_future = self._etcd_client.watch(ETCD_BUILDER_PREFIX, recursive=True,
- timeout=ETCD_DISABLE_TIMEOUT)
- workers_future.add_done_callback(self._handle_key_expiration)
- logger.debug('Scheduling watch task.')
- self._worker_watch_task = async(workers_future)
+ watch_future = self._etcd_client.watch(etcd_key, recursive=recursive,
+ timeout=ETCD_DISABLE_TIMEOUT)
+ watch_future.add_done_callback(callback_wrapper)
+ logger.debug('Scheduling watch of key: %s%s', etcd_key, '/*' if recursive else '')
+ self._watch_tasks[watch_task_key] = async(watch_future)
- def _handle_key_expiration(self, changed_key_future):
- """ Handle when a builder expires
- """
- if self._worker_watch_task is None or self._worker_watch_task.done():
- self._watch_builders()
-
- if changed_key_future.cancelled():
- # Due to lack of interest, tomorrow has been cancelled
- return
-
- try:
- etcd_result = changed_key_future.result()
- except ReadTimeoutError:
- return
-
- if etcd_result.action == ETCD_EXPIRE_RESULT:
+ def _handle_builder_expiration(self, etcd_result):
+ if etcd_result.action == EtcdAction.EXPIRE:
# Handle the expiration
logger.debug('Builder expired, clean up the old build node')
job_metadata = json.loads(etcd_result._prev_node.value)
async(self._clean_up_old_builder(etcd_result.key, job_metadata))
+ def _handle_realm_change(self, etcd_result):
+ if etcd_result.action == EtcdAction.SET:
+ # We must listen on the realm created by ourselves or another worker
+ realm_spec = json.loads(etcd_result.value)
+ component = self.register_component(realm_spec['realm'], BuildComponent,
+ token=realm_spec['token'])
+ build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
+ self._component_to_job[component] = build_job
+ self._component_to_builder[component] = realm_spec['builder_id']
+ self._job_uuid_to_component[build_job.job_details['build_uuid']] = component
+
+ elif etcd_result.action == EtcdAction.DELETE or etcd_result.action == EtcdAction.EXPIRE:
+ # We must stop listening for new connections on the specified realm, if we did not get the
+ # connection
+ realm_spec = json.loads(etcd_result._prev_node.value)
+ build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
+ component = self._job_uuid_to_component.pop(build_job.job_details['build_uuid'], None)
+ if component is not None:
+ # We were not the manager which the worker connected to, remove the bookkeeping for it
+ logger.debug('Unregistering unused component on realm: %s', realm_spec['realm'])
+ del self._component_to_job[component]
+ del self._component_to_builder[component]
+ self.unregister_component(component)
+
+ else:
+ logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key)
+
def initialize(self, manager_config):
logger.debug('Calling initialize')
self._manager_config = manager_config
executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor)
self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}),
- self.public_ip_address)
+ self.manager_hostname)
etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1')
etcd_port = self._manager_config.get('ETCD_PORT', 2379)
@@ -97,7 +139,8 @@ class EphemeralBuilderManager(BaseManager):
self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port),
executor=self._async_thread_executor)
- self._watch_builders()
+ self._watch_etcd(ETCD_BUILDER_PREFIX, self._handle_builder_expiration)
+ self._watch_etcd(ETCD_REALM_PREFIX, self._handle_realm_change)
def setup_time(self):
setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300)
@@ -108,17 +151,17 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Shutting down worker.')
self._shutting_down = True
- if self._worker_watch_task is not None:
- logger.debug('Canceling watch task.')
- self._worker_watch_task.cancel()
- self._worker_watch_task = None
+ for (etcd_key, _), task in self._watch_tasks.items():
+ if not task.done():
+ logger.debug('Canceling watch task for %s', etcd_key)
+ task.cancel()
if self._async_thread_executor is not None:
logger.debug('Shutting down thread pool executor.')
self._async_thread_executor.shutdown()
@coroutine
- def schedule(self, build_job, loop):
+ def schedule(self, build_job):
logger.debug('Calling schedule with job: %s', build_job.job_details['build_uuid'])
# Check if there are worker slots avialable by checking the number of jobs in etcd
@@ -154,8 +197,6 @@ class EphemeralBuilderManager(BaseManager):
try:
yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=False, ttl=ttl))
- component = self.register_component(realm, BuildComponent, token=token)
- self._component_to_job[component] = build_job
except KeyError:
# The job was already taken by someone else, we are probably a retry
logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?')
@@ -163,20 +204,38 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Starting builder with executor: %s', self._executor)
builder_id = yield From(self._executor.start_builder(realm, token))
- self._component_to_builder[component] = builder_id
# Store the builder in etcd associated with the job id
payload['builder_id'] = builder_id
yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl))
+ # Store the realm spec which will allow any manager to accept this builder when it connects
+ realm_spec = json.dumps({
+ 'realm': realm,
+ 'token': token,
+ 'builder_id': builder_id,
+ 'job_queue_item': build_job.job_item,
+ })
+ try:
+ yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False,
+ ttl=ttl))
+ except KeyError:
+ logger.error('Realm already exists in etcd. UUID collision or something is very very wrong.')
+ raise Return(False)
+
raise Return(True)
- def build_component_ready(self, build_component, loop):
+ @coroutine
+ def build_component_ready(self, build_component):
try:
+ # Clean up the bookkeeping for allowing any manager to take the job
job = self._component_to_job.pop(build_component)
+ del self._job_uuid_to_component[job.job_details['build_uuid']]
+ yield From(self._etcd_client.delete(self._etcd_realm_key(build_component.builder_realm)))
+
logger.debug('Sending build %s to newly ready component on realm %s',
job.job_details['build_uuid'], build_component.builder_realm)
- loop.call_soon(build_component.start_build, job)
+ yield From(build_component.start_build(job))
except KeyError:
logger.warning('Builder is asking for more work, but work already completed')
@@ -240,6 +299,12 @@ class EphemeralBuilderManager(BaseManager):
"""
return os.path.join(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid'])
+ @staticmethod
+ def _etcd_realm_key(realm):
+ """ Create a key which is used to track an incoming connection on a realm.
+ """
+ return os.path.join(ETCD_REALM_PREFIX, realm)
+
def num_workers(self):
""" Return the number of workers we're managing locally.
"""
diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py
index 814b95a5b..c4b38366d 100644
--- a/buildman/manager/executor.py
+++ b/buildman/manager/executor.py
@@ -29,9 +29,9 @@ class ExecutorException(Exception):
class BuilderExecutor(object):
- def __init__(self, executor_config, manager_public_ip):
+ def __init__(self, executor_config, manager_hostname):
self.executor_config = executor_config
- self.manager_public_ip = manager_public_ip
+ self.manager_hostname = manager_hostname
""" Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
starting and stopping builders.
@@ -52,7 +52,7 @@ class BuilderExecutor(object):
def get_manager_websocket_url(self):
return 'ws://{0}:'
- def generate_cloud_config(self, realm, token, coreos_channel, manager_ip,
+ def generate_cloud_config(self, realm, token, coreos_channel, manager_hostname,
quay_username=None, quay_password=None, etcd_token=None):
if quay_username is None:
quay_username = self.executor_config['QUAY_USERNAME']
@@ -69,7 +69,7 @@ class BuilderExecutor(object):
quay_username=quay_username,
quay_password=quay_password,
etcd_token=etcd_token,
- manager_ip=manager_ip,
+ manager_hostname=manager_hostname,
coreos_channel=coreos_channel,
)
@@ -108,7 +108,7 @@ class EC2Executor(BuilderExecutor):
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
get_ami_callable = partial(self._get_coreos_ami, region, channel)
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
- user_data = self.generate_cloud_config(realm, token, channel, self.manager_public_ip)
+ user_data = self.generate_cloud_config(realm, token, channel, self.manager_hostname)
logger.debug('Generated cloud config: %s', user_data)
@@ -155,10 +155,10 @@ class EC2Executor(BuilderExecutor):
class PopenExecutor(BuilderExecutor):
""" Implementation of BuilderExecutor which uses Popen to fork a quay-builder process.
"""
- def __init__(self, executor_config, manager_public_ip):
+ def __init__(self, executor_config, manager_hostname):
self._jobs = {}
- super(PopenExecutor, self).__init__(executor_config, manager_public_ip)
+ super(PopenExecutor, self).__init__(executor_config, manager_hostname)
""" Executor which uses Popen to fork a quay-builder process.
"""
diff --git a/buildman/server.py b/buildman/server.py
index ba9536c1e..e1175f718 100644
--- a/buildman/server.py
+++ b/buildman/server.py
@@ -37,7 +37,7 @@ class BuilderServer(object):
controller.
"""
def __init__(self, registry_hostname, queue, build_logs, user_files, lifecycle_manager_klass,
- lifecycle_manager_config, manager_public_ip):
+ lifecycle_manager_config, manager_hostname):
self._loop = None
self._current_status = 'starting'
self._current_components = []
@@ -53,7 +53,7 @@ class BuilderServer(object):
self._unregister_component,
self._job_heartbeat,
self._job_complete,
- manager_public_ip,
+ manager_hostname,
HEARTBEAT_PERIOD_SEC,
)
self._lifecycle_manager_config = lifecycle_manager_config
@@ -158,7 +158,7 @@ class BuilderServer(object):
self._queue.incomplete(job_item, restore_retry=False)
logger.debug('Build job found. Checking for an avaliable worker.')
- scheduled = yield From(self._lifecycle_manager.schedule(build_job, self._loop))
+ scheduled = yield From(self._lifecycle_manager.schedule(build_job))
if scheduled:
self._job_count = self._job_count + 1
logger.debug('Build job scheduled. Running: %s', self._job_count)
@@ -168,7 +168,6 @@ class BuilderServer(object):
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
-
@trollius.coroutine
def _initialize(self, loop, host, ssl=None):
self._loop = loop
diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml
index 3bebde670..d6ae3aeca 100644
--- a/buildman/templates/cloudconfig.yaml
+++ b/buildman/templates/cloudconfig.yaml
@@ -6,7 +6,7 @@ write_files:
content: |
REALM={{ realm }}
TOKEN={{ token }}
- ENDPOINT=wss://buildman.quay.io:8787
+ SERVER=wss://{{ manager_hostname }}
coreos:
update:
@@ -31,7 +31,6 @@ coreos:
[Service]
TimeoutStartSec=600
TimeoutStopSec=2000
- ExecStartPre=/bin/sh -xc "echo '{{ manager_ip }} buildman.quay.io' >> /etc/hosts; exit 0"
ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io
ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest
ExecStop=/usr/bin/docker stop quay-builder
diff --git a/data/queue.py b/data/queue.py
index 5c720eed2..865511519 100644
--- a/data/queue.py
+++ b/data/queue.py
@@ -78,7 +78,8 @@ class WorkQueue(object):
def get(self, processing_time=300):
"""
Get an available item and mark it as unavailable for the default of five
- minutes.
+ minutes. The result of this method must always be composed of simple
+ python objects which are JSON serializable for network portability reasons.
"""
now = datetime.utcnow()
diff --git a/test/test_buildman.py b/test/test_buildman.py
index 0d0b6ced2..f10ba473e 100644
--- a/test/test_buildman.py
+++ b/test/test_buildman.py
@@ -4,19 +4,20 @@ import os.path
import time
import json
-from trollius import coroutine, get_event_loop, From, Future, sleep
+from trollius import coroutine, get_event_loop, From, Future, sleep, Return
from mock import Mock
from threading import Event
from urllib3.exceptions import ReadTimeoutError
from buildman.manager.executor import BuilderExecutor
from buildman.manager.ephemeral import (EphemeralBuilderManager, ETCD_BUILDER_PREFIX,
- ETCD_EXPIRE_RESULT)
+ ETCD_REALM_PREFIX, EtcdAction)
from buildman.server import BuildJobResult
from buildman.component.buildcomponent import BuildComponent
BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead'
+REALM_ID = '1234-realm'
def async_test(f):
@@ -43,17 +44,17 @@ class TestEphemeral(unittest.TestCase):
self.etcd_client_mock.watch = Mock(side_effect=hang_until_event)
return self.etcd_client_mock
- def _create_mock_executor(self, *args, **kwargs):
- def create_completed_future(result=None):
- def inner(*args, **kwargs):
- new_future = Future()
- new_future.set_result(result)
- return new_future
- return inner
+ def _create_completed_future(self, result=None):
+ def inner(*args, **kwargs):
+ new_future = Future()
+ new_future.set_result(result)
+ return new_future
+ return inner
+ def _create_mock_executor(self, *args, **kwargs):
self.test_executor = Mock(spec=BuilderExecutor)
- self.test_executor.start_builder = Mock(side_effect=create_completed_future('123'))
- self.test_executor.stop_builder = Mock(side_effect=create_completed_future())
+ self.test_executor.start_builder = Mock(side_effect=self._create_completed_future('123'))
+ self.test_executor.stop_builder = Mock(side_effect=self._create_completed_future())
return self.test_executor
def _create_build_job(self):
@@ -61,6 +62,10 @@ class TestEphemeral(unittest.TestCase):
mock_job.job_details = {
'build_uuid': BUILD_UUID,
}
+ mock_job.job_item = {
+ 'body': json.dumps(mock_job.job_details),
+ 'id': 1,
+ }
return mock_job
def setUp(self):
@@ -71,13 +76,13 @@ class TestEphemeral(unittest.TestCase):
self.etcd_wait_event.clear()
self.register_component_callback = Mock()
- self.uniregister_component_callback = Mock()
+ self.unregister_component_callback = Mock()
self.job_heartbeat_callback = Mock()
self.job_complete_callback = Mock()
self.manager = EphemeralBuilderManager(
self.register_component_callback,
- self.uniregister_component_callback,
+ self.unregister_component_callback,
self.job_heartbeat_callback,
self.job_complete_callback,
'127.0.0.1',
@@ -97,15 +102,19 @@ class TestEphemeral(unittest.TestCase):
del EphemeralBuilderManager._executors['test']
EphemeralBuilderManager._etcd_client_klass = self.old_etcd_client_klass
- @async_test
- def test_schedule_and_complete(self):
+ @coroutine
+ def _setup_job_for_managers(self):
+ # Test that we are watching the realm location before anything else happens
+ self.etcd_client_mock.watch.assert_any_call(ETCD_REALM_PREFIX, recursive=True, timeout=0)
+
self.etcd_client_mock.read = Mock(side_effect=KeyError)
- test_component = BuildComponent(None)
+ test_component = Mock(spec=BuildComponent)
+ test_component.builder_realm = REALM_ID
+ test_component.start_build = Mock(side_effect=self._create_completed_future())
self.register_component_callback.return_value = test_component
# Ask for a builder to be scheduled
- loop = get_event_loop()
- is_scheduled = yield From(self.manager.schedule(self.mock_job, loop))
+ is_scheduled = yield From(self.manager.schedule(self.mock_job))
self.assertTrue(is_scheduled)
@@ -114,29 +123,76 @@ class TestEphemeral(unittest.TestCase):
self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key)
self.assertEqual(self.etcd_client_mock.write.call_args_list[1][0][0], self.mock_job_key)
+ # Right now the job is not registered with any managers because etcd has not accepted the job
+ self.assertEqual(self.register_component_callback.call_count, 0)
+
+ realm_created = Mock(spec=etcd.EtcdResult)
+ realm_created.action = EtcdAction.SET
+ realm_created.key = os.path.join(ETCD_REALM_PREFIX, REALM_ID)
+ realm_created.value = json.dumps({
+ 'realm': REALM_ID,
+ 'token': 'beef',
+ 'builder_id': '123',
+ 'job_queue_item': self.mock_job.job_item,
+ })
+
+ self.manager._handle_realm_change(realm_created)
+
self.assertEqual(self.register_component_callback.call_count, 1)
+ raise Return(test_component)
+
+ @async_test
+ def test_schedule_and_complete(self):
+ # Test that a job is properly registered with all of the managers
+ test_component = yield From(self._setup_job_for_managers())
+
+ # Take the job ourselves
+ yield From(self.manager.build_component_ready(test_component))
+
+ self.etcd_client_mock.delete.assert_called_once_with(os.path.join(ETCD_REALM_PREFIX, REALM_ID))
+ self.etcd_client_mock.delete.reset_mock()
+
+ # Finish the job
yield From(self.manager.job_completed(self.mock_job, BuildJobResult.COMPLETE, test_component))
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key)
+ @async_test
+ def test_another_manager_takes_job(self):
+ # Prepare a job to be taken by another manager
+ test_component = yield From(self._setup_job_for_managers())
+
+ realm_deleted = Mock(spec=etcd.EtcdResult)
+ realm_deleted.action = EtcdAction.DELETE
+ realm_deleted.key = os.path.join(ETCD_REALM_PREFIX, REALM_ID)
+
+ realm_deleted._prev_node = Mock(spec=etcd.EtcdResult)
+ realm_deleted._prev_node.value = json.dumps({
+ 'realm': REALM_ID,
+ 'token': 'beef',
+ 'builder_id': '123',
+ 'job_queue_item': self.mock_job.job_item,
+ })
+
+ self.manager._handle_realm_change(realm_deleted)
+
+ self.unregister_component_callback.assert_called_once_with(test_component)
+
@async_test
def test_expiring_worker(self):
# Test that we are watching before anything else happens
- self.etcd_client_mock.watch.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True,
- timeout=0)
+ self.etcd_client_mock.watch.assert_any_call(ETCD_BUILDER_PREFIX, recursive=True, timeout=0)
# Send a signal to the callback that a worker has expired
expired_result = Mock(spec=etcd.EtcdResult)
- expired_result.action = ETCD_EXPIRE_RESULT
+ expired_result.action = EtcdAction.EXPIRE
expired_result.key = self.mock_job_key
expired_result._prev_node = Mock(spec=etcd.EtcdResult)
expired_result._prev_node.value = json.dumps({'builder_id': '1234'})
- expired_future = Future()
- expired_future.set_result(expired_result)
- self.manager._handle_key_expiration(expired_future)
+ self.manager._handle_builder_expiration(expired_result)
yield From(sleep(.01))
@@ -151,10 +207,8 @@ class TestEphemeral(unittest.TestCase):
set_result = Mock(sepc=etcd.EtcdResult)
set_result.action = 'set'
set_result.key = self.mock_job_key
- set_future = Future()
- set_future.set_result(set_result)
- self.manager._handle_key_expiration(set_future)
+ self.manager._handle_builder_expiration(set_result)
yield From(sleep(.01))
@@ -179,15 +233,3 @@ class TestEphemeral(unittest.TestCase):
self.job_heartbeat_callback.assert_called_once_with(self.mock_job)
self.assertEqual(self.etcd_client_mock.write.call_count, 1)
self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key)
-
- @async_test
- def test_etcd_read_timeout(self):
- # Send a signal to the callback that a worker key has been changed
- read_timeout_future = Future()
- read_timeout_future.set_exception(ReadTimeoutError(None, None, None))
-
- self.manager._handle_key_expiration(read_timeout_future)
-
- yield From(sleep(.01))
-
- self.assertEquals(self.test_executor.stop_builder.call_count, 0)
From a9839021af9554b86b7ab46ff7665ad7b9ce71ad Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Wed, 31 Dec 2014 11:46:02 -0500
Subject: [PATCH 024/127] When the etcd key tracking realms is first created
the action is create, not set.
---
buildman/manager/ephemeral.py | 2 +-
test/test_buildman.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index 7126ec836..9ab10fd15 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -96,7 +96,7 @@ class EphemeralBuilderManager(BaseManager):
async(self._clean_up_old_builder(etcd_result.key, job_metadata))
def _handle_realm_change(self, etcd_result):
- if etcd_result.action == EtcdAction.SET:
+ if etcd_result.action == EtcdAction.CREATE:
# We must listen on the realm created by ourselves or another worker
realm_spec = json.loads(etcd_result.value)
component = self.register_component(realm_spec['realm'], BuildComponent,
diff --git a/test/test_buildman.py b/test/test_buildman.py
index f10ba473e..a9029c22a 100644
--- a/test/test_buildman.py
+++ b/test/test_buildman.py
@@ -127,7 +127,7 @@ class TestEphemeral(unittest.TestCase):
self.assertEqual(self.register_component_callback.call_count, 0)
realm_created = Mock(spec=etcd.EtcdResult)
- realm_created.action = EtcdAction.SET
+ realm_created.action = EtcdAction.CREATE
realm_created.key = os.path.join(ETCD_REALM_PREFIX, REALM_ID)
realm_created.value = json.dumps({
'realm': REALM_ID,
From b33ee1a474010d473d5900dc7d6ff86274d21fce Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Mon, 5 Jan 2015 11:21:36 -0500
Subject: [PATCH 025/127] Register existing builders to watch their
expirations.
---
buildman/manager/ephemeral.py | 42 ++++++++++++++++++++---------------
test/test_buildman.py | 2 --
2 files changed, 24 insertions(+), 20 deletions(-)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index 9ab10fd15..701465c3f 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -93,18 +93,16 @@ class EphemeralBuilderManager(BaseManager):
# Handle the expiration
logger.debug('Builder expired, clean up the old build node')
job_metadata = json.loads(etcd_result._prev_node.value)
- async(self._clean_up_old_builder(etcd_result.key, job_metadata))
+
+ if 'builder_id' in job_metadata:
+ logger.info('Terminating expired build node.')
+ async(self._executor.stop_builder(job_metadata['builder_id']))
def _handle_realm_change(self, etcd_result):
if etcd_result.action == EtcdAction.CREATE:
# We must listen on the realm created by ourselves or another worker
realm_spec = json.loads(etcd_result.value)
- component = self.register_component(realm_spec['realm'], BuildComponent,
- token=realm_spec['token'])
- build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
- self._component_to_job[component] = build_job
- self._component_to_builder[component] = realm_spec['builder_id']
- self._job_uuid_to_component[build_job.job_details['build_uuid']] = component
+ self._register_realm(realm_spec)
elif etcd_result.action == EtcdAction.DELETE or etcd_result.action == EtcdAction.EXPIRE:
# We must stop listening for new connections on the specified realm, if we did not get the
@@ -122,6 +120,22 @@ class EphemeralBuilderManager(BaseManager):
else:
logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key)
+ def _register_realm(self, realm_spec):
+ logger.debug('Registering realm with manager: %s', realm_spec['realm'])
+ component = self.register_component(realm_spec['realm'], BuildComponent,
+ token=realm_spec['token'])
+ build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
+ self._component_to_job[component] = build_job
+ self._component_to_builder[component] = realm_spec['builder_id']
+ self._job_uuid_to_component[build_job.job_details['build_uuid']] = component
+
+ @coroutine
+ def _register_existing_realms(self):
+ all_realms = yield From(self._etcd_client.read(ETCD_REALM_PREFIX, recursive=True))
+ for realm in all_realms.children:
+ if not realm.dir:
+ self._register_realm(json.loads(realm.value))
+
def initialize(self, manager_config):
logger.debug('Calling initialize')
self._manager_config = manager_config
@@ -142,6 +156,9 @@ class EphemeralBuilderManager(BaseManager):
self._watch_etcd(ETCD_BUILDER_PREFIX, self._handle_builder_expiration)
self._watch_etcd(ETCD_REALM_PREFIX, self._handle_realm_change)
+ # Load components for all realms currently known to the cluster
+ async(self._register_existing_realms())
+
def setup_time(self):
setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300)
logger.debug('Returning setup_time: %s', setup_time)
@@ -282,17 +299,6 @@ class EphemeralBuilderManager(BaseManager):
self.job_heartbeat_callback(build_job)
- @coroutine
- def _clean_up_old_builder(self, job_key, job_payload):
- """ Terminate an old builders once the expiration date has passed.
- """
- logger.debug('Cleaning up the old builder for job: %s', job_key)
- if 'builder_id' in job_payload:
- logger.info('Terminating expired build node.')
- yield From(self._executor.stop_builder(job_payload['builder_id']))
-
- yield From(self._etcd_client.delete(job_key))
-
@staticmethod
def _etcd_job_key(build_job):
""" Create a key which is used to track a job in etcd.
diff --git a/test/test_buildman.py b/test/test_buildman.py
index a9029c22a..89658f65d 100644
--- a/test/test_buildman.py
+++ b/test/test_buildman.py
@@ -199,8 +199,6 @@ class TestEphemeral(unittest.TestCase):
self.test_executor.stop_builder.assert_called_once_with('1234')
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
- self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key)
-
@async_test
def test_change_worker(self):
# Send a signal to the callback that a worker key has been changed
From 320ae63ccdbc96fa68ee2e4d8073be039027e861 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Mon, 5 Jan 2015 12:23:54 -0500
Subject: [PATCH 026/127] Handle the case where there are no realms registered.
---
buildman/manager/ephemeral.py | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index 701465c3f..aa9bb7193 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -131,10 +131,14 @@ class EphemeralBuilderManager(BaseManager):
@coroutine
def _register_existing_realms(self):
- all_realms = yield From(self._etcd_client.read(ETCD_REALM_PREFIX, recursive=True))
- for realm in all_realms.children:
- if not realm.dir:
- self._register_realm(json.loads(realm.value))
+ try:
+ all_realms = yield From(self._etcd_client.read(ETCD_REALM_PREFIX, recursive=True))
+ for realm in all_realms.children:
+ if not realm.dir:
+ self._register_realm(json.loads(realm.value))
+ except KeyError:
+ # no realms have been registered yet
+ pass
def initialize(self, manager_config):
logger.debug('Calling initialize')
From f58b09a0647f28ecda0b4753ce553d22539338c4 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Mon, 5 Jan 2015 13:08:25 -0500
Subject: [PATCH 027/127] Remove the loop argument from the call to
build_component_ready.
---
buildman/component/buildcomponent.py | 3 +--
buildman/manager/ephemeral.py | 1 -
2 files changed, 1 insertion(+), 3 deletions(-)
diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py
index 42e6696f2..c31d7aafe 100644
--- a/buildman/component/buildcomponent.py
+++ b/buildman/component/buildcomponent.py
@@ -314,8 +314,7 @@ class BuildComponent(BaseComponent):
@trollius.coroutine
def _set_status(self, phase):
if phase == ComponentStatus.RUNNING:
- loop = trollius.get_event_loop()
- yield trollius.From(self.parent_manager.build_component_ready(self, loop))
+ yield trollius.From(self.parent_manager.build_component_ready(self))
self._component_status = phase
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index aa9bb7193..c7a084888 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -66,7 +66,6 @@ class EphemeralBuilderManager(BaseManager):
def _watch_etcd(self, etcd_key, change_callback, recursive=True):
watch_task_key = (etcd_key, recursive)
def callback_wrapper(changed_key_future):
-
if watch_task_key not in self._watch_tasks or self._watch_tasks[watch_task_key].done():
self._watch_etcd(etcd_key, change_callback)
From 803796271651d590c697235c713c6b01bea65bd1 Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Mon, 5 Jan 2015 14:44:54 -0500
Subject: [PATCH 028/127] Change the severity of a log message which is
actually expected in the happy case.
---
buildman/manager/ephemeral.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index c7a084888..07c773a59 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -257,7 +257,7 @@ class EphemeralBuilderManager(BaseManager):
job.job_details['build_uuid'], build_component.builder_realm)
yield From(build_component.start_build(job))
except KeyError:
- logger.warning('Builder is asking for more work, but work already completed')
+ logger.debug('Builder is asking for more work, but work already completed')
def build_component_disposed(self, build_component, timed_out):
logger.debug('Calling build_component_disposed.')
From dd7664328c3e3f673fef8521140769f6d8c720df Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Mon, 5 Jan 2015 15:09:03 -0500
Subject: [PATCH 029/127] Make the build manager ports configurable.
---
buildman/builder.py | 12 +++++++++++-
buildman/server.py | 17 +++++++----------
2 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/buildman/builder.py b/buildman/builder.py
index e1c7a852b..467ac2f6d 100644
--- a/buildman/builder.py
+++ b/buildman/builder.py
@@ -20,6 +20,9 @@ BUILD_MANAGERS = {
EXTERNALLY_MANAGED = 'external'
+DEFAULT_WEBSOCKET_PORT = 8787
+DEFAULT_CONTROLLER_PORT = 8686
+
def run_build_manager():
if not features.BUILD_SUPPORT:
logger.debug('Building is disabled. Please enable the feature flag')
@@ -44,6 +47,13 @@ def run_build_manager():
manager_hostname = os.environ.get('BUILDMAN_HOSTNAME',
app.config.get('BUILDMAN_HOSTNAME',
app.config['SERVER_HOSTNAME']))
+ websocket_port = int(os.environ.get('BUILDMAN_WEBSOCKET_PORT',
+ app.config.get('BUILDMAN_WEBSOCKET_PORT',
+ DEFAULT_WEBSOCKET_PORT)))
+ controller_port = int(os.environ.get('BUILDMAN_CONTROLLER_PORT',
+ app.config.get('BUILDMAN_CONTROLLER_PORT',
+ DEFAULT_CONTROLLER_PORT)))
+
logger.debug('Will pass buildman hostname %s to builders for websocket connection',
manager_hostname)
@@ -57,7 +67,7 @@ def run_build_manager():
server = BuilderServer(app.config['SERVER_HOSTNAME'], dockerfile_build_queue, build_logs,
user_files, manager_klass, build_manager_config[1], manager_hostname)
- server.run('0.0.0.0', ssl=ssl_context)
+ server.run('0.0.0.0', websocket_port, controller_port, ssl=ssl_context)
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
diff --git a/buildman/server.py b/buildman/server.py
index e1175f718..369f90313 100644
--- a/buildman/server.py
+++ b/buildman/server.py
@@ -21,9 +21,6 @@ TIMEOUT_PERIOD_MINUTES = 20
JOB_TIMEOUT_SECONDS = 300
MINIMUM_JOB_EXTENSION = timedelta(minutes=2)
-WEBSOCKET_PORT = 8787
-CONTROLLER_PORT = 8686
-
HEARTBEAT_PERIOD_SEC = 30
class BuildJobResult(object):
@@ -73,16 +70,16 @@ class BuilderServer(object):
self._controller_app = controller_app
- def run(self, host, ssl=None):
+ def run(self, host, websocket_port, controller_port, ssl=None):
logger.debug('Initializing the lifecycle manager')
self._lifecycle_manager.initialize(self._lifecycle_manager_config)
logger.debug('Initializing all members of the event loop')
loop = trollius.get_event_loop()
- trollius.Task(self._initialize(loop, host, ssl))
+ trollius.Task(self._initialize(loop, host, websocket_port, controller_port, ssl))
- logger.debug('Starting server on port %s, with controller on port %s', WEBSOCKET_PORT,
- CONTROLLER_PORT)
+ logger.debug('Starting server on port %s, with controller on port %s', websocket_port,
+ controller_port)
try:
loop.run_forever()
except KeyboardInterrupt:
@@ -169,7 +166,7 @@ class BuilderServer(object):
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
@trollius.coroutine
- def _initialize(self, loop, host, ssl=None):
+ def _initialize(self, loop, host, websocket_port, controller_port, ssl=None):
self._loop = loop
# Create the WAMP server.
@@ -177,8 +174,8 @@ class BuilderServer(object):
transport_factory.setProtocolOptions(failByDrop=True)
# Initialize the controller server and the WAMP server
- create_wsgi_server(self._controller_app, loop=loop, host=host, port=CONTROLLER_PORT, ssl=ssl)
- yield From(loop.create_server(transport_factory, host, WEBSOCKET_PORT, ssl=ssl))
+ create_wsgi_server(self._controller_app, loop=loop, host=host, port=controller_port, ssl=ssl)
+ yield From(loop.create_server(transport_factory, host, websocket_port, ssl=ssl))
# Initialize the work queue checker.
yield From(self._work_checker())
From fc757fecada1c2db0b413bfcd4e87d1e866ee8ba Mon Sep 17 00:00:00 2001
From: Jake Moshenko
Date: Mon, 5 Jan 2015 15:35:14 -0500
Subject: [PATCH 030/127] Tag the EC2 instances with the build uuid.
---
buildman/manager/ephemeral.py | 5 +++--
buildman/manager/executor.py | 7 ++++---
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py
index 07c773a59..6abd10a5c 100644
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@@ -182,7 +182,8 @@ class EphemeralBuilderManager(BaseManager):
@coroutine
def schedule(self, build_job):
- logger.debug('Calling schedule with job: %s', build_job.job_details['build_uuid'])
+ build_uuid = build_job.job_details['build_uuid']
+ logger.debug('Calling schedule with job: %s', build_uuid)
# Check if there are worker slots avialable by checking the number of jobs in etcd
allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
@@ -223,7 +224,7 @@ class EphemeralBuilderManager(BaseManager):
raise Return(False)
logger.debug('Starting builder with executor: %s', self._executor)
- builder_id = yield From(self._executor.start_builder(realm, token))
+ builder_id = yield From(self._executor.start_builder(realm, token, build_uuid))
# Store the builder in etcd associated with the job id
payload['builder_id'] = builder_id
diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py
index c4b38366d..c122a89fc 100644
--- a/buildman/manager/executor.py
+++ b/buildman/manager/executor.py
@@ -37,7 +37,7 @@ class BuilderExecutor(object):
starting and stopping builders.
"""
@coroutine
- def start_builder(self, realm, token):
+ def start_builder(self, realm, token, build_uuid):
""" Create a builder with the specified config. Returns a unique id which can be used to manage
the builder.
"""
@@ -103,7 +103,7 @@ class EC2Executor(BuilderExecutor):
return stack_amis[ec2_region]
@coroutine
- def start_builder(self, realm, token):
+ def start_builder(self, realm, token, build_uuid):
region = self.executor_config['EC2_REGION']
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
get_ami_callable = partial(self._get_coreos_ami, region, channel)
@@ -141,6 +141,7 @@ class EC2Executor(BuilderExecutor):
'Name': 'Quay Ephemeral Builder',
'Realm': realm,
'Token': token,
+ 'BuildUUID': build_uuid,
}))
raise Return(launched.id)
@@ -163,7 +164,7 @@ class PopenExecutor(BuilderExecutor):
""" Executor which uses Popen to fork a quay-builder process.
"""
@coroutine
- def start_builder(self, realm, token):
+ def start_builder(self, realm, token, build_uuid):
# Now start a machine for this job, adding the machine id to the etcd information
logger.debug('Forking process for build')
import subprocess
From f268a5d66114724c21bbe0046fb86233431f0373 Mon Sep 17 00:00:00 2001
From: Jimmy Zelinskie
Date: Tue, 13 Jan 2015 11:02:08 -0500
Subject: [PATCH 031/127] Fix twitter-view once and for all!
One image URL was broken and it was accidentally using the avatar
directive, so the class has been changed to 'twitter-avatar' and made
explicit.
---
static/css/quay.css | 2 +-
static/directives/twitter-view.html | 2 +-
static/partials/landing-normal.html | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/static/css/quay.css b/static/css/quay.css
index 4df625503..08438df08 100644
--- a/static/css/quay.css
+++ b/static/css/quay.css
@@ -1667,7 +1667,7 @@ i.toggle-icon:hover {
padding-left: 70px;
}
-.landing-page .twitter-tweet .avatar img {
+.landing-page .twitter-tweet .twitter-avatar img {
border-radius: 4px;
border: 2px solid rgb(70, 70, 70);
width: 50px;
diff --git a/static/directives/twitter-view.html b/static/directives/twitter-view.html
index e78776ea1..68e30f366 100644
--- a/static/directives/twitter-view.html
+++ b/static/directives/twitter-view.html
@@ -4,7 +4,7 @@
-
+
{{ authorName }} (@{{authorUser}})
{{ messageDate }}
diff --git a/static/partials/landing-normal.html b/static/partials/landing-normal.html
index 0a0dedc3a..274b56ac0 100644
--- a/static/partials/landing-normal.html
+++ b/static/partials/landing-normal.html
@@ -207,7 +207,7 @@
-