From 2d7e84475343192ca2876c39fbd747e376140953 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 16 Dec 2014 13:41:30 -0500 Subject: [PATCH 001/127] First implementation of ephemeral build lifecycle manager. --- buildman/builder.py | 9 +- buildman/component/buildcomponent.py | 22 +-- buildman/jobutil/buildjob.py | 32 ++--- buildman/manager/basemanager.py | 10 +- buildman/manager/enterprise.py | 26 ++-- buildman/manager/ephemeral.py | 145 +++++++++++++++++++ buildman/manager/executor.py | 204 +++++++++++++++++++++++++++ buildman/server.py | 21 +-- buildman/templates/cloudconfig.yaml | 38 +++++ requirements-nover.txt | 2 + 10 files changed, 453 insertions(+), 56 deletions(-) create mode 100644 buildman/manager/ephemeral.py create mode 100644 buildman/manager/executor.py create mode 100644 buildman/templates/cloudconfig.yaml diff --git a/buildman/builder.py b/buildman/builder.py index 3e14db3eb..df485f142 100644 --- a/buildman/builder.py +++ b/buildman/builder.py @@ -6,6 +6,7 @@ import time from app import app, userfiles as user_files, build_logs, dockerfile_build_queue from buildman.manager.enterprise import EnterpriseManager +from buildman.manager.ephemeral import EphemeralBuilderManager from buildman.server import BuilderServer from trollius import SSLContext @@ -13,7 +14,8 @@ from trollius import SSLContext logger = logging.getLogger(__name__) BUILD_MANAGERS = { - 'enterprise': EnterpriseManager + 'enterprise': EnterpriseManager, + 'ephemeral': EphemeralBuilderManager, } EXTERNALLY_MANAGED = 'external' @@ -39,6 +41,9 @@ def run_build_manager(): if manager_klass is None: return + public_ip = os.environ.get('PUBLIC_IP', '127.0.0.1') + logger.debug('Will pass public IP address %s to builders for websocket connection', public_ip) + logger.debug('Starting build manager with lifecycle "%s"', build_manager_config[0]) ssl_context = None if os.environ.get('SSL_CONFIG'): @@ -48,7 +53,7 @@ def run_build_manager(): os.environ.get('SSL_CONFIG') + '/ssl.key') server = BuilderServer(app.config['SERVER_HOSTNAME'], dockerfile_build_queue, build_logs, - user_files, manager_klass) + user_files, manager_klass, build_manager_config[1], public_ip) server.run('0.0.0.0', ssl=ssl_context) if __name__ == '__main__': diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py index d518d3453..05d342628 100644 --- a/buildman/component/buildcomponent.py +++ b/buildman/component/buildcomponent.py @@ -39,7 +39,7 @@ class BuildComponent(BaseComponent): self.builder_realm = realm self.parent_manager = None - self.server_hostname = None + self.registry_hostname = None self._component_status = ComponentStatus.JOINING self._last_heartbeat = None @@ -68,13 +68,13 @@ class BuildComponent(BaseComponent): def start_build(self, build_job): """ Starts a build. """ self._current_job = build_job - self._build_status = StatusHandler(self.build_logs, build_job.repo_build()) + self._build_status = StatusHandler(self.build_logs, build_job.repo_build) self._image_info = {} self._set_status(ComponentStatus.BUILDING) # Retrieve the job's buildpack. - buildpack_url = self.user_files.get_file_url(build_job.repo_build().resource_key, + buildpack_url = self.user_files.get_file_url(build_job.repo_build.resource_key, requires_cors=False) logger.debug('Retreiving build package: %s', buildpack_url) @@ -89,7 +89,7 @@ class BuildComponent(BaseComponent): parsed_dockerfile = None logger.debug('Parsing dockerfile') - build_config = build_job.build_config() + build_config = build_job.build_config try: parsed_dockerfile = buildpack.parse_dockerfile(build_config.get('build_subdir')) except BuildPackageException as bpe: @@ -116,7 +116,7 @@ class BuildComponent(BaseComponent): base_image_information['password'] = build_config['pull_credentials'].get('password', '') # Retrieve the repository's fully qualified name. - repo = build_job.repo_build().repository + repo = build_job.repo_build.repository repository_name = repo.namespace_user.username + '/' + repo.name # Parse the build queue item into build arguments. @@ -136,9 +136,9 @@ class BuildComponent(BaseComponent): 'build_package': buildpack_url, 'sub_directory': build_config.get('build_subdir', ''), 'repository': repository_name, - 'registry': self.server_hostname, - 'pull_token': build_job.repo_build().access_token.code, - 'push_token': build_job.repo_build().access_token.code, + 'registry': self.registry_hostname, + 'pull_token': build_job.repo_build.access_token.code, + 'push_token': build_job.repo_build.access_token.code, 'tag_names': build_config.get('docker_tags', ['latest']), 'base_image': base_image_information, 'cached_tag': build_job.determine_cached_tag() or '' @@ -244,7 +244,7 @@ class BuildComponent(BaseComponent): 'internal_error': exception.message if exception else None }) - build_id = self._current_job.repo_build().uuid + build_id = self._current_job.repo_build.uuid logger.warning('Build %s failed with message: %s', build_id, error_message) # Mark that the build has finished (in an error state) @@ -305,6 +305,10 @@ class BuildComponent(BaseComponent): return True def _set_status(self, phase): + if phase == ComponentStatus.RUNNING: + loop = trollius.get_event_loop() + self.parent_manager.build_component_ready(self, loop) + self._component_status = phase def _on_heartbeat(self): diff --git a/buildman/jobutil/buildjob.py b/buildman/jobutil/buildjob.py index 6ec02a830..e92be23a6 100644 --- a/buildman/jobutil/buildjob.py +++ b/buildman/jobutil/buildjob.py @@ -9,50 +9,38 @@ class BuildJobLoadException(Exception): class BuildJob(object): """ Represents a single in-progress build job. """ def __init__(self, job_item): - self._job_item = job_item + self.job_item = job_item try: - self._job_details = json.loads(job_item.body) + self.job_details = json.loads(job_item.body) except ValueError: raise BuildJobLoadException( - 'Could not parse build queue item config with ID %s' % self._job_details['build_uuid'] + 'Could not parse build queue item config with ID %s' % self.job_details['build_uuid'] ) try: - self._repo_build = model.get_repository_build(self._job_details['build_uuid']) + self.repo_build = model.get_repository_build(self.job_details['build_uuid']) except model.InvalidRepositoryBuildException: raise BuildJobLoadException( - 'Could not load repository build with ID %s' % self._job_details['build_uuid']) + 'Could not load repository build with ID %s' % self.job_details['build_uuid']) try: - self._build_config = json.loads(self._repo_build.job_config) + self.build_config = json.loads(self.repo_build.job_config) except ValueError: raise BuildJobLoadException( - 'Could not parse repository build job config with ID %s' % self._job_details['build_uuid'] + 'Could not parse repository build job config with ID %s' % self.job_details['build_uuid'] ) def determine_cached_tag(self): """ Returns the tag to pull to prime the cache or None if none. """ # TODO(jschorr): Change this to use the more complicated caching rules, once we have caching # be a pull of things besides the constructed tags. - tags = self._build_config.get('docker_tags', ['latest']) - existing_tags = model.list_repository_tags(self._repo_build.repository.namespace_user.username, - self._repo_build.repository.name) + tags = self.build_config.get('docker_tags', ['latest']) + existing_tags = model.list_repository_tags(self.repo_build.repository.namespace_user.username, + self.repo_build.repository.name) cached_tags = set(tags) & set([tag.name for tag in existing_tags]) if cached_tags: return list(cached_tags)[0] return None - - def job_item(self): - """ Returns the job's queue item. """ - return self._job_item - - def repo_build(self): - """ Returns the repository build DB row for the job. """ - return self._repo_build - - def build_config(self): - """ Returns the parsed repository build config for the job. """ - return self._build_config diff --git a/buildman/manager/basemanager.py b/buildman/manager/basemanager.py index f66054c45..f71971997 100644 --- a/buildman/manager/basemanager.py +++ b/buildman/manager/basemanager.py @@ -1,11 +1,12 @@ class BaseManager(object): """ Base for all worker managers. """ def __init__(self, register_component, unregister_component, job_heartbeat_callback, - job_complete_callback): + job_complete_callback, public_ip_address): self.register_component = register_component self.unregister_component = unregister_component self.job_heartbeat_callback = job_heartbeat_callback self.job_complete_callback = job_complete_callback + self.public_ip_address = public_ip_address def job_heartbeat(self, build_job): """ Method invoked to tell the manager that a job is still running. This method will be called @@ -31,11 +32,16 @@ class BaseManager(object): """ raise NotImplementedError - def initialize(self): + def initialize(self, manager_config): """ Runs any initialization code for the manager. Called once the server is in a ready state. """ raise NotImplementedError + def build_component_ready(self, build_component, loop): + """ Method invoked whenever a build component announces itself as ready. + """ + raise NotImplementedError + def build_component_disposed(self, build_component, timed_out): """ Method invoked whenever a build component has been disposed. The timed_out boolean indicates whether the component's heartbeat timed out. diff --git a/buildman/manager/enterprise.py b/buildman/manager/enterprise.py index 824e02d53..1eedf2790 100644 --- a/buildman/manager/enterprise.py +++ b/buildman/manager/enterprise.py @@ -28,10 +28,12 @@ class DynamicRegistrationComponent(BaseComponent): class EnterpriseManager(BaseManager): """ Build manager implementation for the Enterprise Registry. """ - build_components = [] - shutting_down = False - def initialize(self): + def __init__(self, *args, **kwargs): + self.ready_components = set() + self.shutting_down = False + + def initialize(self, manager_config): # Add a component which is used by build workers for dynamic registration. Unlike # production, build workers in enterprise are long-lived and register dynamically. self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent) @@ -45,21 +47,20 @@ class EnterpriseManager(BaseManager): """ Adds a new build component for an Enterprise Registry. """ # Generate a new unique realm ID for the build worker. realm = str(uuid.uuid4()) - component = self.register_component(realm, BuildComponent, token="") - self.build_components.append(component) + self.register_component(realm, BuildComponent, token="") return realm def schedule(self, build_job, loop): """ Schedules a build for an Enterprise Registry. """ - if self.shutting_down: + if self.shutting_down or not self.ready_components: return False - for component in self.build_components: - if component.is_ready(): - loop.call_soon(component.start_build, build_job) - return True + component = self.ready_components.pop() + loop.call_soon(component.start_build, build_job) + return True - return False + def build_component_ready(self, build_component, loop): + self.ready_components.add(build_component) def shutdown(self): self.shutting_down = True @@ -68,5 +69,6 @@ class EnterpriseManager(BaseManager): self.job_complete_callback(build_job, job_status) def build_component_disposed(self, build_component, timed_out): - self.build_components.remove(build_component) + if build_component in self.ready_components: + self.ready_components.remove(build_component) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py new file mode 100644 index 000000000..68af9de0e --- /dev/null +++ b/buildman/manager/ephemeral.py @@ -0,0 +1,145 @@ +import logging +import etcd +import uuid + +from datetime import datetime, timedelta + +from buildman.manager.basemanager import BaseManager +from buildman.manager.executor import PopenExecutor, EC2Executor +from buildman.component.buildcomponent import BuildComponent + + +logger = logging.getLogger(__name__) + + +ETCD_BUILDER_PREFIX = 'building/' + + +def clear_etcd(client): + """ Debugging method used to clear out the section of etcd we are using to track jobs in flight. + """ + try: + building = client.read(ETCD_BUILDER_PREFIX, recursive=True) + for child in building.leaves: + if not child.dir: + logger.warning('Deleting key: %s', child.key) + client.delete(child.key) + except KeyError: + pass + + +class EphemeralBuilderManager(BaseManager): + """ Build manager implementation for the Enterprise Registry. """ + shutting_down = False + + def __init__(self, *args, **kwargs): + self._manager_config = None + self._etcd_client = None + + self._component_to_job = {} + self._component_to_builder = {} + + self._executors = { + 'popen': PopenExecutor, + 'ec2': EC2Executor, + } + self._executor = None + + super(EphemeralBuilderManager, self).__init__(*args, **kwargs) + + def initialize(self, manager_config): + logger.debug('Calling initialize') + self._manager_config = manager_config + + executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor) + self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}), + self.public_ip_address) + + etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1') + etcd_port = self._manager_config.get('ETCD_PORT', 2379) + logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port) + self._etcd_client = etcd.Client(host=etcd_host, port=etcd_port) + + clear_etcd(self._etcd_client) + + def setup_time(self): + setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300) + logger.debug('Returning setup_time: %s', setup_time) + return setup_time + + def shutdown(self): + logger.debug('Calling shutdown.') + raise NotImplementedError + + def schedule(self, build_job, loop): + logger.debug('Calling schedule with job: %s', build_job.repo_build.uuid) + + # Check if there are worker slots avialable by checking the number of jobs in etcd + allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 2) + try: + building = self._etcd_client.read(ETCD_BUILDER_PREFIX, recursive=True) + workers_alive = sum(1 for child in building.children if not child.dir) + except KeyError: + workers_alive = 0 + + logger.debug('Total jobs: %s', workers_alive) + + if workers_alive >= allowed_worker_count: + logger.info('Too many workers alive, unable to start new worker. %s >= %s', workers_alive, + allowed_worker_count) + return False + + job_key = self._etcd_job_key(build_job) + + # First try to take a lock for this job, meaning we will be responsible for its lifeline + realm = str(uuid.uuid4()) + token = str(uuid.uuid4()) + expiration = datetime.utcnow() + timedelta(seconds=self.setup_time()) + + payload = { + 'expiration': expiration.isoformat(), + } + + try: + self._etcd_client.write(job_key, payload, prevExist=False) + component = self.register_component(realm, BuildComponent, token=token) + self._component_to_job[component] = build_job + except KeyError: + # The job was already taken by someone else, we are probably a retry + logger.warning('Job already exists in etcd, did an old worker die?') + return False + + builder_id = self._executor.start_builder(realm, token) + self._component_to_builder[component] = builder_id + + return True + + def build_component_ready(self, build_component, loop): + try: + job = self._component_to_job.pop(build_component) + logger.debug('Sending build %s to newly ready component on realm %s', job.repo_build.uuid, + build_component.builder_realm) + loop.call_soon(build_component.start_build, job) + except KeyError: + logger.warning('Builder is asking for more work, but work already completed') + + def build_component_disposed(self, build_component, timed_out): + logger.debug('Calling build_component_disposed.') + + def job_completed(self, build_job, job_status, build_component): + logger.debug('Calling job_completed with status: %s', job_status) + + # Kill he ephmeral builder + self._executor.stop_builder(self._component_to_builder.pop(build_component)) + + # Release the lock in etcd + job_key = self._etcd_job_key(build_job) + self._etcd_client.delete(job_key) + + self.job_complete_callback(build_job, job_status) + + @staticmethod + def _etcd_job_key(build_job): + """ Create a key which is used to track a job in etcd. + """ + return '{0}{1}'.format(ETCD_BUILDER_PREFIX, build_job.repo_build.uuid) diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py new file mode 100644 index 000000000..a3cd4981b --- /dev/null +++ b/buildman/manager/executor.py @@ -0,0 +1,204 @@ +import logging +import os +import uuid +import threading +import boto.ec2 +import requests +import cachetools + +from jinja2 import FileSystemLoader, Environment + + +logger = logging.getLogger(__name__) + + +ONE_HOUR = 60*60 + +ENV = Environment(loader=FileSystemLoader('buildman/templates')) +TEMPLATE = ENV.get_template('cloudconfig.yaml') + + +class ExecutorException(Exception): + """ Exception raised when there is a problem starting or stopping a builder. + """ + pass + + +class BuilderExecutor(object): + def __init__(self, executor_config, manager_public_ip): + self.executor_config = executor_config + self.manager_public_ip = manager_public_ip + + """ Interface which can be plugged into the EphemeralNodeManager to provide a strategy for + starting and stopping builders. + """ + def start_builder(self, realm, token): + """ Create a builder with the specified config. Returns a unique id which can be used to manage + the builder. + """ + raise NotImplementedError + + def stop_builder(self, builder_id): + """ Stop a builder which is currently running. + """ + raise NotImplementedError + + def get_manager_websocket_url(self): + return 'ws://{0}:' + + def generate_cloud_config(self, realm, token, coreos_channel, manager_ip, + quay_username=None, quay_password=None, etcd_token=None): + if quay_username is None: + quay_username = self.executor_config['QUAY_USERNAME'] + + if quay_password is None: + quay_password = self.executor_config['QUAY_PASSWORD'] + + if etcd_token is None: + etcd_token = self.executor_config['ETCD_DISCOVERY_TOKEN'] + + return TEMPLATE.render( + realm=realm, + token=token, + quay_username=quay_username, + quay_password=quay_password, + etcd_token=etcd_token, + manager_ip=manager_ip, + coreos_channel=coreos_channel, + ) + + +class EC2Executor(BuilderExecutor): + """ Implementation of BuilderExecutor which uses libcloud to start machines on a variety of cloud + providers. + """ + COREOS_STACK_URL = 'http://%s.release.core-os.net/amd64-usr/current/coreos_production_ami_hvm.txt' + + def _get_conn(self): + """ Creates an ec2 connection which can be used to manage instances. + """ + return boto.ec2.connect_to_region( + self.executor_config['EC2_REGION'], + aws_access_key_id=self.executor_config['AWS_ACCESS_KEY'], + aws_secret_access_key=self.executor_config['AWS_SECRET_KEY'], + ) + + @classmethod + @cachetools.ttl_cache(ttl=ONE_HOUR) + def _get_coreos_ami(cls, ec2_region, coreos_channel): + """ Retrieve the CoreOS AMI id from the canonical listing. + """ + stack_list_string = requests.get(EC2Executor.COREOS_STACK_URL % coreos_channel).text + stack_amis = dict([stack.split('=') for stack in stack_list_string.split('|')]) + return stack_amis[ec2_region] + + def start_builder(self, realm, token): + region = self.executor_config['EC2_REGION'] + channel = self.executor_config.get('COREOS_CHANNEL', 'stable') + coreos_ami = self._get_coreos_ami(region, channel) + user_data = self.generate_cloud_config(realm, token, channel, self.manager_public_ip) + + logger.debug('Generated cloud config: %s', user_data) + + ec2_conn = self._get_conn() + # class FakeReservation(object): + # def __init__(self): + # self.instances = None + # reservation = FakeReservation() + reservation = ec2_conn.run_instances( + coreos_ami, + instance_type=self.executor_config['EC2_INSTANCE_TYPE'], + security_groups=self.executor_config['EC2_SECURITY_GROUP_IDS'], + key_name=self.executor_config.get('EC2_KEY_NAME', None), + user_data=user_data, + ) + + if not reservation.instances: + raise ExecutorException('Unable to spawn builder instance.') + elif len(reservation.instances) != 1: + raise ExecutorException('EC2 started wrong number of instances!') + + return reservation.instances[0] + + def stop_builder(self, builder_id): + ec2_conn = self._get_conn() + stopped_instances = ec2_conn.stop_instances([builder_id], force=True) + if builder_id not in stopped_instances: + raise ExecutorException('Unable to stop instance: %s' % builder_id) + +class PopenExecutor(BuilderExecutor): + """ Implementation of BuilderExecutor which uses Popen to fork a quay-builder process. + """ + def __init__(self, executor_config, manager_public_ip): + self._jobs = {} + + super(PopenExecutor, self).__init__(executor_config, manager_public_ip) + + """ Executor which uses Popen to fork a quay-builder process. + """ + def start_builder(self, realm, token): + # Now start a machine for this job, adding the machine id to the etcd information + logger.debug('Forking process for build') + import subprocess + builder_env = { + 'TOKEN': token, + 'REALM': realm, + 'ENDPOINT': 'ws://localhost:8787', + 'DOCKER_TLS_VERIFY': os.environ.get('DOCKER_TLS_VERIFY', ''), + 'DOCKER_CERT_PATH': os.environ.get('DOCKER_CERT_PATH', ''), + 'DOCKER_HOST': os.environ.get('DOCKER_HOST', ''), + } + + logpipe = LogPipe(logging.INFO) + spawned = subprocess.Popen('/Users/jake/bin/quay-builder', stdout=logpipe, stderr=logpipe, + env=builder_env) + + builder_id = str(uuid.uuid4()) + self._jobs[builder_id] = (spawned, logpipe) + logger.debug('Builder spawned with id: %s', builder_id) + return builder_id + + + def stop_builder(self, builder_id): + if builder_id not in self._jobs: + raise ExecutorException('Builder id not being tracked by executor.') + + logger.debug('Killing builder with id: %s', builder_id) + spawned, logpipe = self._jobs[builder_id] + + if spawned.poll() is None: + spawned.kill() + logpipe.close() + + +class LogPipe(threading.Thread): + """ Adapted from http://codereview.stackexchange.com/a/17959 + """ + def __init__(self, level): + """Setup the object with a logger and a loglevel + and start the thread + """ + threading.Thread.__init__(self) + self.daemon = False + self.level = level + self.fd_read, self.fd_write = os.pipe() + self.pipe_reader = os.fdopen(self.fd_read) + self.start() + + def fileno(self): + """Return the write file descriptor of the pipe + """ + return self.fd_write + + def run(self): + """Run the thread, logging everything. + """ + for line in iter(self.pipe_reader.readline, ''): + logging.log(self.level, line.strip('\n')) + + self.pipe_reader.close() + + def close(self): + """Close the write end of the pipe. + """ + os.close(self.fd_write) diff --git a/buildman/server.py b/buildman/server.py index 3863406f2..6f57b6627 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -34,14 +34,15 @@ class BuilderServer(object): """ Server which handles both HTTP and WAMP requests, managing the full state of the build controller. """ - def __init__(self, server_hostname, queue, build_logs, user_files, lifecycle_manager_klass): + def __init__(self, registry_hostname, queue, build_logs, user_files, lifecycle_manager_klass, + lifecycle_manager_config, manager_public_ip): self._loop = None self._current_status = 'starting' self._current_components = [] self._job_count = 0 self._session_factory = RouterSessionFactory(RouterFactory()) - self._server_hostname = server_hostname + self._registry_hostname = registry_hostname self._queue = queue self._build_logs = build_logs self._user_files = user_files @@ -49,8 +50,10 @@ class BuilderServer(object): self._register_component, self._unregister_component, self._job_heartbeat, - self._job_complete + self._job_complete, + manager_public_ip, ) + self._lifecycle_manager_config = lifecycle_manager_config self._shutdown_event = Event() self._current_status = 'running' @@ -69,7 +72,7 @@ class BuilderServer(object): def run(self, host, ssl=None): logger.debug('Initializing the lifecycle manager') - self._lifecycle_manager.initialize() + self._lifecycle_manager.initialize(self._lifecycle_manager_config) logger.debug('Initializing all members of the event loop') loop = trollius.get_event_loop() @@ -102,7 +105,7 @@ class BuilderServer(object): component.parent_manager = self._lifecycle_manager component.build_logs = self._build_logs component.user_files = self._user_files - component.server_hostname = self._server_hostname + component.registry_hostname = self._registry_hostname self._current_components.append(component) self._session_factory.add(component) @@ -116,16 +119,16 @@ class BuilderServer(object): self._session_factory.remove(component) def _job_heartbeat(self, build_job): - WorkQueue.extend_processing(build_job.job_item(), seconds_from_now=JOB_TIMEOUT_SECONDS, + WorkQueue.extend_processing(build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS, retry_count=1, minimum_extension=MINIMUM_JOB_EXTENSION) def _job_complete(self, build_job, job_status): if job_status == BuildJobResult.INCOMPLETE: - self._queue.incomplete(build_job.job_item(), restore_retry=True, retry_after=30) + self._queue.incomplete(build_job.job_item, restore_retry=True, retry_after=30) elif job_status == BuildJobResult.ERROR: - self._queue.incomplete(build_job.job_item(), restore_retry=False) + self._queue.incomplete(build_job.job_item, restore_retry=False) else: - self._queue.complete(build_job.job_item()) + self._queue.complete(build_job.job_item) self._job_count = self._job_count - 1 diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml new file mode 100644 index 000000000..ca9c6c16a --- /dev/null +++ b/buildman/templates/cloudconfig.yaml @@ -0,0 +1,38 @@ +#cloud-config + +write_files: +- path: /root/overrides.list + permission: '0644' + content: | + REALM={{ realm }} + TOKEN={{ token }} + ENDPOINT=wss://buildman.quay.io:8787 + +coreos: + update: + reboot-strategy: off + group: {{ coreos_channel }} + + etcd: + discovery: https://discovery.etcd.io/{{ etcd_token }} + # multi-region and multi-cloud deployments need to use $public_ipv4 + addr: $private_ipv4:4001 + peer-addr: $private_ipv4:7001 + + units: + - name: quay-builder.service + command: start + content: | + [Unit] + Description=Quay builder container + Author=Jake Moshenko + After=docker.service + + [Service] + Restart=always + TimeoutStartSec=600 + TimeoutStopSec=2000 + ExecStartPre=/usr/bin/sudo /bin/sh -xc "echo '{{ manager_ip }} buildman.quay.io' >> /etc/hosts; exit 0" + ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io + ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest + ExecStop=/usr/bin/docker stop quay-builder diff --git a/requirements-nover.txt b/requirements-nover.txt index c1bf6c19f..51cd42e3c 100644 --- a/requirements-nover.txt +++ b/requirements-nover.txt @@ -41,3 +41,5 @@ git+https://github.com/DevTable/aniso8601-fake.git git+https://github.com/DevTable/anunidecode.git git+https://github.com/DevTable/avatar-generator.git gipc +python-etcd +cachetools From 1d68594dc220d5cc16dda89aa5cad62d969210e6 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 16 Dec 2014 15:10:50 -0500 Subject: [PATCH 002/127] Extract instance ids from the instance objects returned by boto. --- buildman/manager/executor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index a3cd4981b..e82ecf672 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -118,12 +118,12 @@ class EC2Executor(BuilderExecutor): elif len(reservation.instances) != 1: raise ExecutorException('EC2 started wrong number of instances!') - return reservation.instances[0] + return reservation.instances[0].id def stop_builder(self, builder_id): ec2_conn = self._get_conn() - stopped_instances = ec2_conn.stop_instances([builder_id], force=True) - if builder_id not in stopped_instances: + stopped_instance_ids = [si.id for si in ec2_conn.stop_instances([builder_id], force=True)] + if builder_id not in stopped_instance_ids: raise ExecutorException('Unable to stop instance: %s' % builder_id) class PopenExecutor(BuilderExecutor): From a280bbcb6db471e3275e4d3937404da6f09479b5 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 16 Dec 2014 15:17:39 -0500 Subject: [PATCH 003/127] Add tag metadata to the instances. --- buildman/manager/executor.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index e82ecf672..b35a90c97 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -118,7 +118,13 @@ class EC2Executor(BuilderExecutor): elif len(reservation.instances) != 1: raise ExecutorException('EC2 started wrong number of instances!') - return reservation.instances[0].id + launched = reservation.instances[0] + launched.add_tags({ + 'Name': 'Quay Ephemeral Builder', + 'Realm': realm, + 'Token': token, + }) + return launched.id def stop_builder(self, builder_id): ec2_conn = self._get_conn() From 12ee8e0fc02a7abe1c0cf457698fa618a7a26a76 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Mon, 22 Dec 2014 12:14:16 -0500 Subject: [PATCH 004/127] Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. --- buildman/asyncutil.py | 27 +++++++ buildman/component/buildcomponent.py | 22 +++--- buildman/manager/basemanager.py | 8 +- buildman/manager/enterprise.py | 8 +- buildman/manager/ephemeral.py | 64 +++++++++++----- buildman/manager/executor.py | 37 +++++---- buildman/server.py | 3 +- buildman/templates/cloudconfig.yaml | 2 +- endpoints/api/build.py | 4 +- requirements-nover.txt | 1 + test/test_buildman.py | 109 +++++++++++++++++++++++++++ 11 files changed, 233 insertions(+), 52 deletions(-) create mode 100644 buildman/asyncutil.py create mode 100644 test/test_buildman.py diff --git a/buildman/asyncutil.py b/buildman/asyncutil.py new file mode 100644 index 000000000..4f2d4e1a9 --- /dev/null +++ b/buildman/asyncutil.py @@ -0,0 +1,27 @@ +from functools import partial, wraps +from trollius import get_event_loop + + +class AsyncWrapper(object): + """ Wrapper class which will transform a syncronous library to one that can be used with + trollius coroutines. + """ + def __init__(self, delegate, loop=None, executor=None): + self._loop = loop if loop is not None else get_event_loop() + self._delegate = delegate + self._executor = executor + + def __getattr__(self, attrib): + delegate_attr = getattr(self._delegate, attrib) + + if not callable(delegate_attr): + return delegate_attr + + def wrapper(*args, **kwargs): + """ Wraps the delegate_attr with primitives that will transform sync calls to ones shelled + out to a thread pool. + """ + callable_delegate_attr = partial(delegate_attr, *args, **kwargs) + return self._loop.run_in_executor(self._executor, callable_delegate_attr) + + return wrapper diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py index 05d342628..53b04bf87 100644 --- a/buildman/component/buildcomponent.py +++ b/buildman/component/buildcomponent.py @@ -6,7 +6,6 @@ import trollius import re from autobahn.wamp.exception import ApplicationError -from trollius.coroutines import From from buildman.server import BuildJobResult from buildman.component.basecomponent import BaseComponent @@ -54,10 +53,10 @@ class BuildComponent(BaseComponent): def onJoin(self, details): logger.debug('Registering methods and listeners for component %s', self.builder_realm) - yield From(self.register(self._on_ready, u'io.quay.buildworker.ready')) - yield From(self.register(self._ping, u'io.quay.buildworker.ping')) - yield From(self.subscribe(self._on_heartbeat, 'io.quay.builder.heartbeat')) - yield From(self.subscribe(self._on_log_message, 'io.quay.builder.logmessage')) + yield trollius.From(self.register(self._on_ready, u'io.quay.buildworker.ready')) + yield trollius.From(self.register(self._ping, u'io.quay.buildworker.ping')) + yield trollius.From(self.subscribe(self._on_heartbeat, 'io.quay.builder.heartbeat')) + yield trollius.From(self.subscribe(self._on_log_message, 'io.quay.builder.logmessage')) self._set_status(ComponentStatus.WAITING) @@ -270,9 +269,10 @@ class BuildComponent(BaseComponent): else: self._build_finished(BuildJobResult.ERROR) + @trollius.coroutine def _build_finished(self, job_status): """ Alerts the parent that a build has completed and sets the status back to running. """ - self.parent_manager.job_completed(self._current_job, job_status, self) + yield trollius.From(self.parent_manager.job_completed(self._current_job, job_status, self)) self._current_job = None # Set the component back to a running state. @@ -313,7 +313,7 @@ class BuildComponent(BaseComponent): def _on_heartbeat(self): """ Updates the last known heartbeat. """ - self._last_heartbeat = datetime.datetime.now() + self._last_heartbeat = datetime.datetime.utcnow() @trollius.coroutine def _heartbeat(self): @@ -321,7 +321,7 @@ class BuildComponent(BaseComponent): and updating the heartbeat in the build status dictionary (if applicable). This allows the build system to catch crashes from either end. """ - yield From(trollius.sleep(INITIAL_TIMEOUT)) + yield trollius.From(trollius.sleep(INITIAL_TIMEOUT)) while True: # If the component is no longer running or actively building, nothing more to do. @@ -335,7 +335,6 @@ class BuildComponent(BaseComponent): with build_status as status_dict: status_dict['heartbeat'] = int(time.time()) - # Mark the build item. current_job = self._current_job if current_job is not None: @@ -343,11 +342,12 @@ class BuildComponent(BaseComponent): # Check the heartbeat from the worker. logger.debug('Checking heartbeat on realm %s', self.builder_realm) - if self._last_heartbeat and self._last_heartbeat < datetime.datetime.now() - HEARTBEAT_DELTA: + if (self._last_heartbeat and + self._last_heartbeat < datetime.datetime.utcnow() - HEARTBEAT_DELTA): self._timeout() return - yield From(trollius.sleep(HEARTBEAT_TIMEOUT)) + yield trollius.From(trollius.sleep(HEARTBEAT_TIMEOUT)) def _timeout(self): self._set_status(ComponentStatus.TIMED_OUT) diff --git a/buildman/manager/basemanager.py b/buildman/manager/basemanager.py index f71971997..fc9fd70cf 100644 --- a/buildman/manager/basemanager.py +++ b/buildman/manager/basemanager.py @@ -1,3 +1,5 @@ +from trollius import coroutine + class BaseManager(object): """ Base for all worker managers. """ def __init__(self, register_component, unregister_component, job_heartbeat_callback, @@ -26,6 +28,7 @@ class BaseManager(object): """ raise NotImplementedError + @coroutine def schedule(self, build_job, loop): """ Schedules a queue item to be built. Returns True if the item was properly scheduled and False if all workers are busy. @@ -48,8 +51,11 @@ class BaseManager(object): """ raise NotImplementedError + @coroutine def job_completed(self, build_job, job_status, build_component): """ Method invoked once a job_item has completed, in some manner. The job_status will be - one of: incomplete, error, complete. If incomplete, the job should be requeued. + one of: incomplete, error, complete. Implementations of this method should call + self.job_complete_callback with a status of Incomplete if they wish for the job to be + automatically requeued. """ raise NotImplementedError diff --git a/buildman/manager/enterprise.py b/buildman/manager/enterprise.py index 1eedf2790..516464ff3 100644 --- a/buildman/manager/enterprise.py +++ b/buildman/manager/enterprise.py @@ -5,7 +5,7 @@ from buildman.component.basecomponent import BaseComponent from buildman.component.buildcomponent import BuildComponent from buildman.manager.basemanager import BaseManager -from trollius.coroutines import From +from trollius.coroutines import From, Return, coroutine REGISTRATION_REALM = 'registration' logger = logging.getLogger(__name__) @@ -50,14 +50,15 @@ class EnterpriseManager(BaseManager): self.register_component(realm, BuildComponent, token="") return realm + @coroutine def schedule(self, build_job, loop): """ Schedules a build for an Enterprise Registry. """ if self.shutting_down or not self.ready_components: - return False + raise Return(False) component = self.ready_components.pop() loop.call_soon(component.start_build, build_job) - return True + raise Return(True) def build_component_ready(self, build_component, loop): self.ready_components.add(build_component) @@ -65,6 +66,7 @@ class EnterpriseManager(BaseManager): def shutdown(self): self.shutting_down = True + @coroutine def job_completed(self, build_job, job_status, build_component): self.job_complete_callback(build_job, job_status) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 68af9de0e..ed2da908e 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -1,12 +1,15 @@ import logging import etcd import uuid +import calendar from datetime import datetime, timedelta +from trollius import From, coroutine, Return from buildman.manager.basemanager import BaseManager from buildman.manager.executor import PopenExecutor, EC2Executor from buildman.component.buildcomponent import BuildComponent +from buildman.asyncutil import AsyncWrapper logger = logging.getLogger(__name__) @@ -32,6 +35,13 @@ class EphemeralBuilderManager(BaseManager): """ Build manager implementation for the Enterprise Registry. """ shutting_down = False + _executors = { + 'popen': PopenExecutor, + 'ec2': EC2Executor, + } + + _etcd_client_klass = etcd.Client + def __init__(self, *args, **kwargs): self._manager_config = None self._etcd_client = None @@ -39,10 +49,6 @@ class EphemeralBuilderManager(BaseManager): self._component_to_job = {} self._component_to_builder = {} - self._executors = { - 'popen': PopenExecutor, - 'ec2': EC2Executor, - } self._executor = None super(EphemeralBuilderManager, self).__init__(*args, **kwargs) @@ -58,9 +64,8 @@ class EphemeralBuilderManager(BaseManager): etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1') etcd_port = self._manager_config.get('ETCD_PORT', 2379) logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port) - self._etcd_client = etcd.Client(host=etcd_host, port=etcd_port) - clear_etcd(self._etcd_client) + self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port)) def setup_time(self): setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300) @@ -71,13 +76,14 @@ class EphemeralBuilderManager(BaseManager): logger.debug('Calling shutdown.') raise NotImplementedError + @coroutine def schedule(self, build_job, loop): - logger.debug('Calling schedule with job: %s', build_job.repo_build.uuid) + logger.debug('Calling schedule with job: %s', build_job.job_details['build_uuid']) # Check if there are worker slots avialable by checking the number of jobs in etcd - allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 2) + allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1) try: - building = self._etcd_client.read(ETCD_BUILDER_PREFIX, recursive=True) + building = yield From(self._etcd_client.read(ETCD_BUILDER_PREFIX, recursive=True)) workers_alive = sum(1 for child in building.children if not child.dir) except KeyError: workers_alive = 0 @@ -87,7 +93,7 @@ class EphemeralBuilderManager(BaseManager): if workers_alive >= allowed_worker_count: logger.info('Too many workers alive, unable to start new worker. %s >= %s', workers_alive, allowed_worker_count) - return False + raise Return(False) job_key = self._etcd_job_key(build_job) @@ -97,28 +103,33 @@ class EphemeralBuilderManager(BaseManager): expiration = datetime.utcnow() + timedelta(seconds=self.setup_time()) payload = { - 'expiration': expiration.isoformat(), + 'expiration': calendar.timegm(expiration.timetuple()), } try: - self._etcd_client.write(job_key, payload, prevExist=False) + yield From(self._etcd_client.write(job_key, payload, prevExist=False)) component = self.register_component(realm, BuildComponent, token=token) self._component_to_job[component] = build_job except KeyError: # The job was already taken by someone else, we are probably a retry - logger.warning('Job already exists in etcd, did an old worker die?') - return False + logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?') + raise Return(False) - builder_id = self._executor.start_builder(realm, token) + logger.debug('Starting builder with executor: %s', self._executor) + builder_id = yield From(self._executor.start_builder(realm, token)) self._component_to_builder[component] = builder_id - return True + # Store the builder in etcd associated with the job id + payload['builder_id'] = builder_id + yield From(self._etcd_client.write(job_key, payload, prevExist=True)) + + raise Return(True) def build_component_ready(self, build_component, loop): try: job = self._component_to_job.pop(build_component) - logger.debug('Sending build %s to newly ready component on realm %s', job.repo_build.uuid, - build_component.builder_realm) + logger.debug('Sending build %s to newly ready component on realm %s', + job.job_details['build_uuid'], build_component.builder_realm) loop.call_soon(build_component.start_build, job) except KeyError: logger.warning('Builder is asking for more work, but work already completed') @@ -126,6 +137,7 @@ class EphemeralBuilderManager(BaseManager): def build_component_disposed(self, build_component, timed_out): logger.debug('Calling build_component_disposed.') + @coroutine def job_completed(self, build_job, job_status, build_component): logger.debug('Calling job_completed with status: %s', job_status) @@ -134,12 +146,24 @@ class EphemeralBuilderManager(BaseManager): # Release the lock in etcd job_key = self._etcd_job_key(build_job) - self._etcd_client.delete(job_key) + yield From(self._etcd_client.delete(job_key)) self.job_complete_callback(build_job, job_status) + @coroutine + def _clean_up_old_builder(self, job_key, job_payload): + """ Terminate an old builders once the expiration date has passed. + """ + logger.debug('Cleaning up the old builder for job: %s', job_key) + if 'builder_id' in job_payload: + logger.info('Terminating expired build node.') + yield From(self._executor.stop_builder(job_payload['builder_id'])) + + yield From(self._etcd_client.delete(job_key)) + + @staticmethod def _etcd_job_key(build_job): """ Create a key which is used to track a job in etcd. """ - return '{0}{1}'.format(ETCD_BUILDER_PREFIX, build_job.repo_build.uuid) + return '{0}{1}'.format(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid']) diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index b35a90c97..82b98ef5c 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -7,6 +7,10 @@ import requests import cachetools from jinja2 import FileSystemLoader, Environment +from trollius import coroutine, From, Return, get_event_loop +from functools import partial + +from buildman.asyncutil import AsyncWrapper logger = logging.getLogger(__name__) @@ -32,12 +36,14 @@ class BuilderExecutor(object): """ Interface which can be plugged into the EphemeralNodeManager to provide a strategy for starting and stopping builders. """ + @coroutine def start_builder(self, realm, token): """ Create a builder with the specified config. Returns a unique id which can be used to manage the builder. """ raise NotImplementedError + @coroutine def stop_builder(self, builder_id): """ Stop a builder which is currently running. """ @@ -74,14 +80,18 @@ class EC2Executor(BuilderExecutor): """ COREOS_STACK_URL = 'http://%s.release.core-os.net/amd64-usr/current/coreos_production_ami_hvm.txt' + def __init__(self, *args, **kwargs): + self._loop = get_event_loop() + super(EC2Executor, self).__init__(*args, **kwargs) + def _get_conn(self): """ Creates an ec2 connection which can be used to manage instances. """ - return boto.ec2.connect_to_region( + return AsyncWrapper(boto.ec2.connect_to_region( self.executor_config['EC2_REGION'], aws_access_key_id=self.executor_config['AWS_ACCESS_KEY'], aws_secret_access_key=self.executor_config['AWS_SECRET_KEY'], - ) + )) @classmethod @cachetools.ttl_cache(ttl=ONE_HOUR) @@ -92,25 +102,24 @@ class EC2Executor(BuilderExecutor): stack_amis = dict([stack.split('=') for stack in stack_list_string.split('|')]) return stack_amis[ec2_region] + @coroutine def start_builder(self, realm, token): region = self.executor_config['EC2_REGION'] channel = self.executor_config.get('COREOS_CHANNEL', 'stable') - coreos_ami = self._get_coreos_ami(region, channel) + get_ami_callable = partial(self._get_coreos_ami, region, channel) + coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable)) user_data = self.generate_cloud_config(realm, token, channel, self.manager_public_ip) logger.debug('Generated cloud config: %s', user_data) ec2_conn = self._get_conn() - # class FakeReservation(object): - # def __init__(self): - # self.instances = None - # reservation = FakeReservation() - reservation = ec2_conn.run_instances( + reservation = yield ec2_conn.run_instances( coreos_ami, instance_type=self.executor_config['EC2_INSTANCE_TYPE'], security_groups=self.executor_config['EC2_SECURITY_GROUP_IDS'], key_name=self.executor_config.get('EC2_KEY_NAME', None), user_data=user_data, + instance_initiated_shutdown_behavior='terminate', ) if not reservation.instances: @@ -124,12 +133,13 @@ class EC2Executor(BuilderExecutor): 'Realm': realm, 'Token': token, }) - return launched.id + raise Return(launched.id) + @coroutine def stop_builder(self, builder_id): ec2_conn = self._get_conn() - stopped_instance_ids = [si.id for si in ec2_conn.stop_instances([builder_id], force=True)] - if builder_id not in stopped_instance_ids: + stopped_instances = yield ec2_conn.stop_instances([builder_id], force=True) + if builder_id not in [si.id for si in stopped_instances]: raise ExecutorException('Unable to stop instance: %s' % builder_id) class PopenExecutor(BuilderExecutor): @@ -142,6 +152,7 @@ class PopenExecutor(BuilderExecutor): """ Executor which uses Popen to fork a quay-builder process. """ + @coroutine def start_builder(self, realm, token): # Now start a machine for this job, adding the machine id to the etcd information logger.debug('Forking process for build') @@ -162,9 +173,9 @@ class PopenExecutor(BuilderExecutor): builder_id = str(uuid.uuid4()) self._jobs[builder_id] = (spawned, logpipe) logger.debug('Builder spawned with id: %s', builder_id) - return builder_id - + raise Return(builder_id) + @coroutine def stop_builder(self, builder_id): if builder_id not in self._jobs: raise ExecutorException('Builder id not being tracked by executor.') diff --git a/buildman/server.py b/buildman/server.py index 6f57b6627..66f0010b6 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -154,7 +154,8 @@ class BuilderServer(object): self._queue.incomplete(job_item, restore_retry=False) logger.debug('Build job found. Checking for an avaliable worker.') - if self._lifecycle_manager.schedule(build_job, self._loop): + scheduled = yield From(self._lifecycle_manager.schedule(build_job, self._loop)) + if scheduled: self._job_count = self._job_count + 1 logger.debug('Build job scheduled. Running: %s', self._job_count) else: diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml index ca9c6c16a..e75ce5626 100644 --- a/buildman/templates/cloudconfig.yaml +++ b/buildman/templates/cloudconfig.yaml @@ -29,10 +29,10 @@ coreos: After=docker.service [Service] - Restart=always TimeoutStartSec=600 TimeoutStopSec=2000 ExecStartPre=/usr/bin/sudo /bin/sh -xc "echo '{{ manager_ip }} buildman.quay.io' >> /etc/hosts; exit 0" ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest ExecStop=/usr/bin/docker stop quay-builder + ExecStopPost=/usr/bin/sudo /bin/sh -xc "/bin/sleep 600; /sbin/shutown -h now" diff --git a/endpoints/api/build.py b/endpoints/api/build.py index e7fdf2f11..506c250da 100644 --- a/endpoints/api/build.py +++ b/endpoints/api/build.py @@ -72,8 +72,8 @@ def build_status_view(build_obj, can_write=False): # minutes. If not, then the build timed out. if phase != database.BUILD_PHASE.COMPLETE and phase != database.BUILD_PHASE.ERROR: if status is not None and 'heartbeat' in status and status['heartbeat']: - heartbeat = datetime.datetime.fromtimestamp(status['heartbeat']) - if datetime.datetime.now() - heartbeat > datetime.timedelta(minutes=1): + heartbeat = datetime.datetime.utcfromtimestamp(status['heartbeat']) + if datetime.datetime.utcnow() - heartbeat > datetime.timedelta(minutes=1): phase = database.BUILD_PHASE.INTERNAL_ERROR logger.debug('Can write: %s job_config: %s', can_write, build_obj.job_config) diff --git a/requirements-nover.txt b/requirements-nover.txt index 51cd42e3c..2993895d7 100644 --- a/requirements-nover.txt +++ b/requirements-nover.txt @@ -43,3 +43,4 @@ git+https://github.com/DevTable/avatar-generator.git gipc python-etcd cachetools +mock diff --git a/test/test_buildman.py b/test/test_buildman.py new file mode 100644 index 000000000..0886b671a --- /dev/null +++ b/test/test_buildman.py @@ -0,0 +1,109 @@ +import unittest +import etcd + +from trollius import coroutine, get_event_loop, From, Future +from mock import Mock +from functools import partial + +from buildman.manager.executor import BuilderExecutor +from buildman.manager.ephemeral import EphemeralBuilderManager, ETCD_BUILDER_PREFIX +from buildman.server import BuildJobResult +from buildman.component.buildcomponent import BuildComponent + + +BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead' + + +import logging +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + +def async_test(f): + def wrapper(*args, **kwargs): + coro = coroutine(f) + future = coro(*args, **kwargs) + loop = get_event_loop() + loop.run_until_complete(future) + return wrapper + +class TestEphemeral(unittest.TestCase): + def __init__(self, *args, **kwargs): + self.etcd_client_mock = None + self.test_executor = None + super(TestEphemeral, self).__init__(*args, **kwargs) + + def _create_mock_etcd_client(self, *args, **kwargs): + self.etcd_client_mock = Mock(spec=etcd.Client, name='etcd.Client') + return self.etcd_client_mock + + def _create_mock_executor(self, *args, **kwargs): + def create_completed_future(result=None): + def inner(*args, **kwargs): + new_future = Future() + new_future.set_result(result) + return new_future + return inner + + self.test_executor = Mock(spec=BuilderExecutor) + self.test_executor.start_builder = Mock(side_effect=create_completed_future('123')) + self.test_executor.stop_builder = Mock(side_effect=create_completed_future()) + return self.test_executor + + def _create_build_job(self): + mock_job = Mock() + mock_job.job_details = { + 'build_uuid': BUILD_UUID, + } + return mock_job + + def setUp(self): + EphemeralBuilderManager._executors['test'] = self._create_mock_executor + + self.old_etcd_client_klass = EphemeralBuilderManager._etcd_client_klass + EphemeralBuilderManager._etcd_client_klass = self._create_mock_etcd_client + + self.register_component_callback = Mock() + self.uniregister_component_callback = Mock() + self.job_heartbeat_callback = Mock() + self.job_complete_callback = Mock() + + self.manager = EphemeralBuilderManager( + self.register_component_callback, + self.uniregister_component_callback, + self.job_heartbeat_callback, + self.job_complete_callback, + '127.0.0.1' + ) + + self.manager.initialize({'EXECUTOR': 'test'}) + + def tearDown(self): + del EphemeralBuilderManager._executors['test'] + EphemeralBuilderManager._etcd_client_klass = self.old_etcd_client_klass + + @async_test + def test_schedule_and_complete(self): + mock_job = self._create_build_job() + + self.etcd_client_mock.read = Mock(side_effect=KeyError) + test_component = BuildComponent(None) + self.register_component_callback.return_value = test_component + + # Ask for a builder to be scheduled + loop = get_event_loop() + is_scheduled = yield From(self.manager.schedule(mock_job, loop)) + + self.assertTrue(is_scheduled) + + job_key = ETCD_BUILDER_PREFIX + mock_job.job_details['build_uuid'] + self.etcd_client_mock.read.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True) + self.assertEqual(len(self.test_executor.start_builder.call_args_list), 1) + self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], job_key) + self.assertEqual(self.etcd_client_mock.write.call_args_list[1][0][0], job_key) + + self.assertEqual(len(self.register_component_callback.call_args_list), 1) + + yield From(self.manager.job_completed(mock_job, BuildJobResult.COMPLETE, test_component)) + + self.assertEqual(len(self.test_executor.stop_builder.call_args_list), 1) + self.etcd_client_mock.delete.assert_called_once_with(job_key) From 2b6c2a2a50daeccb975e243faf24b11de40b55f3 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Mon, 22 Dec 2014 16:22:07 -0500 Subject: [PATCH 005/127] Improve tests for the ephemeral build manager. --- buildman/manager/ephemeral.py | 72 ++++++++++++++++++++++---------- test/test_buildman.py | 78 ++++++++++++++++++++++++++++------- 2 files changed, 115 insertions(+), 35 deletions(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index ed2da908e..80a96d336 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -2,9 +2,11 @@ import logging import etcd import uuid import calendar +import os.path from datetime import datetime, timedelta -from trollius import From, coroutine, Return +from trollius import From, coroutine, Return, async +from concurrent.futures import ThreadPoolExecutor from buildman.manager.basemanager import BaseManager from buildman.manager.executor import PopenExecutor, EC2Executor @@ -16,25 +18,11 @@ logger = logging.getLogger(__name__) ETCD_BUILDER_PREFIX = 'building/' - - -def clear_etcd(client): - """ Debugging method used to clear out the section of etcd we are using to track jobs in flight. - """ - try: - building = client.read(ETCD_BUILDER_PREFIX, recursive=True) - for child in building.leaves: - if not child.dir: - logger.warning('Deleting key: %s', child.key) - client.delete(child.key) - except KeyError: - pass +ETCD_EXPIRE_RESULT = 'expire' class EphemeralBuilderManager(BaseManager): """ Build manager implementation for the Enterprise Registry. """ - shutting_down = False - _executors = { 'popen': PopenExecutor, 'ec2': EC2Executor, @@ -43,7 +31,10 @@ class EphemeralBuilderManager(BaseManager): _etcd_client_klass = etcd.Client def __init__(self, *args, **kwargs): + self._shutting_down = False + self._manager_config = None + self._async_thread_executor = None self._etcd_client = None self._component_to_job = {} @@ -51,8 +42,35 @@ class EphemeralBuilderManager(BaseManager): self._executor = None + self._worker_watch_task = None + super(EphemeralBuilderManager, self).__init__(*args, **kwargs) + def _watch_builders(self): + """ Watch the builders key for expirations. + """ + if not self._shutting_down: + workers_future = self._etcd_client.watch(ETCD_BUILDER_PREFIX, recursive=True) + workers_future.add_done_callback(self._handle_key_expiration) + logger.debug('Scheduling watch task.') + self._worker_watch_task = async(workers_future) + + def _handle_key_expiration(self, changed_key_future): + """ Handle when a builder expires + """ + if self._worker_watch_task is None or self._worker_watch_task.done(): + self._watch_builders() + + if changed_key_future.cancelled(): + # Due to lack of interest, tomorrow has been cancelled + return + + etcd_result = changed_key_future.result() + if etcd_result.action == ETCD_EXPIRE_RESULT: + # Handle the expiration + logger.debug('Builder expired, clean up the old build node') + async(self._clean_up_old_builder(etcd_result.key, etcd_result._prev_node.value)) + def initialize(self, manager_config): logger.debug('Calling initialize') self._manager_config = manager_config @@ -65,7 +83,11 @@ class EphemeralBuilderManager(BaseManager): etcd_port = self._manager_config.get('ETCD_PORT', 2379) logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port) - self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port)) + self._async_thread_executor = ThreadPoolExecutor(self._manager_config.get('ETCD_WORKERS', 5)) + self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port), + executor=self._async_thread_executor) + + self._watch_builders() def setup_time(self): setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300) @@ -73,8 +95,17 @@ class EphemeralBuilderManager(BaseManager): return setup_time def shutdown(self): - logger.debug('Calling shutdown.') - raise NotImplementedError + logger.debug('Shutting down worker.') + self._shutting_down = True + + if self._worker_watch_task is not None: + logger.debug('Canceling watch task.') + self._worker_watch_task.cancel() + self._worker_watch_task = None + + if self._async_thread_executor is not None: + logger.debug('Shutting down thread pool executor.') + self._async_thread_executor.shutdown() @coroutine def schedule(self, build_job, loop): @@ -161,9 +192,8 @@ class EphemeralBuilderManager(BaseManager): yield From(self._etcd_client.delete(job_key)) - @staticmethod def _etcd_job_key(build_job): """ Create a key which is used to track a job in etcd. """ - return '{0}{1}'.format(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid']) + return os.path.join(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid']) diff --git a/test/test_buildman.py b/test/test_buildman.py index 0886b671a..d5a7423e6 100644 --- a/test/test_buildman.py +++ b/test/test_buildman.py @@ -1,12 +1,15 @@ import unittest import etcd +import os.path +import time -from trollius import coroutine, get_event_loop, From, Future +from trollius import coroutine, get_event_loop, From, Future, sleep from mock import Mock -from functools import partial +from threading import Event from buildman.manager.executor import BuilderExecutor -from buildman.manager.ephemeral import EphemeralBuilderManager, ETCD_BUILDER_PREFIX +from buildman.manager.ephemeral import (EphemeralBuilderManager, ETCD_BUILDER_PREFIX, + ETCD_EXPIRE_RESULT) from buildman.server import BuildJobResult from buildman.component.buildcomponent import BuildComponent @@ -14,10 +17,6 @@ from buildman.component.buildcomponent import BuildComponent BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead' -import logging -logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger(__name__) - def async_test(f): def wrapper(*args, **kwargs): coro = coroutine(f) @@ -29,11 +28,17 @@ def async_test(f): class TestEphemeral(unittest.TestCase): def __init__(self, *args, **kwargs): self.etcd_client_mock = None + self.etcd_wait_event = Event() self.test_executor = None super(TestEphemeral, self).__init__(*args, **kwargs) def _create_mock_etcd_client(self, *args, **kwargs): + def hang_until_event(*args, **kwargs): + time.sleep(.01) # 10ms to simulate network latency + self.etcd_wait_event.wait() + self.etcd_client_mock = Mock(spec=etcd.Client, name='etcd.Client') + self.etcd_client_mock.watch = Mock(side_effect=hang_until_event) return self.etcd_client_mock def _create_mock_executor(self, *args, **kwargs): @@ -61,6 +66,7 @@ class TestEphemeral(unittest.TestCase): self.old_etcd_client_klass = EphemeralBuilderManager._etcd_client_klass EphemeralBuilderManager._etcd_client_klass = self._create_mock_etcd_client + self.etcd_wait_event.clear() self.register_component_callback = Mock() self.uniregister_component_callback = Mock() @@ -77,7 +83,13 @@ class TestEphemeral(unittest.TestCase): self.manager.initialize({'EXECUTOR': 'test'}) + self.mock_job_key = os.path.join(ETCD_BUILDER_PREFIX, BUILD_UUID) + def tearDown(self): + self.etcd_wait_event.set() + + self.manager.shutdown() + del EphemeralBuilderManager._executors['test'] EphemeralBuilderManager._etcd_client_klass = self.old_etcd_client_klass @@ -95,15 +107,53 @@ class TestEphemeral(unittest.TestCase): self.assertTrue(is_scheduled) - job_key = ETCD_BUILDER_PREFIX + mock_job.job_details['build_uuid'] self.etcd_client_mock.read.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True) - self.assertEqual(len(self.test_executor.start_builder.call_args_list), 1) - self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], job_key) - self.assertEqual(self.etcd_client_mock.write.call_args_list[1][0][0], job_key) + self.assertEqual(self.test_executor.start_builder.call_count, 1) + self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key) + self.assertEqual(self.etcd_client_mock.write.call_args_list[1][0][0], self.mock_job_key) - self.assertEqual(len(self.register_component_callback.call_args_list), 1) + self.assertEqual(self.register_component_callback.call_count, 1) yield From(self.manager.job_completed(mock_job, BuildJobResult.COMPLETE, test_component)) - self.assertEqual(len(self.test_executor.stop_builder.call_args_list), 1) - self.etcd_client_mock.delete.assert_called_once_with(job_key) + self.assertEqual(self.test_executor.stop_builder.call_count, 1) + self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key) + + @async_test + def test_expiring_worker(self): + # Test that we are watching before anything else happens + self.etcd_client_mock.watch.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True) + + # Send a signal to the callback that a worker has expired + expired_result = Mock(sepc=etcd.EtcdResult) + expired_result.action = ETCD_EXPIRE_RESULT + expired_result.key = self.mock_job_key + expired_result._prev_node = Mock(spec=etcd.EtcdResult) + expired_result._prev_node.value = {'builder_id': '1234'} + expired_future = Future() + expired_future.set_result(expired_result) + + self.manager._handle_key_expiration(expired_future) + + yield From(sleep(.01)) + + self.test_executor.stop_builder.assert_called_once_with('1234') + self.assertEqual(self.test_executor.stop_builder.call_count, 1) + + self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key) + + @async_test + def test_change_worker(self): + # Send a signal to the callback that a worker key has been changed + set_result = Mock(sepc=etcd.EtcdResult) + set_result.action = 'set' + set_result.key = self.mock_job_key + set_future = Future() + set_future.set_result(set_result) + + self.manager._handle_key_expiration(set_future) + + yield From(sleep(.01)) + + self.assertEquals(self.test_executor.stop_builder.call_count, 0) + From 34bf92673bfc1654f2a7020ed4b41d5a321a03e2 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Mon, 22 Dec 2014 17:24:44 -0500 Subject: [PATCH 006/127] Add support for adjusting etcd ttl on job_heartbeat. Switch the heartbeat method to a coroutine. --- buildman/component/buildcomponent.py | 2 +- buildman/manager/basemanager.py | 4 +++- buildman/manager/ephemeral.py | 36 ++++++++++++++++++++++++---- buildman/manager/executor.py | 1 + buildman/server.py | 6 ++++- test/test_buildman.py | 26 +++++++++++++++----- 6 files changed, 62 insertions(+), 13 deletions(-) diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py index 53b04bf87..fb81e6aa5 100644 --- a/buildman/component/buildcomponent.py +++ b/buildman/component/buildcomponent.py @@ -338,7 +338,7 @@ class BuildComponent(BaseComponent): # Mark the build item. current_job = self._current_job if current_job is not None: - self.parent_manager.job_heartbeat(current_job) + yield trollius.From(self.parent_manager.job_heartbeat(current_job)) # Check the heartbeat from the worker. logger.debug('Checking heartbeat on realm %s', self.builder_realm) diff --git a/buildman/manager/basemanager.py b/buildman/manager/basemanager.py index 76e97e5ac..ee17cf531 100644 --- a/buildman/manager/basemanager.py +++ b/buildman/manager/basemanager.py @@ -3,13 +3,15 @@ from trollius import coroutine class BaseManager(object): """ Base for all worker managers. """ def __init__(self, register_component, unregister_component, job_heartbeat_callback, - job_complete_callback, public_ip_address): + job_complete_callback, public_ip_address, heartbeat_period_sec): self.register_component = register_component self.unregister_component = unregister_component self.job_heartbeat_callback = job_heartbeat_callback self.job_complete_callback = job_complete_callback self.public_ip_address = public_ip_address + self.heartbeat_period_sec = heartbeat_period_sec + @coroutine def job_heartbeat(self, build_job): """ Method invoked to tell the manager that a job is still running. This method will be called every few minutes. """ diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 80a96d336..fdc116e5b 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -83,7 +83,8 @@ class EphemeralBuilderManager(BaseManager): etcd_port = self._manager_config.get('ETCD_PORT', 2379) logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port) - self._async_thread_executor = ThreadPoolExecutor(self._manager_config.get('ETCD_WORKERS', 5)) + worker_threads = self._manager_config.get('ETCD_WORKER_THREADS', 5) + self._async_thread_executor = ThreadPoolExecutor(worker_threads) self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port), executor=self._async_thread_executor) @@ -131,14 +132,15 @@ class EphemeralBuilderManager(BaseManager): # First try to take a lock for this job, meaning we will be responsible for its lifeline realm = str(uuid.uuid4()) token = str(uuid.uuid4()) - expiration = datetime.utcnow() + timedelta(seconds=self.setup_time()) + ttl = self.setup_time() + expiration = datetime.utcnow() + timedelta(seconds=ttl) payload = { 'expiration': calendar.timegm(expiration.timetuple()), } try: - yield From(self._etcd_client.write(job_key, payload, prevExist=False)) + yield From(self._etcd_client.write(job_key, payload, prevExist=False, ttl=ttl)) component = self.register_component(realm, BuildComponent, token=token) self._component_to_job[component] = build_job except KeyError: @@ -168,11 +170,14 @@ class EphemeralBuilderManager(BaseManager): def build_component_disposed(self, build_component, timed_out): logger.debug('Calling build_component_disposed.') + # TODO make it so that I don't have to unregister the component if it timed out + self.unregister_component(build_component) + @coroutine def job_completed(self, build_job, job_status, build_component): logger.debug('Calling job_completed with status: %s', job_status) - # Kill he ephmeral builder + # Kill the ephmeral builder self._executor.stop_builder(self._component_to_builder.pop(build_component)) # Release the lock in etcd @@ -181,6 +186,24 @@ class EphemeralBuilderManager(BaseManager): self.job_complete_callback(build_job, job_status) + @coroutine + def job_heartbeat(self, build_job): + # Extend the deadline in etcd + job_key = self._etcd_job_key(build_job) + build_job_response = yield From(self._etcd_client.read(job_key)) + + ttl = self.heartbeat_period_sec * 2 + new_expiration = datetime.utcnow() + timedelta(seconds=ttl) + + payload = { + 'expiration': calendar.timegm(new_expiration.timetuple()), + 'builder_id': build_job_response.value['builder_id'], + } + + yield From(self._etcd_client.write(job_key, payload, ttl=ttl)) + + self.job_heartbeat_callback(build_job) + @coroutine def _clean_up_old_builder(self, job_key, job_payload): """ Terminate an old builders once the expiration date has passed. @@ -197,3 +220,8 @@ class EphemeralBuilderManager(BaseManager): """ Create a key which is used to track a job in etcd. """ return os.path.join(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid']) + + def num_workers(self): + """ Return the number of workers we're managing locally. + """ + return len(self._component_to_builder) diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index 82b98ef5c..e3a6a4f4a 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -142,6 +142,7 @@ class EC2Executor(BuilderExecutor): if builder_id not in [si.id for si in stopped_instances]: raise ExecutorException('Unable to stop instance: %s' % builder_id) + class PopenExecutor(BuilderExecutor): """ Implementation of BuilderExecutor which uses Popen to fork a quay-builder process. """ diff --git a/buildman/server.py b/buildman/server.py index 576bb3a10..ba9536c1e 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -24,6 +24,8 @@ MINIMUM_JOB_EXTENSION = timedelta(minutes=2) WEBSOCKET_PORT = 8787 CONTROLLER_PORT = 8686 +HEARTBEAT_PERIOD_SEC = 30 + class BuildJobResult(object): """ Build job result enum """ INCOMPLETE = 'incomplete' @@ -52,6 +54,7 @@ class BuilderServer(object): self._job_heartbeat, self._job_complete, manager_public_ip, + HEARTBEAT_PERIOD_SEC, ) self._lifecycle_manager_config = lifecycle_manager_config @@ -140,7 +143,8 @@ class BuilderServer(object): @trollius.coroutine def _work_checker(self): while self._current_status == 'running': - logger.debug('Checking for more work for %d active workers', self._lifecycle_manager.num_workers()) + logger.debug('Checking for more work for %d active workers', + self._lifecycle_manager.num_workers()) job_item = self._queue.get(processing_time=self._lifecycle_manager.setup_time()) if job_item is None: logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT) diff --git a/test/test_buildman.py b/test/test_buildman.py index d5a7423e6..d31539a3d 100644 --- a/test/test_buildman.py +++ b/test/test_buildman.py @@ -78,11 +78,13 @@ class TestEphemeral(unittest.TestCase): self.uniregister_component_callback, self.job_heartbeat_callback, self.job_complete_callback, - '127.0.0.1' + '127.0.0.1', + 30, ) self.manager.initialize({'EXECUTOR': 'test'}) + self.mock_job = self._create_build_job() self.mock_job_key = os.path.join(ETCD_BUILDER_PREFIX, BUILD_UUID) def tearDown(self): @@ -95,15 +97,13 @@ class TestEphemeral(unittest.TestCase): @async_test def test_schedule_and_complete(self): - mock_job = self._create_build_job() - self.etcd_client_mock.read = Mock(side_effect=KeyError) test_component = BuildComponent(None) self.register_component_callback.return_value = test_component # Ask for a builder to be scheduled loop = get_event_loop() - is_scheduled = yield From(self.manager.schedule(mock_job, loop)) + is_scheduled = yield From(self.manager.schedule(self.mock_job, loop)) self.assertTrue(is_scheduled) @@ -114,7 +114,7 @@ class TestEphemeral(unittest.TestCase): self.assertEqual(self.register_component_callback.call_count, 1) - yield From(self.manager.job_completed(mock_job, BuildJobResult.COMPLETE, test_component)) + yield From(self.manager.job_completed(self.mock_job, BuildJobResult.COMPLETE, test_component)) self.assertEqual(self.test_executor.stop_builder.call_count, 1) self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key) @@ -125,7 +125,7 @@ class TestEphemeral(unittest.TestCase): self.etcd_client_mock.watch.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True) # Send a signal to the callback that a worker has expired - expired_result = Mock(sepc=etcd.EtcdResult) + expired_result = Mock(spec=etcd.EtcdResult) expired_result.action = ETCD_EXPIRE_RESULT expired_result.key = self.mock_job_key expired_result._prev_node = Mock(spec=etcd.EtcdResult) @@ -157,3 +157,17 @@ class TestEphemeral(unittest.TestCase): self.assertEquals(self.test_executor.stop_builder.call_count, 0) + @async_test + def test_heartbeat_response(self): + builder_result = Mock(spec=etcd.EtcdResult) + builder_result.value = {'builder_id': '123', 'expiration': '123'} + self.etcd_client_mock.read = Mock(return_value=builder_result) + + yield From(self.manager.job_heartbeat(self.mock_job)) + + # Wait for threads to complete + yield From(sleep(.01)) + + self.job_heartbeat_callback.assert_called_once_with(self.mock_job) + self.assertEqual(self.etcd_client_mock.write.call_count, 1) + self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key) From aac7feb20b36542ea3594aa6343687b602558088 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 11:17:23 -0500 Subject: [PATCH 007/127] Refresh the build_job from the database before we write updates. --- buildman/component/buildcomponent.py | 2 +- buildman/jobutil/buildstatus.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py index fb81e6aa5..391f8ffed 100644 --- a/buildman/component/buildcomponent.py +++ b/buildman/component/buildcomponent.py @@ -67,7 +67,7 @@ class BuildComponent(BaseComponent): def start_build(self, build_job): """ Starts a build. """ self._current_job = build_job - self._build_status = StatusHandler(self.build_logs, build_job.repo_build) + self._build_status = StatusHandler(self.build_logs, build_job.repo_build.uuid) self._image_info = {} self._set_status(ComponentStatus.BUILDING) diff --git a/buildman/jobutil/buildstatus.py b/buildman/jobutil/buildstatus.py index 68b8cd5e3..b79776c46 100644 --- a/buildman/jobutil/buildstatus.py +++ b/buildman/jobutil/buildstatus.py @@ -1,12 +1,12 @@ from data.database import BUILD_PHASE +from data import model class StatusHandler(object): """ Context wrapper for writing status to build logs. """ - def __init__(self, build_logs, repository_build): + def __init__(self, build_logs, repository_build_uuid): self._current_phase = None - self._repository_build = repository_build - self._uuid = repository_build.uuid + self._uuid = repository_build_uuid self._build_logs = build_logs self._status = { @@ -41,8 +41,11 @@ class StatusHandler(object): self._current_phase = phase self._append_log_message(phase, self._build_logs.PHASE, extra_data) - self._repository_build.phase = phase - self._repository_build.save() + + # Update the repository build with the new phase + repo_build = model.get_repository_build(self._uuid) + repo_build.phase = phase + repo_build.save() return True def __enter__(self): From 055a6b0c377f143038c1413b14f5e0137f143fdd Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 11:18:10 -0500 Subject: [PATCH 008/127] Add a total maximum time that a machine is allowed to stick around before we terminate it more forcefully. --- buildman/manager/ephemeral.py | 13 +++++++++++-- test/test_buildman.py | 7 ++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index fdc116e5b..7d9eacdc2 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -135,8 +135,12 @@ class EphemeralBuilderManager(BaseManager): ttl = self.setup_time() expiration = datetime.utcnow() + timedelta(seconds=ttl) + machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200) + max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration) + payload = { 'expiration': calendar.timegm(expiration.timetuple()), + 'max_expiration': calendar.timegm(max_expiration.timetuple()), } try: @@ -154,7 +158,7 @@ class EphemeralBuilderManager(BaseManager): # Store the builder in etcd associated with the job id payload['builder_id'] = builder_id - yield From(self._etcd_client.write(job_key, payload, prevExist=True)) + yield From(self._etcd_client.write(job_key, payload, prevExist=True, ttl=ttl)) raise Return(True) @@ -192,12 +196,17 @@ class EphemeralBuilderManager(BaseManager): job_key = self._etcd_job_key(build_job) build_job_response = yield From(self._etcd_client.read(job_key)) - ttl = self.heartbeat_period_sec * 2 + max_expiration = datetime.utcfromtimestamp(build_job_response.value['max_expiration']) + max_expiration_remaining = max_expiration - datetime.utcnow() + max_expiration_sec = max(0, int(max_expiration_remaining.total_seconds())) + + ttl = min(self.heartbeat_period_sec * 2, max_expiration_sec) new_expiration = datetime.utcnow() + timedelta(seconds=ttl) payload = { 'expiration': calendar.timegm(new_expiration.timetuple()), 'builder_id': build_job_response.value['builder_id'], + 'max_expiration': build_job_response.value['max_expiration'], } yield From(self._etcd_client.write(job_key, payload, ttl=ttl)) diff --git a/test/test_buildman.py b/test/test_buildman.py index d31539a3d..6835cdd49 100644 --- a/test/test_buildman.py +++ b/test/test_buildman.py @@ -159,8 +159,13 @@ class TestEphemeral(unittest.TestCase): @async_test def test_heartbeat_response(self): + expiration_timestamp = time.time() + 60 builder_result = Mock(spec=etcd.EtcdResult) - builder_result.value = {'builder_id': '123', 'expiration': '123'} + builder_result.value = { + 'builder_id': '123', + 'expiration': expiration_timestamp, + 'max_expiration': expiration_timestamp, + } self.etcd_client_mock.read = Mock(return_value=builder_result) yield From(self.manager.job_heartbeat(self.mock_job)) From 709e571b78ff7b7345b3f1771368e673cc610065 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 12:13:49 -0500 Subject: [PATCH 009/127] Handle read timeouts from etcd when watching a key. --- buildman/manager/ephemeral.py | 7 ++++++- test/test_buildman.py | 13 +++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 7d9eacdc2..39776b60e 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -7,6 +7,7 @@ import os.path from datetime import datetime, timedelta from trollius import From, coroutine, Return, async from concurrent.futures import ThreadPoolExecutor +from urllib3.exceptions import ReadTimeoutError from buildman.manager.basemanager import BaseManager from buildman.manager.executor import PopenExecutor, EC2Executor @@ -65,7 +66,11 @@ class EphemeralBuilderManager(BaseManager): # Due to lack of interest, tomorrow has been cancelled return - etcd_result = changed_key_future.result() + try: + etcd_result = changed_key_future.result() + except ReadTimeoutError: + return + if etcd_result.action == ETCD_EXPIRE_RESULT: # Handle the expiration logger.debug('Builder expired, clean up the old build node') diff --git a/test/test_buildman.py b/test/test_buildman.py index 6835cdd49..9d0f5c1f4 100644 --- a/test/test_buildman.py +++ b/test/test_buildman.py @@ -6,6 +6,7 @@ import time from trollius import coroutine, get_event_loop, From, Future, sleep from mock import Mock from threading import Event +from urllib3.exceptions import ReadTimeoutError from buildman.manager.executor import BuilderExecutor from buildman.manager.ephemeral import (EphemeralBuilderManager, ETCD_BUILDER_PREFIX, @@ -176,3 +177,15 @@ class TestEphemeral(unittest.TestCase): self.job_heartbeat_callback.assert_called_once_with(self.mock_job) self.assertEqual(self.etcd_client_mock.write.call_count, 1) self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key) + + @async_test + def test_etcd_read_timeout(self): + # Send a signal to the callback that a worker key has been changed + read_timeout_future = Future() + read_timeout_future.set_exception(ReadTimeoutError(None, None, None)) + + self.manager._handle_key_expiration(read_timeout_future) + + yield From(sleep(.01)) + + self.assertEquals(self.test_executor.stop_builder.call_count, 0) From 4e22e22ba12b9c6899d9f49f76838bbec491ae4f Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 14:09:04 -0500 Subject: [PATCH 010/127] We have to serialize our build data before sending it to etc. --- buildman/manager/ephemeral.py | 19 +++++++++++-------- test/test_buildman.py | 7 ++++--- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 39776b60e..3b467bb23 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -3,6 +3,7 @@ import etcd import uuid import calendar import os.path +import json from datetime import datetime, timedelta from trollius import From, coroutine, Return, async @@ -74,7 +75,8 @@ class EphemeralBuilderManager(BaseManager): if etcd_result.action == ETCD_EXPIRE_RESULT: # Handle the expiration logger.debug('Builder expired, clean up the old build node') - async(self._clean_up_old_builder(etcd_result.key, etcd_result._prev_node.value)) + job_metadata = json.loads(etcd_result._prev_node.value) + async(self._clean_up_old_builder(etcd_result.key, job_metadata)) def initialize(self, manager_config): logger.debug('Calling initialize') @@ -149,7 +151,7 @@ class EphemeralBuilderManager(BaseManager): } try: - yield From(self._etcd_client.write(job_key, payload, prevExist=False, ttl=ttl)) + yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=False, ttl=ttl)) component = self.register_component(realm, BuildComponent, token=token) self._component_to_job[component] = build_job except KeyError: @@ -163,7 +165,7 @@ class EphemeralBuilderManager(BaseManager): # Store the builder in etcd associated with the job id payload['builder_id'] = builder_id - yield From(self._etcd_client.write(job_key, payload, prevExist=True, ttl=ttl)) + yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl)) raise Return(True) @@ -199,9 +201,10 @@ class EphemeralBuilderManager(BaseManager): def job_heartbeat(self, build_job): # Extend the deadline in etcd job_key = self._etcd_job_key(build_job) - build_job_response = yield From(self._etcd_client.read(job_key)) + build_job_metadata_response = yield From(self._etcd_client.read(job_key)) + build_job_metadata = json.loads(build_job_metadata_response.value) - max_expiration = datetime.utcfromtimestamp(build_job_response.value['max_expiration']) + max_expiration = datetime.utcfromtimestamp(build_job_metadata['max_expiration']) max_expiration_remaining = max_expiration - datetime.utcnow() max_expiration_sec = max(0, int(max_expiration_remaining.total_seconds())) @@ -210,11 +213,11 @@ class EphemeralBuilderManager(BaseManager): payload = { 'expiration': calendar.timegm(new_expiration.timetuple()), - 'builder_id': build_job_response.value['builder_id'], - 'max_expiration': build_job_response.value['max_expiration'], + 'builder_id': build_job_metadata['builder_id'], + 'max_expiration': build_job_metadata['max_expiration'], } - yield From(self._etcd_client.write(job_key, payload, ttl=ttl)) + yield From(self._etcd_client.write(job_key, json.dumps(payload), ttl=ttl)) self.job_heartbeat_callback(build_job) diff --git a/test/test_buildman.py b/test/test_buildman.py index 9d0f5c1f4..e33adccbd 100644 --- a/test/test_buildman.py +++ b/test/test_buildman.py @@ -2,6 +2,7 @@ import unittest import etcd import os.path import time +import json from trollius import coroutine, get_event_loop, From, Future, sleep from mock import Mock @@ -130,7 +131,7 @@ class TestEphemeral(unittest.TestCase): expired_result.action = ETCD_EXPIRE_RESULT expired_result.key = self.mock_job_key expired_result._prev_node = Mock(spec=etcd.EtcdResult) - expired_result._prev_node.value = {'builder_id': '1234'} + expired_result._prev_node.value = json.dumps({'builder_id': '1234'}) expired_future = Future() expired_future.set_result(expired_result) @@ -162,11 +163,11 @@ class TestEphemeral(unittest.TestCase): def test_heartbeat_response(self): expiration_timestamp = time.time() + 60 builder_result = Mock(spec=etcd.EtcdResult) - builder_result.value = { + builder_result.value = json.dumps({ 'builder_id': '123', 'expiration': expiration_timestamp, 'max_expiration': expiration_timestamp, - } + }) self.etcd_client_mock.read = Mock(return_value=builder_result) yield From(self.manager.job_heartbeat(self.mock_job)) From b2d7fad6676774cf22ce0c3324453f6d8c3ce285 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 14:09:24 -0500 Subject: [PATCH 011/127] Fix a typo with the automatic node shutdown fallback in the ephemeral nodes. --- buildman/templates/cloudconfig.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml index e75ce5626..fc8ec6a4f 100644 --- a/buildman/templates/cloudconfig.yaml +++ b/buildman/templates/cloudconfig.yaml @@ -35,4 +35,4 @@ coreos: ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest ExecStop=/usr/bin/docker stop quay-builder - ExecStopPost=/usr/bin/sudo /bin/sh -xc "/bin/sleep 600; /sbin/shutown -h now" + ExecStopPost=/usr/bin/sudo /bin/sh -xc "/bin/sleep 600; /sbin/shutdown -h now" From 2ed9b3d2437e19c7369dfd83c4eeed71d89dcf14 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 14:54:34 -0500 Subject: [PATCH 012/127] Disable the etcd timeout on watch calls to prevent them from disconnecting the client. --- buildman/manager/ephemeral.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 3b467bb23..ac603b9ce 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -21,6 +21,7 @@ logger = logging.getLogger(__name__) ETCD_BUILDER_PREFIX = 'building/' ETCD_EXPIRE_RESULT = 'expire' +ETCD_DISABLE_TIMEOUT = 0 class EphemeralBuilderManager(BaseManager): @@ -52,7 +53,8 @@ class EphemeralBuilderManager(BaseManager): """ Watch the builders key for expirations. """ if not self._shutting_down: - workers_future = self._etcd_client.watch(ETCD_BUILDER_PREFIX, recursive=True) + workers_future = self._etcd_client.watch(ETCD_BUILDER_PREFIX, recursive=True, + timeout=ETCD_DISABLE_TIMEOUT) workers_future.add_done_callback(self._handle_key_expiration) logger.debug('Scheduling watch task.') self._worker_watch_task = async(workers_future) From 723fb27671bd08690f74172f195ff315892a50c4 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 14:54:58 -0500 Subject: [PATCH 013/127] Calls to the ec2 service must be async, and responses must be wrapped as well. --- buildman/manager/executor.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index e3a6a4f4a..beef14881 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -113,32 +113,32 @@ class EC2Executor(BuilderExecutor): logger.debug('Generated cloud config: %s', user_data) ec2_conn = self._get_conn() - reservation = yield ec2_conn.run_instances( + reservation = yield From(ec2_conn.run_instances( coreos_ami, instance_type=self.executor_config['EC2_INSTANCE_TYPE'], security_groups=self.executor_config['EC2_SECURITY_GROUP_IDS'], key_name=self.executor_config.get('EC2_KEY_NAME', None), user_data=user_data, instance_initiated_shutdown_behavior='terminate', - ) + )) if not reservation.instances: raise ExecutorException('Unable to spawn builder instance.') elif len(reservation.instances) != 1: raise ExecutorException('EC2 started wrong number of instances!') - launched = reservation.instances[0] - launched.add_tags({ + launched = AsyncWrapper(reservation.instances[0]) + yield From(launched.add_tags({ 'Name': 'Quay Ephemeral Builder', 'Realm': realm, 'Token': token, - }) + })) raise Return(launched.id) @coroutine def stop_builder(self, builder_id): ec2_conn = self._get_conn() - stopped_instances = yield ec2_conn.stop_instances([builder_id], force=True) + stopped_instances = yield From(ec2_conn.stop_instances([builder_id], force=True)) if builder_id not in [si.id for si in stopped_instances]: raise ExecutorException('Unable to stop instance: %s' % builder_id) From 2f2a88825d4fdc8c970adb1dfcc8e714d297d626 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 15:35:21 -0500 Subject: [PATCH 014/127] Try using SSD for root volumes. --- buildman/manager/executor.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index beef14881..b2081d581 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -113,6 +113,14 @@ class EC2Executor(BuilderExecutor): logger.debug('Generated cloud config: %s', user_data) ec2_conn = self._get_conn() + + ssd_root_ebs = boto.ec2.blockdevicemapping.BlockDeviceType( + size=8, + volume_type='gp2', + delete_on_termination=True, + ) + block_devices = boto.ec2.blockdevicemapping.BlockDeviceMapping() + block_devices['/dev/sda1'] = ssd_root_ebs reservation = yield From(ec2_conn.run_instances( coreos_ami, instance_type=self.executor_config['EC2_INSTANCE_TYPE'], @@ -120,6 +128,7 @@ class EC2Executor(BuilderExecutor): key_name=self.executor_config.get('EC2_KEY_NAME', None), user_data=user_data, instance_initiated_shutdown_behavior='terminate', + block_device_map=block_devices, )) if not reservation.instances: From 4a2295373f5e997bbd65eee664c80bd3f1e53c62 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 15:35:34 -0500 Subject: [PATCH 015/127] Fix tests for no timeout watches. --- test/test_buildman.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_buildman.py b/test/test_buildman.py index e33adccbd..0d0b6ced2 100644 --- a/test/test_buildman.py +++ b/test/test_buildman.py @@ -124,7 +124,8 @@ class TestEphemeral(unittest.TestCase): @async_test def test_expiring_worker(self): # Test that we are watching before anything else happens - self.etcd_client_mock.watch.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True) + self.etcd_client_mock.watch.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True, + timeout=0) # Send a signal to the callback that a worker has expired expired_result = Mock(spec=etcd.EtcdResult) From 8e16fbf59b9530672629ab9959a27bdb84a62e20 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 15:41:58 -0500 Subject: [PATCH 016/127] The root device on CoreOS is /dev/xvda. --- buildman/manager/executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index b2081d581..b80e87922 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -120,7 +120,7 @@ class EC2Executor(BuilderExecutor): delete_on_termination=True, ) block_devices = boto.ec2.blockdevicemapping.BlockDeviceMapping() - block_devices['/dev/sda1'] = ssd_root_ebs + block_devices['/dev/xvda'] = ssd_root_ebs reservation = yield From(ec2_conn.run_instances( coreos_ami, instance_type=self.executor_config['EC2_INSTANCE_TYPE'], From ef70432b117f0fa85d9e5e0d840a439c894447be Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 16:04:10 -0500 Subject: [PATCH 017/127] We need to call build_finished async. --- buildman/component/buildcomponent.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py index 391f8ffed..726435bcc 100644 --- a/buildman/component/buildcomponent.py +++ b/buildman/component/buildcomponent.py @@ -247,7 +247,7 @@ class BuildComponent(BaseComponent): logger.warning('Build %s failed with message: %s', build_id, error_message) # Mark that the build has finished (in an error state) - self._build_finished(BuildJobResult.ERROR) + trollius.async(self._build_finished(BuildJobResult.ERROR)) def _build_complete(self, result): """ Wraps up a completed build. Handles any errors and calls self._build_finished. """ @@ -255,7 +255,7 @@ class BuildComponent(BaseComponent): # Retrieve the result. This will raise an ApplicationError on any error that occurred. result.result() self._build_status.set_phase(BUILD_PHASE.COMPLETE) - self._build_finished(BuildJobResult.COMPLETE) + trollius.async(self._build_finished(BuildJobResult.COMPLETE)) except ApplicationError as aex: worker_error = WorkerError(aex.error, aex.kwargs.get('base_error')) @@ -265,9 +265,9 @@ class BuildComponent(BaseComponent): # Mark the build as completed. if worker_error.is_internal_error(): - self._build_finished(BuildJobResult.INCOMPLETE) + trollius.async(self._build_finished(BuildJobResult.INCOMPLETE)) else: - self._build_finished(BuildJobResult.ERROR) + trollius.async(self._build_finished(BuildJobResult.ERROR)) @trollius.coroutine def _build_finished(self, job_status): From 3ce64b4a7f5b34cb79fc1882d7654f358ad4f1e1 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 16:12:10 -0500 Subject: [PATCH 018/127] We must yield from stop_builder. --- buildman/manager/ephemeral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index ac603b9ce..63f03a6b7 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -191,7 +191,7 @@ class EphemeralBuilderManager(BaseManager): logger.debug('Calling job_completed with status: %s', job_status) # Kill the ephmeral builder - self._executor.stop_builder(self._component_to_builder.pop(build_component)) + yield From(self._executor.stop_builder(self._component_to_builder.pop(build_component))) # Release the lock in etcd job_key = self._etcd_job_key(build_job) From cece94e1dad95208303dffe109641729f7056973 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 16:20:42 -0500 Subject: [PATCH 019/127] We want to terminate instances, not stop them. --- buildman/manager/executor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index b80e87922..1a7c4e114 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -147,9 +147,9 @@ class EC2Executor(BuilderExecutor): @coroutine def stop_builder(self, builder_id): ec2_conn = self._get_conn() - stopped_instances = yield From(ec2_conn.stop_instances([builder_id], force=True)) - if builder_id not in [si.id for si in stopped_instances]: - raise ExecutorException('Unable to stop instance: %s' % builder_id) + terminated_instances = yield From(ec2_conn.terminate_instances([builder_id], force=True)) + if builder_id not in [si.id for si in terminated_instances]: + raise ExecutorException('Unable to terminate instance: %s' % builder_id) class PopenExecutor(BuilderExecutor): From 1005c29b6b914d9fa837f4015d4ab31b186e5b89 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 17:08:16 -0500 Subject: [PATCH 020/127] Fix the shutdown command for when the builder terminates itself. --- buildman/templates/cloudconfig.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml index fc8ec6a4f..321e75927 100644 --- a/buildman/templates/cloudconfig.yaml +++ b/buildman/templates/cloudconfig.yaml @@ -31,8 +31,8 @@ coreos: [Service] TimeoutStartSec=600 TimeoutStopSec=2000 - ExecStartPre=/usr/bin/sudo /bin/sh -xc "echo '{{ manager_ip }} buildman.quay.io' >> /etc/hosts; exit 0" + ExecStartPre=/bin/sh -xc "echo '{{ manager_ip }} buildman.quay.io' >> /etc/hosts; exit 0" ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest ExecStop=/usr/bin/docker stop quay-builder - ExecStopPost=/usr/bin/sudo /bin/sh -xc "/bin/sleep 600; /sbin/shutdown -h now" + ExecStopPost=/bin/sh -xc "/bin/sleep 600; /usr/bin/systemctl --no-block poweroff" From ec87e37d8c5f5788e8af231a7d5b6acb8cc471b3 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 17:17:53 -0500 Subject: [PATCH 021/127] EC2 terminate_instances does not take a force flag. --- buildman/manager/executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index 1a7c4e114..814b95a5b 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -147,7 +147,7 @@ class EC2Executor(BuilderExecutor): @coroutine def stop_builder(self, builder_id): ec2_conn = self._get_conn() - terminated_instances = yield From(ec2_conn.terminate_instances([builder_id], force=True)) + terminated_instances = yield From(ec2_conn.terminate_instances([builder_id])) if builder_id not in [si.id for si in terminated_instances]: raise ExecutorException('Unable to terminate instance: %s' % builder_id) From ccb19571d635f57fd7c0c22459a7719692759f77 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Tue, 23 Dec 2014 17:42:47 -0500 Subject: [PATCH 022/127] Try lowering the sleep on the shutdown timeout to avoid the service dispatch timeout built into systemd. --- buildman/templates/cloudconfig.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml index 321e75927..3bebde670 100644 --- a/buildman/templates/cloudconfig.yaml +++ b/buildman/templates/cloudconfig.yaml @@ -35,4 +35,4 @@ coreos: ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest ExecStop=/usr/bin/docker stop quay-builder - ExecStopPost=/bin/sh -xc "/bin/sleep 600; /usr/bin/systemctl --no-block poweroff" + ExecStopPost=/bin/sh -xc "/bin/sleep 120; /usr/bin/systemctl --no-block poweroff" From cc70225043f5990d62a3acc3faba83c618d9a7db Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Wed, 31 Dec 2014 11:33:56 -0500 Subject: [PATCH 023/127] Generalize the ephemeral build managers so that any manager may manage a builder spawned by any other manager. --- buildman/builder.py | 9 +- buildman/component/buildcomponent.py | 49 ++++++---- buildman/jobutil/buildjob.py | 17 +++- buildman/manager/basemanager.py | 9 +- buildman/manager/enterprise.py | 11 ++- buildman/manager/ephemeral.py | 141 +++++++++++++++++++-------- buildman/manager/executor.py | 14 +-- buildman/server.py | 7 +- buildman/templates/cloudconfig.yaml | 3 +- data/queue.py | 3 +- test/test_buildman.py | 120 +++++++++++++++-------- 11 files changed, 258 insertions(+), 125 deletions(-) diff --git a/buildman/builder.py b/buildman/builder.py index 4e88d3ed7..e1c7a852b 100644 --- a/buildman/builder.py +++ b/buildman/builder.py @@ -41,8 +41,11 @@ def run_build_manager(): if manager_klass is None: return - public_ip = os.environ.get('PUBLIC_IP', '127.0.0.1') - logger.debug('Will pass public IP address %s to builders for websocket connection', public_ip) + manager_hostname = os.environ.get('BUILDMAN_HOSTNAME', + app.config.get('BUILDMAN_HOSTNAME', + app.config['SERVER_HOSTNAME'])) + logger.debug('Will pass buildman hostname %s to builders for websocket connection', + manager_hostname) logger.debug('Starting build manager with lifecycle "%s"', build_manager_config[0]) ssl_context = None @@ -53,7 +56,7 @@ def run_build_manager(): os.path.join(os.environ.get('SSL_CONFIG'), 'ssl.key')) server = BuilderServer(app.config['SERVER_HOSTNAME'], dockerfile_build_queue, build_logs, - user_files, manager_klass, build_manager_config[1], public_ip) + user_files, manager_klass, build_manager_config[1], manager_hostname) server.run('0.0.0.0', ssl=ssl_context) if __name__ == '__main__': diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py index 726435bcc..42e6696f2 100644 --- a/buildman/component/buildcomponent.py +++ b/buildman/component/buildcomponent.py @@ -9,6 +9,7 @@ from autobahn.wamp.exception import ApplicationError from buildman.server import BuildJobResult from buildman.component.basecomponent import BaseComponent +from buildman.jobutil.buildjob import BuildJobLoadException from buildman.jobutil.buildpack import BuildPackage, BuildPackageException from buildman.jobutil.buildstatus import StatusHandler from buildman.jobutil.workererror import WorkerError @@ -58,19 +59,20 @@ class BuildComponent(BaseComponent): yield trollius.From(self.subscribe(self._on_heartbeat, 'io.quay.builder.heartbeat')) yield trollius.From(self.subscribe(self._on_log_message, 'io.quay.builder.logmessage')) - self._set_status(ComponentStatus.WAITING) + yield trollius.From(self._set_status(ComponentStatus.WAITING)) def is_ready(self): """ Determines whether a build component is ready to begin a build. """ return self._component_status == ComponentStatus.RUNNING + @trollius.coroutine def start_build(self, build_job): """ Starts a build. """ self._current_job = build_job self._build_status = StatusHandler(self.build_logs, build_job.repo_build.uuid) self._image_info = {} - self._set_status(ComponentStatus.BUILDING) + yield trollius.From(self._set_status(ComponentStatus.BUILDING)) # Retrieve the job's buildpack. buildpack_url = self.user_files.get_file_url(build_job.repo_build.resource_key, @@ -82,23 +84,27 @@ class BuildComponent(BaseComponent): buildpack = BuildPackage.from_url(buildpack_url) except BuildPackageException as bpe: self._build_failure('Could not retrieve build package', bpe) - return + raise trollius.Return() # Extract the base image information from the Dockerfile. parsed_dockerfile = None logger.debug('Parsing dockerfile') - build_config = build_job.build_config + try: + build_config = build_job.build_config + except BuildJobLoadException as irbe: + self._build_failure('Could not load build job information', irbe) + try: parsed_dockerfile = buildpack.parse_dockerfile(build_config.get('build_subdir')) except BuildPackageException as bpe: self._build_failure('Could not find Dockerfile in build package', bpe) - return + raise trollius.Return() image_and_tag_tuple = parsed_dockerfile.get_image_and_tag() if image_and_tag_tuple is None or image_and_tag_tuple[0] is None: self._build_failure('Missing FROM line in Dockerfile') - return + raise trollius.Return() base_image_information = { 'repository': image_and_tag_tuple[0], @@ -147,9 +153,7 @@ class BuildComponent(BaseComponent): logger.debug('Invoking build: %s', self.builder_realm) logger.debug('With Arguments: %s', build_arguments) - return (self - .call("io.quay.builder.build", **build_arguments) - .add_done_callback(self._build_complete)) + self.call("io.quay.builder.build", **build_arguments).add_done_callback(self._build_complete) @staticmethod def _total_completion(statuses, total_images): @@ -276,38 +280,42 @@ class BuildComponent(BaseComponent): self._current_job = None # Set the component back to a running state. - self._set_status(ComponentStatus.RUNNING) + yield trollius.From(self._set_status(ComponentStatus.RUNNING)) @staticmethod def _ping(): """ Ping pong. """ return 'pong' + @trollius.coroutine def _on_ready(self, token, version): if not version in SUPPORTED_WORKER_VERSIONS: - logger.warning('Build component (token "%s") is running an out-of-date version: %s', version) - return False + logger.warning('Build component (token "%s") is running an out-of-date version: %s', token, + version) + raise trollius.Return(False) if self._component_status != 'waiting': logger.warning('Build component (token "%s") is already connected', self.expected_token) - return False + raise trollius.Return(False) if token != self.expected_token: - logger.warning('Builder token mismatch. Expected: "%s". Found: "%s"', self.expected_token, token) - return False + logger.warning('Builder token mismatch. Expected: "%s". Found: "%s"', self.expected_token, + token) + raise trollius.Return(False) - self._set_status(ComponentStatus.RUNNING) + yield trollius.From(self._set_status(ComponentStatus.RUNNING)) # Start the heartbeat check and updating loop. loop = trollius.get_event_loop() loop.create_task(self._heartbeat()) logger.debug('Build worker %s is connected and ready', self.builder_realm) - return True + raise trollius.Return(True) + @trollius.coroutine def _set_status(self, phase): if phase == ComponentStatus.RUNNING: loop = trollius.get_event_loop() - self.parent_manager.build_component_ready(self, loop) + yield trollius.From(self.parent_manager.build_component_ready(self, loop)) self._component_status = phase @@ -344,13 +352,14 @@ class BuildComponent(BaseComponent): logger.debug('Checking heartbeat on realm %s', self.builder_realm) if (self._last_heartbeat and self._last_heartbeat < datetime.datetime.utcnow() - HEARTBEAT_DELTA): - self._timeout() + yield trollius.From(self._timeout()) return yield trollius.From(trollius.sleep(HEARTBEAT_TIMEOUT)) + @trollius.coroutine def _timeout(self): - self._set_status(ComponentStatus.TIMED_OUT) + yield trollius.From(self._set_status(ComponentStatus.TIMED_OUT)) logger.warning('Build component with realm %s has timed out', self.builder_realm) self._dispose(timed_out=True) diff --git a/buildman/jobutil/buildjob.py b/buildman/jobutil/buildjob.py index e92be23a6..c2d2769db 100644 --- a/buildman/jobutil/buildjob.py +++ b/buildman/jobutil/buildjob.py @@ -1,6 +1,9 @@ +import json + +from cachetools import lru_cache + from data import model -import json class BuildJobLoadException(Exception): """ Exception raised if a build job could not be instantiated for some reason. """ @@ -18,14 +21,22 @@ class BuildJob(object): 'Could not parse build queue item config with ID %s' % self.job_details['build_uuid'] ) + @lru_cache(maxsize=1) + def _load_repo_build(self): try: - self.repo_build = model.get_repository_build(self.job_details['build_uuid']) + return model.get_repository_build(self.job_details['build_uuid']) except model.InvalidRepositoryBuildException: raise BuildJobLoadException( 'Could not load repository build with ID %s' % self.job_details['build_uuid']) + @property + def repo_build(self): + return self._load_repo_build() + + @property + def build_config(self): try: - self.build_config = json.loads(self.repo_build.job_config) + return json.loads(self.repo_build.job_config) except ValueError: raise BuildJobLoadException( 'Could not parse repository build job config with ID %s' % self.job_details['build_uuid'] diff --git a/buildman/manager/basemanager.py b/buildman/manager/basemanager.py index ee17cf531..2c57ac095 100644 --- a/buildman/manager/basemanager.py +++ b/buildman/manager/basemanager.py @@ -3,12 +3,12 @@ from trollius import coroutine class BaseManager(object): """ Base for all worker managers. """ def __init__(self, register_component, unregister_component, job_heartbeat_callback, - job_complete_callback, public_ip_address, heartbeat_period_sec): + job_complete_callback, manager_hostname, heartbeat_period_sec): self.register_component = register_component self.unregister_component = unregister_component self.job_heartbeat_callback = job_heartbeat_callback self.job_complete_callback = job_complete_callback - self.public_ip_address = public_ip_address + self.manager_hostname = manager_hostname self.heartbeat_period_sec = heartbeat_period_sec @coroutine @@ -31,7 +31,7 @@ class BaseManager(object): raise NotImplementedError @coroutine - def schedule(self, build_job, loop): + def schedule(self, build_job): """ Schedules a queue item to be built. Returns True if the item was properly scheduled and False if all workers are busy. """ @@ -42,7 +42,8 @@ class BaseManager(object): """ raise NotImplementedError - def build_component_ready(self, build_component, loop): + @coroutine + def build_component_ready(self, build_component): """ Method invoked whenever a build component announces itself as ready. """ raise NotImplementedError diff --git a/buildman/manager/enterprise.py b/buildman/manager/enterprise.py index 5a97c0955..d7fdea39a 100644 --- a/buildman/manager/enterprise.py +++ b/buildman/manager/enterprise.py @@ -5,7 +5,7 @@ from buildman.component.basecomponent import BaseComponent from buildman.component.buildcomponent import BuildComponent from buildman.manager.basemanager import BaseManager -from trollius.coroutines import From, Return, coroutine +from trollius import From, Return, coroutine, async REGISTRATION_REALM = 'registration' logger = logging.getLogger(__name__) @@ -51,16 +51,19 @@ class EnterpriseManager(BaseManager): return realm @coroutine - def schedule(self, build_job, loop): + def schedule(self, build_job): """ Schedules a build for an Enterprise Registry. """ if self.shutting_down or not self.ready_components: raise Return(False) component = self.ready_components.pop() - loop.call_soon(component.start_build, build_job) + + yield From(component.start_build(build_job)) + raise Return(True) - def build_component_ready(self, build_component, loop): + @coroutine + def build_component_ready(self, build_component): self.ready_components.add(build_component) def shutdown(self): diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 63f03a6b7..7126ec836 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -13,16 +13,28 @@ from urllib3.exceptions import ReadTimeoutError from buildman.manager.basemanager import BaseManager from buildman.manager.executor import PopenExecutor, EC2Executor from buildman.component.buildcomponent import BuildComponent +from buildman.jobutil.buildjob import BuildJob from buildman.asyncutil import AsyncWrapper +from util.morecollections import AttrDict logger = logging.getLogger(__name__) ETCD_BUILDER_PREFIX = 'building/' -ETCD_EXPIRE_RESULT = 'expire' +ETCD_REALM_PREFIX = 'realm/' ETCD_DISABLE_TIMEOUT = 0 +class EtcdAction(object): + GET = 'get' + SET = 'set' + EXPIRE = 'expire' + UPDATE = 'update' + DELETE = 'delete' + CREATE = 'create' + COMPARE_AND_SWAP = 'compareAndSwap' + COMPARE_AND_DELETE = 'compareAndDelete' + class EphemeralBuilderManager(BaseManager): """ Build manager implementation for the Enterprise Registry. """ @@ -41,52 +53,82 @@ class EphemeralBuilderManager(BaseManager): self._etcd_client = None self._component_to_job = {} + self._job_uuid_to_component = {} self._component_to_builder = {} self._executor = None - self._worker_watch_task = None + # Map of etcd keys being watched to the tasks watching them + self._watch_tasks = {} super(EphemeralBuilderManager, self).__init__(*args, **kwargs) - def _watch_builders(self): - """ Watch the builders key for expirations. - """ + def _watch_etcd(self, etcd_key, change_callback, recursive=True): + watch_task_key = (etcd_key, recursive) + def callback_wrapper(changed_key_future): + + if watch_task_key not in self._watch_tasks or self._watch_tasks[watch_task_key].done(): + self._watch_etcd(etcd_key, change_callback) + + if changed_key_future.cancelled(): + # Due to lack of interest, tomorrow has been cancelled + return + + try: + etcd_result = changed_key_future.result() + except ReadTimeoutError: + return + + change_callback(etcd_result) + if not self._shutting_down: - workers_future = self._etcd_client.watch(ETCD_BUILDER_PREFIX, recursive=True, - timeout=ETCD_DISABLE_TIMEOUT) - workers_future.add_done_callback(self._handle_key_expiration) - logger.debug('Scheduling watch task.') - self._worker_watch_task = async(workers_future) + watch_future = self._etcd_client.watch(etcd_key, recursive=recursive, + timeout=ETCD_DISABLE_TIMEOUT) + watch_future.add_done_callback(callback_wrapper) + logger.debug('Scheduling watch of key: %s%s', etcd_key, '/*' if recursive else '') + self._watch_tasks[watch_task_key] = async(watch_future) - def _handle_key_expiration(self, changed_key_future): - """ Handle when a builder expires - """ - if self._worker_watch_task is None or self._worker_watch_task.done(): - self._watch_builders() - - if changed_key_future.cancelled(): - # Due to lack of interest, tomorrow has been cancelled - return - - try: - etcd_result = changed_key_future.result() - except ReadTimeoutError: - return - - if etcd_result.action == ETCD_EXPIRE_RESULT: + def _handle_builder_expiration(self, etcd_result): + if etcd_result.action == EtcdAction.EXPIRE: # Handle the expiration logger.debug('Builder expired, clean up the old build node') job_metadata = json.loads(etcd_result._prev_node.value) async(self._clean_up_old_builder(etcd_result.key, job_metadata)) + def _handle_realm_change(self, etcd_result): + if etcd_result.action == EtcdAction.SET: + # We must listen on the realm created by ourselves or another worker + realm_spec = json.loads(etcd_result.value) + component = self.register_component(realm_spec['realm'], BuildComponent, + token=realm_spec['token']) + build_job = BuildJob(AttrDict(realm_spec['job_queue_item'])) + self._component_to_job[component] = build_job + self._component_to_builder[component] = realm_spec['builder_id'] + self._job_uuid_to_component[build_job.job_details['build_uuid']] = component + + elif etcd_result.action == EtcdAction.DELETE or etcd_result.action == EtcdAction.EXPIRE: + # We must stop listening for new connections on the specified realm, if we did not get the + # connection + realm_spec = json.loads(etcd_result._prev_node.value) + build_job = BuildJob(AttrDict(realm_spec['job_queue_item'])) + component = self._job_uuid_to_component.pop(build_job.job_details['build_uuid'], None) + if component is not None: + # We were not the manager which the worker connected to, remove the bookkeeping for it + logger.debug('Unregistering unused component on realm: %s', realm_spec['realm']) + del self._component_to_job[component] + del self._component_to_builder[component] + self.unregister_component(component) + + else: + logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key) + def initialize(self, manager_config): logger.debug('Calling initialize') self._manager_config = manager_config executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor) self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}), - self.public_ip_address) + self.manager_hostname) etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1') etcd_port = self._manager_config.get('ETCD_PORT', 2379) @@ -97,7 +139,8 @@ class EphemeralBuilderManager(BaseManager): self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port), executor=self._async_thread_executor) - self._watch_builders() + self._watch_etcd(ETCD_BUILDER_PREFIX, self._handle_builder_expiration) + self._watch_etcd(ETCD_REALM_PREFIX, self._handle_realm_change) def setup_time(self): setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300) @@ -108,17 +151,17 @@ class EphemeralBuilderManager(BaseManager): logger.debug('Shutting down worker.') self._shutting_down = True - if self._worker_watch_task is not None: - logger.debug('Canceling watch task.') - self._worker_watch_task.cancel() - self._worker_watch_task = None + for (etcd_key, _), task in self._watch_tasks.items(): + if not task.done(): + logger.debug('Canceling watch task for %s', etcd_key) + task.cancel() if self._async_thread_executor is not None: logger.debug('Shutting down thread pool executor.') self._async_thread_executor.shutdown() @coroutine - def schedule(self, build_job, loop): + def schedule(self, build_job): logger.debug('Calling schedule with job: %s', build_job.job_details['build_uuid']) # Check if there are worker slots avialable by checking the number of jobs in etcd @@ -154,8 +197,6 @@ class EphemeralBuilderManager(BaseManager): try: yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=False, ttl=ttl)) - component = self.register_component(realm, BuildComponent, token=token) - self._component_to_job[component] = build_job except KeyError: # The job was already taken by someone else, we are probably a retry logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?') @@ -163,20 +204,38 @@ class EphemeralBuilderManager(BaseManager): logger.debug('Starting builder with executor: %s', self._executor) builder_id = yield From(self._executor.start_builder(realm, token)) - self._component_to_builder[component] = builder_id # Store the builder in etcd associated with the job id payload['builder_id'] = builder_id yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl)) + # Store the realm spec which will allow any manager to accept this builder when it connects + realm_spec = json.dumps({ + 'realm': realm, + 'token': token, + 'builder_id': builder_id, + 'job_queue_item': build_job.job_item, + }) + try: + yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False, + ttl=ttl)) + except KeyError: + logger.error('Realm already exists in etcd. UUID collision or something is very very wrong.') + raise Return(False) + raise Return(True) - def build_component_ready(self, build_component, loop): + @coroutine + def build_component_ready(self, build_component): try: + # Clean up the bookkeeping for allowing any manager to take the job job = self._component_to_job.pop(build_component) + del self._job_uuid_to_component[job.job_details['build_uuid']] + yield From(self._etcd_client.delete(self._etcd_realm_key(build_component.builder_realm))) + logger.debug('Sending build %s to newly ready component on realm %s', job.job_details['build_uuid'], build_component.builder_realm) - loop.call_soon(build_component.start_build, job) + yield From(build_component.start_build(job)) except KeyError: logger.warning('Builder is asking for more work, but work already completed') @@ -240,6 +299,12 @@ class EphemeralBuilderManager(BaseManager): """ return os.path.join(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid']) + @staticmethod + def _etcd_realm_key(realm): + """ Create a key which is used to track an incoming connection on a realm. + """ + return os.path.join(ETCD_REALM_PREFIX, realm) + def num_workers(self): """ Return the number of workers we're managing locally. """ diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index 814b95a5b..c4b38366d 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -29,9 +29,9 @@ class ExecutorException(Exception): class BuilderExecutor(object): - def __init__(self, executor_config, manager_public_ip): + def __init__(self, executor_config, manager_hostname): self.executor_config = executor_config - self.manager_public_ip = manager_public_ip + self.manager_hostname = manager_hostname """ Interface which can be plugged into the EphemeralNodeManager to provide a strategy for starting and stopping builders. @@ -52,7 +52,7 @@ class BuilderExecutor(object): def get_manager_websocket_url(self): return 'ws://{0}:' - def generate_cloud_config(self, realm, token, coreos_channel, manager_ip, + def generate_cloud_config(self, realm, token, coreos_channel, manager_hostname, quay_username=None, quay_password=None, etcd_token=None): if quay_username is None: quay_username = self.executor_config['QUAY_USERNAME'] @@ -69,7 +69,7 @@ class BuilderExecutor(object): quay_username=quay_username, quay_password=quay_password, etcd_token=etcd_token, - manager_ip=manager_ip, + manager_hostname=manager_hostname, coreos_channel=coreos_channel, ) @@ -108,7 +108,7 @@ class EC2Executor(BuilderExecutor): channel = self.executor_config.get('COREOS_CHANNEL', 'stable') get_ami_callable = partial(self._get_coreos_ami, region, channel) coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable)) - user_data = self.generate_cloud_config(realm, token, channel, self.manager_public_ip) + user_data = self.generate_cloud_config(realm, token, channel, self.manager_hostname) logger.debug('Generated cloud config: %s', user_data) @@ -155,10 +155,10 @@ class EC2Executor(BuilderExecutor): class PopenExecutor(BuilderExecutor): """ Implementation of BuilderExecutor which uses Popen to fork a quay-builder process. """ - def __init__(self, executor_config, manager_public_ip): + def __init__(self, executor_config, manager_hostname): self._jobs = {} - super(PopenExecutor, self).__init__(executor_config, manager_public_ip) + super(PopenExecutor, self).__init__(executor_config, manager_hostname) """ Executor which uses Popen to fork a quay-builder process. """ diff --git a/buildman/server.py b/buildman/server.py index ba9536c1e..e1175f718 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -37,7 +37,7 @@ class BuilderServer(object): controller. """ def __init__(self, registry_hostname, queue, build_logs, user_files, lifecycle_manager_klass, - lifecycle_manager_config, manager_public_ip): + lifecycle_manager_config, manager_hostname): self._loop = None self._current_status = 'starting' self._current_components = [] @@ -53,7 +53,7 @@ class BuilderServer(object): self._unregister_component, self._job_heartbeat, self._job_complete, - manager_public_ip, + manager_hostname, HEARTBEAT_PERIOD_SEC, ) self._lifecycle_manager_config = lifecycle_manager_config @@ -158,7 +158,7 @@ class BuilderServer(object): self._queue.incomplete(job_item, restore_retry=False) logger.debug('Build job found. Checking for an avaliable worker.') - scheduled = yield From(self._lifecycle_manager.schedule(build_job, self._loop)) + scheduled = yield From(self._lifecycle_manager.schedule(build_job)) if scheduled: self._job_count = self._job_count + 1 logger.debug('Build job scheduled. Running: %s', self._job_count) @@ -168,7 +168,6 @@ class BuilderServer(object): yield From(trollius.sleep(WORK_CHECK_TIMEOUT)) - @trollius.coroutine def _initialize(self, loop, host, ssl=None): self._loop = loop diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml index 3bebde670..d6ae3aeca 100644 --- a/buildman/templates/cloudconfig.yaml +++ b/buildman/templates/cloudconfig.yaml @@ -6,7 +6,7 @@ write_files: content: | REALM={{ realm }} TOKEN={{ token }} - ENDPOINT=wss://buildman.quay.io:8787 + SERVER=wss://{{ manager_hostname }} coreos: update: @@ -31,7 +31,6 @@ coreos: [Service] TimeoutStartSec=600 TimeoutStopSec=2000 - ExecStartPre=/bin/sh -xc "echo '{{ manager_ip }} buildman.quay.io' >> /etc/hosts; exit 0" ExecStartPre=/usr/bin/docker login -u {{ quay_username }} -p {{ quay_password }} -e unused quay.io ExecStart=/usr/bin/docker run --rm --net=host --name quay-builder --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock quay.io/coreos/registry-build-worker:latest ExecStop=/usr/bin/docker stop quay-builder diff --git a/data/queue.py b/data/queue.py index 5c720eed2..865511519 100644 --- a/data/queue.py +++ b/data/queue.py @@ -78,7 +78,8 @@ class WorkQueue(object): def get(self, processing_time=300): """ Get an available item and mark it as unavailable for the default of five - minutes. + minutes. The result of this method must always be composed of simple + python objects which are JSON serializable for network portability reasons. """ now = datetime.utcnow() diff --git a/test/test_buildman.py b/test/test_buildman.py index 0d0b6ced2..f10ba473e 100644 --- a/test/test_buildman.py +++ b/test/test_buildman.py @@ -4,19 +4,20 @@ import os.path import time import json -from trollius import coroutine, get_event_loop, From, Future, sleep +from trollius import coroutine, get_event_loop, From, Future, sleep, Return from mock import Mock from threading import Event from urllib3.exceptions import ReadTimeoutError from buildman.manager.executor import BuilderExecutor from buildman.manager.ephemeral import (EphemeralBuilderManager, ETCD_BUILDER_PREFIX, - ETCD_EXPIRE_RESULT) + ETCD_REALM_PREFIX, EtcdAction) from buildman.server import BuildJobResult from buildman.component.buildcomponent import BuildComponent BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead' +REALM_ID = '1234-realm' def async_test(f): @@ -43,17 +44,17 @@ class TestEphemeral(unittest.TestCase): self.etcd_client_mock.watch = Mock(side_effect=hang_until_event) return self.etcd_client_mock - def _create_mock_executor(self, *args, **kwargs): - def create_completed_future(result=None): - def inner(*args, **kwargs): - new_future = Future() - new_future.set_result(result) - return new_future - return inner + def _create_completed_future(self, result=None): + def inner(*args, **kwargs): + new_future = Future() + new_future.set_result(result) + return new_future + return inner + def _create_mock_executor(self, *args, **kwargs): self.test_executor = Mock(spec=BuilderExecutor) - self.test_executor.start_builder = Mock(side_effect=create_completed_future('123')) - self.test_executor.stop_builder = Mock(side_effect=create_completed_future()) + self.test_executor.start_builder = Mock(side_effect=self._create_completed_future('123')) + self.test_executor.stop_builder = Mock(side_effect=self._create_completed_future()) return self.test_executor def _create_build_job(self): @@ -61,6 +62,10 @@ class TestEphemeral(unittest.TestCase): mock_job.job_details = { 'build_uuid': BUILD_UUID, } + mock_job.job_item = { + 'body': json.dumps(mock_job.job_details), + 'id': 1, + } return mock_job def setUp(self): @@ -71,13 +76,13 @@ class TestEphemeral(unittest.TestCase): self.etcd_wait_event.clear() self.register_component_callback = Mock() - self.uniregister_component_callback = Mock() + self.unregister_component_callback = Mock() self.job_heartbeat_callback = Mock() self.job_complete_callback = Mock() self.manager = EphemeralBuilderManager( self.register_component_callback, - self.uniregister_component_callback, + self.unregister_component_callback, self.job_heartbeat_callback, self.job_complete_callback, '127.0.0.1', @@ -97,15 +102,19 @@ class TestEphemeral(unittest.TestCase): del EphemeralBuilderManager._executors['test'] EphemeralBuilderManager._etcd_client_klass = self.old_etcd_client_klass - @async_test - def test_schedule_and_complete(self): + @coroutine + def _setup_job_for_managers(self): + # Test that we are watching the realm location before anything else happens + self.etcd_client_mock.watch.assert_any_call(ETCD_REALM_PREFIX, recursive=True, timeout=0) + self.etcd_client_mock.read = Mock(side_effect=KeyError) - test_component = BuildComponent(None) + test_component = Mock(spec=BuildComponent) + test_component.builder_realm = REALM_ID + test_component.start_build = Mock(side_effect=self._create_completed_future()) self.register_component_callback.return_value = test_component # Ask for a builder to be scheduled - loop = get_event_loop() - is_scheduled = yield From(self.manager.schedule(self.mock_job, loop)) + is_scheduled = yield From(self.manager.schedule(self.mock_job)) self.assertTrue(is_scheduled) @@ -114,29 +123,76 @@ class TestEphemeral(unittest.TestCase): self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key) self.assertEqual(self.etcd_client_mock.write.call_args_list[1][0][0], self.mock_job_key) + # Right now the job is not registered with any managers because etcd has not accepted the job + self.assertEqual(self.register_component_callback.call_count, 0) + + realm_created = Mock(spec=etcd.EtcdResult) + realm_created.action = EtcdAction.SET + realm_created.key = os.path.join(ETCD_REALM_PREFIX, REALM_ID) + realm_created.value = json.dumps({ + 'realm': REALM_ID, + 'token': 'beef', + 'builder_id': '123', + 'job_queue_item': self.mock_job.job_item, + }) + + self.manager._handle_realm_change(realm_created) + self.assertEqual(self.register_component_callback.call_count, 1) + raise Return(test_component) + + @async_test + def test_schedule_and_complete(self): + # Test that a job is properly registered with all of the managers + test_component = yield From(self._setup_job_for_managers()) + + # Take the job ourselves + yield From(self.manager.build_component_ready(test_component)) + + self.etcd_client_mock.delete.assert_called_once_with(os.path.join(ETCD_REALM_PREFIX, REALM_ID)) + self.etcd_client_mock.delete.reset_mock() + + # Finish the job yield From(self.manager.job_completed(self.mock_job, BuildJobResult.COMPLETE, test_component)) self.assertEqual(self.test_executor.stop_builder.call_count, 1) self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key) + @async_test + def test_another_manager_takes_job(self): + # Prepare a job to be taken by another manager + test_component = yield From(self._setup_job_for_managers()) + + realm_deleted = Mock(spec=etcd.EtcdResult) + realm_deleted.action = EtcdAction.DELETE + realm_deleted.key = os.path.join(ETCD_REALM_PREFIX, REALM_ID) + + realm_deleted._prev_node = Mock(spec=etcd.EtcdResult) + realm_deleted._prev_node.value = json.dumps({ + 'realm': REALM_ID, + 'token': 'beef', + 'builder_id': '123', + 'job_queue_item': self.mock_job.job_item, + }) + + self.manager._handle_realm_change(realm_deleted) + + self.unregister_component_callback.assert_called_once_with(test_component) + @async_test def test_expiring_worker(self): # Test that we are watching before anything else happens - self.etcd_client_mock.watch.assert_called_once_with(ETCD_BUILDER_PREFIX, recursive=True, - timeout=0) + self.etcd_client_mock.watch.assert_any_call(ETCD_BUILDER_PREFIX, recursive=True, timeout=0) # Send a signal to the callback that a worker has expired expired_result = Mock(spec=etcd.EtcdResult) - expired_result.action = ETCD_EXPIRE_RESULT + expired_result.action = EtcdAction.EXPIRE expired_result.key = self.mock_job_key expired_result._prev_node = Mock(spec=etcd.EtcdResult) expired_result._prev_node.value = json.dumps({'builder_id': '1234'}) - expired_future = Future() - expired_future.set_result(expired_result) - self.manager._handle_key_expiration(expired_future) + self.manager._handle_builder_expiration(expired_result) yield From(sleep(.01)) @@ -151,10 +207,8 @@ class TestEphemeral(unittest.TestCase): set_result = Mock(sepc=etcd.EtcdResult) set_result.action = 'set' set_result.key = self.mock_job_key - set_future = Future() - set_future.set_result(set_result) - self.manager._handle_key_expiration(set_future) + self.manager._handle_builder_expiration(set_result) yield From(sleep(.01)) @@ -179,15 +233,3 @@ class TestEphemeral(unittest.TestCase): self.job_heartbeat_callback.assert_called_once_with(self.mock_job) self.assertEqual(self.etcd_client_mock.write.call_count, 1) self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key) - - @async_test - def test_etcd_read_timeout(self): - # Send a signal to the callback that a worker key has been changed - read_timeout_future = Future() - read_timeout_future.set_exception(ReadTimeoutError(None, None, None)) - - self.manager._handle_key_expiration(read_timeout_future) - - yield From(sleep(.01)) - - self.assertEquals(self.test_executor.stop_builder.call_count, 0) From a9839021af9554b86b7ab46ff7665ad7b9ce71ad Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Wed, 31 Dec 2014 11:46:02 -0500 Subject: [PATCH 024/127] When the etcd key tracking realms is first created the action is create, not set. --- buildman/manager/ephemeral.py | 2 +- test/test_buildman.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 7126ec836..9ab10fd15 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -96,7 +96,7 @@ class EphemeralBuilderManager(BaseManager): async(self._clean_up_old_builder(etcd_result.key, job_metadata)) def _handle_realm_change(self, etcd_result): - if etcd_result.action == EtcdAction.SET: + if etcd_result.action == EtcdAction.CREATE: # We must listen on the realm created by ourselves or another worker realm_spec = json.loads(etcd_result.value) component = self.register_component(realm_spec['realm'], BuildComponent, diff --git a/test/test_buildman.py b/test/test_buildman.py index f10ba473e..a9029c22a 100644 --- a/test/test_buildman.py +++ b/test/test_buildman.py @@ -127,7 +127,7 @@ class TestEphemeral(unittest.TestCase): self.assertEqual(self.register_component_callback.call_count, 0) realm_created = Mock(spec=etcd.EtcdResult) - realm_created.action = EtcdAction.SET + realm_created.action = EtcdAction.CREATE realm_created.key = os.path.join(ETCD_REALM_PREFIX, REALM_ID) realm_created.value = json.dumps({ 'realm': REALM_ID, From b33ee1a474010d473d5900dc7d6ff86274d21fce Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Mon, 5 Jan 2015 11:21:36 -0500 Subject: [PATCH 025/127] Register existing builders to watch their expirations. --- buildman/manager/ephemeral.py | 42 ++++++++++++++++++++--------------- test/test_buildman.py | 2 -- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 9ab10fd15..701465c3f 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -93,18 +93,16 @@ class EphemeralBuilderManager(BaseManager): # Handle the expiration logger.debug('Builder expired, clean up the old build node') job_metadata = json.loads(etcd_result._prev_node.value) - async(self._clean_up_old_builder(etcd_result.key, job_metadata)) + + if 'builder_id' in job_metadata: + logger.info('Terminating expired build node.') + async(self._executor.stop_builder(job_metadata['builder_id'])) def _handle_realm_change(self, etcd_result): if etcd_result.action == EtcdAction.CREATE: # We must listen on the realm created by ourselves or another worker realm_spec = json.loads(etcd_result.value) - component = self.register_component(realm_spec['realm'], BuildComponent, - token=realm_spec['token']) - build_job = BuildJob(AttrDict(realm_spec['job_queue_item'])) - self._component_to_job[component] = build_job - self._component_to_builder[component] = realm_spec['builder_id'] - self._job_uuid_to_component[build_job.job_details['build_uuid']] = component + self._register_realm(realm_spec) elif etcd_result.action == EtcdAction.DELETE or etcd_result.action == EtcdAction.EXPIRE: # We must stop listening for new connections on the specified realm, if we did not get the @@ -122,6 +120,22 @@ class EphemeralBuilderManager(BaseManager): else: logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key) + def _register_realm(self, realm_spec): + logger.debug('Registering realm with manager: %s', realm_spec['realm']) + component = self.register_component(realm_spec['realm'], BuildComponent, + token=realm_spec['token']) + build_job = BuildJob(AttrDict(realm_spec['job_queue_item'])) + self._component_to_job[component] = build_job + self._component_to_builder[component] = realm_spec['builder_id'] + self._job_uuid_to_component[build_job.job_details['build_uuid']] = component + + @coroutine + def _register_existing_realms(self): + all_realms = yield From(self._etcd_client.read(ETCD_REALM_PREFIX, recursive=True)) + for realm in all_realms.children: + if not realm.dir: + self._register_realm(json.loads(realm.value)) + def initialize(self, manager_config): logger.debug('Calling initialize') self._manager_config = manager_config @@ -142,6 +156,9 @@ class EphemeralBuilderManager(BaseManager): self._watch_etcd(ETCD_BUILDER_PREFIX, self._handle_builder_expiration) self._watch_etcd(ETCD_REALM_PREFIX, self._handle_realm_change) + # Load components for all realms currently known to the cluster + async(self._register_existing_realms()) + def setup_time(self): setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300) logger.debug('Returning setup_time: %s', setup_time) @@ -282,17 +299,6 @@ class EphemeralBuilderManager(BaseManager): self.job_heartbeat_callback(build_job) - @coroutine - def _clean_up_old_builder(self, job_key, job_payload): - """ Terminate an old builders once the expiration date has passed. - """ - logger.debug('Cleaning up the old builder for job: %s', job_key) - if 'builder_id' in job_payload: - logger.info('Terminating expired build node.') - yield From(self._executor.stop_builder(job_payload['builder_id'])) - - yield From(self._etcd_client.delete(job_key)) - @staticmethod def _etcd_job_key(build_job): """ Create a key which is used to track a job in etcd. diff --git a/test/test_buildman.py b/test/test_buildman.py index a9029c22a..89658f65d 100644 --- a/test/test_buildman.py +++ b/test/test_buildman.py @@ -199,8 +199,6 @@ class TestEphemeral(unittest.TestCase): self.test_executor.stop_builder.assert_called_once_with('1234') self.assertEqual(self.test_executor.stop_builder.call_count, 1) - self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key) - @async_test def test_change_worker(self): # Send a signal to the callback that a worker key has been changed From 320ae63ccdbc96fa68ee2e4d8073be039027e861 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Mon, 5 Jan 2015 12:23:54 -0500 Subject: [PATCH 026/127] Handle the case where there are no realms registered. --- buildman/manager/ephemeral.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 701465c3f..aa9bb7193 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -131,10 +131,14 @@ class EphemeralBuilderManager(BaseManager): @coroutine def _register_existing_realms(self): - all_realms = yield From(self._etcd_client.read(ETCD_REALM_PREFIX, recursive=True)) - for realm in all_realms.children: - if not realm.dir: - self._register_realm(json.loads(realm.value)) + try: + all_realms = yield From(self._etcd_client.read(ETCD_REALM_PREFIX, recursive=True)) + for realm in all_realms.children: + if not realm.dir: + self._register_realm(json.loads(realm.value)) + except KeyError: + # no realms have been registered yet + pass def initialize(self, manager_config): logger.debug('Calling initialize') From f58b09a0647f28ecda0b4753ce553d22539338c4 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Mon, 5 Jan 2015 13:08:25 -0500 Subject: [PATCH 027/127] Remove the loop argument from the call to build_component_ready. --- buildman/component/buildcomponent.py | 3 +-- buildman/manager/ephemeral.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py index 42e6696f2..c31d7aafe 100644 --- a/buildman/component/buildcomponent.py +++ b/buildman/component/buildcomponent.py @@ -314,8 +314,7 @@ class BuildComponent(BaseComponent): @trollius.coroutine def _set_status(self, phase): if phase == ComponentStatus.RUNNING: - loop = trollius.get_event_loop() - yield trollius.From(self.parent_manager.build_component_ready(self, loop)) + yield trollius.From(self.parent_manager.build_component_ready(self)) self._component_status = phase diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index aa9bb7193..c7a084888 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -66,7 +66,6 @@ class EphemeralBuilderManager(BaseManager): def _watch_etcd(self, etcd_key, change_callback, recursive=True): watch_task_key = (etcd_key, recursive) def callback_wrapper(changed_key_future): - if watch_task_key not in self._watch_tasks or self._watch_tasks[watch_task_key].done(): self._watch_etcd(etcd_key, change_callback) From 803796271651d590c697235c713c6b01bea65bd1 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Mon, 5 Jan 2015 14:44:54 -0500 Subject: [PATCH 028/127] Change the severity of a log message which is actually expected in the happy case. --- buildman/manager/ephemeral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index c7a084888..07c773a59 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -257,7 +257,7 @@ class EphemeralBuilderManager(BaseManager): job.job_details['build_uuid'], build_component.builder_realm) yield From(build_component.start_build(job)) except KeyError: - logger.warning('Builder is asking for more work, but work already completed') + logger.debug('Builder is asking for more work, but work already completed') def build_component_disposed(self, build_component, timed_out): logger.debug('Calling build_component_disposed.') From dd7664328c3e3f673fef8521140769f6d8c720df Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Mon, 5 Jan 2015 15:09:03 -0500 Subject: [PATCH 029/127] Make the build manager ports configurable. --- buildman/builder.py | 12 +++++++++++- buildman/server.py | 17 +++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/buildman/builder.py b/buildman/builder.py index e1c7a852b..467ac2f6d 100644 --- a/buildman/builder.py +++ b/buildman/builder.py @@ -20,6 +20,9 @@ BUILD_MANAGERS = { EXTERNALLY_MANAGED = 'external' +DEFAULT_WEBSOCKET_PORT = 8787 +DEFAULT_CONTROLLER_PORT = 8686 + def run_build_manager(): if not features.BUILD_SUPPORT: logger.debug('Building is disabled. Please enable the feature flag') @@ -44,6 +47,13 @@ def run_build_manager(): manager_hostname = os.environ.get('BUILDMAN_HOSTNAME', app.config.get('BUILDMAN_HOSTNAME', app.config['SERVER_HOSTNAME'])) + websocket_port = int(os.environ.get('BUILDMAN_WEBSOCKET_PORT', + app.config.get('BUILDMAN_WEBSOCKET_PORT', + DEFAULT_WEBSOCKET_PORT))) + controller_port = int(os.environ.get('BUILDMAN_CONTROLLER_PORT', + app.config.get('BUILDMAN_CONTROLLER_PORT', + DEFAULT_CONTROLLER_PORT))) + logger.debug('Will pass buildman hostname %s to builders for websocket connection', manager_hostname) @@ -57,7 +67,7 @@ def run_build_manager(): server = BuilderServer(app.config['SERVER_HOSTNAME'], dockerfile_build_queue, build_logs, user_files, manager_klass, build_manager_config[1], manager_hostname) - server.run('0.0.0.0', ssl=ssl_context) + server.run('0.0.0.0', websocket_port, controller_port, ssl=ssl_context) if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) diff --git a/buildman/server.py b/buildman/server.py index e1175f718..369f90313 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -21,9 +21,6 @@ TIMEOUT_PERIOD_MINUTES = 20 JOB_TIMEOUT_SECONDS = 300 MINIMUM_JOB_EXTENSION = timedelta(minutes=2) -WEBSOCKET_PORT = 8787 -CONTROLLER_PORT = 8686 - HEARTBEAT_PERIOD_SEC = 30 class BuildJobResult(object): @@ -73,16 +70,16 @@ class BuilderServer(object): self._controller_app = controller_app - def run(self, host, ssl=None): + def run(self, host, websocket_port, controller_port, ssl=None): logger.debug('Initializing the lifecycle manager') self._lifecycle_manager.initialize(self._lifecycle_manager_config) logger.debug('Initializing all members of the event loop') loop = trollius.get_event_loop() - trollius.Task(self._initialize(loop, host, ssl)) + trollius.Task(self._initialize(loop, host, websocket_port, controller_port, ssl)) - logger.debug('Starting server on port %s, with controller on port %s', WEBSOCKET_PORT, - CONTROLLER_PORT) + logger.debug('Starting server on port %s, with controller on port %s', websocket_port, + controller_port) try: loop.run_forever() except KeyboardInterrupt: @@ -169,7 +166,7 @@ class BuilderServer(object): yield From(trollius.sleep(WORK_CHECK_TIMEOUT)) @trollius.coroutine - def _initialize(self, loop, host, ssl=None): + def _initialize(self, loop, host, websocket_port, controller_port, ssl=None): self._loop = loop # Create the WAMP server. @@ -177,8 +174,8 @@ class BuilderServer(object): transport_factory.setProtocolOptions(failByDrop=True) # Initialize the controller server and the WAMP server - create_wsgi_server(self._controller_app, loop=loop, host=host, port=CONTROLLER_PORT, ssl=ssl) - yield From(loop.create_server(transport_factory, host, WEBSOCKET_PORT, ssl=ssl)) + create_wsgi_server(self._controller_app, loop=loop, host=host, port=controller_port, ssl=ssl) + yield From(loop.create_server(transport_factory, host, websocket_port, ssl=ssl)) # Initialize the work queue checker. yield From(self._work_checker()) From fc757fecada1c2db0b413bfcd4e87d1e866ee8ba Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Mon, 5 Jan 2015 15:35:14 -0500 Subject: [PATCH 030/127] Tag the EC2 instances with the build uuid. --- buildman/manager/ephemeral.py | 5 +++-- buildman/manager/executor.py | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 07c773a59..6abd10a5c 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -182,7 +182,8 @@ class EphemeralBuilderManager(BaseManager): @coroutine def schedule(self, build_job): - logger.debug('Calling schedule with job: %s', build_job.job_details['build_uuid']) + build_uuid = build_job.job_details['build_uuid'] + logger.debug('Calling schedule with job: %s', build_uuid) # Check if there are worker slots avialable by checking the number of jobs in etcd allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1) @@ -223,7 +224,7 @@ class EphemeralBuilderManager(BaseManager): raise Return(False) logger.debug('Starting builder with executor: %s', self._executor) - builder_id = yield From(self._executor.start_builder(realm, token)) + builder_id = yield From(self._executor.start_builder(realm, token, build_uuid)) # Store the builder in etcd associated with the job id payload['builder_id'] = builder_id diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index c4b38366d..c122a89fc 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -37,7 +37,7 @@ class BuilderExecutor(object): starting and stopping builders. """ @coroutine - def start_builder(self, realm, token): + def start_builder(self, realm, token, build_uuid): """ Create a builder with the specified config. Returns a unique id which can be used to manage the builder. """ @@ -103,7 +103,7 @@ class EC2Executor(BuilderExecutor): return stack_amis[ec2_region] @coroutine - def start_builder(self, realm, token): + def start_builder(self, realm, token, build_uuid): region = self.executor_config['EC2_REGION'] channel = self.executor_config.get('COREOS_CHANNEL', 'stable') get_ami_callable = partial(self._get_coreos_ami, region, channel) @@ -141,6 +141,7 @@ class EC2Executor(BuilderExecutor): 'Name': 'Quay Ephemeral Builder', 'Realm': realm, 'Token': token, + 'BuildUUID': build_uuid, })) raise Return(launched.id) @@ -163,7 +164,7 @@ class PopenExecutor(BuilderExecutor): """ Executor which uses Popen to fork a quay-builder process. """ @coroutine - def start_builder(self, realm, token): + def start_builder(self, realm, token, build_uuid): # Now start a machine for this job, adding the machine id to the etcd information logger.debug('Forking process for build') import subprocess From f268a5d66114724c21bbe0046fb86233431f0373 Mon Sep 17 00:00:00 2001 From: Jimmy Zelinskie Date: Tue, 13 Jan 2015 11:02:08 -0500 Subject: [PATCH 031/127] Fix twitter-view once and for all! One image URL was broken and it was accidentally using the avatar directive, so the class has been changed to 'twitter-avatar' and made explicit. --- static/css/quay.css | 2 +- static/directives/twitter-view.html | 2 +- static/partials/landing-normal.html | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/static/css/quay.css b/static/css/quay.css index 4df625503..08438df08 100644 --- a/static/css/quay.css +++ b/static/css/quay.css @@ -1667,7 +1667,7 @@ i.toggle-icon:hover { padding-left: 70px; } -.landing-page .twitter-tweet .avatar img { +.landing-page .twitter-tweet .twitter-avatar img { border-radius: 4px; border: 2px solid rgb(70, 70, 70); width: 50px; diff --git a/static/directives/twitter-view.html b/static/directives/twitter-view.html index e78776ea1..68e30f366 100644 --- a/static/directives/twitter-view.html +++ b/static/directives/twitter-view.html @@ -4,7 +4,7 @@

- + {{ authorName }} (@{{authorUser}}) {{ messageDate }} diff --git a/static/partials/landing-normal.html b/static/partials/landing-normal.html index 0a0dedc3a..274b56ac0 100644 --- a/static/partials/landing-normal.html +++ b/static/partials/landing-normal.html @@ -207,7 +207,7 @@
  • -
  • From 3bf5e93f0694d7975f23246956c74452748219ab Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Tue, 3 Feb 2015 16:06:23 -0500 Subject: [PATCH 119/127] Remove log statement --- buildman/component/buildcomponent.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py index ecd215209..90f2d9199 100644 --- a/buildman/component/buildcomponent.py +++ b/buildman/component/buildcomponent.py @@ -204,8 +204,6 @@ class BuildComponent(BaseComponent): except ValueError: pass - logger.debug('Got log message: %s: (%s) => (%s)', phase, json_data, docker_data) - # Extract the current status message (if any). fully_unwrapped = '' keys_to_extract = ['error', 'status', 'stream'] From a1938593a967d12c9658671827f8b9aa11417ff5 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Tue, 3 Feb 2015 16:29:47 -0500 Subject: [PATCH 120/127] Better handling of retries on build errors --- buildman/jobutil/workererror.py | 3 ++- buildman/server.py | 4 +--- endpoints/common.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/buildman/jobutil/workererror.py b/buildman/jobutil/workererror.py index 8271976e4..f0301fac4 100644 --- a/buildman/jobutil/workererror.py +++ b/buildman/jobutil/workererror.py @@ -25,7 +25,8 @@ class WorkerError(object): }, 'io.quay.builder.internalerror': { - 'message': 'An internal error occurred while building. Please submit a ticket.' + 'message': 'An internal error occurred while building. Please submit a ticket.', + 'is_internal': True }, 'io.quay.builder.buildrunerror': { diff --git a/buildman/server.py b/buildman/server.py index d8e096135..7b10995b4 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -125,9 +125,7 @@ class BuilderServer(object): def _job_complete(self, build_job, job_status): if job_status == BuildJobResult.INCOMPLETE: - self._queue.incomplete(build_job.job_item, restore_retry=True, retry_after=30) - elif job_status == BuildJobResult.ERROR: - self._queue.incomplete(build_job.job_item, restore_retry=False) + self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30) else: self._queue.complete(build_job.job_item) diff --git a/endpoints/common.py b/endpoints/common.py index 6b89fb6bd..ec6090202 100644 --- a/endpoints/common.py +++ b/endpoints/common.py @@ -211,7 +211,7 @@ def start_build(repository, dockerfile_id, tags, build_name, subdir, manual, dockerfile_build_queue.put([repository.namespace_user.username, repository.name], json.dumps({ 'build_uuid': build_request.uuid, 'pull_credentials': model.get_pull_credentials(pull_robot_name) if pull_robot_name else None - }), retries_remaining=2) + }), retries_remaining=3) # Add the build to the repo's log. metadata = { From ec4f77fa7e3e46d0e6e8991aa425e3977637a0d9 Mon Sep 17 00:00:00 2001 From: Alex Malinovich Date: Tue, 3 Feb 2015 13:42:22 -0800 Subject: [PATCH 121/127] Fix date in ToS --- templates/tos.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/tos.html b/templates/tos.html index 65c97a069..35d13b81e 100644 --- a/templates/tos.html +++ b/templates/tos.html @@ -28,7 +28,7 @@ {% block body_content %}

    CoreOS Terms of Service

    -

    Last Revised: February 2, 2015

    +

    Last Revised: February 3, 2015

    These Quay.io Terms of Service (these “Terms”) apply to the features and functions provided by CoreOS, Inc. (“CoreOS,” “our,” or “we”) via quay.io (the “Site”) (collectively, the “Services”). By accessing or using the Services, you agree to be bound by these Terms. If you do not agree to these Terms, do not use any of the Services. The “Effective Date” of these Terms is the date you first access any of the Services.

    If you are accessing the Services in your capacity as an employee, consultant or agent of a company (or other entity), you represent that you are an employee, consultant or agent of such company (or other entity) and you have the authority to agree (and be legally bound) on behalf of such company (or other entity) to all of the terms and conditions of these Terms.

    From 4355e07f9f553f8ac59c95494e7971ad482c2b23 Mon Sep 17 00:00:00 2001 From: Alex Malinovich Date: Tue, 3 Feb 2015 17:40:58 -0800 Subject: [PATCH 122/127] Fix date in ToS again --- templates/tos.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/tos.html b/templates/tos.html index 35d13b81e..ede476da9 100644 --- a/templates/tos.html +++ b/templates/tos.html @@ -28,7 +28,7 @@ {% block body_content %}

    CoreOS Terms of Service

    -

    Last Revised: February 3, 2015

    +

    Last Revised: February 4, 2015

    These Quay.io Terms of Service (these “Terms”) apply to the features and functions provided by CoreOS, Inc. (“CoreOS,” “our,” or “we”) via quay.io (the “Site”) (collectively, the “Services”). By accessing or using the Services, you agree to be bound by these Terms. If you do not agree to these Terms, do not use any of the Services. The “Effective Date” of these Terms is the date you first access any of the Services.

    If you are accessing the Services in your capacity as an employee, consultant or agent of a company (or other entity), you represent that you are an employee, consultant or agent of such company (or other entity) and you have the authority to agree (and be legally bound) on behalf of such company (or other entity) to all of the terms and conditions of these Terms.

    From 9ffb53cd4769d0dc4b3ba8cf69dee00875bad6b5 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Tue, 3 Feb 2015 21:05:18 -0500 Subject: [PATCH 123/127] Add support for v2 of the build worker, which performs the Dockerfile parsing on its own. Note that this version is backwards compatible with v1-beta of the build worker, so it should be pushed first. Also note that this version is temporary until such time as we get the caching branches merged. --- buildman/component/buildcomponent.py | 85 ++++++++++++++++------------ buildman/jobutil/workererror.py | 5 ++ 2 files changed, 55 insertions(+), 35 deletions(-) diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py index 90f2d9199..c1fb41a02 100644 --- a/buildman/component/buildcomponent.py +++ b/buildman/component/buildcomponent.py @@ -20,7 +20,7 @@ HEARTBEAT_DELTA = datetime.timedelta(seconds=30) HEARTBEAT_TIMEOUT = 10 INITIAL_TIMEOUT = 25 -SUPPORTED_WORKER_VERSIONS = ['0.1-beta'] +SUPPORTED_WORKER_VERSIONS = ['0.1-beta', '0.2'] logger = logging.getLogger(__name__) @@ -46,6 +46,7 @@ class BuildComponent(BaseComponent): self._current_job = None self._build_status = None self._image_info = None + self._worker_version = None BaseComponent.__init__(self, config, **kwargs) @@ -68,6 +69,9 @@ class BuildComponent(BaseComponent): @trollius.coroutine def start_build(self, build_job): """ Starts a build. """ + logger.debug('Starting build for component %s (worker version: %s)', + self.builder_realm, self._worker_version) + self._current_job = build_job self._build_status = StatusHandler(self.build_logs, build_job.repo_build.uuid) self._image_info = {} @@ -77,46 +81,55 @@ class BuildComponent(BaseComponent): # Send the notification that the build has started. build_job.send_notification('build_start') - # Retrieve the job's buildpack. - buildpack_url = self.user_files.get_file_url(build_job.repo_build.resource_key, - requires_cors=False) - - logger.debug('Retrieving build package: %s', buildpack_url) - buildpack = None - try: - buildpack = BuildPackage.from_url(buildpack_url) - except BuildPackageException as bpe: - self._build_failure('Could not retrieve build package', bpe) - raise trollius.Return() - - # Extract the base image information from the Dockerfile. - parsed_dockerfile = None - logger.debug('Parsing dockerfile') - + # Parse the build configuration. try: build_config = build_job.build_config except BuildJobLoadException as irbe: self._build_failure('Could not load build job information', irbe) - try: - parsed_dockerfile = buildpack.parse_dockerfile(build_config.get('build_subdir')) - except BuildPackageException as bpe: - self._build_failure('Could not find Dockerfile in build package', bpe) - raise trollius.Return() + base_image_information = {} + buildpack_url = self.user_files.get_file_url(build_job.repo_build.resource_key, + requires_cors=False) - image_and_tag_tuple = parsed_dockerfile.get_image_and_tag() - if image_and_tag_tuple is None or image_and_tag_tuple[0] is None: - self._build_failure('Missing FROM line in Dockerfile') - raise trollius.Return() + # TODO(jschorr): Remove as soon as the fleet has been transitioned to 0.2. + if self._worker_version == '0.1-beta': + # Retrieve the job's buildpack. + logger.debug('Retrieving build package: %s', buildpack_url) + buildpack = None + try: + buildpack = BuildPackage.from_url(buildpack_url) + except BuildPackageException as bpe: + self._build_failure('Could not retrieve build package', bpe) + raise trollius.Return() - base_image_information = { - 'repository': image_and_tag_tuple[0], - 'tag': image_and_tag_tuple[1] - } + # Extract the base image information from the Dockerfile. + parsed_dockerfile = None + logger.debug('Parsing dockerfile') - # Extract the number of steps from the Dockerfile. - with self._build_status as status_dict: - status_dict['total_commands'] = len(parsed_dockerfile.commands) + try: + parsed_dockerfile = buildpack.parse_dockerfile(build_config.get('build_subdir')) + except BuildPackageException as bpe: + self._build_failure('Could not find Dockerfile in build package', bpe) + raise trollius.Return() + + image_and_tag_tuple = parsed_dockerfile.get_image_and_tag() + if image_and_tag_tuple is None or image_and_tag_tuple[0] is None: + self._build_failure('Missing FROM line in Dockerfile') + raise trollius.Return() + + base_image_information = { + 'repository': image_and_tag_tuple[0], + 'tag': image_and_tag_tuple[1] + } + + # Extract the number of steps from the Dockerfile. + with self._build_status as status_dict: + status_dict['total_commands'] = len(parsed_dockerfile.commands) + else: + # TODO(jschorr): This is a HACK to make sure the progress bar (sort of) continues working + # until such time as we have the caching code in place. + with self._build_status as status_dict: + status_dict['total_commands'] = 25 # Add the pull robot information, if any. if build_job.pull_credentials: @@ -136,8 +149,8 @@ class BuildComponent(BaseComponent): # push_token: The token to use to push the built image. # tag_names: The name(s) of the tag(s) for the newly built image. # base_image: The image name and credentials to use to conduct the base image pull. - # repository: The repository to pull. - # tag: The tag to pull. + # repository: The repository to pull (DEPRECATED 0.2) + # tag: The tag to pull (DEPRECATED in 0.2) # username: The username for pulling the base image (if any). # password: The password for pulling the base image (if any). build_arguments = { @@ -299,6 +312,8 @@ class BuildComponent(BaseComponent): @trollius.coroutine def _on_ready(self, token, version): + self._worker_version = version + if not version in SUPPORTED_WORKER_VERSIONS: logger.warning('Build component (token "%s") is running an out-of-date version: %s', token, version) diff --git a/buildman/jobutil/workererror.py b/buildman/jobutil/workererror.py index f0301fac4..c7100360b 100644 --- a/buildman/jobutil/workererror.py +++ b/buildman/jobutil/workererror.py @@ -19,6 +19,11 @@ class WorkerError(object): 'is_internal': True }, + 'io.quay.builder.dockerfileissue': { + 'message': 'Could not find or parse Dockerfile', + 'show_base_error': True + }, + 'io.quay.builder.cannotpullbaseimage': { 'message': 'Could not pull base image', 'show_base_error': True From 5b8d65991e1b338bf8a6769905e2d2e8ff7f02c8 Mon Sep 17 00:00:00 2001 From: Jake Moshenko Date: Wed, 4 Feb 2015 11:58:58 -0500 Subject: [PATCH 124/127] Update the space on the builder nodes because its cheap. --- buildman/manager/executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildman/manager/executor.py b/buildman/manager/executor.py index 11f2a71c8..92641c6ce 100644 --- a/buildman/manager/executor.py +++ b/buildman/manager/executor.py @@ -112,7 +112,7 @@ class EC2Executor(BuilderExecutor): ec2_conn = self._get_conn() ssd_root_ebs = boto.ec2.blockdevicemapping.BlockDeviceType( - size=8, + size=32, volume_type='gp2', delete_on_termination=True, ) From 925cd1f3789ec056d578b2386eeef3b2a3e36ddd Mon Sep 17 00:00:00 2001 From: Alex Malinovich Date: Wed, 4 Feb 2015 16:54:47 -0800 Subject: [PATCH 125/127] Fix date in ToS again. Again. --- templates/tos.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/tos.html b/templates/tos.html index ede476da9..b86128256 100644 --- a/templates/tos.html +++ b/templates/tos.html @@ -28,7 +28,7 @@ {% block body_content %}

    CoreOS Terms of Service

    -

    Last Revised: February 4, 2015

    +

    Last Revised: February 5, 2015

    These Quay.io Terms of Service (these “Terms”) apply to the features and functions provided by CoreOS, Inc. (“CoreOS,” “our,” or “we”) via quay.io (the “Site”) (collectively, the “Services”). By accessing or using the Services, you agree to be bound by these Terms. If you do not agree to these Terms, do not use any of the Services. The “Effective Date” of these Terms is the date you first access any of the Services.

    If you are accessing the Services in your capacity as an employee, consultant or agent of a company (or other entity), you represent that you are an employee, consultant or agent of such company (or other entity) and you have the authority to agree (and be legally bound) on behalf of such company (or other entity) to all of the terms and conditions of these Terms.

    From 5fedd74399411683b09e0c89211b49584fc9fa6e Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 4 Feb 2015 21:31:26 -0500 Subject: [PATCH 126/127] Remove Jake's key --- buildman/templates/cloudconfig.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml index 97958dc1a..4972e07ca 100644 --- a/buildman/templates/cloudconfig.yaml +++ b/buildman/templates/cloudconfig.yaml @@ -1,7 +1,6 @@ #cloud-config ssh_authorized_keys: -- ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQClk9Xh+wHZ3Iw7r2sQwyhpN2qMd7JEZ8ved9J+4cfJ8QKjZkecXVm5mEV5z92cZgUlM3Tr+4VjVWDtAuEQVyGi44I3vNjeRW8WOJ60JkcM/u+aAauN0Ep6+8WCH5iq+uFE9kRwmCLW28bpG1lOaL4MdsP/mY7fzgyYct3F5aZhHOHY2MvLrg54Hqa7MhuGjuW+sy6WQrhwowiNLUxpRdJ6TyIqQ5ukbtfE1W4OOMNZruHOELPw/uGMS1GClP/JzEyHpGblNhEKtmQXu89t0zRki6SerRnR3d18/s8JImGzRhaG560r3bHCvs1HZH9jZh2+p5cN0/3T7Dn0QSDwgAfJ Jake's Mac - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCC0m+hVmyR3vn/xoxJe9+atRWBxSK+YXgyufNVDMcb7H00Jfnc341QH3kDVYZamUbhVh/nyc2RP7YbnZR5zORFtgOaNSdkMYrPozzBvxjnvSUokkCCWbLqXDHvIKiR12r+UTSijPJE/Yk702Mb2ejAFuae1C3Ec+qKAoOCagDjpQ3THyb5oaKE7VPHdwCWjWIQLRhC+plu77ObhoXIFJLD13gCi01L/rp4mYVCxIc2lX5A8rkK+bZHnIZwWUQ4t8SIjWxIaUo0FE7oZ83nKuNkYj5ngmLHQLY23Nx2WhE9H6NBthUpik9SmqQPtVYbhIG+bISPoH9Xs8CLrFb0VRjz Joey's Mac - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCo6FhAP7mFFOAzM91gtaKW7saahtaN4lur42FMMztz6aqUycIltCmvxo+3FmrXgCG30maMNU36Vm1+9QRtVQEd+eRuoIWP28t+8MT01Fh4zPuE2Wca3pOHSNo3X81FfWJLzmwEHiQKs9HPQqUhezR9PcVWVkbMyAzw85c0UycGmHGFNb0UiRd9HFY6XbgbxhZv/mvKLZ99xE3xkOzS1PNsdSNvjUKwZR7pSUPqNS5S/1NXyR4GhFTU24VPH/bTATOv2ATH+PSzsZ7Qyz9UHj38tKC+ALJHEDJ4HXGzobyOUP78cHGZOfCB5FYubq0zmOudAjKIAhwI8XTFvJ2DX1P3 jimmyzelinskie From c7c5377285b6d3d6bc0e9161cc8df12227695467 Mon Sep 17 00:00:00 2001 From: Jimmy Zelinskie Date: Thu, 5 Feb 2015 12:51:02 -0500 Subject: [PATCH 127/127] Add my key back to the ephemeral builder machines. --- buildman/templates/cloudconfig.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/buildman/templates/cloudconfig.yaml b/buildman/templates/cloudconfig.yaml index 4972e07ca..13e6894bf 100644 --- a/buildman/templates/cloudconfig.yaml +++ b/buildman/templates/cloudconfig.yaml @@ -3,6 +3,7 @@ ssh_authorized_keys: - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCC0m+hVmyR3vn/xoxJe9+atRWBxSK+YXgyufNVDMcb7H00Jfnc341QH3kDVYZamUbhVh/nyc2RP7YbnZR5zORFtgOaNSdkMYrPozzBvxjnvSUokkCCWbLqXDHvIKiR12r+UTSijPJE/Yk702Mb2ejAFuae1C3Ec+qKAoOCagDjpQ3THyb5oaKE7VPHdwCWjWIQLRhC+plu77ObhoXIFJLD13gCi01L/rp4mYVCxIc2lX5A8rkK+bZHnIZwWUQ4t8SIjWxIaUo0FE7oZ83nKuNkYj5ngmLHQLY23Nx2WhE9H6NBthUpik9SmqQPtVYbhIG+bISPoH9Xs8CLrFb0VRjz Joey's Mac - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCo6FhAP7mFFOAzM91gtaKW7saahtaN4lur42FMMztz6aqUycIltCmvxo+3FmrXgCG30maMNU36Vm1+9QRtVQEd+eRuoIWP28t+8MT01Fh4zPuE2Wca3pOHSNo3X81FfWJLzmwEHiQKs9HPQqUhezR9PcVWVkbMyAzw85c0UycGmHGFNb0UiRd9HFY6XbgbxhZv/mvKLZ99xE3xkOzS1PNsdSNvjUKwZR7pSUPqNS5S/1NXyR4GhFTU24VPH/bTATOv2ATH+PSzsZ7Qyz9UHj38tKC+ALJHEDJ4HXGzobyOUP78cHGZOfCB5FYubq0zmOudAjKIAhwI8XTFvJ2DX1P3 jimmyzelinskie +- ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDNvw8qo9m8np7yQ/Smv/oklM8bo8VyNRZriGYBDuolWDL/mZpYCQnZJXphQo7RFdNABYistikjJlBuuwUohLf2uSq0iKoFa2TgwI43wViWzvuzU4nA02/ITD5BZdmWAFNyIoqeB50Ol4qUgDwLAZ+7Kv7uCi6chcgr9gTi99jY3GHyZjrMiXMHGVGi+FExFuzhVC2drKjbz5q6oRfQeLtNfG4psl5GU3MQU6FkX4fgoCx0r9R48/b7l4+TT7pWblJQiRfeldixu6308vyoTUEHasdkU3/X0OTaGz/h5XqTKnGQc6stvvoED3w+L3QFp0H5Z8sZ9stSsitmCBrmbcKZ jakemoshenko write_files: - path: /root/overrides.list