initial import for Open Source 🎉

2019-11-12 11:09:47 -05:00 · 2019-11-12 11:09:47 -05:00 · 9c0dd3b722
commit 9c0dd3b722
parent 1898c361f3
2048 changed files with 218743 additions and 0 deletions
--- a/buildman/manager/init.py
+++ b/buildman/manager/init.py
--- a/buildman/manager/basemanager.py
+++ b/buildman/manager/basemanager.py
@ -0,0 +1,71 @@
+from trollius import coroutine
+
+class BaseManager(object):
+  """ Base for all worker managers. """
+  def __init__(self, register_component, unregister_component, job_heartbeat_callback,
+               job_complete_callback, manager_hostname, heartbeat_period_sec):
+    self.register_component = register_component
+    self.unregister_component = unregister_component
+    self.job_heartbeat_callback = job_heartbeat_callback
+    self.job_complete_callback = job_complete_callback
+    self.manager_hostname = manager_hostname
+    self.heartbeat_period_sec = heartbeat_period_sec
+
+  @coroutine
+  def job_heartbeat(self, build_job):
+    """ Method invoked to tell the manager that a job is still running. This method will be called
+        every few minutes. """
+    self.job_heartbeat_callback(build_job)
+
+  def overall_setup_time(self):
+    """ Returns the number of seconds that the build system should wait before allowing the job
+        to be picked up again after called 'schedule'.
+    """
+    raise NotImplementedError
+
+  def shutdown(self):
+    """ Indicates that the build controller server is in a shutdown state and that no new jobs
+        or workers should be performed. Existing workers should be cleaned up once their jobs
+        have completed
+    """
+    raise NotImplementedError
+
+  @coroutine
+  def schedule(self, build_job):
+    """ Schedules a queue item to be built. Returns a 2-tuple with (True, None) if the item was
+        properly scheduled and (False, a retry timeout in seconds) if all workers are busy or an
+        error occurs.
+    """
+    raise NotImplementedError
+
+  def initialize(self, manager_config):
+    """ Runs any initialization code for the manager. Called once the server is in a ready state.
+    """
+    raise NotImplementedError
+
+  @coroutine
+  def build_component_ready(self, build_component):
+    """ Method invoked whenever a build component announces itself as ready.
+    """
+    raise NotImplementedError
+
+  def build_component_disposed(self, build_component, timed_out):
+    """ Method invoked whenever a build component has been disposed. The timed_out boolean indicates
+        whether the component's heartbeat timed out.
+    """
+    raise NotImplementedError
+
+  @coroutine
+  def job_completed(self, build_job, job_status, build_component):
+    """ Method invoked once a job_item has completed, in some manner. The job_status will be
+        one of: incomplete, error, complete. Implementations of this method should call coroutine
+        self.job_complete_callback with a status of Incomplete if they wish for the job to be
+        automatically requeued.
+    """
+    raise NotImplementedError
+
+  def num_workers(self):
+    """ Returns the number of active build workers currently registered. This includes those
+        that are currently busy and awaiting more work.
+    """
+    raise NotImplementedError
--- a/buildman/manager/buildcanceller.py
+++ b/buildman/manager/buildcanceller.py
@ -0,0 +1,27 @@
+import logging
+
+from buildman.manager.orchestrator_canceller import OrchestratorCanceller
+from buildman.manager.noop_canceller import NoopCanceller
+
+logger = logging.getLogger(__name__)
+
+CANCELLERS = {'ephemeral': OrchestratorCanceller}
+
+
+class BuildCanceller(object):
+  """ A class to manage cancelling a build """
+
+  def __init__(self, app=None):
+    self.build_manager_config = app.config.get('BUILD_MANAGER')
+    if app is None or self.build_manager_config is None:
+      self.handler = NoopCanceller()
+    else:
+      self.handler = None
+
+  def try_cancel_build(self, uuid):
+    """ A method to kill a running build """
+    if self.handler is None:
+      canceller = CANCELLERS.get(self.build_manager_config[0], NoopCanceller)
+      self.handler = canceller(self.build_manager_config[1])
+
+    return self.handler.try_cancel_build(uuid)
--- a/buildman/manager/enterprise.py
+++ b/buildman/manager/enterprise.py
@ -0,0 +1,92 @@
+import logging
+import uuid
+
+from buildman.component.basecomponent import BaseComponent
+from buildman.component.buildcomponent import BuildComponent
+from buildman.manager.basemanager import BaseManager
+
+from trollius import From, Return, coroutine
+
+REGISTRATION_REALM = 'registration'
+RETRY_TIMEOUT = 5
+logger = logging.getLogger(__name__)
+
+class DynamicRegistrationComponent(BaseComponent):
+  """ Component session that handles dynamic registration of the builder components. """
+
+  def onConnect(self):
+    self.join(REGISTRATION_REALM)
+
+  def onJoin(self, details):
+    logger.debug('Registering registration method')
+    yield From(self.register(self._worker_register, u'io.quay.buildworker.register'))
+
+  def _worker_register(self):
+    realm = self.parent_manager.add_build_component()
+    logger.debug('Registering new build component+worker with realm %s', realm)
+    return realm
+
+  def kind(self):
+    return 'registration'
+
+
+class EnterpriseManager(BaseManager):
+  """ Build manager implementation for the Enterprise Registry. """
+
+  def __init__(self, *args, **kwargs):
+    self.ready_components = set()
+    self.all_components = set()
+    self.shutting_down = False
+
+    super(EnterpriseManager, self).__init__(*args, **kwargs)
+
+  def initialize(self, manager_config):
+    # Add a component which is used by build workers for dynamic registration. Unlike
+    # production, build workers in enterprise are long-lived and register dynamically.
+    self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
+
+  def overall_setup_time(self):
+    # Builders are already registered, so the setup time should be essentially instant. We therefore
+    # only return a minute here.
+    return 60
+
+  def add_build_component(self):
+    """ Adds a new build component for an Enterprise Registry. """
+    # Generate a new unique realm ID for the build worker.
+    realm = str(uuid.uuid4())
+    new_component = self.register_component(realm, BuildComponent, token="")
+    self.all_components.add(new_component)
+    return realm
+
+  @coroutine
+  def schedule(self, build_job):
+    """ Schedules a build for an Enterprise Registry. """
+    if self.shutting_down or not self.ready_components:
+      raise Return(False, RETRY_TIMEOUT)
+
+    component = self.ready_components.pop()
+
+    yield From(component.start_build(build_job))
+
+    raise Return(True, None)
+
+  @coroutine
+  def build_component_ready(self, build_component):
+    self.ready_components.add(build_component)
+
+  def shutdown(self):
+    self.shutting_down = True
+
+  @coroutine
+  def job_completed(self, build_job, job_status, build_component):
+    yield From(self.job_complete_callback(build_job, job_status))
+
+  def build_component_disposed(self, build_component, timed_out):
+    self.all_components.remove(build_component)
+    if build_component in self.ready_components:
+      self.ready_components.remove(build_component)
+
+    self.unregister_component(build_component)
+
+  def num_workers(self):
+    return len(self.all_components)
--- a/buildman/manager/ephemeral.py
+++ b/buildman/manager/ephemeral.py
@ -0,0 +1,710 @@
+import logging
+import uuid
+import calendar
+import json
+import time
+
+from collections import namedtuple
+from datetime import datetime, timedelta
+from six import iteritems
+
+from trollius import From, coroutine, Return, async, sleep
+
+from app import metric_queue
+from buildman.orchestrator import (orchestrator_from_config, KeyEvent,
+                                   OrchestratorError, OrchestratorConnectionError,
+                                   ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
+from buildman.manager.basemanager import BaseManager
+from buildman.manager.executor import PopenExecutor, EC2Executor, KubernetesExecutor
+from buildman.component.buildcomponent import BuildComponent
+from buildman.jobutil.buildjob import BuildJob
+from buildman.server import BuildJobResult
+from util import slash_join
+from util.morecollections import AttrDict
+
+
+logger = logging.getLogger(__name__)
+
+
+JOB_PREFIX = 'building/'
+LOCK_PREFIX = 'lock/'
+REALM_PREFIX = 'realm/'
+CANCEL_PREFIX = 'cancel/'
+METRIC_PREFIX = 'metric/'
+
+CANCELED_LOCK_PREFIX = slash_join(LOCK_PREFIX, 'job-cancelled')
+EXPIRED_LOCK_PREFIX = slash_join(LOCK_PREFIX, 'job-expired')
+
+EPHEMERAL_API_TIMEOUT = 20
+EPHEMERAL_SETUP_TIMEOUT = 500
+
+RETRY_IMMEDIATELY_SLEEP_DURATION = 0
+TOO_MANY_WORKERS_SLEEP_DURATION = 10
+
+
+BuildInfo = namedtuple('BuildInfo', ['component', 'build_job', 'execution_id', 'executor_name'])
+
+
+class EphemeralBuilderManager(BaseManager):
+  """ Build manager implementation for the Enterprise Registry. """
+
+  EXECUTORS = {
+    'popen': PopenExecutor,
+    'ec2': EC2Executor,
+    'kubernetes': KubernetesExecutor,
+  }
+
+  def __init__(self, *args, **kwargs):
+    super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
+
+    self._shutting_down = False
+
+    self._manager_config = None
+    self._orchestrator = None
+
+    # The registered executors available for running jobs, in order.
+    self._ordered_executors = []
+
+    # The registered executors, mapped by their unique name.
+    self._executor_name_to_executor = {}
+
+    # Map from builder component to its associated job.
+    self._component_to_job = {}
+
+    # Map from build UUID to a BuildInfo tuple with information about the build.
+    self._build_uuid_to_info = {}
+
+  def overall_setup_time(self):
+    return EPHEMERAL_SETUP_TIMEOUT
+
+  @coroutine
+  def _mark_job_incomplete(self, build_job, build_info):
+    """ Marks a job as incomplete, in response to a failure to start or a timeout. """
+    executor_name = build_info.executor_name
+    execution_id = build_info.execution_id
+
+    logger.warning('Build executor failed to successfully boot with execution id %s',
+                   execution_id)
+
+    # Take a lock to ensure that only one manager reports the build as incomplete for this
+    # execution.
+    lock_key = slash_join(self._expired_lock_prefix, build_job.build_uuid, execution_id)
+    acquired_lock = yield From(self._orchestrator.lock(lock_key))
+    if acquired_lock:
+      try:
+        # Clean up the bookkeeping for the job.
+        yield From(self._orchestrator.delete_key(self._job_key(build_job)))
+      except KeyError:
+        logger.debug('Could not delete job key %s; might have been removed already',
+                     build_job.build_uuid)
+
+      logger.error('[BUILD INTERNAL ERROR] Build ID: %s. Exec name: %s. Exec ID: %s',
+                   build_job.build_uuid, executor_name, execution_id)
+      yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
+                                            update_phase=True))
+    else:
+      logger.debug('Did not get lock for job-expiration for job %s', build_job.build_uuid)
+
+  @coroutine
+  def _job_callback(self, key_change):
+    """
+    This is the callback invoked when keys related to jobs are changed.
+    It ignores all events related to the creation of new jobs.
+    Deletes or expirations cause checks to ensure they've been properly marked as completed.
+
+    :param key_change: the event and value produced by a key changing in the orchestrator
+    :type key_change: :class:`KeyChange`
+    """
+    if key_change.event in (KeyEvent.CREATE, KeyEvent.SET):
+      raise Return()
+
+    elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE):
+      # Handle the expiration/deletion.
+      job_metadata = json.loads(key_change.value)
+      build_job = BuildJob(AttrDict(job_metadata['job_queue_item']))
+      logger.debug('Got "%s" of job %s', key_change.event, build_job.build_uuid)
+
+      # Get the build info.
+      build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
+      if build_info is None:
+        logger.debug('No build info for "%s" job %s (%s); probably already deleted by this manager',
+                     key_change.event, build_job.build_uuid, job_metadata)
+        raise Return()
+
+      if key_change.event != KeyEvent.EXPIRE:
+        # If the etcd action was not an expiration, then it was already deleted by some manager and
+        # the execution was therefore already shutdown. All that's left is to remove the build info.
+        self._build_uuid_to_info.pop(build_job.build_uuid, None)
+        raise Return()
+
+      logger.debug('got expiration for job %s with metadata: %s', build_job.build_uuid,
+                   job_metadata)
+
+      if not job_metadata.get('had_heartbeat', False):
+        # If we have not yet received a heartbeat, then the node failed to boot in some way.
+        # We mark the job as incomplete here.
+        yield From(self._mark_job_incomplete(build_job, build_info))
+
+      # Finally, we terminate the build execution for the job. We don't do this under a lock as
+      # terminating a node is an atomic operation; better to make sure it is terminated than not.
+      logger.info('Terminating expired build executor for job %s with execution id %s',
+                  build_job.build_uuid, build_info.execution_id)
+      yield From(self.kill_builder_executor(build_job.build_uuid))
+    else:
+      logger.warning('Unexpected KeyEvent (%s) on job key: %s', key_change.event, key_change.key)
+
+
+  @coroutine
+  def _realm_callback(self, key_change):
+    logger.debug('realm callback for key: %s', key_change.key)
+    if key_change.event == KeyEvent.CREATE:
+      # Listen on the realm created by ourselves or another worker.
+      realm_spec = json.loads(key_change.value)
+      self._register_realm(realm_spec)
+
+    elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE):
+      # Stop listening for new connections on the realm, if we did not get the connection.
+      realm_spec = json.loads(key_change.value)
+      realm_id = realm_spec['realm']
+
+      build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
+      build_uuid = build_job.build_uuid
+
+      logger.debug('Realm key %s for build %s was %s', realm_id, build_uuid, key_change.event)
+      build_info = self._build_uuid_to_info.get(build_uuid, None)
+      if build_info is not None:
+        # Pop off the component and if we find one, then the build has not connected to this
+        # manager, so we can safely unregister its component.
+        component = self._component_to_job.pop(build_info.component, None)
+        if component is not None:
+          # We were not the manager which the worker connected to, remove the bookkeeping for it
+          logger.debug('Unregistering unused component for build %s', build_uuid)
+          self.unregister_component(build_info.component)
+
+      # If the realm has expired, then perform cleanup of the executor.
+      if key_change.event == KeyEvent.EXPIRE:
+        execution_id = realm_spec.get('execution_id', None)
+        executor_name = realm_spec.get('executor_name', 'EC2Executor')
+
+        # Cleanup the job, since it never started.
+        logger.debug('Job %s for incomplete marking: %s', build_uuid, build_info)
+        if build_info is not None:
+          yield From(self._mark_job_incomplete(build_job, build_info))
+
+        # Cleanup the executor.
+        logger.info('Realm %s expired for job %s, terminating executor %s with execution id %s',
+                    realm_id, build_uuid, executor_name, execution_id)
+        yield From(self.terminate_executor(executor_name, execution_id))
+
+    else:
+      logger.warning('Unexpected action (%s) on realm key: %s', key_change.event, key_change.key)
+
+
+  def _register_realm(self, realm_spec):
+    logger.debug('Got call to register realm %s with manager', realm_spec['realm'])
+
+    # Create the build information block for the registered realm.
+    build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
+    execution_id = realm_spec.get('execution_id', None)
+    executor_name = realm_spec.get('executor_name', 'EC2Executor')
+
+    logger.debug('Registering realm %s with manager: %s', realm_spec['realm'], realm_spec)
+    component = self.register_component(realm_spec['realm'], BuildComponent,
+                                        token=realm_spec['token'])
+
+    build_info = BuildInfo(component=component, build_job=build_job, execution_id=execution_id,
+                           executor_name=executor_name)
+
+    self._component_to_job[component] = build_job
+    self._build_uuid_to_info[build_job.build_uuid] = build_info
+
+    logger.debug('Registered realm %s with manager', realm_spec['realm'])
+    return component
+
+  @property
+  def registered_executors(self):
+    return self._ordered_executors
+
+  @coroutine
+  def _register_existing_realms(self):
+    try:
+      all_realms = yield From(self._orchestrator.get_prefixed_keys(self._realm_prefix))
+
+      # Register all existing realms found.
+      encountered = {self._register_realm(json.loads(realm_data))
+                     for _realm, realm_data in all_realms}
+
+      # Remove any components not encountered so we can clean up.
+      for component, job in iteritems(self._component_to_job):
+        if not component in encountered:
+          self._component_to_job.pop(component, None)
+          self._build_uuid_to_info.pop(job.build_uuid, None)
+
+    except KeyError:
+      pass
+
+  def _load_executor(self, executor_kind_name, executor_config):
+    executor_klass = EphemeralBuilderManager.EXECUTORS.get(executor_kind_name)
+    if executor_klass is None:
+      logger.error('Unknown executor %s; skipping install', executor_kind_name)
+      return
+
+    executor = executor_klass(executor_config, self.manager_hostname)
+    if executor.name in self._executor_name_to_executor:
+      raise Exception('Executor with name %s already registered' % executor.name)
+
+    self._ordered_executors.append(executor)
+    self._executor_name_to_executor[executor.name] = executor
+
+  def _config_prefix(self, key):
+    if self._manager_config.get('ORCHESTRATOR') is None:
+      return key
+
+    prefix = self._manager_config.get('ORCHESTRATOR_PREFIX', '')
+    return slash_join(prefix, key).lstrip('/') + '/'
+
+  @property
+  def _job_prefix(self):
+    return self._config_prefix(JOB_PREFIX)
+
+  @property
+  def _realm_prefix(self):
+    return self._config_prefix(REALM_PREFIX)
+
+  @property
+  def _cancel_prefix(self):
+    return self._config_prefix(CANCEL_PREFIX)
+
+  @property
+  def _metric_prefix(self):
+    return self._config_prefix(METRIC_PREFIX)
+
+  @property
+  def _expired_lock_prefix(self):
+    return self._config_prefix(EXPIRED_LOCK_PREFIX)
+
+  @property
+  def _canceled_lock_prefix(self):
+    return self._config_prefix(CANCELED_LOCK_PREFIX)
+
+  def _metric_key(self, realm):
+    """
+    Create a key which is used to track a job in the Orchestrator.
+
+    :param realm: realm for the build
+    :type realm: str
+    :returns: key used to track jobs
+    :rtype: str
+    """
+    return slash_join(self._metric_prefix, realm)
+
+  def _job_key(self, build_job):
+    """
+    Creates a key which is used to track a job in the Orchestrator.
+
+    :param build_job: unique job identifier for a build
+    :type build_job: str
+    :returns: key used to track the job
+    :rtype: str
+    """
+    return slash_join(self._job_prefix, build_job.job_details['build_uuid'])
+
+  def _realm_key(self, realm):
+    """
+    Create a key which is used to track an incoming connection on a realm.
+
+    :param realm: realm for the build
+    :type realm: str
+    :returns: key used to track the connection to the realm
+    :rtype: str
+    """
+    return slash_join(self._realm_prefix, realm)
+
+
+  def initialize(self, manager_config):
+    logger.debug('Calling initialize')
+    self._manager_config = manager_config
+
+    # Note: Executor config can be defined either as a single block of EXECUTOR_CONFIG (old style)
+    # or as a new set of executor configurations, with the order determining how we fallback. We
+    # check for both here to ensure backwards compatibility.
+    if manager_config.get('EXECUTORS'):
+      for executor_config in manager_config['EXECUTORS']:
+        self._load_executor(executor_config.get('EXECUTOR'), executor_config)
+    else:
+      self._load_executor(manager_config.get('EXECUTOR'), manager_config.get('EXECUTOR_CONFIG'))
+
+    logger.debug('calling orchestrator_from_config')
+    self._orchestrator = orchestrator_from_config(manager_config)
+
+    logger.debug('setting on_key_change callbacks for job, cancel, realm')
+    self._orchestrator.on_key_change(self._job_prefix, self._job_callback)
+    self._orchestrator.on_key_change(self._cancel_prefix, self._cancel_callback)
+    self._orchestrator.on_key_change(self._realm_prefix, self._realm_callback,
+                                    restarter=self._register_existing_realms)
+
+    # Load components for all realms currently known to the cluster
+    async(self._register_existing_realms())
+
+  def shutdown(self):
+    logger.debug('Shutting down worker.')
+    if self._orchestrator is not None:
+      self._orchestrator.shutdown()
+
+  @coroutine
+  def schedule(self, build_job):
+    build_uuid = build_job.job_details['build_uuid']
+    logger.debug('Calling schedule with job: %s', build_uuid)
+
+    # Check if there are worker slots available by checking the number of jobs in the orchestrator
+    allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
+    try:
+      active_jobs = yield From(self._orchestrator.get_prefixed_keys(self._job_prefix))
+      workers_alive = len(active_jobs)
+    except KeyError:
+      workers_alive = 0
+    except OrchestratorConnectionError:
+      logger.exception('Could not read job count from orchestrator for job due to orchestrator being down')
+      raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
+    except OrchestratorError:
+      logger.exception('Exception when reading job count from orchestrator for job: %s', build_uuid)
+      raise Return(False, RETRY_IMMEDIATELY_SLEEP_DURATION)
+
+    logger.debug('Total jobs (scheduling job %s): %s', build_uuid, workers_alive)
+
+    if workers_alive >= allowed_worker_count:
+      logger.info('Too many workers alive, unable to start new worker for build job: %s. %s >= %s',
+                  build_uuid, workers_alive, allowed_worker_count)
+      raise Return(False, TOO_MANY_WORKERS_SLEEP_DURATION)
+
+    job_key = self._job_key(build_job)
+
+    # First try to take a lock for this job, meaning we will be responsible for its lifeline
+    realm = str(uuid.uuid4())
+    token = str(uuid.uuid4())
+    nonce = str(uuid.uuid4())
+
+    machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200)
+    max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration)
+
+    payload = {
+      'max_expiration': calendar.timegm(max_expiration.timetuple()),
+      'nonce': nonce,
+      'had_heartbeat': False,
+      'job_queue_item': build_job.job_item,
+    }
+
+    lock_payload = json.dumps(payload)
+    logger.debug('Writing key for job %s with expiration in %s seconds', build_uuid,
+                 EPHEMERAL_SETUP_TIMEOUT)
+
+    try:
+      yield From(self._orchestrator.set_key(job_key, lock_payload, overwrite=False,
+                                           expiration=EPHEMERAL_SETUP_TIMEOUT))
+    except KeyError:
+      logger.warning('Job: %s already exists in orchestrator, timeout may be misconfigured',
+                     build_uuid)
+      raise Return(False, EPHEMERAL_API_TIMEOUT)
+    except OrchestratorConnectionError:
+      logger.exception('Exception when writing job %s to orchestrator; could not connect',
+                       build_uuid)
+      raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
+    except OrchestratorError:
+      logger.exception('Exception when writing job %s to orchestrator', build_uuid)
+      raise Return(False, RETRY_IMMEDIATELY_SLEEP_DURATION)
+
+    # Got a lock, now lets boot the job via one of the registered executors.
+    started_with_executor = None
+    execution_id = None
+
+    logger.debug("Registered executors are: %s", [ex.name for ex in self._ordered_executors])
+    for executor in self._ordered_executors:
+      # Check if we can use this executor based on its whitelist, by namespace.
+      namespace = build_job.namespace
+      if not executor.allowed_for_namespace(namespace):
+        logger.debug('Job %s (namespace: %s) cannot use executor %s', build_uuid, namespace,
+                     executor.name)
+        continue
+
+      # Check if we can use this executor based on the retries remaining.
+      if executor.minimum_retry_threshold > build_job.retries_remaining:
+        metric_queue.builder_fallback.Inc()
+        logger.debug('Job %s cannot use executor %s as it is below retry threshold %s (retry #%s)',
+                     build_uuid, executor.name, executor.minimum_retry_threshold,
+                     build_job.retries_remaining)
+        continue
+
+      logger.debug('Starting builder for job %s with selected executor: %s', build_uuid,
+                   executor.name)
+
+      try:
+        execution_id = yield From(executor.start_builder(realm, token, build_uuid))
+      except:
+        try:
+          metric_queue.build_start_failure.Inc(labelvalues=[executor.name])
+          metric_queue.put_deprecated(('ExecutorFailure-%s' % executor.name), 1, unit='Count')
+        except:
+          logger.exception('Exception when writing failure metric for execution %s for job %s',
+                           execution_id, build_uuid)
+
+        logger.exception('Exception when starting builder for job: %s', build_uuid)
+        continue
+
+      try:
+        metric_queue.build_start_success.Inc(labelvalues=[executor.name])
+      except:
+        logger.exception('Exception when writing success metric for execution %s for job %s',
+                          execution_id, build_uuid)
+
+      try:
+        metric_queue.ephemeral_build_workers.Inc()
+      except:
+        logger.exception('Exception when writing start metrics for execution %s for job %s',
+                         execution_id, build_uuid)
+
+      started_with_executor = executor
+
+      # Break out of the loop now that we've started a builder successfully.
+      break
+
+    # If we didn't start the job, cleanup and return it to the queue.
+    if started_with_executor is None:
+      logger.error('Could not start ephemeral worker for build %s', build_uuid)
+
+      # Delete the associated build job record.
+      yield From(self._orchestrator.delete_key(job_key))
+      raise Return(False, EPHEMERAL_API_TIMEOUT)
+
+    # Job was started!
+    logger.debug('Started execution with ID %s for job: %s with executor: %s',
+                 execution_id, build_uuid, started_with_executor.name)
+
+    # Store metric data
+    metric_spec = json.dumps({
+      'executor_name': started_with_executor.name,
+      'start_time': time.time(),
+    })
+
+    try:
+      yield From(self._orchestrator.set_key(self._metric_key(realm), metric_spec, overwrite=False,
+                                           expiration=machine_max_expiration + 10))
+    except KeyError:
+      logger.error('Realm %s already exists in orchestrator for job %s ' +
+                   'UUID collision or something is very very wrong.', realm, build_uuid)
+    except OrchestratorError:
+      logger.exception('Exception when writing realm %s to orchestrator for job %s',
+                       realm, build_uuid)
+
+    # Store the realm spec which will allow any manager to accept this builder when it connects
+    realm_spec = json.dumps({
+      'realm': realm,
+      'token': token,
+      'execution_id': execution_id,
+      'executor_name': started_with_executor.name,
+      'job_queue_item': build_job.job_item,
+    })
+
+    try:
+      setup_time = started_with_executor.setup_time or self.overall_setup_time()
+      logger.debug('Writing job key for job %s using executor %s with ID %s and ttl %s', build_uuid,
+                   started_with_executor.name, execution_id, setup_time)
+      yield From(self._orchestrator.set_key(self._realm_key(realm), realm_spec,
+                                           expiration=setup_time))
+    except OrchestratorConnectionError:
+      logger.exception('Exception when writing realm %s to orchestrator for job %s',
+                       realm, build_uuid)
+      raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
+    except OrchestratorError:
+      logger.exception('Exception when writing realm %s to orchestrator for job %s',
+                       realm, build_uuid)
+      raise Return(False, setup_time)
+
+    logger.debug('Builder spawn complete for job %s using executor %s with ID %s ',
+                 build_uuid, started_with_executor.name, execution_id)
+    raise Return(True, None)
+
+  @coroutine
+  def build_component_ready(self, build_component):
+    logger.debug('Got component ready for component with realm %s', build_component.builder_realm)
+
+    # Pop off the job for the component.
+    # We do so before we send out the watch below, as it will also remove this mapping.
+    job = self._component_to_job.pop(build_component, None)
+    if job is None:
+      # This will occur once the build finishes, so no need to worry about it.
+      # We log in case it happens outside of the expected flow.
+      logger.debug('Could not find job for the build component on realm %s; component is ready',
+                   build_component.builder_realm)
+      raise Return()
+
+    # Start the build job.
+    logger.debug('Sending build %s to newly ready component on realm %s',
+                 job.build_uuid, build_component.builder_realm)
+    yield From(build_component.start_build(job))
+
+    yield From(self._write_duration_metric(metric_queue.builder_time_to_build,
+                                           build_component.builder_realm))
+
+    # Clean up the bookkeeping for allowing any manager to take the job.
+    try:
+      yield From(self._orchestrator.delete_key(self._realm_key(build_component.builder_realm)))
+    except KeyError:
+      logger.warning('Could not delete realm key %s', build_component.builder_realm)
+
+  def build_component_disposed(self, build_component, timed_out):
+    logger.debug('Calling build_component_disposed.')
+    self.unregister_component(build_component)
+
+  @coroutine
+  def job_completed(self, build_job, job_status, build_component):
+    logger.debug('Calling job_completed for job %s with status: %s',
+                 build_job.build_uuid, job_status)
+
+    yield From(self._write_duration_metric(metric_queue.build_time, build_component.builder_realm))
+
+    # Mark the job as completed. Since this is being invoked from the component, we don't need
+    # to ask for the phase to be updated as well.
+    build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
+    executor_name = build_info.executor_name if build_info else None
+    yield From(self.job_complete_callback(build_job, job_status, executor_name, update_phase=False))
+
+    # Kill the ephemeral builder.
+    yield From(self.kill_builder_executor(build_job.build_uuid))
+
+    # Delete the build job from the orchestrator.
+    try:
+      job_key = self._job_key(build_job)
+      yield From(self._orchestrator.delete_key(job_key))
+    except KeyError:
+      logger.debug('Builder is asking for job to be removed, but work already completed')
+    except OrchestratorConnectionError:
+      logger.exception('Could not remove job key as orchestrator is not available')
+      yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
+      raise Return()
+
+    # Delete the metric from the orchestrator.
+    try:
+      metric_key = self._metric_key(build_component.builder_realm)
+      yield From(self._orchestrator.delete_key(metric_key))
+    except KeyError:
+      logger.debug('Builder is asking for metric to be removed, but key not found')
+    except OrchestratorConnectionError:
+      logger.exception('Could not remove metric key as orchestrator is not available')
+      yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
+      raise Return()
+
+    logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
+
+  @coroutine
+  def kill_builder_executor(self, build_uuid):
+    logger.info('Starting termination of executor for job %s', build_uuid)
+    build_info = self._build_uuid_to_info.pop(build_uuid, None)
+    if build_info is None:
+      logger.debug('Build information not found for build %s; skipping termination', build_uuid)
+      raise Return()
+
+    # Remove the build's component.
+    self._component_to_job.pop(build_info.component, None)
+
+    # Stop the build node/executor itself.
+    yield From(self.terminate_executor(build_info.executor_name, build_info.execution_id))
+
+  @coroutine
+  def terminate_executor(self, executor_name, execution_id):
+    executor = self._executor_name_to_executor.get(executor_name)
+    if executor is None:
+      logger.error('Could not find registered executor %s', executor_name)
+      raise Return()
+
+    # Terminate the executor's execution.
+    logger.info('Terminating executor %s with execution id %s', executor_name, execution_id)
+    yield From(executor.stop_builder(execution_id))
+
+  @coroutine
+  def job_heartbeat(self, build_job):
+    """
+    :param build_job: the identifier for the build
+    :type build_job: str
+    """
+    self.job_heartbeat_callback(build_job)
+    self._extend_job_in_orchestrator(build_job)
+
+  @coroutine
+  def _extend_job_in_orchestrator(self, build_job):
+    try:
+      job_data = yield From(self._orchestrator.get_key(self._job_key(build_job)))
+    except KeyError:
+      logger.info('Job %s no longer exists in the orchestrator', build_job.build_uuid)
+      raise Return()
+    except OrchestratorConnectionError:
+      logger.exception('failed to connect when attempted to extend job')
+
+    build_job_metadata = json.loads(job_data)
+
+    max_expiration = datetime.utcfromtimestamp(build_job_metadata['max_expiration'])
+    max_expiration_remaining = max_expiration - datetime.utcnow()
+    max_expiration_sec = max(0, int(max_expiration_remaining.total_seconds()))
+
+    ttl = min(self.heartbeat_period_sec * 2, max_expiration_sec)
+    payload = {
+      'job_queue_item': build_job.job_item,
+      'max_expiration': build_job_metadata['max_expiration'],
+      'had_heartbeat': True,
+    }
+
+    try:
+      yield From(self._orchestrator.set_key(self._job_key(build_job), json.dumps(payload),
+                                            expiration=ttl))
+    except OrchestratorConnectionError:
+      logger.exception('Could not update heartbeat for job as the orchestrator is not available')
+      yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
+
+  @coroutine
+  def _write_duration_metric(self, metric, realm):
+    """
+    :returns: True if the metric was written, otherwise False
+    :rtype: bool
+    """
+    try:
+      metric_data = yield From(self._orchestrator.get_key(self._metric_key(realm)))
+      parsed_metric_data = json.loads(metric_data)
+      start_time = parsed_metric_data['start_time']
+      metric.Observe(time.time() - start_time,
+                     labelvalues=[parsed_metric_data.get('executor_name',
+                                                         'unknown')])
+    except Exception:
+      logger.exception("Could not write metric for realm %s", realm)
+
+  def num_workers(self):
+    """
+    The number of workers we're managing locally.
+
+    :returns: the number of the workers locally managed
+    :rtype: int
+    """
+    return len(self._component_to_job)
+
+
+  @coroutine
+  def _cancel_callback(self, key_change):
+    if key_change.event not in (KeyEvent.CREATE, KeyEvent.SET):
+      raise Return()
+
+    build_uuid = key_change.value
+    build_info = self._build_uuid_to_info.get(build_uuid, None)
+    if build_info is None:
+      logger.debug('No build info for "%s" job %s', key_change.event, build_uuid)
+      raise Return(False)
+
+    lock_key = slash_join(self._canceled_lock_prefix,
+                          build_uuid, build_info.execution_id)
+    lock_acquired = yield From(self._orchestrator.lock(lock_key))
+    if lock_acquired:
+      builder_realm = build_info.component.builder_realm
+      yield From(self.kill_builder_executor(build_uuid))
+      yield From(self._orchestrator.delete_key(self._realm_key(builder_realm)))
+      yield From(self._orchestrator.delete_key(self._metric_key(builder_realm)))
+      yield From(self._orchestrator.delete_key(slash_join(self._job_prefix, build_uuid)))
+
+    # This is outside the lock so we can un-register the component wherever it is registered to.
+    yield From(build_info.component.cancel_build())
--- a/buildman/manager/etcd_canceller.py
+++ b/buildman/manager/etcd_canceller.py
@ -0,0 +1,37 @@
+import logging
+import etcd
+
+logger = logging.getLogger(__name__)
+
+
+class EtcdCanceller(object):
+  """ A class that sends a message to etcd to cancel a build """
+
+  def __init__(self, config):
+    etcd_host = config.get('ETCD_HOST', '127.0.0.1')
+    etcd_port = config.get('ETCD_PORT', 2379)
+    etcd_ca_cert = config.get('ETCD_CA_CERT', None)
+    etcd_auth = config.get('ETCD_CERT_AND_KEY', None)
+    if etcd_auth is not None:
+      etcd_auth = tuple(etcd_auth)
+
+    etcd_protocol = 'http' if etcd_auth is None else 'https'
+    logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
+    self._cancel_prefix = config.get('ETCD_CANCEL_PREFIX', 'cancel/')
+    self._etcd_client = etcd.Client(
+      host=etcd_host,
+      port=etcd_port,
+      cert=etcd_auth,
+      ca_cert=etcd_ca_cert,
+      protocol=etcd_protocol,
+      read_timeout=5)
+
+  def try_cancel_build(self, build_uuid):
+    """ Writes etcd message to cancel build_uuid. """
+    logger.info("Cancelling build %s".format(build_uuid))
+    try:
+      self._etcd_client.write("{}{}".format(self._cancel_prefix, build_uuid), build_uuid, ttl=60)
+      return True
+    except etcd.EtcdException:
+      logger.exception("Failed to write to etcd client %s", build_uuid)
+      return False
--- a/buildman/manager/executor.py
+++ b/buildman/manager/executor.py
@ -0,0 +1,560 @@
+import datetime
+import hashlib
+import logging
+import os
+import socket
+import subprocess
+import threading
+import uuid
+
+from functools import partial
+
+import boto.ec2
+import cachetools.func
+import requests
+import trollius
+
+from container_cloud_config import CloudConfigContext
+from jinja2 import FileSystemLoader, Environment
+from trollius import coroutine, From, Return, get_event_loop
+
+import release
+
+from buildman.asyncutil import AsyncWrapper
+from app import metric_queue, app
+from util.metrics.metricqueue import duration_collector_async
+from _init import ROOT_DIR
+
+
+logger = logging.getLogger(__name__)
+
+
+ONE_HOUR = 60*60
+
+_TAG_RETRY_COUNT = 3 # Number of times to retry adding tags.
+_TAG_RETRY_SLEEP = 2 # Number of seconds to wait between tag retries.
+
+ENV = Environment(loader=FileSystemLoader(os.path.join(ROOT_DIR, "buildman/templates")))
+TEMPLATE = ENV.get_template('cloudconfig.yaml')
+CloudConfigContext().populate_jinja_environment(ENV)
+
+class ExecutorException(Exception):
+  """ Exception raised when there is a problem starting or stopping a builder.
+  """
+  pass
+
+
+class BuilderExecutor(object):
+  def __init__(self, executor_config, manager_hostname):
+    """ Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
+        starting and stopping builders.
+    """
+    self.executor_config = executor_config
+    self.manager_hostname = manager_hostname
+
+    default_websocket_scheme = 'wss' if app.config['PREFERRED_URL_SCHEME'] == 'https' else 'ws'
+    self.websocket_scheme = executor_config.get("WEBSOCKET_SCHEME", default_websocket_scheme)
+
+  @property
+  def name(self):
+    """ Name returns the unique name for this executor. """
+    return self.executor_config.get('NAME') or self.__class__.__name__
+
+  @property
+  def setup_time(self):
+    """ Returns the amount of time (in seconds) to wait for the execution to start for the build.
+        If None, the manager's default will be used.
+    """
+    return self.executor_config.get('SETUP_TIME')
+
+  @coroutine
+  def start_builder(self, realm, token, build_uuid):
+    """ Create a builder with the specified config. Returns a unique id which can be used to manage
+        the builder.
+    """
+    raise NotImplementedError
+
+  @coroutine
+  def stop_builder(self, builder_id):
+    """ Stop a builder which is currently running.
+    """
+    raise NotImplementedError
+
+  def allowed_for_namespace(self, namespace):
+    """ Returns true if this executor can be used for builds in the given namespace. """
+
+    # Check for an explicit namespace whitelist.
+    namespace_whitelist = self.executor_config.get('NAMESPACE_WHITELIST')
+    if namespace_whitelist is not None and namespace in namespace_whitelist:
+      return True
+
+    # Check for a staged rollout percentage. If found, we hash the namespace and, if it is found
+    # in the first X% of the character space, we allow this executor to be used.
+    staged_rollout = self.executor_config.get('STAGED_ROLLOUT')
+    if staged_rollout is not None:
+      bucket = int(hashlib.sha256(namespace).hexdigest()[-2:], 16)
+      return bucket < (256 * staged_rollout)
+
+    # If there are no restrictions in place, we are free to use this executor.
+    return staged_rollout is None and namespace_whitelist is None
+
+  @property
+  def minimum_retry_threshold(self):
+    """ Returns the minimum number of retries required for this executor to be used or 0 if
+        none. """
+    return self.executor_config.get('MINIMUM_RETRY_THRESHOLD', 0)
+
+  def generate_cloud_config(self, realm, token, build_uuid, coreos_channel,
+                            manager_hostname, quay_username=None,
+                            quay_password=None):
+    if quay_username is None:
+      quay_username = self.executor_config['QUAY_USERNAME']
+
+    if quay_password is None:
+      quay_password = self.executor_config['QUAY_PASSWORD']
+
+    return TEMPLATE.render(
+      realm=realm,
+      token=token,
+      build_uuid=build_uuid,
+      quay_username=quay_username,
+      quay_password=quay_password,
+      manager_hostname=manager_hostname,
+      websocket_scheme=self.websocket_scheme,
+      coreos_channel=coreos_channel,
+      worker_image=self.executor_config.get('WORKER_IMAGE', 'quay.io/coreos/registry-build-worker'),
+      worker_tag=self.executor_config['WORKER_TAG'],
+      logentries_token=self.executor_config.get('LOGENTRIES_TOKEN', None),
+      volume_size=self.executor_config.get('VOLUME_SIZE', '42G'),
+      max_lifetime_s=self.executor_config.get('MAX_LIFETIME_S', 10800),
+      ssh_authorized_keys=self.executor_config.get('SSH_AUTHORIZED_KEYS', []),
+    )
+
+
+class EC2Executor(BuilderExecutor):
+  """ Implementation of BuilderExecutor which uses libcloud to start machines on a variety of cloud
+      providers.
+  """
+  COREOS_STACK_URL = 'http://%s.release.core-os.net/amd64-usr/current/coreos_production_ami_hvm.txt'
+
+  def __init__(self, *args, **kwargs):
+    self._loop = get_event_loop()
+    super(EC2Executor, self).__init__(*args, **kwargs)
+
+  def _get_conn(self):
+    """ Creates an ec2 connection which can be used to manage instances.
+    """
+    return AsyncWrapper(boto.ec2.connect_to_region(
+      self.executor_config['EC2_REGION'],
+      aws_access_key_id=self.executor_config['AWS_ACCESS_KEY'],
+      aws_secret_access_key=self.executor_config['AWS_SECRET_KEY'],
+    ))
+
+  @classmethod
+  @cachetools.func.ttl_cache(ttl=ONE_HOUR)
+  def _get_coreos_ami(cls, ec2_region, coreos_channel):
+    """ Retrieve the CoreOS AMI id from the canonical listing.
+    """
+    stack_list_string = requests.get(EC2Executor.COREOS_STACK_URL % coreos_channel).text
+    stack_amis = dict([stack.split('=') for stack in stack_list_string.split('|')])
+    return stack_amis[ec2_region]
+
+  @coroutine
+  @duration_collector_async(metric_queue.builder_time_to_start, ['ec2'])
+  def start_builder(self, realm, token, build_uuid):
+    region = self.executor_config['EC2_REGION']
+    channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
+
+    coreos_ami = self.executor_config.get('COREOS_AMI', None)
+    if coreos_ami is None:
+      get_ami_callable = partial(self._get_coreos_ami, region, channel)
+      coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
+
+    user_data = self.generate_cloud_config(realm, token, build_uuid, channel, self.manager_hostname)
+    logger.debug('Generated cloud config for build %s: %s', build_uuid, user_data)
+
+    ec2_conn = self._get_conn()
+
+    ssd_root_ebs = boto.ec2.blockdevicemapping.BlockDeviceType(
+      size=int(self.executor_config.get('BLOCK_DEVICE_SIZE', 48)),
+      volume_type='gp2',
+      delete_on_termination=True,
+    )
+    block_devices = boto.ec2.blockdevicemapping.BlockDeviceMapping()
+    block_devices['/dev/xvda'] = ssd_root_ebs
+
+    interfaces = None
+    if self.executor_config.get('EC2_VPC_SUBNET_ID', None) is not None:
+      interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
+        subnet_id=self.executor_config['EC2_VPC_SUBNET_ID'],
+        groups=self.executor_config['EC2_SECURITY_GROUP_IDS'],
+        associate_public_ip_address=True,
+      )
+      interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)
+
+    try:
+      reservation = yield From(ec2_conn.run_instances(
+        coreos_ami,
+        instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
+        key_name=self.executor_config.get('EC2_KEY_NAME', None),
+        user_data=user_data,
+        instance_initiated_shutdown_behavior='terminate',
+        block_device_map=block_devices,
+        network_interfaces=interfaces,
+      ))
+    except boto.exception.EC2ResponseError as ec2e:
+      logger.exception('Unable to spawn builder instance')
+      metric_queue.ephemeral_build_worker_failure.Inc()
+      raise ec2e
+
+    if not reservation.instances:
+      raise ExecutorException('Unable to spawn builder instance.')
+    elif len(reservation.instances) != 1:
+      raise ExecutorException('EC2 started wrong number of instances!')
+
+    launched = AsyncWrapper(reservation.instances[0])
+
+    # Sleep a few seconds to wait for AWS to spawn the instance.
+    yield From(trollius.sleep(_TAG_RETRY_SLEEP))
+
+    # Tag the instance with its metadata.
+    for i in range(0, _TAG_RETRY_COUNT):
+      try:
+        yield From(launched.add_tags({
+          'Name': 'Quay Ephemeral Builder',
+          'Realm': realm,
+          'Token': token,
+          'BuildUUID': build_uuid,
+        }))
+      except boto.exception.EC2ResponseError as ec2e:
+        if ec2e.error_code == 'InvalidInstanceID.NotFound':
+          if i < _TAG_RETRY_COUNT - 1:
+            logger.warning('Failed to write EC2 tags for instance %s for build %s (attempt #%s)',
+                           launched.id, build_uuid, i)
+            yield From(trollius.sleep(_TAG_RETRY_SLEEP))
+            continue
+
+          raise ExecutorException('Unable to find builder instance.')
+
+        logger.exception('Failed to write EC2 tags (attempt #%s)', i)
+
+    logger.debug('Machine with ID %s started for build %s', launched.id, build_uuid)
+    raise Return(launched.id)
+
+  @coroutine
+  def stop_builder(self, builder_id):
+    try:
+      ec2_conn = self._get_conn()
+      terminated_instances = yield From(ec2_conn.terminate_instances([builder_id]))
+    except boto.exception.EC2ResponseError as ec2e:
+      if ec2e.error_code == 'InvalidInstanceID.NotFound':
+        logger.debug('Instance %s already terminated', builder_id)
+        return
+
+      logger.exception('Exception when trying to terminate instance %s', builder_id)
+      raise
+
+    if builder_id not in [si.id for si in terminated_instances]:
+      raise ExecutorException('Unable to terminate instance: %s' % builder_id)
+
+
+class PopenExecutor(BuilderExecutor):
+  """ Implementation of BuilderExecutor which uses Popen to fork a quay-builder process.
+  """
+  def __init__(self, executor_config, manager_hostname):
+    self._jobs = {}
+
+    super(PopenExecutor, self).__init__(executor_config, manager_hostname)
+
+  """ Executor which uses Popen to fork a quay-builder process.
+  """
+  @coroutine
+  @duration_collector_async(metric_queue.builder_time_to_start, ['fork'])
+  def start_builder(self, realm, token, build_uuid):
+    # Now start a machine for this job, adding the machine id to the etcd information
+    logger.debug('Forking process for build')
+
+    ws_host = os.environ.get("BUILDMAN_WS_HOST", "localhost")
+    ws_port = os.environ.get("BUILDMAN_WS_PORT", "8787")
+    builder_env = {
+      'TOKEN': token,
+      'REALM': realm,
+      'ENDPOINT': 'ws://%s:%s' % (ws_host, ws_port),
+      'DOCKER_TLS_VERIFY': os.environ.get('DOCKER_TLS_VERIFY', ''),
+      'DOCKER_CERT_PATH': os.environ.get('DOCKER_CERT_PATH', ''),
+      'DOCKER_HOST': os.environ.get('DOCKER_HOST', ''),
+      'PATH': "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
+    }
+
+    logpipe = LogPipe(logging.INFO)
+    spawned = subprocess.Popen(os.environ.get('BUILDER_BINARY_LOCATION',
+                                              '/usr/local/bin/quay-builder'),
+                               stdout=logpipe,
+                               stderr=logpipe,
+                               env=builder_env)
+
+    builder_id = str(uuid.uuid4())
+    self._jobs[builder_id] = (spawned, logpipe)
+    logger.debug('Builder spawned with id: %s', builder_id)
+    raise Return(builder_id)
+
+  @coroutine
+  def stop_builder(self, builder_id):
+    if builder_id not in self._jobs:
+      raise ExecutorException('Builder id not being tracked by executor.')
+
+    logger.debug('Killing builder with id: %s', builder_id)
+    spawned, logpipe = self._jobs[builder_id]
+
+    if spawned.poll() is None:
+      spawned.kill()
+    logpipe.close()
+
+
+class KubernetesExecutor(BuilderExecutor):
+  """ Executes build jobs by creating Kubernetes jobs which run a qemu-kvm virtual
+      machine in a pod """
+  def __init__(self, *args, **kwargs):
+    super(KubernetesExecutor, self).__init__(*args, **kwargs)
+    self._loop = get_event_loop()
+    self.namespace = self.executor_config.get('BUILDER_NAMESPACE', 'builder')
+    self.image = self.executor_config.get('BUILDER_VM_CONTAINER_IMAGE',
+                                          'quay.io/quay/quay-builder-qemu-coreos:stable')
+
+  @coroutine
+  def _request(self, method, path, **kwargs):
+    request_options = dict(kwargs)
+
+    tls_cert = self.executor_config.get('K8S_API_TLS_CERT')
+    tls_key = self.executor_config.get('K8S_API_TLS_KEY')
+    tls_ca = self.executor_config.get('K8S_API_TLS_CA')
+    service_account_token = self.executor_config.get('SERVICE_ACCOUNT_TOKEN')
+
+    if 'timeout' not in request_options:
+      request_options['timeout'] = self.executor_config.get("K8S_API_TIMEOUT", 20)
+
+    if service_account_token:
+      scheme = 'https'
+      request_options['headers'] = {'Authorization': 'Bearer ' + service_account_token}
+      logger.debug('Using service account token for Kubernetes authentication')
+    elif tls_cert and tls_key:
+      scheme = 'https'
+      request_options['cert'] = (tls_cert, tls_key)
+      logger.debug('Using tls certificate and key for Kubernetes authentication')
+      if tls_ca:
+        request_options['verify'] = tls_ca
+    else:
+      scheme = 'http'
+
+    server = self.executor_config.get('K8S_API_SERVER', 'localhost:8080')
+    url = '%s://%s%s' % (scheme, server, path)
+
+    logger.debug('Executor config: %s', self.executor_config)
+    logger.debug('Kubernetes request: %s %s: %s', method, url, request_options)
+    res = requests.request(method, url, **request_options)
+    logger.debug('Kubernetes response: %s: %s', res.status_code, res.text)
+    raise Return(res)
+
+  def _jobs_path(self):
+    return '/apis/batch/v1/namespaces/%s/jobs' % self.namespace
+
+  def _job_path(self, build_uuid):
+    return '%s/%s' % (self._jobs_path(), build_uuid)
+
+  def _kubernetes_distribution(self):
+    return self.executor_config.get('KUBERNETES_DISTRIBUTION', 'basic').lower()
+
+  def _is_basic_kubernetes_distribution(self):
+    return self._kubernetes_distribution() == 'basic'
+
+  def _is_openshift_kubernetes_distribution(self):
+    return self._kubernetes_distribution() == 'openshift'
+
+  def _build_job_container_resources(self):
+    # Minimum acceptable free resources for this container to "fit" in a quota
+    # These may be lower than the absolute limits if the cluster is knowingly
+    # oversubscribed by some amount.
+    container_requests = {
+      'memory' : self.executor_config.get('CONTAINER_MEMORY_REQUEST', '3968Mi'),
+    }
+
+    container_limits = {
+      'memory' : self.executor_config.get('CONTAINER_MEMORY_LIMITS', '5120Mi'),
+      'cpu' : self.executor_config.get('CONTAINER_CPU_LIMITS', '1000m'),
+    }
+
+    resources = {
+      'requests': container_requests,
+    }
+
+    if self._is_openshift_kubernetes_distribution():
+      resources['requests']['cpu'] = self.executor_config.get('CONTAINER_CPU_REQUEST', '500m')
+      resources['limits'] = container_limits
+
+    return resources
+
+  def _build_job_containers(self, user_data):
+    vm_memory_limit = self.executor_config.get('VM_MEMORY_LIMIT', '4G')
+    vm_volume_size = self.executor_config.get('VOLUME_SIZE', '32G')
+
+    container = {
+      'name': 'builder',
+      'imagePullPolicy': 'IfNotPresent',
+      'image': self.image,
+      'securityContext': {'privileged': True},
+      'env': [
+        {'name': 'USERDATA', 'value': user_data},
+        {'name': 'VM_MEMORY', 'value': vm_memory_limit},
+        {'name': 'VM_VOLUME_SIZE', 'value': vm_volume_size},
+      ],
+      'resources': self._build_job_container_resources(),
+    }
+
+    if self._is_basic_kubernetes_distribution():
+      container['volumeMounts'] = [{'name': 'secrets-mask','mountPath': '/var/run/secrets/kubernetes.io/serviceaccount'}]
+
+    return container
+
+  def _job_resource(self, build_uuid, user_data, coreos_channel='stable'):
+    image_pull_secret_name = self.executor_config.get('IMAGE_PULL_SECRET_NAME', 'builder')
+    service_account = self.executor_config.get('SERVICE_ACCOUNT_NAME', 'quay-builder-sa')
+    node_selector_label_key = self.executor_config.get('NODE_SELECTOR_LABEL_KEY', 'beta.kubernetes.io/instance-type')
+    node_selector_label_value = self.executor_config.get('NODE_SELECTOR_LABEL_VALUE', '')
+
+    node_selector = {
+      node_selector_label_key : node_selector_label_value
+    }
+
+    release_sha = release.GIT_HEAD or 'none'
+    if ' ' in release_sha:
+      release_sha = 'HEAD'
+
+    job_resource = {
+      'apiVersion': 'batch/v1',
+      'kind': 'Job',
+      'metadata': {
+        'namespace': self.namespace,
+        'generateName': build_uuid + '-',
+        'labels': {
+          'build': build_uuid,
+          'time': datetime.datetime.now().strftime('%Y-%m-%d-%H'),
+          'manager': socket.gethostname(),
+          'quay-sha': release_sha,
+        },
+      },
+      'spec' : {
+        'activeDeadlineSeconds': self.executor_config.get('MAXIMUM_JOB_TIME', 7200),
+        'template': {
+          'metadata': {
+            'labels': {
+              'build': build_uuid,
+              'time': datetime.datetime.now().strftime('%Y-%m-%d-%H'),
+              'manager': socket.gethostname(),
+              'quay-sha': release_sha,
+            },
+          },
+          'spec': {
+            'imagePullSecrets': [{ 'name': image_pull_secret_name }],
+            'restartPolicy': 'Never',
+            'dnsPolicy': 'Default',
+            'containers': [self._build_job_containers(user_data)],
+          },
+        },
+      },
+    }
+
+    if self._is_openshift_kubernetes_distribution():
+      # Setting `automountServiceAccountToken` to false will prevent automounting API credentials for a service account.
+      job_resource['spec']['template']['spec']['automountServiceAccountToken'] = False
+
+      # Use dedicated service account that has no authorization to any resources.
+      job_resource['spec']['template']['spec']['serviceAccount'] = service_account
+
+      # Setting `enableServiceLinks` to false prevents information about other services from being injected into pod's
+      # environment variables. Pod has no visibility into other services on the cluster.
+      job_resource['spec']['template']['spec']['enableServiceLinks'] = False
+
+      if node_selector_label_value.strip() != '':
+        job_resource['spec']['template']['spec']['nodeSelector'] = node_selector
+
+    if self._is_basic_kubernetes_distribution():
+      # This volume is a hack to mask the token for the namespace's
+      # default service account, which is placed in a file mounted under
+      # `/var/run/secrets/kubernetes.io/serviceaccount` in all pods.
+      # There's currently no other way to just disable the service
+      # account at either the pod or namespace level.
+      #
+      #   https://github.com/kubernetes/kubernetes/issues/16779
+      #
+      job_resource['spec']['template']['spec']['volumes'] = [{'name': 'secrets-mask','emptyDir': {'medium': 'Memory'}}]
+
+    return job_resource
+
+  @coroutine
+  @duration_collector_async(metric_queue.builder_time_to_start, ['k8s'])
+  def start_builder(self, realm, token, build_uuid):
+    # generate resource
+    channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
+    user_data = self.generate_cloud_config(realm, token, build_uuid, channel, self.manager_hostname)
+    resource = self._job_resource(build_uuid, user_data, channel)
+    logger.debug('Using Kubernetes Distribution: %s', self._kubernetes_distribution())
+    logger.debug('Generated kubernetes resource:\n%s', resource)
+
+    # schedule
+    create_job = yield From(self._request('POST', self._jobs_path(), json=resource))
+    if int(create_job.status_code / 100) != 2:
+      raise ExecutorException('Failed to create job: %s: %s: %s' %
+                              (build_uuid, create_job.status_code, create_job.text))
+
+    job = create_job.json()
+    raise Return(job['metadata']['name'])
+
+  @coroutine
+  def stop_builder(self, builder_id):
+    pods_path = '/api/v1/namespaces/%s/pods' % self.namespace
+
+    # Delete the job itself.
+    try:
+      yield From(self._request('DELETE', self._job_path(builder_id)))
+    except:
+      logger.exception('Failed to send delete job call for job %s', builder_id)
+
+    # Delete the pod(s) for the job.
+    selectorString = "job-name=%s" % builder_id
+    try:
+      yield From(self._request('DELETE', pods_path, params=dict(labelSelector=selectorString)))
+    except:
+      logger.exception("Failed to send delete pod call for job %s", builder_id)
+
+
+class LogPipe(threading.Thread):
+  """ Adapted from http://codereview.stackexchange.com/a/17959
+  """
+  def __init__(self, level):
+    """Setup the object with a logger and a loglevel
+    and start the thread
+    """
+    threading.Thread.__init__(self)
+    self.daemon = False
+    self.level = level
+    self.fd_read, self.fd_write = os.pipe()
+    self.pipe_reader = os.fdopen(self.fd_read)
+    self.start()
+
+  def fileno(self):
+    """Return the write file descriptor of the pipe
+    """
+    return self.fd_write
+
+  def run(self):
+    """Run the thread, logging everything.
+    """
+    for line in iter(self.pipe_reader.readline, ''):
+      logging.log(self.level, line.strip('\n'))
+
+    self.pipe_reader.close()
+
+  def close(self):
+    """Close the write end of the pipe.
+    """
+    os.close(self.fd_write)
--- a/buildman/manager/noop_canceller.py
+++ b/buildman/manager/noop_canceller.py
@ -0,0 +1,8 @@
+class NoopCanceller(object):
+  """ A class that can not cancel a build """
+  def __init__(self, config=None):
+    pass
+
+  def try_cancel_build(self, uuid):
+    """ Does nothing and fails to cancel build. """
+    return False
--- a/buildman/manager/orchestrator_canceller.py
+++ b/buildman/manager/orchestrator_canceller.py
@ -0,0 +1,26 @@
+import logging
+
+from buildman.orchestrator import orchestrator_from_config, OrchestratorError
+from util import slash_join
+
+
+logger = logging.getLogger(__name__)
+
+
+CANCEL_PREFIX = 'cancel/'
+
+
+class OrchestratorCanceller(object):
+  """ An asynchronous way to cancel a build with any Orchestrator. """
+  def __init__(self, config):
+    self._orchestrator = orchestrator_from_config(config, canceller_only=True)
+
+  def try_cancel_build(self, build_uuid):
+    logger.info('Cancelling build %s', build_uuid)
+    cancel_key = slash_join(CANCEL_PREFIX, build_uuid)
+    try:
+      self._orchestrator.set_key_sync(cancel_key, build_uuid, expiration=60)
+      return True
+    except OrchestratorError:
+      logger.exception('Failed to write cancel action to redis with uuid %s', build_uuid)
+      return False