quay/buildman/manager/ephemeral.py

import logging
import etcd
import uuid
import calendar
import os.path

from datetime import datetime, timedelta
from trollius import From, coroutine, Return, async
from concurrent.futures import ThreadPoolExecutor

from buildman.manager.basemanager import BaseManager
from buildman.manager.executor import PopenExecutor, EC2Executor
from buildman.component.buildcomponent import BuildComponent
from buildman.asyncutil import AsyncWrapper


logger = logging.getLogger(__name__)


ETCD_BUILDER_PREFIX = 'building/'
ETCD_EXPIRE_RESULT = 'expire'


class EphemeralBuilderManager(BaseManager):
  """ Build manager implementation for the Enterprise Registry. """
  _executors = {
      'popen': PopenExecutor,
      'ec2': EC2Executor,
  }

  _etcd_client_klass = etcd.Client

  def __init__(self, *args, **kwargs):
    self._shutting_down = False

    self._manager_config = None
    self._async_thread_executor = None
    self._etcd_client = None

    self._component_to_job = {}
    self._component_to_builder = {}

    self._executor = None

    self._worker_watch_task = None

    super(EphemeralBuilderManager, self).__init__(*args, **kwargs)

  def _watch_builders(self):
    """ Watch the builders key for expirations.
    """
    if not self._shutting_down:
      workers_future = self._etcd_client.watch(ETCD_BUILDER_PREFIX, recursive=True)
      workers_future.add_done_callback(self._handle_key_expiration)
      logger.debug('Scheduling watch task.')
      self._worker_watch_task = async(workers_future)

  def _handle_key_expiration(self, changed_key_future):
    """ Handle when a builder expires
    """
    if self._worker_watch_task is None or self._worker_watch_task.done():
      self._watch_builders()

    if changed_key_future.cancelled():
      # Due to lack of interest, tomorrow has been cancelled
      return

    etcd_result = changed_key_future.result()
    if etcd_result.action == ETCD_EXPIRE_RESULT:
      # Handle the expiration
      logger.debug('Builder expired, clean up the old build node')
      async(self._clean_up_old_builder(etcd_result.key, etcd_result._prev_node.value))

  def initialize(self, manager_config):
    logger.debug('Calling initialize')
    self._manager_config = manager_config

    executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor)
    self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}),
                                    self.public_ip_address)

    etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1')
    etcd_port = self._manager_config.get('ETCD_PORT', 2379)
    logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)

    worker_threads = self._manager_config.get('ETCD_WORKER_THREADS', 5)
    self._async_thread_executor = ThreadPoolExecutor(worker_threads)
    self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port),
                                     executor=self._async_thread_executor)

    self._watch_builders()

  def setup_time(self):
    setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300)
    logger.debug('Returning setup_time: %s', setup_time)
    return setup_time

  def shutdown(self):
    logger.debug('Shutting down worker.')
    self._shutting_down = True

    if self._worker_watch_task is not None:
      logger.debug('Canceling watch task.')
      self._worker_watch_task.cancel()
      self._worker_watch_task = None

    if self._async_thread_executor is not None:
      logger.debug('Shutting down thread pool executor.')
      self._async_thread_executor.shutdown()

  @coroutine
  def schedule(self, build_job, loop):
    logger.debug('Calling schedule with job: %s', build_job.job_details['build_uuid'])

    # Check if there are worker slots avialable by checking the number of jobs in etcd
    allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
    try:
      building = yield From(self._etcd_client.read(ETCD_BUILDER_PREFIX, recursive=True))
      workers_alive = sum(1 for child in building.children if not child.dir)
    except KeyError:
      workers_alive = 0

    logger.debug('Total jobs: %s', workers_alive)

    if workers_alive >= allowed_worker_count:
      logger.info('Too many workers alive, unable to start new worker. %s >= %s', workers_alive,
                  allowed_worker_count)
      raise Return(False)

    job_key = self._etcd_job_key(build_job)

    # First try to take a lock for this job, meaning we will be responsible for its lifeline
    realm = str(uuid.uuid4())
    token = str(uuid.uuid4())
    ttl = self.setup_time()
    expiration = datetime.utcnow() + timedelta(seconds=ttl)

    payload = {
        'expiration': calendar.timegm(expiration.timetuple()),
    }

    try:
      yield From(self._etcd_client.write(job_key, payload, prevExist=False, ttl=ttl))
      component = self.register_component(realm, BuildComponent, token=token)
      self._component_to_job[component] = build_job
    except KeyError:
      # The job was already taken by someone else, we are probably a retry
      logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?')
      raise Return(False)

    logger.debug('Starting builder with executor: %s', self._executor)
    builder_id = yield From(self._executor.start_builder(realm, token))
    self._component_to_builder[component] = builder_id

    # Store the builder in etcd associated with the job id
    payload['builder_id'] = builder_id
    yield From(self._etcd_client.write(job_key, payload, prevExist=True))

    raise Return(True)

  def build_component_ready(self, build_component, loop):
    try:
      job = self._component_to_job.pop(build_component)
      logger.debug('Sending build %s to newly ready component on realm %s',
                   job.job_details['build_uuid'], build_component.builder_realm)
      loop.call_soon(build_component.start_build, job)
    except KeyError:
      logger.warning('Builder is asking for more work, but work already completed')

  def build_component_disposed(self, build_component, timed_out):
    logger.debug('Calling build_component_disposed.')

    # TODO make it so that I don't have to unregister the component if it timed out
    self.unregister_component(build_component)

  @coroutine
  def job_completed(self, build_job, job_status, build_component):
    logger.debug('Calling job_completed with status: %s', job_status)

    # Kill the ephmeral builder
    self._executor.stop_builder(self._component_to_builder.pop(build_component))

    # Release the lock in etcd
    job_key = self._etcd_job_key(build_job)
    yield From(self._etcd_client.delete(job_key))

    self.job_complete_callback(build_job, job_status)

  @coroutine
  def job_heartbeat(self, build_job):
    # Extend the deadline in etcd
    job_key = self._etcd_job_key(build_job)
    build_job_response = yield From(self._etcd_client.read(job_key))

    ttl = self.heartbeat_period_sec * 2
    new_expiration = datetime.utcnow() + timedelta(seconds=ttl)

    payload = {
        'expiration': calendar.timegm(new_expiration.timetuple()),
        'builder_id': build_job_response.value['builder_id'],
    }

    yield From(self._etcd_client.write(job_key, payload, ttl=ttl))

    self.job_heartbeat_callback(build_job)

  @coroutine
  def _clean_up_old_builder(self, job_key, job_payload):
    """ Terminate an old builders once the expiration date has passed.
    """
    logger.debug('Cleaning up the old builder for job: %s', job_key)
    if 'builder_id' in job_payload:
      logger.info('Terminating expired build node.')
      yield From(self._executor.stop_builder(job_payload['builder_id']))

    yield From(self._etcd_client.delete(job_key))

  @staticmethod
  def _etcd_job_key(build_job):
    """ Create a key which is used to track a job in etcd.
    """
    return os.path.join(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid'])

  def num_workers(self):
    """ Return the number of workers we're managing locally.
    """
    return len(self._component_to_builder)
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`import logging`
			`import etcd`
			`import uuid`
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`import calendar`
Improve tests for the ephemeral build manager. 2014-12-22 21:22:07 +00:00			`import os.path`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00
			`from datetime import datetime, timedelta`
Improve tests for the ephemeral build manager. 2014-12-22 21:22:07 +00:00			`from trollius import From, coroutine, Return, async`
			`from concurrent.futures import ThreadPoolExecutor`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00
			`from buildman.manager.basemanager import BaseManager`
			`from buildman.manager.executor import PopenExecutor, EC2Executor`
			`from buildman.component.buildcomponent import BuildComponent`
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`from buildman.asyncutil import AsyncWrapper`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00

			`logger = logging.getLogger(__name__)`


			`ETCD_BUILDER_PREFIX = 'building/'`
Improve tests for the ephemeral build manager. 2014-12-22 21:22:07 +00:00			`ETCD_EXPIRE_RESULT = 'expire'`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00

			`class EphemeralBuilderManager(BaseManager):`
			`""" Build manager implementation for the Enterprise Registry. """`
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`_executors = {`
			`'popen': PopenExecutor,`
			`'ec2': EC2Executor,`
			`}`

			`_etcd_client_klass = etcd.Client`

First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`def __init__(self, args, *kwargs):`
Improve tests for the ephemeral build manager. 2014-12-22 21:22:07 +00:00			`self._shutting_down = False`

First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`self._manager_config = None`
Improve tests for the ephemeral build manager. 2014-12-22 21:22:07 +00:00			`self._async_thread_executor = None`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`self._etcd_client = None`

			`self._component_to_job = {}`
			`self._component_to_builder = {}`

			`self._executor = None`

Improve tests for the ephemeral build manager. 2014-12-22 21:22:07 +00:00			`self._worker_watch_task = None`

First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`super(EphemeralBuilderManager, self).__init__(args, *kwargs)`

Improve tests for the ephemeral build manager. 2014-12-22 21:22:07 +00:00			`def _watch_builders(self):`
			`""" Watch the builders key for expirations.`
			`"""`
			`if not self._shutting_down:`
			`workers_future = self._etcd_client.watch(ETCD_BUILDER_PREFIX, recursive=True)`
			`workers_future.add_done_callback(self._handle_key_expiration)`
			`logger.debug('Scheduling watch task.')`
			`self._worker_watch_task = async(workers_future)`

			`def _handle_key_expiration(self, changed_key_future):`
			`""" Handle when a builder expires`
			`"""`
			`if self._worker_watch_task is None or self._worker_watch_task.done():`
			`self._watch_builders()`

			`if changed_key_future.cancelled():`
			`# Due to lack of interest, tomorrow has been cancelled`
			`return`

			`etcd_result = changed_key_future.result()`
			`if etcd_result.action == ETCD_EXPIRE_RESULT:`
			`# Handle the expiration`
			`logger.debug('Builder expired, clean up the old build node')`
			`async(self._clean_up_old_builder(etcd_result.key, etcd_result._prev_node.value))`

First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`def initialize(self, manager_config):`
			`logger.debug('Calling initialize')`
			`self._manager_config = manager_config`

			`executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor)`
			`self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}),`
			`self.public_ip_address)`

			`etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1')`
			`etcd_port = self._manager_config.get('ETCD_PORT', 2379)`
			`logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)`

Add support for adjusting etcd ttl on job_heartbeat. Switch the heartbeat method to a coroutine. 2014-12-22 22:24:44 +00:00			`worker_threads = self._manager_config.get('ETCD_WORKER_THREADS', 5)`
			`self._async_thread_executor = ThreadPoolExecutor(worker_threads)`
Improve tests for the ephemeral build manager. 2014-12-22 21:22:07 +00:00			`self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port),`
			`executor=self._async_thread_executor)`

			`self._watch_builders()`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00
			`def setup_time(self):`
			`setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300)`
			`logger.debug('Returning setup_time: %s', setup_time)`
			`return setup_time`

			`def shutdown(self):`
Improve tests for the ephemeral build manager. 2014-12-22 21:22:07 +00:00			`logger.debug('Shutting down worker.')`
			`self._shutting_down = True`

			`if self._worker_watch_task is not None:`
			`logger.debug('Canceling watch task.')`
			`self._worker_watch_task.cancel()`
			`self._worker_watch_task = None`

			`if self._async_thread_executor is not None:`
			`logger.debug('Shutting down thread pool executor.')`
			`self._async_thread_executor.shutdown()`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`@coroutine`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`def schedule(self, build_job, loop):`
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`logger.debug('Calling schedule with job: %s', build_job.job_details['build_uuid'])`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00
			`# Check if there are worker slots avialable by checking the number of jobs in etcd`
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`try:`
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`building = yield From(self._etcd_client.read(ETCD_BUILDER_PREFIX, recursive=True))`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`workers_alive = sum(1 for child in building.children if not child.dir)`
			`except KeyError:`
			`workers_alive = 0`

			`logger.debug('Total jobs: %s', workers_alive)`

			`if workers_alive >= allowed_worker_count:`
			`logger.info('Too many workers alive, unable to start new worker. %s >= %s', workers_alive,`
			`allowed_worker_count)`
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`raise Return(False)`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00
			`job_key = self._etcd_job_key(build_job)`

			`# First try to take a lock for this job, meaning we will be responsible for its lifeline`
			`realm = str(uuid.uuid4())`
			`token = str(uuid.uuid4())`
Add support for adjusting etcd ttl on job_heartbeat. Switch the heartbeat method to a coroutine. 2014-12-22 22:24:44 +00:00			`ttl = self.setup_time()`
			`expiration = datetime.utcnow() + timedelta(seconds=ttl)`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00
			`payload = {`
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`'expiration': calendar.timegm(expiration.timetuple()),`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`}`

			`try:`
Add support for adjusting etcd ttl on job_heartbeat. Switch the heartbeat method to a coroutine. 2014-12-22 22:24:44 +00:00			`yield From(self._etcd_client.write(job_key, payload, prevExist=False, ttl=ttl))`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`component = self.register_component(realm, BuildComponent, token=token)`
			`self._component_to_job[component] = build_job`
			`except KeyError:`
			`# The job was already taken by someone else, we are probably a retry`
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?')`
			`raise Return(False)`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`logger.debug('Starting builder with executor: %s', self._executor)`
			`builder_id = yield From(self._executor.start_builder(realm, token))`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`self._component_to_builder[component] = builder_id`

Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`# Store the builder in etcd associated with the job id`
			`payload['builder_id'] = builder_id`
			`yield From(self._etcd_client.write(job_key, payload, prevExist=True))`

			`raise Return(True)`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00
			`def build_component_ready(self, build_component, loop):`
			`try:`
			`job = self._component_to_job.pop(build_component)`
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`logger.debug('Sending build %s to newly ready component on realm %s',`
			`job.job_details['build_uuid'], build_component.builder_realm)`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`loop.call_soon(build_component.start_build, job)`
			`except KeyError:`
			`logger.warning('Builder is asking for more work, but work already completed')`

			`def build_component_disposed(self, build_component, timed_out):`
			`logger.debug('Calling build_component_disposed.')`

Add support for adjusting etcd ttl on job_heartbeat. Switch the heartbeat method to a coroutine. 2014-12-22 22:24:44 +00:00			`# TODO make it so that I don't have to unregister the component if it timed out`
			`self.unregister_component(build_component)`

Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`@coroutine`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`def job_completed(self, build_job, job_status, build_component):`
			`logger.debug('Calling job_completed with status: %s', job_status)`

Add support for adjusting etcd ttl on job_heartbeat. Switch the heartbeat method to a coroutine. 2014-12-22 22:24:44 +00:00			`# Kill the ephmeral builder`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`self._executor.stop_builder(self._component_to_builder.pop(build_component))`

			`# Release the lock in etcd`
			`job_key = self._etcd_job_key(build_job)`
Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`yield From(self._etcd_client.delete(job_key))`
First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00
			`self.job_complete_callback(build_job, job_status)`

Add support for adjusting etcd ttl on job_heartbeat. Switch the heartbeat method to a coroutine. 2014-12-22 22:24:44 +00:00			`@coroutine`
			`def job_heartbeat(self, build_job):`
			`# Extend the deadline in etcd`
			`job_key = self._etcd_job_key(build_job)`
			`build_job_response = yield From(self._etcd_client.read(job_key))`

			`ttl = self.heartbeat_period_sec * 2`
			`new_expiration = datetime.utcnow() + timedelta(seconds=ttl)`

			`payload = {`
			`'expiration': calendar.timegm(new_expiration.timetuple()),`
			`'builder_id': build_job_response.value['builder_id'],`
			`}`

			`yield From(self._etcd_client.write(job_key, payload, ttl=ttl))`

			`self.job_heartbeat_callback(build_job)`

Switch a few of the buildman methods to coroutines in order to support network calls in methods. Add a test for the ephemeral build manager. 2014-12-22 17:14:16 +00:00			`@coroutine`
			`def _clean_up_old_builder(self, job_key, job_payload):`
			`""" Terminate an old builders once the expiration date has passed.`
			`"""`
			`logger.debug('Cleaning up the old builder for job: %s', job_key)`
			`if 'builder_id' in job_payload:`
			`logger.info('Terminating expired build node.')`
			`yield From(self._executor.stop_builder(job_payload['builder_id']))`

			`yield From(self._etcd_client.delete(job_key))`

First implementation of ephemeral build lifecycle manager. 2014-12-16 18:41:30 +00:00			`@staticmethod`
			`def _etcd_job_key(build_job):`
			`""" Create a key which is used to track a job in etcd.`
			`"""`
Improve tests for the ephemeral build manager. 2014-12-22 21:22:07 +00:00			`return os.path.join(ETCD_BUILDER_PREFIX, build_job.job_details['build_uuid'])`
Add support for adjusting etcd ttl on job_heartbeat. Switch the heartbeat method to a coroutine. 2014-12-22 22:24:44 +00:00
			`def num_workers(self):`
			`""" Return the number of workers we're managing locally.`
			`"""`
			`return len(self._component_to_builder)`