Kubernetes build worker

This commit is contained in:
Colin Hom 2015-11-20 15:32:32 -05:00 committed by Joseph Schorr
parent 3044f8ecbd
commit bc13333f20
7 changed files with 255 additions and 34 deletions

View file

@ -12,7 +12,7 @@ from urllib3.exceptions import ReadTimeoutError, ProtocolError
from app import metric_queue
from buildman.manager.basemanager import BaseManager
from buildman.manager.executor import PopenExecutor, EC2Executor
from buildman.manager.executor import PopenExecutor, EC2Executor, KubernetesExecutor
from buildman.component.buildcomponent import BuildComponent
from buildman.jobutil.buildjob import BuildJob
from buildman.asyncutil import AsyncWrapper
@ -24,9 +24,14 @@ logger = logging.getLogger(__name__)
ETCD_MAX_WATCH_TIMEOUT = 30
EC2_API_TIMEOUT = 20
RETRY_IMMEDIATELY_TIMEOUT = 0
DEFAULT_EPHEMERAL_API_TIMEOUT = 20
EXECUTORS = {
'popen': PopenExecutor,
'ec2': EC2Executor,
'kubernetes': KubernetesExecutor,
}
class EtcdAction(object):
GET = 'get'
@ -41,10 +46,6 @@ class EtcdAction(object):
class EphemeralBuilderManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
_executors = {
'popen': PopenExecutor,
'ec2': EC2Executor,
}
_etcd_client_klass = etcd.Client
@ -61,8 +62,9 @@ class EphemeralBuilderManager(BaseManager):
self._component_to_job = {}
self._job_uuid_to_component = {}
self._component_to_builder = {}
self._job_to_executor = {}
self._executor = None
self._executors = []
# Map of etcd keys being watched to the tasks watching them
self._watch_tasks = {}
@ -159,8 +161,7 @@ class EphemeralBuilderManager(BaseManager):
self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE)
logger.info('Terminating expired build node: %s', builder_id)
yield From(self._executor.stop_builder(builder_id))
yield From(self._job_to_executor[builder_id].stop_builder(builder_id))
def _handle_realm_change(self, etcd_result):
if etcd_result is None:
@ -228,9 +229,12 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Calling initialize')
self._manager_config = manager_config
executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor)
self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}),
self.manager_hostname)
# TODO(jschorr): We need to make this backwards compatible with existing config, as well as test(s)
for config in manager_config.get('EXECUTORS', []):
executor_klass = EXECUTORS.get(config['EXECUTOR'])
executor_config = config.get('CONFIG', {})
executor_config.update(manager_config.get('EXECUTOR_CONFIG', {}))
self._executors.append(executor_klass(executor_config, self.manager_hostname))
etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1')
etcd_port = self._manager_config.get('ETCD_PORT', 2379)
@ -259,6 +263,7 @@ class EphemeralBuilderManager(BaseManager):
restarter=self._register_existing_realms)
self._etcd_lock_prefix = self._manager_config.get('ETCD_LOCK_PREFIX', 'locks/')
self._ephemeral_api_timeout = self._manager_config.get('API_TIMEOUT', DEFAULT_EPHEMERAL_API_TIMEOUT)
# Load components for all realms currently known to the cluster
async(self._register_existing_realms())
@ -326,25 +331,35 @@ class EphemeralBuilderManager(BaseManager):
try:
yield From(self._etcd_client.write(job_key, lock_payload, prevExist=False,
ttl=EC2_API_TIMEOUT))
ttl=self._ephemeral_api_timeout))
except (KeyError, etcd.EtcdKeyError):
# The job was already taken by someone else, we are probably a retry
logger.error('Job: %s already exists in etcd, timeout may be misconfigured', build_uuid)
raise Return(False, EC2_API_TIMEOUT)
raise Return(False, self._ephemeral_api_timeout)
except etcd.EtcdException:
logger.exception('Exception when writing job %s to etcd', build_uuid)
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
executor_type = self._executor.__class__.__name__
logger.debug('Starting builder for job: %s with executor: %s', build_uuid, executor_type)
started = False
logger.debug("executors are: %s", self._executors)
for executor in self._executors:
# TODO(jschorr): gate on whitelist logic
executor_type = executor.__class__.__name__
logger.debug('Starting builder for job: %s with executor: %s', build_uuid, executor_type)
try:
builder_id = yield From(self._executor.start_builder(realm, token, build_uuid))
metric_queue.put_deprecated('EC2BuilderStarted', 1, unit='Count')
metric_queue.ephemeral_build_workers.Inc(labelvalues=[builder_id, build_uuid])
except:
logger.exception('Exception when starting builder for job: %s', build_uuid)
raise Return(False, EC2_API_TIMEOUT)
try:
builder_id = yield From(executor.start_builder(realm, token, build_uuid))
metric_queue.put_deprecated('EphemeralBuilderStarted', 1, unit='Count')
metric_queue.ephemeral_build_workers.Inc(labelvalues=[builder_id, build_uuid])
started = True
break
except:
logger.exception('Exception when starting builder for job: %s', build_uuid)
continue
if not started:
logger.error('Could not start any ephemeral workers.')
raise Return(False, self._ephemeral_api_timeout)
# Store the builder in etcd associated with the job id
try:
@ -353,7 +368,7 @@ class EphemeralBuilderManager(BaseManager):
ttl=setup_time))
except etcd.EtcdException:
logger.exception('Exception when writing job %s to etcd', build_uuid)
raise Return(False, EC2_API_TIMEOUT)
raise Return(False, self._ephemeral_api_timeout)
# Store the realm spec which will allow any manager to accept this builder when it connects
realm_spec = json.dumps({
@ -373,6 +388,7 @@ class EphemeralBuilderManager(BaseManager):
except etcd.EtcdException:
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
raise Return(False, setup_time)
self._job_to_executor[builder_id] = executor
raise Return(True, None)
@ -399,7 +415,9 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Calling job_completed with status: %s', job_status)
# Kill the ephmeral builder
yield From(self._executor.stop_builder(self._component_to_builder.pop(build_component)))
builder_id = self._component_to_builder.pop(build_component)
yield From(self._job_to_executor[builder_id].stop_builder(builder_id))
del self._job_to_executor[builder_id]
# Release the lock in etcd
job_key = self._etcd_job_key(build_job)