Kubernetes build worker
This commit is contained in:
parent
3044f8ecbd
commit
bc13333f20
7 changed files with 255 additions and 34 deletions
|
@ -12,7 +12,7 @@ from urllib3.exceptions import ReadTimeoutError, ProtocolError
|
|||
|
||||
from app import metric_queue
|
||||
from buildman.manager.basemanager import BaseManager
|
||||
from buildman.manager.executor import PopenExecutor, EC2Executor
|
||||
from buildman.manager.executor import PopenExecutor, EC2Executor, KubernetesExecutor
|
||||
from buildman.component.buildcomponent import BuildComponent
|
||||
from buildman.jobutil.buildjob import BuildJob
|
||||
from buildman.asyncutil import AsyncWrapper
|
||||
|
@ -24,9 +24,14 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
ETCD_MAX_WATCH_TIMEOUT = 30
|
||||
EC2_API_TIMEOUT = 20
|
||||
RETRY_IMMEDIATELY_TIMEOUT = 0
|
||||
DEFAULT_EPHEMERAL_API_TIMEOUT = 20
|
||||
|
||||
EXECUTORS = {
|
||||
'popen': PopenExecutor,
|
||||
'ec2': EC2Executor,
|
||||
'kubernetes': KubernetesExecutor,
|
||||
}
|
||||
|
||||
class EtcdAction(object):
|
||||
GET = 'get'
|
||||
|
@ -41,10 +46,6 @@ class EtcdAction(object):
|
|||
|
||||
class EphemeralBuilderManager(BaseManager):
|
||||
""" Build manager implementation for the Enterprise Registry. """
|
||||
_executors = {
|
||||
'popen': PopenExecutor,
|
||||
'ec2': EC2Executor,
|
||||
}
|
||||
|
||||
_etcd_client_klass = etcd.Client
|
||||
|
||||
|
@ -61,8 +62,9 @@ class EphemeralBuilderManager(BaseManager):
|
|||
self._component_to_job = {}
|
||||
self._job_uuid_to_component = {}
|
||||
self._component_to_builder = {}
|
||||
self._job_to_executor = {}
|
||||
|
||||
self._executor = None
|
||||
self._executors = []
|
||||
|
||||
# Map of etcd keys being watched to the tasks watching them
|
||||
self._watch_tasks = {}
|
||||
|
@ -159,8 +161,7 @@ class EphemeralBuilderManager(BaseManager):
|
|||
self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE)
|
||||
|
||||
logger.info('Terminating expired build node: %s', builder_id)
|
||||
yield From(self._executor.stop_builder(builder_id))
|
||||
|
||||
yield From(self._job_to_executor[builder_id].stop_builder(builder_id))
|
||||
|
||||
def _handle_realm_change(self, etcd_result):
|
||||
if etcd_result is None:
|
||||
|
@ -228,9 +229,12 @@ class EphemeralBuilderManager(BaseManager):
|
|||
logger.debug('Calling initialize')
|
||||
self._manager_config = manager_config
|
||||
|
||||
executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor)
|
||||
self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}),
|
||||
self.manager_hostname)
|
||||
# TODO(jschorr): We need to make this backwards compatible with existing config, as well as test(s)
|
||||
for config in manager_config.get('EXECUTORS', []):
|
||||
executor_klass = EXECUTORS.get(config['EXECUTOR'])
|
||||
executor_config = config.get('CONFIG', {})
|
||||
executor_config.update(manager_config.get('EXECUTOR_CONFIG', {}))
|
||||
self._executors.append(executor_klass(executor_config, self.manager_hostname))
|
||||
|
||||
etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1')
|
||||
etcd_port = self._manager_config.get('ETCD_PORT', 2379)
|
||||
|
@ -259,6 +263,7 @@ class EphemeralBuilderManager(BaseManager):
|
|||
restarter=self._register_existing_realms)
|
||||
|
||||
self._etcd_lock_prefix = self._manager_config.get('ETCD_LOCK_PREFIX', 'locks/')
|
||||
self._ephemeral_api_timeout = self._manager_config.get('API_TIMEOUT', DEFAULT_EPHEMERAL_API_TIMEOUT)
|
||||
|
||||
# Load components for all realms currently known to the cluster
|
||||
async(self._register_existing_realms())
|
||||
|
@ -326,25 +331,35 @@ class EphemeralBuilderManager(BaseManager):
|
|||
|
||||
try:
|
||||
yield From(self._etcd_client.write(job_key, lock_payload, prevExist=False,
|
||||
ttl=EC2_API_TIMEOUT))
|
||||
ttl=self._ephemeral_api_timeout))
|
||||
except (KeyError, etcd.EtcdKeyError):
|
||||
# The job was already taken by someone else, we are probably a retry
|
||||
logger.error('Job: %s already exists in etcd, timeout may be misconfigured', build_uuid)
|
||||
raise Return(False, EC2_API_TIMEOUT)
|
||||
raise Return(False, self._ephemeral_api_timeout)
|
||||
except etcd.EtcdException:
|
||||
logger.exception('Exception when writing job %s to etcd', build_uuid)
|
||||
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
|
||||
|
||||
executor_type = self._executor.__class__.__name__
|
||||
logger.debug('Starting builder for job: %s with executor: %s', build_uuid, executor_type)
|
||||
started = False
|
||||
logger.debug("executors are: %s", self._executors)
|
||||
for executor in self._executors:
|
||||
# TODO(jschorr): gate on whitelist logic
|
||||
executor_type = executor.__class__.__name__
|
||||
logger.debug('Starting builder for job: %s with executor: %s', build_uuid, executor_type)
|
||||
|
||||
try:
|
||||
builder_id = yield From(self._executor.start_builder(realm, token, build_uuid))
|
||||
metric_queue.put_deprecated('EC2BuilderStarted', 1, unit='Count')
|
||||
metric_queue.ephemeral_build_workers.Inc(labelvalues=[builder_id, build_uuid])
|
||||
except:
|
||||
logger.exception('Exception when starting builder for job: %s', build_uuid)
|
||||
raise Return(False, EC2_API_TIMEOUT)
|
||||
try:
|
||||
builder_id = yield From(executor.start_builder(realm, token, build_uuid))
|
||||
metric_queue.put_deprecated('EphemeralBuilderStarted', 1, unit='Count')
|
||||
metric_queue.ephemeral_build_workers.Inc(labelvalues=[builder_id, build_uuid])
|
||||
started = True
|
||||
break
|
||||
except:
|
||||
logger.exception('Exception when starting builder for job: %s', build_uuid)
|
||||
continue
|
||||
|
||||
if not started:
|
||||
logger.error('Could not start any ephemeral workers.')
|
||||
raise Return(False, self._ephemeral_api_timeout)
|
||||
|
||||
# Store the builder in etcd associated with the job id
|
||||
try:
|
||||
|
@ -353,7 +368,7 @@ class EphemeralBuilderManager(BaseManager):
|
|||
ttl=setup_time))
|
||||
except etcd.EtcdException:
|
||||
logger.exception('Exception when writing job %s to etcd', build_uuid)
|
||||
raise Return(False, EC2_API_TIMEOUT)
|
||||
raise Return(False, self._ephemeral_api_timeout)
|
||||
|
||||
# Store the realm spec which will allow any manager to accept this builder when it connects
|
||||
realm_spec = json.dumps({
|
||||
|
@ -373,6 +388,7 @@ class EphemeralBuilderManager(BaseManager):
|
|||
except etcd.EtcdException:
|
||||
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
|
||||
raise Return(False, setup_time)
|
||||
self._job_to_executor[builder_id] = executor
|
||||
|
||||
raise Return(True, None)
|
||||
|
||||
|
@ -399,7 +415,9 @@ class EphemeralBuilderManager(BaseManager):
|
|||
logger.debug('Calling job_completed with status: %s', job_status)
|
||||
|
||||
# Kill the ephmeral builder
|
||||
yield From(self._executor.stop_builder(self._component_to_builder.pop(build_component)))
|
||||
builder_id = self._component_to_builder.pop(build_component)
|
||||
yield From(self._job_to_executor[builder_id].stop_builder(builder_id))
|
||||
del self._job_to_executor[builder_id]
|
||||
|
||||
# Release the lock in etcd
|
||||
job_key = self._etcd_job_key(build_job)
|
||||
|
|
Reference in a new issue