initial import for Open Source 🎉
This commit is contained in:
parent
1898c361f3
commit
9c0dd3b722
2048 changed files with 218743 additions and 0 deletions
0
buildman/manager/__init__.py
Normal file
0
buildman/manager/__init__.py
Normal file
71
buildman/manager/basemanager.py
Normal file
71
buildman/manager/basemanager.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
from trollius import coroutine
|
||||
|
||||
class BaseManager(object):
|
||||
""" Base for all worker managers. """
|
||||
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
|
||||
job_complete_callback, manager_hostname, heartbeat_period_sec):
|
||||
self.register_component = register_component
|
||||
self.unregister_component = unregister_component
|
||||
self.job_heartbeat_callback = job_heartbeat_callback
|
||||
self.job_complete_callback = job_complete_callback
|
||||
self.manager_hostname = manager_hostname
|
||||
self.heartbeat_period_sec = heartbeat_period_sec
|
||||
|
||||
@coroutine
|
||||
def job_heartbeat(self, build_job):
|
||||
""" Method invoked to tell the manager that a job is still running. This method will be called
|
||||
every few minutes. """
|
||||
self.job_heartbeat_callback(build_job)
|
||||
|
||||
def overall_setup_time(self):
|
||||
""" Returns the number of seconds that the build system should wait before allowing the job
|
||||
to be picked up again after called 'schedule'.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def shutdown(self):
|
||||
""" Indicates that the build controller server is in a shutdown state and that no new jobs
|
||||
or workers should be performed. Existing workers should be cleaned up once their jobs
|
||||
have completed
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@coroutine
|
||||
def schedule(self, build_job):
|
||||
""" Schedules a queue item to be built. Returns a 2-tuple with (True, None) if the item was
|
||||
properly scheduled and (False, a retry timeout in seconds) if all workers are busy or an
|
||||
error occurs.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def initialize(self, manager_config):
|
||||
""" Runs any initialization code for the manager. Called once the server is in a ready state.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@coroutine
|
||||
def build_component_ready(self, build_component):
|
||||
""" Method invoked whenever a build component announces itself as ready.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def build_component_disposed(self, build_component, timed_out):
|
||||
""" Method invoked whenever a build component has been disposed. The timed_out boolean indicates
|
||||
whether the component's heartbeat timed out.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@coroutine
|
||||
def job_completed(self, build_job, job_status, build_component):
|
||||
""" Method invoked once a job_item has completed, in some manner. The job_status will be
|
||||
one of: incomplete, error, complete. Implementations of this method should call coroutine
|
||||
self.job_complete_callback with a status of Incomplete if they wish for the job to be
|
||||
automatically requeued.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def num_workers(self):
|
||||
""" Returns the number of active build workers currently registered. This includes those
|
||||
that are currently busy and awaiting more work.
|
||||
"""
|
||||
raise NotImplementedError
|
27
buildman/manager/buildcanceller.py
Normal file
27
buildman/manager/buildcanceller.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import logging
|
||||
|
||||
from buildman.manager.orchestrator_canceller import OrchestratorCanceller
|
||||
from buildman.manager.noop_canceller import NoopCanceller
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CANCELLERS = {'ephemeral': OrchestratorCanceller}
|
||||
|
||||
|
||||
class BuildCanceller(object):
|
||||
""" A class to manage cancelling a build """
|
||||
|
||||
def __init__(self, app=None):
|
||||
self.build_manager_config = app.config.get('BUILD_MANAGER')
|
||||
if app is None or self.build_manager_config is None:
|
||||
self.handler = NoopCanceller()
|
||||
else:
|
||||
self.handler = None
|
||||
|
||||
def try_cancel_build(self, uuid):
|
||||
""" A method to kill a running build """
|
||||
if self.handler is None:
|
||||
canceller = CANCELLERS.get(self.build_manager_config[0], NoopCanceller)
|
||||
self.handler = canceller(self.build_manager_config[1])
|
||||
|
||||
return self.handler.try_cancel_build(uuid)
|
92
buildman/manager/enterprise.py
Normal file
92
buildman/manager/enterprise.py
Normal file
|
@ -0,0 +1,92 @@
|
|||
import logging
|
||||
import uuid
|
||||
|
||||
from buildman.component.basecomponent import BaseComponent
|
||||
from buildman.component.buildcomponent import BuildComponent
|
||||
from buildman.manager.basemanager import BaseManager
|
||||
|
||||
from trollius import From, Return, coroutine
|
||||
|
||||
REGISTRATION_REALM = 'registration'
|
||||
RETRY_TIMEOUT = 5
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DynamicRegistrationComponent(BaseComponent):
|
||||
""" Component session that handles dynamic registration of the builder components. """
|
||||
|
||||
def onConnect(self):
|
||||
self.join(REGISTRATION_REALM)
|
||||
|
||||
def onJoin(self, details):
|
||||
logger.debug('Registering registration method')
|
||||
yield From(self.register(self._worker_register, u'io.quay.buildworker.register'))
|
||||
|
||||
def _worker_register(self):
|
||||
realm = self.parent_manager.add_build_component()
|
||||
logger.debug('Registering new build component+worker with realm %s', realm)
|
||||
return realm
|
||||
|
||||
def kind(self):
|
||||
return 'registration'
|
||||
|
||||
|
||||
class EnterpriseManager(BaseManager):
|
||||
""" Build manager implementation for the Enterprise Registry. """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.ready_components = set()
|
||||
self.all_components = set()
|
||||
self.shutting_down = False
|
||||
|
||||
super(EnterpriseManager, self).__init__(*args, **kwargs)
|
||||
|
||||
def initialize(self, manager_config):
|
||||
# Add a component which is used by build workers for dynamic registration. Unlike
|
||||
# production, build workers in enterprise are long-lived and register dynamically.
|
||||
self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
|
||||
|
||||
def overall_setup_time(self):
|
||||
# Builders are already registered, so the setup time should be essentially instant. We therefore
|
||||
# only return a minute here.
|
||||
return 60
|
||||
|
||||
def add_build_component(self):
|
||||
""" Adds a new build component for an Enterprise Registry. """
|
||||
# Generate a new unique realm ID for the build worker.
|
||||
realm = str(uuid.uuid4())
|
||||
new_component = self.register_component(realm, BuildComponent, token="")
|
||||
self.all_components.add(new_component)
|
||||
return realm
|
||||
|
||||
@coroutine
|
||||
def schedule(self, build_job):
|
||||
""" Schedules a build for an Enterprise Registry. """
|
||||
if self.shutting_down or not self.ready_components:
|
||||
raise Return(False, RETRY_TIMEOUT)
|
||||
|
||||
component = self.ready_components.pop()
|
||||
|
||||
yield From(component.start_build(build_job))
|
||||
|
||||
raise Return(True, None)
|
||||
|
||||
@coroutine
|
||||
def build_component_ready(self, build_component):
|
||||
self.ready_components.add(build_component)
|
||||
|
||||
def shutdown(self):
|
||||
self.shutting_down = True
|
||||
|
||||
@coroutine
|
||||
def job_completed(self, build_job, job_status, build_component):
|
||||
yield From(self.job_complete_callback(build_job, job_status))
|
||||
|
||||
def build_component_disposed(self, build_component, timed_out):
|
||||
self.all_components.remove(build_component)
|
||||
if build_component in self.ready_components:
|
||||
self.ready_components.remove(build_component)
|
||||
|
||||
self.unregister_component(build_component)
|
||||
|
||||
def num_workers(self):
|
||||
return len(self.all_components)
|
710
buildman/manager/ephemeral.py
Normal file
710
buildman/manager/ephemeral.py
Normal file
|
@ -0,0 +1,710 @@
|
|||
import logging
|
||||
import uuid
|
||||
import calendar
|
||||
import json
|
||||
import time
|
||||
|
||||
from collections import namedtuple
|
||||
from datetime import datetime, timedelta
|
||||
from six import iteritems
|
||||
|
||||
from trollius import From, coroutine, Return, async, sleep
|
||||
|
||||
from app import metric_queue
|
||||
from buildman.orchestrator import (orchestrator_from_config, KeyEvent,
|
||||
OrchestratorError, OrchestratorConnectionError,
|
||||
ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
|
||||
from buildman.manager.basemanager import BaseManager
|
||||
from buildman.manager.executor import PopenExecutor, EC2Executor, KubernetesExecutor
|
||||
from buildman.component.buildcomponent import BuildComponent
|
||||
from buildman.jobutil.buildjob import BuildJob
|
||||
from buildman.server import BuildJobResult
|
||||
from util import slash_join
|
||||
from util.morecollections import AttrDict
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
JOB_PREFIX = 'building/'
|
||||
LOCK_PREFIX = 'lock/'
|
||||
REALM_PREFIX = 'realm/'
|
||||
CANCEL_PREFIX = 'cancel/'
|
||||
METRIC_PREFIX = 'metric/'
|
||||
|
||||
CANCELED_LOCK_PREFIX = slash_join(LOCK_PREFIX, 'job-cancelled')
|
||||
EXPIRED_LOCK_PREFIX = slash_join(LOCK_PREFIX, 'job-expired')
|
||||
|
||||
EPHEMERAL_API_TIMEOUT = 20
|
||||
EPHEMERAL_SETUP_TIMEOUT = 500
|
||||
|
||||
RETRY_IMMEDIATELY_SLEEP_DURATION = 0
|
||||
TOO_MANY_WORKERS_SLEEP_DURATION = 10
|
||||
|
||||
|
||||
BuildInfo = namedtuple('BuildInfo', ['component', 'build_job', 'execution_id', 'executor_name'])
|
||||
|
||||
|
||||
class EphemeralBuilderManager(BaseManager):
|
||||
""" Build manager implementation for the Enterprise Registry. """
|
||||
|
||||
EXECUTORS = {
|
||||
'popen': PopenExecutor,
|
||||
'ec2': EC2Executor,
|
||||
'kubernetes': KubernetesExecutor,
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
|
||||
|
||||
self._shutting_down = False
|
||||
|
||||
self._manager_config = None
|
||||
self._orchestrator = None
|
||||
|
||||
# The registered executors available for running jobs, in order.
|
||||
self._ordered_executors = []
|
||||
|
||||
# The registered executors, mapped by their unique name.
|
||||
self._executor_name_to_executor = {}
|
||||
|
||||
# Map from builder component to its associated job.
|
||||
self._component_to_job = {}
|
||||
|
||||
# Map from build UUID to a BuildInfo tuple with information about the build.
|
||||
self._build_uuid_to_info = {}
|
||||
|
||||
def overall_setup_time(self):
|
||||
return EPHEMERAL_SETUP_TIMEOUT
|
||||
|
||||
@coroutine
|
||||
def _mark_job_incomplete(self, build_job, build_info):
|
||||
""" Marks a job as incomplete, in response to a failure to start or a timeout. """
|
||||
executor_name = build_info.executor_name
|
||||
execution_id = build_info.execution_id
|
||||
|
||||
logger.warning('Build executor failed to successfully boot with execution id %s',
|
||||
execution_id)
|
||||
|
||||
# Take a lock to ensure that only one manager reports the build as incomplete for this
|
||||
# execution.
|
||||
lock_key = slash_join(self._expired_lock_prefix, build_job.build_uuid, execution_id)
|
||||
acquired_lock = yield From(self._orchestrator.lock(lock_key))
|
||||
if acquired_lock:
|
||||
try:
|
||||
# Clean up the bookkeeping for the job.
|
||||
yield From(self._orchestrator.delete_key(self._job_key(build_job)))
|
||||
except KeyError:
|
||||
logger.debug('Could not delete job key %s; might have been removed already',
|
||||
build_job.build_uuid)
|
||||
|
||||
logger.error('[BUILD INTERNAL ERROR] Build ID: %s. Exec name: %s. Exec ID: %s',
|
||||
build_job.build_uuid, executor_name, execution_id)
|
||||
yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
|
||||
update_phase=True))
|
||||
else:
|
||||
logger.debug('Did not get lock for job-expiration for job %s', build_job.build_uuid)
|
||||
|
||||
@coroutine
|
||||
def _job_callback(self, key_change):
|
||||
"""
|
||||
This is the callback invoked when keys related to jobs are changed.
|
||||
It ignores all events related to the creation of new jobs.
|
||||
Deletes or expirations cause checks to ensure they've been properly marked as completed.
|
||||
|
||||
:param key_change: the event and value produced by a key changing in the orchestrator
|
||||
:type key_change: :class:`KeyChange`
|
||||
"""
|
||||
if key_change.event in (KeyEvent.CREATE, KeyEvent.SET):
|
||||
raise Return()
|
||||
|
||||
elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE):
|
||||
# Handle the expiration/deletion.
|
||||
job_metadata = json.loads(key_change.value)
|
||||
build_job = BuildJob(AttrDict(job_metadata['job_queue_item']))
|
||||
logger.debug('Got "%s" of job %s', key_change.event, build_job.build_uuid)
|
||||
|
||||
# Get the build info.
|
||||
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
|
||||
if build_info is None:
|
||||
logger.debug('No build info for "%s" job %s (%s); probably already deleted by this manager',
|
||||
key_change.event, build_job.build_uuid, job_metadata)
|
||||
raise Return()
|
||||
|
||||
if key_change.event != KeyEvent.EXPIRE:
|
||||
# If the etcd action was not an expiration, then it was already deleted by some manager and
|
||||
# the execution was therefore already shutdown. All that's left is to remove the build info.
|
||||
self._build_uuid_to_info.pop(build_job.build_uuid, None)
|
||||
raise Return()
|
||||
|
||||
logger.debug('got expiration for job %s with metadata: %s', build_job.build_uuid,
|
||||
job_metadata)
|
||||
|
||||
if not job_metadata.get('had_heartbeat', False):
|
||||
# If we have not yet received a heartbeat, then the node failed to boot in some way.
|
||||
# We mark the job as incomplete here.
|
||||
yield From(self._mark_job_incomplete(build_job, build_info))
|
||||
|
||||
# Finally, we terminate the build execution for the job. We don't do this under a lock as
|
||||
# terminating a node is an atomic operation; better to make sure it is terminated than not.
|
||||
logger.info('Terminating expired build executor for job %s with execution id %s',
|
||||
build_job.build_uuid, build_info.execution_id)
|
||||
yield From(self.kill_builder_executor(build_job.build_uuid))
|
||||
else:
|
||||
logger.warning('Unexpected KeyEvent (%s) on job key: %s', key_change.event, key_change.key)
|
||||
|
||||
|
||||
@coroutine
|
||||
def _realm_callback(self, key_change):
|
||||
logger.debug('realm callback for key: %s', key_change.key)
|
||||
if key_change.event == KeyEvent.CREATE:
|
||||
# Listen on the realm created by ourselves or another worker.
|
||||
realm_spec = json.loads(key_change.value)
|
||||
self._register_realm(realm_spec)
|
||||
|
||||
elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE):
|
||||
# Stop listening for new connections on the realm, if we did not get the connection.
|
||||
realm_spec = json.loads(key_change.value)
|
||||
realm_id = realm_spec['realm']
|
||||
|
||||
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
|
||||
build_uuid = build_job.build_uuid
|
||||
|
||||
logger.debug('Realm key %s for build %s was %s', realm_id, build_uuid, key_change.event)
|
||||
build_info = self._build_uuid_to_info.get(build_uuid, None)
|
||||
if build_info is not None:
|
||||
# Pop off the component and if we find one, then the build has not connected to this
|
||||
# manager, so we can safely unregister its component.
|
||||
component = self._component_to_job.pop(build_info.component, None)
|
||||
if component is not None:
|
||||
# We were not the manager which the worker connected to, remove the bookkeeping for it
|
||||
logger.debug('Unregistering unused component for build %s', build_uuid)
|
||||
self.unregister_component(build_info.component)
|
||||
|
||||
# If the realm has expired, then perform cleanup of the executor.
|
||||
if key_change.event == KeyEvent.EXPIRE:
|
||||
execution_id = realm_spec.get('execution_id', None)
|
||||
executor_name = realm_spec.get('executor_name', 'EC2Executor')
|
||||
|
||||
# Cleanup the job, since it never started.
|
||||
logger.debug('Job %s for incomplete marking: %s', build_uuid, build_info)
|
||||
if build_info is not None:
|
||||
yield From(self._mark_job_incomplete(build_job, build_info))
|
||||
|
||||
# Cleanup the executor.
|
||||
logger.info('Realm %s expired for job %s, terminating executor %s with execution id %s',
|
||||
realm_id, build_uuid, executor_name, execution_id)
|
||||
yield From(self.terminate_executor(executor_name, execution_id))
|
||||
|
||||
else:
|
||||
logger.warning('Unexpected action (%s) on realm key: %s', key_change.event, key_change.key)
|
||||
|
||||
|
||||
def _register_realm(self, realm_spec):
|
||||
logger.debug('Got call to register realm %s with manager', realm_spec['realm'])
|
||||
|
||||
# Create the build information block for the registered realm.
|
||||
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
|
||||
execution_id = realm_spec.get('execution_id', None)
|
||||
executor_name = realm_spec.get('executor_name', 'EC2Executor')
|
||||
|
||||
logger.debug('Registering realm %s with manager: %s', realm_spec['realm'], realm_spec)
|
||||
component = self.register_component(realm_spec['realm'], BuildComponent,
|
||||
token=realm_spec['token'])
|
||||
|
||||
build_info = BuildInfo(component=component, build_job=build_job, execution_id=execution_id,
|
||||
executor_name=executor_name)
|
||||
|
||||
self._component_to_job[component] = build_job
|
||||
self._build_uuid_to_info[build_job.build_uuid] = build_info
|
||||
|
||||
logger.debug('Registered realm %s with manager', realm_spec['realm'])
|
||||
return component
|
||||
|
||||
@property
|
||||
def registered_executors(self):
|
||||
return self._ordered_executors
|
||||
|
||||
@coroutine
|
||||
def _register_existing_realms(self):
|
||||
try:
|
||||
all_realms = yield From(self._orchestrator.get_prefixed_keys(self._realm_prefix))
|
||||
|
||||
# Register all existing realms found.
|
||||
encountered = {self._register_realm(json.loads(realm_data))
|
||||
for _realm, realm_data in all_realms}
|
||||
|
||||
# Remove any components not encountered so we can clean up.
|
||||
for component, job in iteritems(self._component_to_job):
|
||||
if not component in encountered:
|
||||
self._component_to_job.pop(component, None)
|
||||
self._build_uuid_to_info.pop(job.build_uuid, None)
|
||||
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
def _load_executor(self, executor_kind_name, executor_config):
|
||||
executor_klass = EphemeralBuilderManager.EXECUTORS.get(executor_kind_name)
|
||||
if executor_klass is None:
|
||||
logger.error('Unknown executor %s; skipping install', executor_kind_name)
|
||||
return
|
||||
|
||||
executor = executor_klass(executor_config, self.manager_hostname)
|
||||
if executor.name in self._executor_name_to_executor:
|
||||
raise Exception('Executor with name %s already registered' % executor.name)
|
||||
|
||||
self._ordered_executors.append(executor)
|
||||
self._executor_name_to_executor[executor.name] = executor
|
||||
|
||||
def _config_prefix(self, key):
|
||||
if self._manager_config.get('ORCHESTRATOR') is None:
|
||||
return key
|
||||
|
||||
prefix = self._manager_config.get('ORCHESTRATOR_PREFIX', '')
|
||||
return slash_join(prefix, key).lstrip('/') + '/'
|
||||
|
||||
@property
|
||||
def _job_prefix(self):
|
||||
return self._config_prefix(JOB_PREFIX)
|
||||
|
||||
@property
|
||||
def _realm_prefix(self):
|
||||
return self._config_prefix(REALM_PREFIX)
|
||||
|
||||
@property
|
||||
def _cancel_prefix(self):
|
||||
return self._config_prefix(CANCEL_PREFIX)
|
||||
|
||||
@property
|
||||
def _metric_prefix(self):
|
||||
return self._config_prefix(METRIC_PREFIX)
|
||||
|
||||
@property
|
||||
def _expired_lock_prefix(self):
|
||||
return self._config_prefix(EXPIRED_LOCK_PREFIX)
|
||||
|
||||
@property
|
||||
def _canceled_lock_prefix(self):
|
||||
return self._config_prefix(CANCELED_LOCK_PREFIX)
|
||||
|
||||
def _metric_key(self, realm):
|
||||
"""
|
||||
Create a key which is used to track a job in the Orchestrator.
|
||||
|
||||
:param realm: realm for the build
|
||||
:type realm: str
|
||||
:returns: key used to track jobs
|
||||
:rtype: str
|
||||
"""
|
||||
return slash_join(self._metric_prefix, realm)
|
||||
|
||||
def _job_key(self, build_job):
|
||||
"""
|
||||
Creates a key which is used to track a job in the Orchestrator.
|
||||
|
||||
:param build_job: unique job identifier for a build
|
||||
:type build_job: str
|
||||
:returns: key used to track the job
|
||||
:rtype: str
|
||||
"""
|
||||
return slash_join(self._job_prefix, build_job.job_details['build_uuid'])
|
||||
|
||||
def _realm_key(self, realm):
|
||||
"""
|
||||
Create a key which is used to track an incoming connection on a realm.
|
||||
|
||||
:param realm: realm for the build
|
||||
:type realm: str
|
||||
:returns: key used to track the connection to the realm
|
||||
:rtype: str
|
||||
"""
|
||||
return slash_join(self._realm_prefix, realm)
|
||||
|
||||
|
||||
def initialize(self, manager_config):
|
||||
logger.debug('Calling initialize')
|
||||
self._manager_config = manager_config
|
||||
|
||||
# Note: Executor config can be defined either as a single block of EXECUTOR_CONFIG (old style)
|
||||
# or as a new set of executor configurations, with the order determining how we fallback. We
|
||||
# check for both here to ensure backwards compatibility.
|
||||
if manager_config.get('EXECUTORS'):
|
||||
for executor_config in manager_config['EXECUTORS']:
|
||||
self._load_executor(executor_config.get('EXECUTOR'), executor_config)
|
||||
else:
|
||||
self._load_executor(manager_config.get('EXECUTOR'), manager_config.get('EXECUTOR_CONFIG'))
|
||||
|
||||
logger.debug('calling orchestrator_from_config')
|
||||
self._orchestrator = orchestrator_from_config(manager_config)
|
||||
|
||||
logger.debug('setting on_key_change callbacks for job, cancel, realm')
|
||||
self._orchestrator.on_key_change(self._job_prefix, self._job_callback)
|
||||
self._orchestrator.on_key_change(self._cancel_prefix, self._cancel_callback)
|
||||
self._orchestrator.on_key_change(self._realm_prefix, self._realm_callback,
|
||||
restarter=self._register_existing_realms)
|
||||
|
||||
# Load components for all realms currently known to the cluster
|
||||
async(self._register_existing_realms())
|
||||
|
||||
def shutdown(self):
|
||||
logger.debug('Shutting down worker.')
|
||||
if self._orchestrator is not None:
|
||||
self._orchestrator.shutdown()
|
||||
|
||||
@coroutine
|
||||
def schedule(self, build_job):
|
||||
build_uuid = build_job.job_details['build_uuid']
|
||||
logger.debug('Calling schedule with job: %s', build_uuid)
|
||||
|
||||
# Check if there are worker slots available by checking the number of jobs in the orchestrator
|
||||
allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
|
||||
try:
|
||||
active_jobs = yield From(self._orchestrator.get_prefixed_keys(self._job_prefix))
|
||||
workers_alive = len(active_jobs)
|
||||
except KeyError:
|
||||
workers_alive = 0
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('Could not read job count from orchestrator for job due to orchestrator being down')
|
||||
raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
|
||||
except OrchestratorError:
|
||||
logger.exception('Exception when reading job count from orchestrator for job: %s', build_uuid)
|
||||
raise Return(False, RETRY_IMMEDIATELY_SLEEP_DURATION)
|
||||
|
||||
logger.debug('Total jobs (scheduling job %s): %s', build_uuid, workers_alive)
|
||||
|
||||
if workers_alive >= allowed_worker_count:
|
||||
logger.info('Too many workers alive, unable to start new worker for build job: %s. %s >= %s',
|
||||
build_uuid, workers_alive, allowed_worker_count)
|
||||
raise Return(False, TOO_MANY_WORKERS_SLEEP_DURATION)
|
||||
|
||||
job_key = self._job_key(build_job)
|
||||
|
||||
# First try to take a lock for this job, meaning we will be responsible for its lifeline
|
||||
realm = str(uuid.uuid4())
|
||||
token = str(uuid.uuid4())
|
||||
nonce = str(uuid.uuid4())
|
||||
|
||||
machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200)
|
||||
max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration)
|
||||
|
||||
payload = {
|
||||
'max_expiration': calendar.timegm(max_expiration.timetuple()),
|
||||
'nonce': nonce,
|
||||
'had_heartbeat': False,
|
||||
'job_queue_item': build_job.job_item,
|
||||
}
|
||||
|
||||
lock_payload = json.dumps(payload)
|
||||
logger.debug('Writing key for job %s with expiration in %s seconds', build_uuid,
|
||||
EPHEMERAL_SETUP_TIMEOUT)
|
||||
|
||||
try:
|
||||
yield From(self._orchestrator.set_key(job_key, lock_payload, overwrite=False,
|
||||
expiration=EPHEMERAL_SETUP_TIMEOUT))
|
||||
except KeyError:
|
||||
logger.warning('Job: %s already exists in orchestrator, timeout may be misconfigured',
|
||||
build_uuid)
|
||||
raise Return(False, EPHEMERAL_API_TIMEOUT)
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('Exception when writing job %s to orchestrator; could not connect',
|
||||
build_uuid)
|
||||
raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
|
||||
except OrchestratorError:
|
||||
logger.exception('Exception when writing job %s to orchestrator', build_uuid)
|
||||
raise Return(False, RETRY_IMMEDIATELY_SLEEP_DURATION)
|
||||
|
||||
# Got a lock, now lets boot the job via one of the registered executors.
|
||||
started_with_executor = None
|
||||
execution_id = None
|
||||
|
||||
logger.debug("Registered executors are: %s", [ex.name for ex in self._ordered_executors])
|
||||
for executor in self._ordered_executors:
|
||||
# Check if we can use this executor based on its whitelist, by namespace.
|
||||
namespace = build_job.namespace
|
||||
if not executor.allowed_for_namespace(namespace):
|
||||
logger.debug('Job %s (namespace: %s) cannot use executor %s', build_uuid, namespace,
|
||||
executor.name)
|
||||
continue
|
||||
|
||||
# Check if we can use this executor based on the retries remaining.
|
||||
if executor.minimum_retry_threshold > build_job.retries_remaining:
|
||||
metric_queue.builder_fallback.Inc()
|
||||
logger.debug('Job %s cannot use executor %s as it is below retry threshold %s (retry #%s)',
|
||||
build_uuid, executor.name, executor.minimum_retry_threshold,
|
||||
build_job.retries_remaining)
|
||||
continue
|
||||
|
||||
logger.debug('Starting builder for job %s with selected executor: %s', build_uuid,
|
||||
executor.name)
|
||||
|
||||
try:
|
||||
execution_id = yield From(executor.start_builder(realm, token, build_uuid))
|
||||
except:
|
||||
try:
|
||||
metric_queue.build_start_failure.Inc(labelvalues=[executor.name])
|
||||
metric_queue.put_deprecated(('ExecutorFailure-%s' % executor.name), 1, unit='Count')
|
||||
except:
|
||||
logger.exception('Exception when writing failure metric for execution %s for job %s',
|
||||
execution_id, build_uuid)
|
||||
|
||||
logger.exception('Exception when starting builder for job: %s', build_uuid)
|
||||
continue
|
||||
|
||||
try:
|
||||
metric_queue.build_start_success.Inc(labelvalues=[executor.name])
|
||||
except:
|
||||
logger.exception('Exception when writing success metric for execution %s for job %s',
|
||||
execution_id, build_uuid)
|
||||
|
||||
try:
|
||||
metric_queue.ephemeral_build_workers.Inc()
|
||||
except:
|
||||
logger.exception('Exception when writing start metrics for execution %s for job %s',
|
||||
execution_id, build_uuid)
|
||||
|
||||
started_with_executor = executor
|
||||
|
||||
# Break out of the loop now that we've started a builder successfully.
|
||||
break
|
||||
|
||||
# If we didn't start the job, cleanup and return it to the queue.
|
||||
if started_with_executor is None:
|
||||
logger.error('Could not start ephemeral worker for build %s', build_uuid)
|
||||
|
||||
# Delete the associated build job record.
|
||||
yield From(self._orchestrator.delete_key(job_key))
|
||||
raise Return(False, EPHEMERAL_API_TIMEOUT)
|
||||
|
||||
# Job was started!
|
||||
logger.debug('Started execution with ID %s for job: %s with executor: %s',
|
||||
execution_id, build_uuid, started_with_executor.name)
|
||||
|
||||
# Store metric data
|
||||
metric_spec = json.dumps({
|
||||
'executor_name': started_with_executor.name,
|
||||
'start_time': time.time(),
|
||||
})
|
||||
|
||||
try:
|
||||
yield From(self._orchestrator.set_key(self._metric_key(realm), metric_spec, overwrite=False,
|
||||
expiration=machine_max_expiration + 10))
|
||||
except KeyError:
|
||||
logger.error('Realm %s already exists in orchestrator for job %s ' +
|
||||
'UUID collision or something is very very wrong.', realm, build_uuid)
|
||||
except OrchestratorError:
|
||||
logger.exception('Exception when writing realm %s to orchestrator for job %s',
|
||||
realm, build_uuid)
|
||||
|
||||
# Store the realm spec which will allow any manager to accept this builder when it connects
|
||||
realm_spec = json.dumps({
|
||||
'realm': realm,
|
||||
'token': token,
|
||||
'execution_id': execution_id,
|
||||
'executor_name': started_with_executor.name,
|
||||
'job_queue_item': build_job.job_item,
|
||||
})
|
||||
|
||||
try:
|
||||
setup_time = started_with_executor.setup_time or self.overall_setup_time()
|
||||
logger.debug('Writing job key for job %s using executor %s with ID %s and ttl %s', build_uuid,
|
||||
started_with_executor.name, execution_id, setup_time)
|
||||
yield From(self._orchestrator.set_key(self._realm_key(realm), realm_spec,
|
||||
expiration=setup_time))
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('Exception when writing realm %s to orchestrator for job %s',
|
||||
realm, build_uuid)
|
||||
raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
|
||||
except OrchestratorError:
|
||||
logger.exception('Exception when writing realm %s to orchestrator for job %s',
|
||||
realm, build_uuid)
|
||||
raise Return(False, setup_time)
|
||||
|
||||
logger.debug('Builder spawn complete for job %s using executor %s with ID %s ',
|
||||
build_uuid, started_with_executor.name, execution_id)
|
||||
raise Return(True, None)
|
||||
|
||||
@coroutine
|
||||
def build_component_ready(self, build_component):
|
||||
logger.debug('Got component ready for component with realm %s', build_component.builder_realm)
|
||||
|
||||
# Pop off the job for the component.
|
||||
# We do so before we send out the watch below, as it will also remove this mapping.
|
||||
job = self._component_to_job.pop(build_component, None)
|
||||
if job is None:
|
||||
# This will occur once the build finishes, so no need to worry about it.
|
||||
# We log in case it happens outside of the expected flow.
|
||||
logger.debug('Could not find job for the build component on realm %s; component is ready',
|
||||
build_component.builder_realm)
|
||||
raise Return()
|
||||
|
||||
# Start the build job.
|
||||
logger.debug('Sending build %s to newly ready component on realm %s',
|
||||
job.build_uuid, build_component.builder_realm)
|
||||
yield From(build_component.start_build(job))
|
||||
|
||||
yield From(self._write_duration_metric(metric_queue.builder_time_to_build,
|
||||
build_component.builder_realm))
|
||||
|
||||
# Clean up the bookkeeping for allowing any manager to take the job.
|
||||
try:
|
||||
yield From(self._orchestrator.delete_key(self._realm_key(build_component.builder_realm)))
|
||||
except KeyError:
|
||||
logger.warning('Could not delete realm key %s', build_component.builder_realm)
|
||||
|
||||
def build_component_disposed(self, build_component, timed_out):
|
||||
logger.debug('Calling build_component_disposed.')
|
||||
self.unregister_component(build_component)
|
||||
|
||||
@coroutine
|
||||
def job_completed(self, build_job, job_status, build_component):
|
||||
logger.debug('Calling job_completed for job %s with status: %s',
|
||||
build_job.build_uuid, job_status)
|
||||
|
||||
yield From(self._write_duration_metric(metric_queue.build_time, build_component.builder_realm))
|
||||
|
||||
# Mark the job as completed. Since this is being invoked from the component, we don't need
|
||||
# to ask for the phase to be updated as well.
|
||||
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
|
||||
executor_name = build_info.executor_name if build_info else None
|
||||
yield From(self.job_complete_callback(build_job, job_status, executor_name, update_phase=False))
|
||||
|
||||
# Kill the ephemeral builder.
|
||||
yield From(self.kill_builder_executor(build_job.build_uuid))
|
||||
|
||||
# Delete the build job from the orchestrator.
|
||||
try:
|
||||
job_key = self._job_key(build_job)
|
||||
yield From(self._orchestrator.delete_key(job_key))
|
||||
except KeyError:
|
||||
logger.debug('Builder is asking for job to be removed, but work already completed')
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('Could not remove job key as orchestrator is not available')
|
||||
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
|
||||
raise Return()
|
||||
|
||||
# Delete the metric from the orchestrator.
|
||||
try:
|
||||
metric_key = self._metric_key(build_component.builder_realm)
|
||||
yield From(self._orchestrator.delete_key(metric_key))
|
||||
except KeyError:
|
||||
logger.debug('Builder is asking for metric to be removed, but key not found')
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('Could not remove metric key as orchestrator is not available')
|
||||
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
|
||||
raise Return()
|
||||
|
||||
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
|
||||
|
||||
@coroutine
|
||||
def kill_builder_executor(self, build_uuid):
|
||||
logger.info('Starting termination of executor for job %s', build_uuid)
|
||||
build_info = self._build_uuid_to_info.pop(build_uuid, None)
|
||||
if build_info is None:
|
||||
logger.debug('Build information not found for build %s; skipping termination', build_uuid)
|
||||
raise Return()
|
||||
|
||||
# Remove the build's component.
|
||||
self._component_to_job.pop(build_info.component, None)
|
||||
|
||||
# Stop the build node/executor itself.
|
||||
yield From(self.terminate_executor(build_info.executor_name, build_info.execution_id))
|
||||
|
||||
@coroutine
|
||||
def terminate_executor(self, executor_name, execution_id):
|
||||
executor = self._executor_name_to_executor.get(executor_name)
|
||||
if executor is None:
|
||||
logger.error('Could not find registered executor %s', executor_name)
|
||||
raise Return()
|
||||
|
||||
# Terminate the executor's execution.
|
||||
logger.info('Terminating executor %s with execution id %s', executor_name, execution_id)
|
||||
yield From(executor.stop_builder(execution_id))
|
||||
|
||||
@coroutine
|
||||
def job_heartbeat(self, build_job):
|
||||
"""
|
||||
:param build_job: the identifier for the build
|
||||
:type build_job: str
|
||||
"""
|
||||
self.job_heartbeat_callback(build_job)
|
||||
self._extend_job_in_orchestrator(build_job)
|
||||
|
||||
@coroutine
|
||||
def _extend_job_in_orchestrator(self, build_job):
|
||||
try:
|
||||
job_data = yield From(self._orchestrator.get_key(self._job_key(build_job)))
|
||||
except KeyError:
|
||||
logger.info('Job %s no longer exists in the orchestrator', build_job.build_uuid)
|
||||
raise Return()
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('failed to connect when attempted to extend job')
|
||||
|
||||
build_job_metadata = json.loads(job_data)
|
||||
|
||||
max_expiration = datetime.utcfromtimestamp(build_job_metadata['max_expiration'])
|
||||
max_expiration_remaining = max_expiration - datetime.utcnow()
|
||||
max_expiration_sec = max(0, int(max_expiration_remaining.total_seconds()))
|
||||
|
||||
ttl = min(self.heartbeat_period_sec * 2, max_expiration_sec)
|
||||
payload = {
|
||||
'job_queue_item': build_job.job_item,
|
||||
'max_expiration': build_job_metadata['max_expiration'],
|
||||
'had_heartbeat': True,
|
||||
}
|
||||
|
||||
try:
|
||||
yield From(self._orchestrator.set_key(self._job_key(build_job), json.dumps(payload),
|
||||
expiration=ttl))
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('Could not update heartbeat for job as the orchestrator is not available')
|
||||
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
|
||||
|
||||
@coroutine
|
||||
def _write_duration_metric(self, metric, realm):
|
||||
"""
|
||||
:returns: True if the metric was written, otherwise False
|
||||
:rtype: bool
|
||||
"""
|
||||
try:
|
||||
metric_data = yield From(self._orchestrator.get_key(self._metric_key(realm)))
|
||||
parsed_metric_data = json.loads(metric_data)
|
||||
start_time = parsed_metric_data['start_time']
|
||||
metric.Observe(time.time() - start_time,
|
||||
labelvalues=[parsed_metric_data.get('executor_name',
|
||||
'unknown')])
|
||||
except Exception:
|
||||
logger.exception("Could not write metric for realm %s", realm)
|
||||
|
||||
def num_workers(self):
|
||||
"""
|
||||
The number of workers we're managing locally.
|
||||
|
||||
:returns: the number of the workers locally managed
|
||||
:rtype: int
|
||||
"""
|
||||
return len(self._component_to_job)
|
||||
|
||||
|
||||
@coroutine
|
||||
def _cancel_callback(self, key_change):
|
||||
if key_change.event not in (KeyEvent.CREATE, KeyEvent.SET):
|
||||
raise Return()
|
||||
|
||||
build_uuid = key_change.value
|
||||
build_info = self._build_uuid_to_info.get(build_uuid, None)
|
||||
if build_info is None:
|
||||
logger.debug('No build info for "%s" job %s', key_change.event, build_uuid)
|
||||
raise Return(False)
|
||||
|
||||
lock_key = slash_join(self._canceled_lock_prefix,
|
||||
build_uuid, build_info.execution_id)
|
||||
lock_acquired = yield From(self._orchestrator.lock(lock_key))
|
||||
if lock_acquired:
|
||||
builder_realm = build_info.component.builder_realm
|
||||
yield From(self.kill_builder_executor(build_uuid))
|
||||
yield From(self._orchestrator.delete_key(self._realm_key(builder_realm)))
|
||||
yield From(self._orchestrator.delete_key(self._metric_key(builder_realm)))
|
||||
yield From(self._orchestrator.delete_key(slash_join(self._job_prefix, build_uuid)))
|
||||
|
||||
# This is outside the lock so we can un-register the component wherever it is registered to.
|
||||
yield From(build_info.component.cancel_build())
|
37
buildman/manager/etcd_canceller.py
Normal file
37
buildman/manager/etcd_canceller.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
import logging
|
||||
import etcd
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EtcdCanceller(object):
|
||||
""" A class that sends a message to etcd to cancel a build """
|
||||
|
||||
def __init__(self, config):
|
||||
etcd_host = config.get('ETCD_HOST', '127.0.0.1')
|
||||
etcd_port = config.get('ETCD_PORT', 2379)
|
||||
etcd_ca_cert = config.get('ETCD_CA_CERT', None)
|
||||
etcd_auth = config.get('ETCD_CERT_AND_KEY', None)
|
||||
if etcd_auth is not None:
|
||||
etcd_auth = tuple(etcd_auth)
|
||||
|
||||
etcd_protocol = 'http' if etcd_auth is None else 'https'
|
||||
logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
|
||||
self._cancel_prefix = config.get('ETCD_CANCEL_PREFIX', 'cancel/')
|
||||
self._etcd_client = etcd.Client(
|
||||
host=etcd_host,
|
||||
port=etcd_port,
|
||||
cert=etcd_auth,
|
||||
ca_cert=etcd_ca_cert,
|
||||
protocol=etcd_protocol,
|
||||
read_timeout=5)
|
||||
|
||||
def try_cancel_build(self, build_uuid):
|
||||
""" Writes etcd message to cancel build_uuid. """
|
||||
logger.info("Cancelling build %s".format(build_uuid))
|
||||
try:
|
||||
self._etcd_client.write("{}{}".format(self._cancel_prefix, build_uuid), build_uuid, ttl=60)
|
||||
return True
|
||||
except etcd.EtcdException:
|
||||
logger.exception("Failed to write to etcd client %s", build_uuid)
|
||||
return False
|
560
buildman/manager/executor.py
Normal file
560
buildman/manager/executor.py
Normal file
|
@ -0,0 +1,560 @@
|
|||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
import threading
|
||||
import uuid
|
||||
|
||||
from functools import partial
|
||||
|
||||
import boto.ec2
|
||||
import cachetools.func
|
||||
import requests
|
||||
import trollius
|
||||
|
||||
from container_cloud_config import CloudConfigContext
|
||||
from jinja2 import FileSystemLoader, Environment
|
||||
from trollius import coroutine, From, Return, get_event_loop
|
||||
|
||||
import release
|
||||
|
||||
from buildman.asyncutil import AsyncWrapper
|
||||
from app import metric_queue, app
|
||||
from util.metrics.metricqueue import duration_collector_async
|
||||
from _init import ROOT_DIR
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
ONE_HOUR = 60*60
|
||||
|
||||
_TAG_RETRY_COUNT = 3 # Number of times to retry adding tags.
|
||||
_TAG_RETRY_SLEEP = 2 # Number of seconds to wait between tag retries.
|
||||
|
||||
ENV = Environment(loader=FileSystemLoader(os.path.join(ROOT_DIR, "buildman/templates")))
|
||||
TEMPLATE = ENV.get_template('cloudconfig.yaml')
|
||||
CloudConfigContext().populate_jinja_environment(ENV)
|
||||
|
||||
class ExecutorException(Exception):
|
||||
""" Exception raised when there is a problem starting or stopping a builder.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class BuilderExecutor(object):
|
||||
def __init__(self, executor_config, manager_hostname):
|
||||
""" Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
|
||||
starting and stopping builders.
|
||||
"""
|
||||
self.executor_config = executor_config
|
||||
self.manager_hostname = manager_hostname
|
||||
|
||||
default_websocket_scheme = 'wss' if app.config['PREFERRED_URL_SCHEME'] == 'https' else 'ws'
|
||||
self.websocket_scheme = executor_config.get("WEBSOCKET_SCHEME", default_websocket_scheme)
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
""" Name returns the unique name for this executor. """
|
||||
return self.executor_config.get('NAME') or self.__class__.__name__
|
||||
|
||||
@property
|
||||
def setup_time(self):
|
||||
""" Returns the amount of time (in seconds) to wait for the execution to start for the build.
|
||||
If None, the manager's default will be used.
|
||||
"""
|
||||
return self.executor_config.get('SETUP_TIME')
|
||||
|
||||
@coroutine
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
""" Create a builder with the specified config. Returns a unique id which can be used to manage
|
||||
the builder.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@coroutine
|
||||
def stop_builder(self, builder_id):
|
||||
""" Stop a builder which is currently running.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def allowed_for_namespace(self, namespace):
|
||||
""" Returns true if this executor can be used for builds in the given namespace. """
|
||||
|
||||
# Check for an explicit namespace whitelist.
|
||||
namespace_whitelist = self.executor_config.get('NAMESPACE_WHITELIST')
|
||||
if namespace_whitelist is not None and namespace in namespace_whitelist:
|
||||
return True
|
||||
|
||||
# Check for a staged rollout percentage. If found, we hash the namespace and, if it is found
|
||||
# in the first X% of the character space, we allow this executor to be used.
|
||||
staged_rollout = self.executor_config.get('STAGED_ROLLOUT')
|
||||
if staged_rollout is not None:
|
||||
bucket = int(hashlib.sha256(namespace).hexdigest()[-2:], 16)
|
||||
return bucket < (256 * staged_rollout)
|
||||
|
||||
# If there are no restrictions in place, we are free to use this executor.
|
||||
return staged_rollout is None and namespace_whitelist is None
|
||||
|
||||
@property
|
||||
def minimum_retry_threshold(self):
|
||||
""" Returns the minimum number of retries required for this executor to be used or 0 if
|
||||
none. """
|
||||
return self.executor_config.get('MINIMUM_RETRY_THRESHOLD', 0)
|
||||
|
||||
def generate_cloud_config(self, realm, token, build_uuid, coreos_channel,
|
||||
manager_hostname, quay_username=None,
|
||||
quay_password=None):
|
||||
if quay_username is None:
|
||||
quay_username = self.executor_config['QUAY_USERNAME']
|
||||
|
||||
if quay_password is None:
|
||||
quay_password = self.executor_config['QUAY_PASSWORD']
|
||||
|
||||
return TEMPLATE.render(
|
||||
realm=realm,
|
||||
token=token,
|
||||
build_uuid=build_uuid,
|
||||
quay_username=quay_username,
|
||||
quay_password=quay_password,
|
||||
manager_hostname=manager_hostname,
|
||||
websocket_scheme=self.websocket_scheme,
|
||||
coreos_channel=coreos_channel,
|
||||
worker_image=self.executor_config.get('WORKER_IMAGE', 'quay.io/coreos/registry-build-worker'),
|
||||
worker_tag=self.executor_config['WORKER_TAG'],
|
||||
logentries_token=self.executor_config.get('LOGENTRIES_TOKEN', None),
|
||||
volume_size=self.executor_config.get('VOLUME_SIZE', '42G'),
|
||||
max_lifetime_s=self.executor_config.get('MAX_LIFETIME_S', 10800),
|
||||
ssh_authorized_keys=self.executor_config.get('SSH_AUTHORIZED_KEYS', []),
|
||||
)
|
||||
|
||||
|
||||
class EC2Executor(BuilderExecutor):
|
||||
""" Implementation of BuilderExecutor which uses libcloud to start machines on a variety of cloud
|
||||
providers.
|
||||
"""
|
||||
COREOS_STACK_URL = 'http://%s.release.core-os.net/amd64-usr/current/coreos_production_ami_hvm.txt'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._loop = get_event_loop()
|
||||
super(EC2Executor, self).__init__(*args, **kwargs)
|
||||
|
||||
def _get_conn(self):
|
||||
""" Creates an ec2 connection which can be used to manage instances.
|
||||
"""
|
||||
return AsyncWrapper(boto.ec2.connect_to_region(
|
||||
self.executor_config['EC2_REGION'],
|
||||
aws_access_key_id=self.executor_config['AWS_ACCESS_KEY'],
|
||||
aws_secret_access_key=self.executor_config['AWS_SECRET_KEY'],
|
||||
))
|
||||
|
||||
@classmethod
|
||||
@cachetools.func.ttl_cache(ttl=ONE_HOUR)
|
||||
def _get_coreos_ami(cls, ec2_region, coreos_channel):
|
||||
""" Retrieve the CoreOS AMI id from the canonical listing.
|
||||
"""
|
||||
stack_list_string = requests.get(EC2Executor.COREOS_STACK_URL % coreos_channel).text
|
||||
stack_amis = dict([stack.split('=') for stack in stack_list_string.split('|')])
|
||||
return stack_amis[ec2_region]
|
||||
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, ['ec2'])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
region = self.executor_config['EC2_REGION']
|
||||
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
|
||||
|
||||
coreos_ami = self.executor_config.get('COREOS_AMI', None)
|
||||
if coreos_ami is None:
|
||||
get_ami_callable = partial(self._get_coreos_ami, region, channel)
|
||||
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
|
||||
|
||||
user_data = self.generate_cloud_config(realm, token, build_uuid, channel, self.manager_hostname)
|
||||
logger.debug('Generated cloud config for build %s: %s', build_uuid, user_data)
|
||||
|
||||
ec2_conn = self._get_conn()
|
||||
|
||||
ssd_root_ebs = boto.ec2.blockdevicemapping.BlockDeviceType(
|
||||
size=int(self.executor_config.get('BLOCK_DEVICE_SIZE', 48)),
|
||||
volume_type='gp2',
|
||||
delete_on_termination=True,
|
||||
)
|
||||
block_devices = boto.ec2.blockdevicemapping.BlockDeviceMapping()
|
||||
block_devices['/dev/xvda'] = ssd_root_ebs
|
||||
|
||||
interfaces = None
|
||||
if self.executor_config.get('EC2_VPC_SUBNET_ID', None) is not None:
|
||||
interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
|
||||
subnet_id=self.executor_config['EC2_VPC_SUBNET_ID'],
|
||||
groups=self.executor_config['EC2_SECURITY_GROUP_IDS'],
|
||||
associate_public_ip_address=True,
|
||||
)
|
||||
interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)
|
||||
|
||||
try:
|
||||
reservation = yield From(ec2_conn.run_instances(
|
||||
coreos_ami,
|
||||
instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
|
||||
key_name=self.executor_config.get('EC2_KEY_NAME', None),
|
||||
user_data=user_data,
|
||||
instance_initiated_shutdown_behavior='terminate',
|
||||
block_device_map=block_devices,
|
||||
network_interfaces=interfaces,
|
||||
))
|
||||
except boto.exception.EC2ResponseError as ec2e:
|
||||
logger.exception('Unable to spawn builder instance')
|
||||
metric_queue.ephemeral_build_worker_failure.Inc()
|
||||
raise ec2e
|
||||
|
||||
if not reservation.instances:
|
||||
raise ExecutorException('Unable to spawn builder instance.')
|
||||
elif len(reservation.instances) != 1:
|
||||
raise ExecutorException('EC2 started wrong number of instances!')
|
||||
|
||||
launched = AsyncWrapper(reservation.instances[0])
|
||||
|
||||
# Sleep a few seconds to wait for AWS to spawn the instance.
|
||||
yield From(trollius.sleep(_TAG_RETRY_SLEEP))
|
||||
|
||||
# Tag the instance with its metadata.
|
||||
for i in range(0, _TAG_RETRY_COUNT):
|
||||
try:
|
||||
yield From(launched.add_tags({
|
||||
'Name': 'Quay Ephemeral Builder',
|
||||
'Realm': realm,
|
||||
'Token': token,
|
||||
'BuildUUID': build_uuid,
|
||||
}))
|
||||
except boto.exception.EC2ResponseError as ec2e:
|
||||
if ec2e.error_code == 'InvalidInstanceID.NotFound':
|
||||
if i < _TAG_RETRY_COUNT - 1:
|
||||
logger.warning('Failed to write EC2 tags for instance %s for build %s (attempt #%s)',
|
||||
launched.id, build_uuid, i)
|
||||
yield From(trollius.sleep(_TAG_RETRY_SLEEP))
|
||||
continue
|
||||
|
||||
raise ExecutorException('Unable to find builder instance.')
|
||||
|
||||
logger.exception('Failed to write EC2 tags (attempt #%s)', i)
|
||||
|
||||
logger.debug('Machine with ID %s started for build %s', launched.id, build_uuid)
|
||||
raise Return(launched.id)
|
||||
|
||||
@coroutine
|
||||
def stop_builder(self, builder_id):
|
||||
try:
|
||||
ec2_conn = self._get_conn()
|
||||
terminated_instances = yield From(ec2_conn.terminate_instances([builder_id]))
|
||||
except boto.exception.EC2ResponseError as ec2e:
|
||||
if ec2e.error_code == 'InvalidInstanceID.NotFound':
|
||||
logger.debug('Instance %s already terminated', builder_id)
|
||||
return
|
||||
|
||||
logger.exception('Exception when trying to terminate instance %s', builder_id)
|
||||
raise
|
||||
|
||||
if builder_id not in [si.id for si in terminated_instances]:
|
||||
raise ExecutorException('Unable to terminate instance: %s' % builder_id)
|
||||
|
||||
|
||||
class PopenExecutor(BuilderExecutor):
|
||||
""" Implementation of BuilderExecutor which uses Popen to fork a quay-builder process.
|
||||
"""
|
||||
def __init__(self, executor_config, manager_hostname):
|
||||
self._jobs = {}
|
||||
|
||||
super(PopenExecutor, self).__init__(executor_config, manager_hostname)
|
||||
|
||||
""" Executor which uses Popen to fork a quay-builder process.
|
||||
"""
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, ['fork'])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
# Now start a machine for this job, adding the machine id to the etcd information
|
||||
logger.debug('Forking process for build')
|
||||
|
||||
ws_host = os.environ.get("BUILDMAN_WS_HOST", "localhost")
|
||||
ws_port = os.environ.get("BUILDMAN_WS_PORT", "8787")
|
||||
builder_env = {
|
||||
'TOKEN': token,
|
||||
'REALM': realm,
|
||||
'ENDPOINT': 'ws://%s:%s' % (ws_host, ws_port),
|
||||
'DOCKER_TLS_VERIFY': os.environ.get('DOCKER_TLS_VERIFY', ''),
|
||||
'DOCKER_CERT_PATH': os.environ.get('DOCKER_CERT_PATH', ''),
|
||||
'DOCKER_HOST': os.environ.get('DOCKER_HOST', ''),
|
||||
'PATH': "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||
}
|
||||
|
||||
logpipe = LogPipe(logging.INFO)
|
||||
spawned = subprocess.Popen(os.environ.get('BUILDER_BINARY_LOCATION',
|
||||
'/usr/local/bin/quay-builder'),
|
||||
stdout=logpipe,
|
||||
stderr=logpipe,
|
||||
env=builder_env)
|
||||
|
||||
builder_id = str(uuid.uuid4())
|
||||
self._jobs[builder_id] = (spawned, logpipe)
|
||||
logger.debug('Builder spawned with id: %s', builder_id)
|
||||
raise Return(builder_id)
|
||||
|
||||
@coroutine
|
||||
def stop_builder(self, builder_id):
|
||||
if builder_id not in self._jobs:
|
||||
raise ExecutorException('Builder id not being tracked by executor.')
|
||||
|
||||
logger.debug('Killing builder with id: %s', builder_id)
|
||||
spawned, logpipe = self._jobs[builder_id]
|
||||
|
||||
if spawned.poll() is None:
|
||||
spawned.kill()
|
||||
logpipe.close()
|
||||
|
||||
|
||||
class KubernetesExecutor(BuilderExecutor):
|
||||
""" Executes build jobs by creating Kubernetes jobs which run a qemu-kvm virtual
|
||||
machine in a pod """
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(KubernetesExecutor, self).__init__(*args, **kwargs)
|
||||
self._loop = get_event_loop()
|
||||
self.namespace = self.executor_config.get('BUILDER_NAMESPACE', 'builder')
|
||||
self.image = self.executor_config.get('BUILDER_VM_CONTAINER_IMAGE',
|
||||
'quay.io/quay/quay-builder-qemu-coreos:stable')
|
||||
|
||||
@coroutine
|
||||
def _request(self, method, path, **kwargs):
|
||||
request_options = dict(kwargs)
|
||||
|
||||
tls_cert = self.executor_config.get('K8S_API_TLS_CERT')
|
||||
tls_key = self.executor_config.get('K8S_API_TLS_KEY')
|
||||
tls_ca = self.executor_config.get('K8S_API_TLS_CA')
|
||||
service_account_token = self.executor_config.get('SERVICE_ACCOUNT_TOKEN')
|
||||
|
||||
if 'timeout' not in request_options:
|
||||
request_options['timeout'] = self.executor_config.get("K8S_API_TIMEOUT", 20)
|
||||
|
||||
if service_account_token:
|
||||
scheme = 'https'
|
||||
request_options['headers'] = {'Authorization': 'Bearer ' + service_account_token}
|
||||
logger.debug('Using service account token for Kubernetes authentication')
|
||||
elif tls_cert and tls_key:
|
||||
scheme = 'https'
|
||||
request_options['cert'] = (tls_cert, tls_key)
|
||||
logger.debug('Using tls certificate and key for Kubernetes authentication')
|
||||
if tls_ca:
|
||||
request_options['verify'] = tls_ca
|
||||
else:
|
||||
scheme = 'http'
|
||||
|
||||
server = self.executor_config.get('K8S_API_SERVER', 'localhost:8080')
|
||||
url = '%s://%s%s' % (scheme, server, path)
|
||||
|
||||
logger.debug('Executor config: %s', self.executor_config)
|
||||
logger.debug('Kubernetes request: %s %s: %s', method, url, request_options)
|
||||
res = requests.request(method, url, **request_options)
|
||||
logger.debug('Kubernetes response: %s: %s', res.status_code, res.text)
|
||||
raise Return(res)
|
||||
|
||||
def _jobs_path(self):
|
||||
return '/apis/batch/v1/namespaces/%s/jobs' % self.namespace
|
||||
|
||||
def _job_path(self, build_uuid):
|
||||
return '%s/%s' % (self._jobs_path(), build_uuid)
|
||||
|
||||
def _kubernetes_distribution(self):
|
||||
return self.executor_config.get('KUBERNETES_DISTRIBUTION', 'basic').lower()
|
||||
|
||||
def _is_basic_kubernetes_distribution(self):
|
||||
return self._kubernetes_distribution() == 'basic'
|
||||
|
||||
def _is_openshift_kubernetes_distribution(self):
|
||||
return self._kubernetes_distribution() == 'openshift'
|
||||
|
||||
def _build_job_container_resources(self):
|
||||
# Minimum acceptable free resources for this container to "fit" in a quota
|
||||
# These may be lower than the absolute limits if the cluster is knowingly
|
||||
# oversubscribed by some amount.
|
||||
container_requests = {
|
||||
'memory' : self.executor_config.get('CONTAINER_MEMORY_REQUEST', '3968Mi'),
|
||||
}
|
||||
|
||||
container_limits = {
|
||||
'memory' : self.executor_config.get('CONTAINER_MEMORY_LIMITS', '5120Mi'),
|
||||
'cpu' : self.executor_config.get('CONTAINER_CPU_LIMITS', '1000m'),
|
||||
}
|
||||
|
||||
resources = {
|
||||
'requests': container_requests,
|
||||
}
|
||||
|
||||
if self._is_openshift_kubernetes_distribution():
|
||||
resources['requests']['cpu'] = self.executor_config.get('CONTAINER_CPU_REQUEST', '500m')
|
||||
resources['limits'] = container_limits
|
||||
|
||||
return resources
|
||||
|
||||
def _build_job_containers(self, user_data):
|
||||
vm_memory_limit = self.executor_config.get('VM_MEMORY_LIMIT', '4G')
|
||||
vm_volume_size = self.executor_config.get('VOLUME_SIZE', '32G')
|
||||
|
||||
container = {
|
||||
'name': 'builder',
|
||||
'imagePullPolicy': 'IfNotPresent',
|
||||
'image': self.image,
|
||||
'securityContext': {'privileged': True},
|
||||
'env': [
|
||||
{'name': 'USERDATA', 'value': user_data},
|
||||
{'name': 'VM_MEMORY', 'value': vm_memory_limit},
|
||||
{'name': 'VM_VOLUME_SIZE', 'value': vm_volume_size},
|
||||
],
|
||||
'resources': self._build_job_container_resources(),
|
||||
}
|
||||
|
||||
if self._is_basic_kubernetes_distribution():
|
||||
container['volumeMounts'] = [{'name': 'secrets-mask','mountPath': '/var/run/secrets/kubernetes.io/serviceaccount'}]
|
||||
|
||||
return container
|
||||
|
||||
def _job_resource(self, build_uuid, user_data, coreos_channel='stable'):
|
||||
image_pull_secret_name = self.executor_config.get('IMAGE_PULL_SECRET_NAME', 'builder')
|
||||
service_account = self.executor_config.get('SERVICE_ACCOUNT_NAME', 'quay-builder-sa')
|
||||
node_selector_label_key = self.executor_config.get('NODE_SELECTOR_LABEL_KEY', 'beta.kubernetes.io/instance-type')
|
||||
node_selector_label_value = self.executor_config.get('NODE_SELECTOR_LABEL_VALUE', '')
|
||||
|
||||
node_selector = {
|
||||
node_selector_label_key : node_selector_label_value
|
||||
}
|
||||
|
||||
release_sha = release.GIT_HEAD or 'none'
|
||||
if ' ' in release_sha:
|
||||
release_sha = 'HEAD'
|
||||
|
||||
job_resource = {
|
||||
'apiVersion': 'batch/v1',
|
||||
'kind': 'Job',
|
||||
'metadata': {
|
||||
'namespace': self.namespace,
|
||||
'generateName': build_uuid + '-',
|
||||
'labels': {
|
||||
'build': build_uuid,
|
||||
'time': datetime.datetime.now().strftime('%Y-%m-%d-%H'),
|
||||
'manager': socket.gethostname(),
|
||||
'quay-sha': release_sha,
|
||||
},
|
||||
},
|
||||
'spec' : {
|
||||
'activeDeadlineSeconds': self.executor_config.get('MAXIMUM_JOB_TIME', 7200),
|
||||
'template': {
|
||||
'metadata': {
|
||||
'labels': {
|
||||
'build': build_uuid,
|
||||
'time': datetime.datetime.now().strftime('%Y-%m-%d-%H'),
|
||||
'manager': socket.gethostname(),
|
||||
'quay-sha': release_sha,
|
||||
},
|
||||
},
|
||||
'spec': {
|
||||
'imagePullSecrets': [{ 'name': image_pull_secret_name }],
|
||||
'restartPolicy': 'Never',
|
||||
'dnsPolicy': 'Default',
|
||||
'containers': [self._build_job_containers(user_data)],
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
if self._is_openshift_kubernetes_distribution():
|
||||
# Setting `automountServiceAccountToken` to false will prevent automounting API credentials for a service account.
|
||||
job_resource['spec']['template']['spec']['automountServiceAccountToken'] = False
|
||||
|
||||
# Use dedicated service account that has no authorization to any resources.
|
||||
job_resource['spec']['template']['spec']['serviceAccount'] = service_account
|
||||
|
||||
# Setting `enableServiceLinks` to false prevents information about other services from being injected into pod's
|
||||
# environment variables. Pod has no visibility into other services on the cluster.
|
||||
job_resource['spec']['template']['spec']['enableServiceLinks'] = False
|
||||
|
||||
if node_selector_label_value.strip() != '':
|
||||
job_resource['spec']['template']['spec']['nodeSelector'] = node_selector
|
||||
|
||||
if self._is_basic_kubernetes_distribution():
|
||||
# This volume is a hack to mask the token for the namespace's
|
||||
# default service account, which is placed in a file mounted under
|
||||
# `/var/run/secrets/kubernetes.io/serviceaccount` in all pods.
|
||||
# There's currently no other way to just disable the service
|
||||
# account at either the pod or namespace level.
|
||||
#
|
||||
# https://github.com/kubernetes/kubernetes/issues/16779
|
||||
#
|
||||
job_resource['spec']['template']['spec']['volumes'] = [{'name': 'secrets-mask','emptyDir': {'medium': 'Memory'}}]
|
||||
|
||||
return job_resource
|
||||
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, ['k8s'])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
# generate resource
|
||||
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
|
||||
user_data = self.generate_cloud_config(realm, token, build_uuid, channel, self.manager_hostname)
|
||||
resource = self._job_resource(build_uuid, user_data, channel)
|
||||
logger.debug('Using Kubernetes Distribution: %s', self._kubernetes_distribution())
|
||||
logger.debug('Generated kubernetes resource:\n%s', resource)
|
||||
|
||||
# schedule
|
||||
create_job = yield From(self._request('POST', self._jobs_path(), json=resource))
|
||||
if int(create_job.status_code / 100) != 2:
|
||||
raise ExecutorException('Failed to create job: %s: %s: %s' %
|
||||
(build_uuid, create_job.status_code, create_job.text))
|
||||
|
||||
job = create_job.json()
|
||||
raise Return(job['metadata']['name'])
|
||||
|
||||
@coroutine
|
||||
def stop_builder(self, builder_id):
|
||||
pods_path = '/api/v1/namespaces/%s/pods' % self.namespace
|
||||
|
||||
# Delete the job itself.
|
||||
try:
|
||||
yield From(self._request('DELETE', self._job_path(builder_id)))
|
||||
except:
|
||||
logger.exception('Failed to send delete job call for job %s', builder_id)
|
||||
|
||||
# Delete the pod(s) for the job.
|
||||
selectorString = "job-name=%s" % builder_id
|
||||
try:
|
||||
yield From(self._request('DELETE', pods_path, params=dict(labelSelector=selectorString)))
|
||||
except:
|
||||
logger.exception("Failed to send delete pod call for job %s", builder_id)
|
||||
|
||||
|
||||
class LogPipe(threading.Thread):
|
||||
""" Adapted from http://codereview.stackexchange.com/a/17959
|
||||
"""
|
||||
def __init__(self, level):
|
||||
"""Setup the object with a logger and a loglevel
|
||||
and start the thread
|
||||
"""
|
||||
threading.Thread.__init__(self)
|
||||
self.daemon = False
|
||||
self.level = level
|
||||
self.fd_read, self.fd_write = os.pipe()
|
||||
self.pipe_reader = os.fdopen(self.fd_read)
|
||||
self.start()
|
||||
|
||||
def fileno(self):
|
||||
"""Return the write file descriptor of the pipe
|
||||
"""
|
||||
return self.fd_write
|
||||
|
||||
def run(self):
|
||||
"""Run the thread, logging everything.
|
||||
"""
|
||||
for line in iter(self.pipe_reader.readline, ''):
|
||||
logging.log(self.level, line.strip('\n'))
|
||||
|
||||
self.pipe_reader.close()
|
||||
|
||||
def close(self):
|
||||
"""Close the write end of the pipe.
|
||||
"""
|
||||
os.close(self.fd_write)
|
8
buildman/manager/noop_canceller.py
Normal file
8
buildman/manager/noop_canceller.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
class NoopCanceller(object):
|
||||
""" A class that can not cancel a build """
|
||||
def __init__(self, config=None):
|
||||
pass
|
||||
|
||||
def try_cancel_build(self, uuid):
|
||||
""" Does nothing and fails to cancel build. """
|
||||
return False
|
26
buildman/manager/orchestrator_canceller.py
Normal file
26
buildman/manager/orchestrator_canceller.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
import logging
|
||||
|
||||
from buildman.orchestrator import orchestrator_from_config, OrchestratorError
|
||||
from util import slash_join
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
CANCEL_PREFIX = 'cancel/'
|
||||
|
||||
|
||||
class OrchestratorCanceller(object):
|
||||
""" An asynchronous way to cancel a build with any Orchestrator. """
|
||||
def __init__(self, config):
|
||||
self._orchestrator = orchestrator_from_config(config, canceller_only=True)
|
||||
|
||||
def try_cancel_build(self, build_uuid):
|
||||
logger.info('Cancelling build %s', build_uuid)
|
||||
cancel_key = slash_join(CANCEL_PREFIX, build_uuid)
|
||||
try:
|
||||
self._orchestrator.set_key_sync(cancel_key, build_uuid, expiration=60)
|
||||
return True
|
||||
except OrchestratorError:
|
||||
logger.exception('Failed to write cancel action to redis with uuid %s', build_uuid)
|
||||
return False
|
Reference in a new issue