initial import for Open Source 🎉

This commit is contained in:
Jimmy Zelinskie 2019-11-12 11:09:47 -05:00
parent 1898c361f3
commit 9c0dd3b722
2048 changed files with 218743 additions and 0 deletions

View file

View file

@ -0,0 +1,71 @@
from trollius import coroutine
class BaseManager(object):
""" Base for all worker managers. """
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
job_complete_callback, manager_hostname, heartbeat_period_sec):
self.register_component = register_component
self.unregister_component = unregister_component
self.job_heartbeat_callback = job_heartbeat_callback
self.job_complete_callback = job_complete_callback
self.manager_hostname = manager_hostname
self.heartbeat_period_sec = heartbeat_period_sec
@coroutine
def job_heartbeat(self, build_job):
""" Method invoked to tell the manager that a job is still running. This method will be called
every few minutes. """
self.job_heartbeat_callback(build_job)
def overall_setup_time(self):
""" Returns the number of seconds that the build system should wait before allowing the job
to be picked up again after called 'schedule'.
"""
raise NotImplementedError
def shutdown(self):
""" Indicates that the build controller server is in a shutdown state and that no new jobs
or workers should be performed. Existing workers should be cleaned up once their jobs
have completed
"""
raise NotImplementedError
@coroutine
def schedule(self, build_job):
""" Schedules a queue item to be built. Returns a 2-tuple with (True, None) if the item was
properly scheduled and (False, a retry timeout in seconds) if all workers are busy or an
error occurs.
"""
raise NotImplementedError
def initialize(self, manager_config):
""" Runs any initialization code for the manager. Called once the server is in a ready state.
"""
raise NotImplementedError
@coroutine
def build_component_ready(self, build_component):
""" Method invoked whenever a build component announces itself as ready.
"""
raise NotImplementedError
def build_component_disposed(self, build_component, timed_out):
""" Method invoked whenever a build component has been disposed. The timed_out boolean indicates
whether the component's heartbeat timed out.
"""
raise NotImplementedError
@coroutine
def job_completed(self, build_job, job_status, build_component):
""" Method invoked once a job_item has completed, in some manner. The job_status will be
one of: incomplete, error, complete. Implementations of this method should call coroutine
self.job_complete_callback with a status of Incomplete if they wish for the job to be
automatically requeued.
"""
raise NotImplementedError
def num_workers(self):
""" Returns the number of active build workers currently registered. This includes those
that are currently busy and awaiting more work.
"""
raise NotImplementedError

View file

@ -0,0 +1,27 @@
import logging
from buildman.manager.orchestrator_canceller import OrchestratorCanceller
from buildman.manager.noop_canceller import NoopCanceller
logger = logging.getLogger(__name__)
CANCELLERS = {'ephemeral': OrchestratorCanceller}
class BuildCanceller(object):
""" A class to manage cancelling a build """
def __init__(self, app=None):
self.build_manager_config = app.config.get('BUILD_MANAGER')
if app is None or self.build_manager_config is None:
self.handler = NoopCanceller()
else:
self.handler = None
def try_cancel_build(self, uuid):
""" A method to kill a running build """
if self.handler is None:
canceller = CANCELLERS.get(self.build_manager_config[0], NoopCanceller)
self.handler = canceller(self.build_manager_config[1])
return self.handler.try_cancel_build(uuid)

View file

@ -0,0 +1,92 @@
import logging
import uuid
from buildman.component.basecomponent import BaseComponent
from buildman.component.buildcomponent import BuildComponent
from buildman.manager.basemanager import BaseManager
from trollius import From, Return, coroutine
REGISTRATION_REALM = 'registration'
RETRY_TIMEOUT = 5
logger = logging.getLogger(__name__)
class DynamicRegistrationComponent(BaseComponent):
""" Component session that handles dynamic registration of the builder components. """
def onConnect(self):
self.join(REGISTRATION_REALM)
def onJoin(self, details):
logger.debug('Registering registration method')
yield From(self.register(self._worker_register, u'io.quay.buildworker.register'))
def _worker_register(self):
realm = self.parent_manager.add_build_component()
logger.debug('Registering new build component+worker with realm %s', realm)
return realm
def kind(self):
return 'registration'
class EnterpriseManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
def __init__(self, *args, **kwargs):
self.ready_components = set()
self.all_components = set()
self.shutting_down = False
super(EnterpriseManager, self).__init__(*args, **kwargs)
def initialize(self, manager_config):
# Add a component which is used by build workers for dynamic registration. Unlike
# production, build workers in enterprise are long-lived and register dynamically.
self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
def overall_setup_time(self):
# Builders are already registered, so the setup time should be essentially instant. We therefore
# only return a minute here.
return 60
def add_build_component(self):
""" Adds a new build component for an Enterprise Registry. """
# Generate a new unique realm ID for the build worker.
realm = str(uuid.uuid4())
new_component = self.register_component(realm, BuildComponent, token="")
self.all_components.add(new_component)
return realm
@coroutine
def schedule(self, build_job):
""" Schedules a build for an Enterprise Registry. """
if self.shutting_down or not self.ready_components:
raise Return(False, RETRY_TIMEOUT)
component = self.ready_components.pop()
yield From(component.start_build(build_job))
raise Return(True, None)
@coroutine
def build_component_ready(self, build_component):
self.ready_components.add(build_component)
def shutdown(self):
self.shutting_down = True
@coroutine
def job_completed(self, build_job, job_status, build_component):
yield From(self.job_complete_callback(build_job, job_status))
def build_component_disposed(self, build_component, timed_out):
self.all_components.remove(build_component)
if build_component in self.ready_components:
self.ready_components.remove(build_component)
self.unregister_component(build_component)
def num_workers(self):
return len(self.all_components)

View file

@ -0,0 +1,710 @@
import logging
import uuid
import calendar
import json
import time
from collections import namedtuple
from datetime import datetime, timedelta
from six import iteritems
from trollius import From, coroutine, Return, async, sleep
from app import metric_queue
from buildman.orchestrator import (orchestrator_from_config, KeyEvent,
OrchestratorError, OrchestratorConnectionError,
ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
from buildman.manager.basemanager import BaseManager
from buildman.manager.executor import PopenExecutor, EC2Executor, KubernetesExecutor
from buildman.component.buildcomponent import BuildComponent
from buildman.jobutil.buildjob import BuildJob
from buildman.server import BuildJobResult
from util import slash_join
from util.morecollections import AttrDict
logger = logging.getLogger(__name__)
JOB_PREFIX = 'building/'
LOCK_PREFIX = 'lock/'
REALM_PREFIX = 'realm/'
CANCEL_PREFIX = 'cancel/'
METRIC_PREFIX = 'metric/'
CANCELED_LOCK_PREFIX = slash_join(LOCK_PREFIX, 'job-cancelled')
EXPIRED_LOCK_PREFIX = slash_join(LOCK_PREFIX, 'job-expired')
EPHEMERAL_API_TIMEOUT = 20
EPHEMERAL_SETUP_TIMEOUT = 500
RETRY_IMMEDIATELY_SLEEP_DURATION = 0
TOO_MANY_WORKERS_SLEEP_DURATION = 10
BuildInfo = namedtuple('BuildInfo', ['component', 'build_job', 'execution_id', 'executor_name'])
class EphemeralBuilderManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
EXECUTORS = {
'popen': PopenExecutor,
'ec2': EC2Executor,
'kubernetes': KubernetesExecutor,
}
def __init__(self, *args, **kwargs):
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
self._shutting_down = False
self._manager_config = None
self._orchestrator = None
# The registered executors available for running jobs, in order.
self._ordered_executors = []
# The registered executors, mapped by their unique name.
self._executor_name_to_executor = {}
# Map from builder component to its associated job.
self._component_to_job = {}
# Map from build UUID to a BuildInfo tuple with information about the build.
self._build_uuid_to_info = {}
def overall_setup_time(self):
return EPHEMERAL_SETUP_TIMEOUT
@coroutine
def _mark_job_incomplete(self, build_job, build_info):
""" Marks a job as incomplete, in response to a failure to start or a timeout. """
executor_name = build_info.executor_name
execution_id = build_info.execution_id
logger.warning('Build executor failed to successfully boot with execution id %s',
execution_id)
# Take a lock to ensure that only one manager reports the build as incomplete for this
# execution.
lock_key = slash_join(self._expired_lock_prefix, build_job.build_uuid, execution_id)
acquired_lock = yield From(self._orchestrator.lock(lock_key))
if acquired_lock:
try:
# Clean up the bookkeeping for the job.
yield From(self._orchestrator.delete_key(self._job_key(build_job)))
except KeyError:
logger.debug('Could not delete job key %s; might have been removed already',
build_job.build_uuid)
logger.error('[BUILD INTERNAL ERROR] Build ID: %s. Exec name: %s. Exec ID: %s',
build_job.build_uuid, executor_name, execution_id)
yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
update_phase=True))
else:
logger.debug('Did not get lock for job-expiration for job %s', build_job.build_uuid)
@coroutine
def _job_callback(self, key_change):
"""
This is the callback invoked when keys related to jobs are changed.
It ignores all events related to the creation of new jobs.
Deletes or expirations cause checks to ensure they've been properly marked as completed.
:param key_change: the event and value produced by a key changing in the orchestrator
:type key_change: :class:`KeyChange`
"""
if key_change.event in (KeyEvent.CREATE, KeyEvent.SET):
raise Return()
elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE):
# Handle the expiration/deletion.
job_metadata = json.loads(key_change.value)
build_job = BuildJob(AttrDict(job_metadata['job_queue_item']))
logger.debug('Got "%s" of job %s', key_change.event, build_job.build_uuid)
# Get the build info.
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
if build_info is None:
logger.debug('No build info for "%s" job %s (%s); probably already deleted by this manager',
key_change.event, build_job.build_uuid, job_metadata)
raise Return()
if key_change.event != KeyEvent.EXPIRE:
# If the etcd action was not an expiration, then it was already deleted by some manager and
# the execution was therefore already shutdown. All that's left is to remove the build info.
self._build_uuid_to_info.pop(build_job.build_uuid, None)
raise Return()
logger.debug('got expiration for job %s with metadata: %s', build_job.build_uuid,
job_metadata)
if not job_metadata.get('had_heartbeat', False):
# If we have not yet received a heartbeat, then the node failed to boot in some way.
# We mark the job as incomplete here.
yield From(self._mark_job_incomplete(build_job, build_info))
# Finally, we terminate the build execution for the job. We don't do this under a lock as
# terminating a node is an atomic operation; better to make sure it is terminated than not.
logger.info('Terminating expired build executor for job %s with execution id %s',
build_job.build_uuid, build_info.execution_id)
yield From(self.kill_builder_executor(build_job.build_uuid))
else:
logger.warning('Unexpected KeyEvent (%s) on job key: %s', key_change.event, key_change.key)
@coroutine
def _realm_callback(self, key_change):
logger.debug('realm callback for key: %s', key_change.key)
if key_change.event == KeyEvent.CREATE:
# Listen on the realm created by ourselves or another worker.
realm_spec = json.loads(key_change.value)
self._register_realm(realm_spec)
elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE):
# Stop listening for new connections on the realm, if we did not get the connection.
realm_spec = json.loads(key_change.value)
realm_id = realm_spec['realm']
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
build_uuid = build_job.build_uuid
logger.debug('Realm key %s for build %s was %s', realm_id, build_uuid, key_change.event)
build_info = self._build_uuid_to_info.get(build_uuid, None)
if build_info is not None:
# Pop off the component and if we find one, then the build has not connected to this
# manager, so we can safely unregister its component.
component = self._component_to_job.pop(build_info.component, None)
if component is not None:
# We were not the manager which the worker connected to, remove the bookkeeping for it
logger.debug('Unregistering unused component for build %s', build_uuid)
self.unregister_component(build_info.component)
# If the realm has expired, then perform cleanup of the executor.
if key_change.event == KeyEvent.EXPIRE:
execution_id = realm_spec.get('execution_id', None)
executor_name = realm_spec.get('executor_name', 'EC2Executor')
# Cleanup the job, since it never started.
logger.debug('Job %s for incomplete marking: %s', build_uuid, build_info)
if build_info is not None:
yield From(self._mark_job_incomplete(build_job, build_info))
# Cleanup the executor.
logger.info('Realm %s expired for job %s, terminating executor %s with execution id %s',
realm_id, build_uuid, executor_name, execution_id)
yield From(self.terminate_executor(executor_name, execution_id))
else:
logger.warning('Unexpected action (%s) on realm key: %s', key_change.event, key_change.key)
def _register_realm(self, realm_spec):
logger.debug('Got call to register realm %s with manager', realm_spec['realm'])
# Create the build information block for the registered realm.
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
execution_id = realm_spec.get('execution_id', None)
executor_name = realm_spec.get('executor_name', 'EC2Executor')
logger.debug('Registering realm %s with manager: %s', realm_spec['realm'], realm_spec)
component = self.register_component(realm_spec['realm'], BuildComponent,
token=realm_spec['token'])
build_info = BuildInfo(component=component, build_job=build_job, execution_id=execution_id,
executor_name=executor_name)
self._component_to_job[component] = build_job
self._build_uuid_to_info[build_job.build_uuid] = build_info
logger.debug('Registered realm %s with manager', realm_spec['realm'])
return component
@property
def registered_executors(self):
return self._ordered_executors
@coroutine
def _register_existing_realms(self):
try:
all_realms = yield From(self._orchestrator.get_prefixed_keys(self._realm_prefix))
# Register all existing realms found.
encountered = {self._register_realm(json.loads(realm_data))
for _realm, realm_data in all_realms}
# Remove any components not encountered so we can clean up.
for component, job in iteritems(self._component_to_job):
if not component in encountered:
self._component_to_job.pop(component, None)
self._build_uuid_to_info.pop(job.build_uuid, None)
except KeyError:
pass
def _load_executor(self, executor_kind_name, executor_config):
executor_klass = EphemeralBuilderManager.EXECUTORS.get(executor_kind_name)
if executor_klass is None:
logger.error('Unknown executor %s; skipping install', executor_kind_name)
return
executor = executor_klass(executor_config, self.manager_hostname)
if executor.name in self._executor_name_to_executor:
raise Exception('Executor with name %s already registered' % executor.name)
self._ordered_executors.append(executor)
self._executor_name_to_executor[executor.name] = executor
def _config_prefix(self, key):
if self._manager_config.get('ORCHESTRATOR') is None:
return key
prefix = self._manager_config.get('ORCHESTRATOR_PREFIX', '')
return slash_join(prefix, key).lstrip('/') + '/'
@property
def _job_prefix(self):
return self._config_prefix(JOB_PREFIX)
@property
def _realm_prefix(self):
return self._config_prefix(REALM_PREFIX)
@property
def _cancel_prefix(self):
return self._config_prefix(CANCEL_PREFIX)
@property
def _metric_prefix(self):
return self._config_prefix(METRIC_PREFIX)
@property
def _expired_lock_prefix(self):
return self._config_prefix(EXPIRED_LOCK_PREFIX)
@property
def _canceled_lock_prefix(self):
return self._config_prefix(CANCELED_LOCK_PREFIX)
def _metric_key(self, realm):
"""
Create a key which is used to track a job in the Orchestrator.
:param realm: realm for the build
:type realm: str
:returns: key used to track jobs
:rtype: str
"""
return slash_join(self._metric_prefix, realm)
def _job_key(self, build_job):
"""
Creates a key which is used to track a job in the Orchestrator.
:param build_job: unique job identifier for a build
:type build_job: str
:returns: key used to track the job
:rtype: str
"""
return slash_join(self._job_prefix, build_job.job_details['build_uuid'])
def _realm_key(self, realm):
"""
Create a key which is used to track an incoming connection on a realm.
:param realm: realm for the build
:type realm: str
:returns: key used to track the connection to the realm
:rtype: str
"""
return slash_join(self._realm_prefix, realm)
def initialize(self, manager_config):
logger.debug('Calling initialize')
self._manager_config = manager_config
# Note: Executor config can be defined either as a single block of EXECUTOR_CONFIG (old style)
# or as a new set of executor configurations, with the order determining how we fallback. We
# check for both here to ensure backwards compatibility.
if manager_config.get('EXECUTORS'):
for executor_config in manager_config['EXECUTORS']:
self._load_executor(executor_config.get('EXECUTOR'), executor_config)
else:
self._load_executor(manager_config.get('EXECUTOR'), manager_config.get('EXECUTOR_CONFIG'))
logger.debug('calling orchestrator_from_config')
self._orchestrator = orchestrator_from_config(manager_config)
logger.debug('setting on_key_change callbacks for job, cancel, realm')
self._orchestrator.on_key_change(self._job_prefix, self._job_callback)
self._orchestrator.on_key_change(self._cancel_prefix, self._cancel_callback)
self._orchestrator.on_key_change(self._realm_prefix, self._realm_callback,
restarter=self._register_existing_realms)
# Load components for all realms currently known to the cluster
async(self._register_existing_realms())
def shutdown(self):
logger.debug('Shutting down worker.')
if self._orchestrator is not None:
self._orchestrator.shutdown()
@coroutine
def schedule(self, build_job):
build_uuid = build_job.job_details['build_uuid']
logger.debug('Calling schedule with job: %s', build_uuid)
# Check if there are worker slots available by checking the number of jobs in the orchestrator
allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
try:
active_jobs = yield From(self._orchestrator.get_prefixed_keys(self._job_prefix))
workers_alive = len(active_jobs)
except KeyError:
workers_alive = 0
except OrchestratorConnectionError:
logger.exception('Could not read job count from orchestrator for job due to orchestrator being down')
raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
except OrchestratorError:
logger.exception('Exception when reading job count from orchestrator for job: %s', build_uuid)
raise Return(False, RETRY_IMMEDIATELY_SLEEP_DURATION)
logger.debug('Total jobs (scheduling job %s): %s', build_uuid, workers_alive)
if workers_alive >= allowed_worker_count:
logger.info('Too many workers alive, unable to start new worker for build job: %s. %s >= %s',
build_uuid, workers_alive, allowed_worker_count)
raise Return(False, TOO_MANY_WORKERS_SLEEP_DURATION)
job_key = self._job_key(build_job)
# First try to take a lock for this job, meaning we will be responsible for its lifeline
realm = str(uuid.uuid4())
token = str(uuid.uuid4())
nonce = str(uuid.uuid4())
machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200)
max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration)
payload = {
'max_expiration': calendar.timegm(max_expiration.timetuple()),
'nonce': nonce,
'had_heartbeat': False,
'job_queue_item': build_job.job_item,
}
lock_payload = json.dumps(payload)
logger.debug('Writing key for job %s with expiration in %s seconds', build_uuid,
EPHEMERAL_SETUP_TIMEOUT)
try:
yield From(self._orchestrator.set_key(job_key, lock_payload, overwrite=False,
expiration=EPHEMERAL_SETUP_TIMEOUT))
except KeyError:
logger.warning('Job: %s already exists in orchestrator, timeout may be misconfigured',
build_uuid)
raise Return(False, EPHEMERAL_API_TIMEOUT)
except OrchestratorConnectionError:
logger.exception('Exception when writing job %s to orchestrator; could not connect',
build_uuid)
raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
except OrchestratorError:
logger.exception('Exception when writing job %s to orchestrator', build_uuid)
raise Return(False, RETRY_IMMEDIATELY_SLEEP_DURATION)
# Got a lock, now lets boot the job via one of the registered executors.
started_with_executor = None
execution_id = None
logger.debug("Registered executors are: %s", [ex.name for ex in self._ordered_executors])
for executor in self._ordered_executors:
# Check if we can use this executor based on its whitelist, by namespace.
namespace = build_job.namespace
if not executor.allowed_for_namespace(namespace):
logger.debug('Job %s (namespace: %s) cannot use executor %s', build_uuid, namespace,
executor.name)
continue
# Check if we can use this executor based on the retries remaining.
if executor.minimum_retry_threshold > build_job.retries_remaining:
metric_queue.builder_fallback.Inc()
logger.debug('Job %s cannot use executor %s as it is below retry threshold %s (retry #%s)',
build_uuid, executor.name, executor.minimum_retry_threshold,
build_job.retries_remaining)
continue
logger.debug('Starting builder for job %s with selected executor: %s', build_uuid,
executor.name)
try:
execution_id = yield From(executor.start_builder(realm, token, build_uuid))
except:
try:
metric_queue.build_start_failure.Inc(labelvalues=[executor.name])
metric_queue.put_deprecated(('ExecutorFailure-%s' % executor.name), 1, unit='Count')
except:
logger.exception('Exception when writing failure metric for execution %s for job %s',
execution_id, build_uuid)
logger.exception('Exception when starting builder for job: %s', build_uuid)
continue
try:
metric_queue.build_start_success.Inc(labelvalues=[executor.name])
except:
logger.exception('Exception when writing success metric for execution %s for job %s',
execution_id, build_uuid)
try:
metric_queue.ephemeral_build_workers.Inc()
except:
logger.exception('Exception when writing start metrics for execution %s for job %s',
execution_id, build_uuid)
started_with_executor = executor
# Break out of the loop now that we've started a builder successfully.
break
# If we didn't start the job, cleanup and return it to the queue.
if started_with_executor is None:
logger.error('Could not start ephemeral worker for build %s', build_uuid)
# Delete the associated build job record.
yield From(self._orchestrator.delete_key(job_key))
raise Return(False, EPHEMERAL_API_TIMEOUT)
# Job was started!
logger.debug('Started execution with ID %s for job: %s with executor: %s',
execution_id, build_uuid, started_with_executor.name)
# Store metric data
metric_spec = json.dumps({
'executor_name': started_with_executor.name,
'start_time': time.time(),
})
try:
yield From(self._orchestrator.set_key(self._metric_key(realm), metric_spec, overwrite=False,
expiration=machine_max_expiration + 10))
except KeyError:
logger.error('Realm %s already exists in orchestrator for job %s ' +
'UUID collision or something is very very wrong.', realm, build_uuid)
except OrchestratorError:
logger.exception('Exception when writing realm %s to orchestrator for job %s',
realm, build_uuid)
# Store the realm spec which will allow any manager to accept this builder when it connects
realm_spec = json.dumps({
'realm': realm,
'token': token,
'execution_id': execution_id,
'executor_name': started_with_executor.name,
'job_queue_item': build_job.job_item,
})
try:
setup_time = started_with_executor.setup_time or self.overall_setup_time()
logger.debug('Writing job key for job %s using executor %s with ID %s and ttl %s', build_uuid,
started_with_executor.name, execution_id, setup_time)
yield From(self._orchestrator.set_key(self._realm_key(realm), realm_spec,
expiration=setup_time))
except OrchestratorConnectionError:
logger.exception('Exception when writing realm %s to orchestrator for job %s',
realm, build_uuid)
raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
except OrchestratorError:
logger.exception('Exception when writing realm %s to orchestrator for job %s',
realm, build_uuid)
raise Return(False, setup_time)
logger.debug('Builder spawn complete for job %s using executor %s with ID %s ',
build_uuid, started_with_executor.name, execution_id)
raise Return(True, None)
@coroutine
def build_component_ready(self, build_component):
logger.debug('Got component ready for component with realm %s', build_component.builder_realm)
# Pop off the job for the component.
# We do so before we send out the watch below, as it will also remove this mapping.
job = self._component_to_job.pop(build_component, None)
if job is None:
# This will occur once the build finishes, so no need to worry about it.
# We log in case it happens outside of the expected flow.
logger.debug('Could not find job for the build component on realm %s; component is ready',
build_component.builder_realm)
raise Return()
# Start the build job.
logger.debug('Sending build %s to newly ready component on realm %s',
job.build_uuid, build_component.builder_realm)
yield From(build_component.start_build(job))
yield From(self._write_duration_metric(metric_queue.builder_time_to_build,
build_component.builder_realm))
# Clean up the bookkeeping for allowing any manager to take the job.
try:
yield From(self._orchestrator.delete_key(self._realm_key(build_component.builder_realm)))
except KeyError:
logger.warning('Could not delete realm key %s', build_component.builder_realm)
def build_component_disposed(self, build_component, timed_out):
logger.debug('Calling build_component_disposed.')
self.unregister_component(build_component)
@coroutine
def job_completed(self, build_job, job_status, build_component):
logger.debug('Calling job_completed for job %s with status: %s',
build_job.build_uuid, job_status)
yield From(self._write_duration_metric(metric_queue.build_time, build_component.builder_realm))
# Mark the job as completed. Since this is being invoked from the component, we don't need
# to ask for the phase to be updated as well.
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
executor_name = build_info.executor_name if build_info else None
yield From(self.job_complete_callback(build_job, job_status, executor_name, update_phase=False))
# Kill the ephemeral builder.
yield From(self.kill_builder_executor(build_job.build_uuid))
# Delete the build job from the orchestrator.
try:
job_key = self._job_key(build_job)
yield From(self._orchestrator.delete_key(job_key))
except KeyError:
logger.debug('Builder is asking for job to be removed, but work already completed')
except OrchestratorConnectionError:
logger.exception('Could not remove job key as orchestrator is not available')
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
raise Return()
# Delete the metric from the orchestrator.
try:
metric_key = self._metric_key(build_component.builder_realm)
yield From(self._orchestrator.delete_key(metric_key))
except KeyError:
logger.debug('Builder is asking for metric to be removed, but key not found')
except OrchestratorConnectionError:
logger.exception('Could not remove metric key as orchestrator is not available')
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
raise Return()
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
@coroutine
def kill_builder_executor(self, build_uuid):
logger.info('Starting termination of executor for job %s', build_uuid)
build_info = self._build_uuid_to_info.pop(build_uuid, None)
if build_info is None:
logger.debug('Build information not found for build %s; skipping termination', build_uuid)
raise Return()
# Remove the build's component.
self._component_to_job.pop(build_info.component, None)
# Stop the build node/executor itself.
yield From(self.terminate_executor(build_info.executor_name, build_info.execution_id))
@coroutine
def terminate_executor(self, executor_name, execution_id):
executor = self._executor_name_to_executor.get(executor_name)
if executor is None:
logger.error('Could not find registered executor %s', executor_name)
raise Return()
# Terminate the executor's execution.
logger.info('Terminating executor %s with execution id %s', executor_name, execution_id)
yield From(executor.stop_builder(execution_id))
@coroutine
def job_heartbeat(self, build_job):
"""
:param build_job: the identifier for the build
:type build_job: str
"""
self.job_heartbeat_callback(build_job)
self._extend_job_in_orchestrator(build_job)
@coroutine
def _extend_job_in_orchestrator(self, build_job):
try:
job_data = yield From(self._orchestrator.get_key(self._job_key(build_job)))
except KeyError:
logger.info('Job %s no longer exists in the orchestrator', build_job.build_uuid)
raise Return()
except OrchestratorConnectionError:
logger.exception('failed to connect when attempted to extend job')
build_job_metadata = json.loads(job_data)
max_expiration = datetime.utcfromtimestamp(build_job_metadata['max_expiration'])
max_expiration_remaining = max_expiration - datetime.utcnow()
max_expiration_sec = max(0, int(max_expiration_remaining.total_seconds()))
ttl = min(self.heartbeat_period_sec * 2, max_expiration_sec)
payload = {
'job_queue_item': build_job.job_item,
'max_expiration': build_job_metadata['max_expiration'],
'had_heartbeat': True,
}
try:
yield From(self._orchestrator.set_key(self._job_key(build_job), json.dumps(payload),
expiration=ttl))
except OrchestratorConnectionError:
logger.exception('Could not update heartbeat for job as the orchestrator is not available')
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
@coroutine
def _write_duration_metric(self, metric, realm):
"""
:returns: True if the metric was written, otherwise False
:rtype: bool
"""
try:
metric_data = yield From(self._orchestrator.get_key(self._metric_key(realm)))
parsed_metric_data = json.loads(metric_data)
start_time = parsed_metric_data['start_time']
metric.Observe(time.time() - start_time,
labelvalues=[parsed_metric_data.get('executor_name',
'unknown')])
except Exception:
logger.exception("Could not write metric for realm %s", realm)
def num_workers(self):
"""
The number of workers we're managing locally.
:returns: the number of the workers locally managed
:rtype: int
"""
return len(self._component_to_job)
@coroutine
def _cancel_callback(self, key_change):
if key_change.event not in (KeyEvent.CREATE, KeyEvent.SET):
raise Return()
build_uuid = key_change.value
build_info = self._build_uuid_to_info.get(build_uuid, None)
if build_info is None:
logger.debug('No build info for "%s" job %s', key_change.event, build_uuid)
raise Return(False)
lock_key = slash_join(self._canceled_lock_prefix,
build_uuid, build_info.execution_id)
lock_acquired = yield From(self._orchestrator.lock(lock_key))
if lock_acquired:
builder_realm = build_info.component.builder_realm
yield From(self.kill_builder_executor(build_uuid))
yield From(self._orchestrator.delete_key(self._realm_key(builder_realm)))
yield From(self._orchestrator.delete_key(self._metric_key(builder_realm)))
yield From(self._orchestrator.delete_key(slash_join(self._job_prefix, build_uuid)))
# This is outside the lock so we can un-register the component wherever it is registered to.
yield From(build_info.component.cancel_build())

View file

@ -0,0 +1,37 @@
import logging
import etcd
logger = logging.getLogger(__name__)
class EtcdCanceller(object):
""" A class that sends a message to etcd to cancel a build """
def __init__(self, config):
etcd_host = config.get('ETCD_HOST', '127.0.0.1')
etcd_port = config.get('ETCD_PORT', 2379)
etcd_ca_cert = config.get('ETCD_CA_CERT', None)
etcd_auth = config.get('ETCD_CERT_AND_KEY', None)
if etcd_auth is not None:
etcd_auth = tuple(etcd_auth)
etcd_protocol = 'http' if etcd_auth is None else 'https'
logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
self._cancel_prefix = config.get('ETCD_CANCEL_PREFIX', 'cancel/')
self._etcd_client = etcd.Client(
host=etcd_host,
port=etcd_port,
cert=etcd_auth,
ca_cert=etcd_ca_cert,
protocol=etcd_protocol,
read_timeout=5)
def try_cancel_build(self, build_uuid):
""" Writes etcd message to cancel build_uuid. """
logger.info("Cancelling build %s".format(build_uuid))
try:
self._etcd_client.write("{}{}".format(self._cancel_prefix, build_uuid), build_uuid, ttl=60)
return True
except etcd.EtcdException:
logger.exception("Failed to write to etcd client %s", build_uuid)
return False

View file

@ -0,0 +1,560 @@
import datetime
import hashlib
import logging
import os
import socket
import subprocess
import threading
import uuid
from functools import partial
import boto.ec2
import cachetools.func
import requests
import trollius
from container_cloud_config import CloudConfigContext
from jinja2 import FileSystemLoader, Environment
from trollius import coroutine, From, Return, get_event_loop
import release
from buildman.asyncutil import AsyncWrapper
from app import metric_queue, app
from util.metrics.metricqueue import duration_collector_async
from _init import ROOT_DIR
logger = logging.getLogger(__name__)
ONE_HOUR = 60*60
_TAG_RETRY_COUNT = 3 # Number of times to retry adding tags.
_TAG_RETRY_SLEEP = 2 # Number of seconds to wait between tag retries.
ENV = Environment(loader=FileSystemLoader(os.path.join(ROOT_DIR, "buildman/templates")))
TEMPLATE = ENV.get_template('cloudconfig.yaml')
CloudConfigContext().populate_jinja_environment(ENV)
class ExecutorException(Exception):
""" Exception raised when there is a problem starting or stopping a builder.
"""
pass
class BuilderExecutor(object):
def __init__(self, executor_config, manager_hostname):
""" Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
starting and stopping builders.
"""
self.executor_config = executor_config
self.manager_hostname = manager_hostname
default_websocket_scheme = 'wss' if app.config['PREFERRED_URL_SCHEME'] == 'https' else 'ws'
self.websocket_scheme = executor_config.get("WEBSOCKET_SCHEME", default_websocket_scheme)
@property
def name(self):
""" Name returns the unique name for this executor. """
return self.executor_config.get('NAME') or self.__class__.__name__
@property
def setup_time(self):
""" Returns the amount of time (in seconds) to wait for the execution to start for the build.
If None, the manager's default will be used.
"""
return self.executor_config.get('SETUP_TIME')
@coroutine
def start_builder(self, realm, token, build_uuid):
""" Create a builder with the specified config. Returns a unique id which can be used to manage
the builder.
"""
raise NotImplementedError
@coroutine
def stop_builder(self, builder_id):
""" Stop a builder which is currently running.
"""
raise NotImplementedError
def allowed_for_namespace(self, namespace):
""" Returns true if this executor can be used for builds in the given namespace. """
# Check for an explicit namespace whitelist.
namespace_whitelist = self.executor_config.get('NAMESPACE_WHITELIST')
if namespace_whitelist is not None and namespace in namespace_whitelist:
return True
# Check for a staged rollout percentage. If found, we hash the namespace and, if it is found
# in the first X% of the character space, we allow this executor to be used.
staged_rollout = self.executor_config.get('STAGED_ROLLOUT')
if staged_rollout is not None:
bucket = int(hashlib.sha256(namespace).hexdigest()[-2:], 16)
return bucket < (256 * staged_rollout)
# If there are no restrictions in place, we are free to use this executor.
return staged_rollout is None and namespace_whitelist is None
@property
def minimum_retry_threshold(self):
""" Returns the minimum number of retries required for this executor to be used or 0 if
none. """
return self.executor_config.get('MINIMUM_RETRY_THRESHOLD', 0)
def generate_cloud_config(self, realm, token, build_uuid, coreos_channel,
manager_hostname, quay_username=None,
quay_password=None):
if quay_username is None:
quay_username = self.executor_config['QUAY_USERNAME']
if quay_password is None:
quay_password = self.executor_config['QUAY_PASSWORD']
return TEMPLATE.render(
realm=realm,
token=token,
build_uuid=build_uuid,
quay_username=quay_username,
quay_password=quay_password,
manager_hostname=manager_hostname,
websocket_scheme=self.websocket_scheme,
coreos_channel=coreos_channel,
worker_image=self.executor_config.get('WORKER_IMAGE', 'quay.io/coreos/registry-build-worker'),
worker_tag=self.executor_config['WORKER_TAG'],
logentries_token=self.executor_config.get('LOGENTRIES_TOKEN', None),
volume_size=self.executor_config.get('VOLUME_SIZE', '42G'),
max_lifetime_s=self.executor_config.get('MAX_LIFETIME_S', 10800),
ssh_authorized_keys=self.executor_config.get('SSH_AUTHORIZED_KEYS', []),
)
class EC2Executor(BuilderExecutor):
""" Implementation of BuilderExecutor which uses libcloud to start machines on a variety of cloud
providers.
"""
COREOS_STACK_URL = 'http://%s.release.core-os.net/amd64-usr/current/coreos_production_ami_hvm.txt'
def __init__(self, *args, **kwargs):
self._loop = get_event_loop()
super(EC2Executor, self).__init__(*args, **kwargs)
def _get_conn(self):
""" Creates an ec2 connection which can be used to manage instances.
"""
return AsyncWrapper(boto.ec2.connect_to_region(
self.executor_config['EC2_REGION'],
aws_access_key_id=self.executor_config['AWS_ACCESS_KEY'],
aws_secret_access_key=self.executor_config['AWS_SECRET_KEY'],
))
@classmethod
@cachetools.func.ttl_cache(ttl=ONE_HOUR)
def _get_coreos_ami(cls, ec2_region, coreos_channel):
""" Retrieve the CoreOS AMI id from the canonical listing.
"""
stack_list_string = requests.get(EC2Executor.COREOS_STACK_URL % coreos_channel).text
stack_amis = dict([stack.split('=') for stack in stack_list_string.split('|')])
return stack_amis[ec2_region]
@coroutine
@duration_collector_async(metric_queue.builder_time_to_start, ['ec2'])
def start_builder(self, realm, token, build_uuid):
region = self.executor_config['EC2_REGION']
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
coreos_ami = self.executor_config.get('COREOS_AMI', None)
if coreos_ami is None:
get_ami_callable = partial(self._get_coreos_ami, region, channel)
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
user_data = self.generate_cloud_config(realm, token, build_uuid, channel, self.manager_hostname)
logger.debug('Generated cloud config for build %s: %s', build_uuid, user_data)
ec2_conn = self._get_conn()
ssd_root_ebs = boto.ec2.blockdevicemapping.BlockDeviceType(
size=int(self.executor_config.get('BLOCK_DEVICE_SIZE', 48)),
volume_type='gp2',
delete_on_termination=True,
)
block_devices = boto.ec2.blockdevicemapping.BlockDeviceMapping()
block_devices['/dev/xvda'] = ssd_root_ebs
interfaces = None
if self.executor_config.get('EC2_VPC_SUBNET_ID', None) is not None:
interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
subnet_id=self.executor_config['EC2_VPC_SUBNET_ID'],
groups=self.executor_config['EC2_SECURITY_GROUP_IDS'],
associate_public_ip_address=True,
)
interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)
try:
reservation = yield From(ec2_conn.run_instances(
coreos_ami,
instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
key_name=self.executor_config.get('EC2_KEY_NAME', None),
user_data=user_data,
instance_initiated_shutdown_behavior='terminate',
block_device_map=block_devices,
network_interfaces=interfaces,
))
except boto.exception.EC2ResponseError as ec2e:
logger.exception('Unable to spawn builder instance')
metric_queue.ephemeral_build_worker_failure.Inc()
raise ec2e
if not reservation.instances:
raise ExecutorException('Unable to spawn builder instance.')
elif len(reservation.instances) != 1:
raise ExecutorException('EC2 started wrong number of instances!')
launched = AsyncWrapper(reservation.instances[0])
# Sleep a few seconds to wait for AWS to spawn the instance.
yield From(trollius.sleep(_TAG_RETRY_SLEEP))
# Tag the instance with its metadata.
for i in range(0, _TAG_RETRY_COUNT):
try:
yield From(launched.add_tags({
'Name': 'Quay Ephemeral Builder',
'Realm': realm,
'Token': token,
'BuildUUID': build_uuid,
}))
except boto.exception.EC2ResponseError as ec2e:
if ec2e.error_code == 'InvalidInstanceID.NotFound':
if i < _TAG_RETRY_COUNT - 1:
logger.warning('Failed to write EC2 tags for instance %s for build %s (attempt #%s)',
launched.id, build_uuid, i)
yield From(trollius.sleep(_TAG_RETRY_SLEEP))
continue
raise ExecutorException('Unable to find builder instance.')
logger.exception('Failed to write EC2 tags (attempt #%s)', i)
logger.debug('Machine with ID %s started for build %s', launched.id, build_uuid)
raise Return(launched.id)
@coroutine
def stop_builder(self, builder_id):
try:
ec2_conn = self._get_conn()
terminated_instances = yield From(ec2_conn.terminate_instances([builder_id]))
except boto.exception.EC2ResponseError as ec2e:
if ec2e.error_code == 'InvalidInstanceID.NotFound':
logger.debug('Instance %s already terminated', builder_id)
return
logger.exception('Exception when trying to terminate instance %s', builder_id)
raise
if builder_id not in [si.id for si in terminated_instances]:
raise ExecutorException('Unable to terminate instance: %s' % builder_id)
class PopenExecutor(BuilderExecutor):
""" Implementation of BuilderExecutor which uses Popen to fork a quay-builder process.
"""
def __init__(self, executor_config, manager_hostname):
self._jobs = {}
super(PopenExecutor, self).__init__(executor_config, manager_hostname)
""" Executor which uses Popen to fork a quay-builder process.
"""
@coroutine
@duration_collector_async(metric_queue.builder_time_to_start, ['fork'])
def start_builder(self, realm, token, build_uuid):
# Now start a machine for this job, adding the machine id to the etcd information
logger.debug('Forking process for build')
ws_host = os.environ.get("BUILDMAN_WS_HOST", "localhost")
ws_port = os.environ.get("BUILDMAN_WS_PORT", "8787")
builder_env = {
'TOKEN': token,
'REALM': realm,
'ENDPOINT': 'ws://%s:%s' % (ws_host, ws_port),
'DOCKER_TLS_VERIFY': os.environ.get('DOCKER_TLS_VERIFY', ''),
'DOCKER_CERT_PATH': os.environ.get('DOCKER_CERT_PATH', ''),
'DOCKER_HOST': os.environ.get('DOCKER_HOST', ''),
'PATH': "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
}
logpipe = LogPipe(logging.INFO)
spawned = subprocess.Popen(os.environ.get('BUILDER_BINARY_LOCATION',
'/usr/local/bin/quay-builder'),
stdout=logpipe,
stderr=logpipe,
env=builder_env)
builder_id = str(uuid.uuid4())
self._jobs[builder_id] = (spawned, logpipe)
logger.debug('Builder spawned with id: %s', builder_id)
raise Return(builder_id)
@coroutine
def stop_builder(self, builder_id):
if builder_id not in self._jobs:
raise ExecutorException('Builder id not being tracked by executor.')
logger.debug('Killing builder with id: %s', builder_id)
spawned, logpipe = self._jobs[builder_id]
if spawned.poll() is None:
spawned.kill()
logpipe.close()
class KubernetesExecutor(BuilderExecutor):
""" Executes build jobs by creating Kubernetes jobs which run a qemu-kvm virtual
machine in a pod """
def __init__(self, *args, **kwargs):
super(KubernetesExecutor, self).__init__(*args, **kwargs)
self._loop = get_event_loop()
self.namespace = self.executor_config.get('BUILDER_NAMESPACE', 'builder')
self.image = self.executor_config.get('BUILDER_VM_CONTAINER_IMAGE',
'quay.io/quay/quay-builder-qemu-coreos:stable')
@coroutine
def _request(self, method, path, **kwargs):
request_options = dict(kwargs)
tls_cert = self.executor_config.get('K8S_API_TLS_CERT')
tls_key = self.executor_config.get('K8S_API_TLS_KEY')
tls_ca = self.executor_config.get('K8S_API_TLS_CA')
service_account_token = self.executor_config.get('SERVICE_ACCOUNT_TOKEN')
if 'timeout' not in request_options:
request_options['timeout'] = self.executor_config.get("K8S_API_TIMEOUT", 20)
if service_account_token:
scheme = 'https'
request_options['headers'] = {'Authorization': 'Bearer ' + service_account_token}
logger.debug('Using service account token for Kubernetes authentication')
elif tls_cert and tls_key:
scheme = 'https'
request_options['cert'] = (tls_cert, tls_key)
logger.debug('Using tls certificate and key for Kubernetes authentication')
if tls_ca:
request_options['verify'] = tls_ca
else:
scheme = 'http'
server = self.executor_config.get('K8S_API_SERVER', 'localhost:8080')
url = '%s://%s%s' % (scheme, server, path)
logger.debug('Executor config: %s', self.executor_config)
logger.debug('Kubernetes request: %s %s: %s', method, url, request_options)
res = requests.request(method, url, **request_options)
logger.debug('Kubernetes response: %s: %s', res.status_code, res.text)
raise Return(res)
def _jobs_path(self):
return '/apis/batch/v1/namespaces/%s/jobs' % self.namespace
def _job_path(self, build_uuid):
return '%s/%s' % (self._jobs_path(), build_uuid)
def _kubernetes_distribution(self):
return self.executor_config.get('KUBERNETES_DISTRIBUTION', 'basic').lower()
def _is_basic_kubernetes_distribution(self):
return self._kubernetes_distribution() == 'basic'
def _is_openshift_kubernetes_distribution(self):
return self._kubernetes_distribution() == 'openshift'
def _build_job_container_resources(self):
# Minimum acceptable free resources for this container to "fit" in a quota
# These may be lower than the absolute limits if the cluster is knowingly
# oversubscribed by some amount.
container_requests = {
'memory' : self.executor_config.get('CONTAINER_MEMORY_REQUEST', '3968Mi'),
}
container_limits = {
'memory' : self.executor_config.get('CONTAINER_MEMORY_LIMITS', '5120Mi'),
'cpu' : self.executor_config.get('CONTAINER_CPU_LIMITS', '1000m'),
}
resources = {
'requests': container_requests,
}
if self._is_openshift_kubernetes_distribution():
resources['requests']['cpu'] = self.executor_config.get('CONTAINER_CPU_REQUEST', '500m')
resources['limits'] = container_limits
return resources
def _build_job_containers(self, user_data):
vm_memory_limit = self.executor_config.get('VM_MEMORY_LIMIT', '4G')
vm_volume_size = self.executor_config.get('VOLUME_SIZE', '32G')
container = {
'name': 'builder',
'imagePullPolicy': 'IfNotPresent',
'image': self.image,
'securityContext': {'privileged': True},
'env': [
{'name': 'USERDATA', 'value': user_data},
{'name': 'VM_MEMORY', 'value': vm_memory_limit},
{'name': 'VM_VOLUME_SIZE', 'value': vm_volume_size},
],
'resources': self._build_job_container_resources(),
}
if self._is_basic_kubernetes_distribution():
container['volumeMounts'] = [{'name': 'secrets-mask','mountPath': '/var/run/secrets/kubernetes.io/serviceaccount'}]
return container
def _job_resource(self, build_uuid, user_data, coreos_channel='stable'):
image_pull_secret_name = self.executor_config.get('IMAGE_PULL_SECRET_NAME', 'builder')
service_account = self.executor_config.get('SERVICE_ACCOUNT_NAME', 'quay-builder-sa')
node_selector_label_key = self.executor_config.get('NODE_SELECTOR_LABEL_KEY', 'beta.kubernetes.io/instance-type')
node_selector_label_value = self.executor_config.get('NODE_SELECTOR_LABEL_VALUE', '')
node_selector = {
node_selector_label_key : node_selector_label_value
}
release_sha = release.GIT_HEAD or 'none'
if ' ' in release_sha:
release_sha = 'HEAD'
job_resource = {
'apiVersion': 'batch/v1',
'kind': 'Job',
'metadata': {
'namespace': self.namespace,
'generateName': build_uuid + '-',
'labels': {
'build': build_uuid,
'time': datetime.datetime.now().strftime('%Y-%m-%d-%H'),
'manager': socket.gethostname(),
'quay-sha': release_sha,
},
},
'spec' : {
'activeDeadlineSeconds': self.executor_config.get('MAXIMUM_JOB_TIME', 7200),
'template': {
'metadata': {
'labels': {
'build': build_uuid,
'time': datetime.datetime.now().strftime('%Y-%m-%d-%H'),
'manager': socket.gethostname(),
'quay-sha': release_sha,
},
},
'spec': {
'imagePullSecrets': [{ 'name': image_pull_secret_name }],
'restartPolicy': 'Never',
'dnsPolicy': 'Default',
'containers': [self._build_job_containers(user_data)],
},
},
},
}
if self._is_openshift_kubernetes_distribution():
# Setting `automountServiceAccountToken` to false will prevent automounting API credentials for a service account.
job_resource['spec']['template']['spec']['automountServiceAccountToken'] = False
# Use dedicated service account that has no authorization to any resources.
job_resource['spec']['template']['spec']['serviceAccount'] = service_account
# Setting `enableServiceLinks` to false prevents information about other services from being injected into pod's
# environment variables. Pod has no visibility into other services on the cluster.
job_resource['spec']['template']['spec']['enableServiceLinks'] = False
if node_selector_label_value.strip() != '':
job_resource['spec']['template']['spec']['nodeSelector'] = node_selector
if self._is_basic_kubernetes_distribution():
# This volume is a hack to mask the token for the namespace's
# default service account, which is placed in a file mounted under
# `/var/run/secrets/kubernetes.io/serviceaccount` in all pods.
# There's currently no other way to just disable the service
# account at either the pod or namespace level.
#
# https://github.com/kubernetes/kubernetes/issues/16779
#
job_resource['spec']['template']['spec']['volumes'] = [{'name': 'secrets-mask','emptyDir': {'medium': 'Memory'}}]
return job_resource
@coroutine
@duration_collector_async(metric_queue.builder_time_to_start, ['k8s'])
def start_builder(self, realm, token, build_uuid):
# generate resource
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
user_data = self.generate_cloud_config(realm, token, build_uuid, channel, self.manager_hostname)
resource = self._job_resource(build_uuid, user_data, channel)
logger.debug('Using Kubernetes Distribution: %s', self._kubernetes_distribution())
logger.debug('Generated kubernetes resource:\n%s', resource)
# schedule
create_job = yield From(self._request('POST', self._jobs_path(), json=resource))
if int(create_job.status_code / 100) != 2:
raise ExecutorException('Failed to create job: %s: %s: %s' %
(build_uuid, create_job.status_code, create_job.text))
job = create_job.json()
raise Return(job['metadata']['name'])
@coroutine
def stop_builder(self, builder_id):
pods_path = '/api/v1/namespaces/%s/pods' % self.namespace
# Delete the job itself.
try:
yield From(self._request('DELETE', self._job_path(builder_id)))
except:
logger.exception('Failed to send delete job call for job %s', builder_id)
# Delete the pod(s) for the job.
selectorString = "job-name=%s" % builder_id
try:
yield From(self._request('DELETE', pods_path, params=dict(labelSelector=selectorString)))
except:
logger.exception("Failed to send delete pod call for job %s", builder_id)
class LogPipe(threading.Thread):
""" Adapted from http://codereview.stackexchange.com/a/17959
"""
def __init__(self, level):
"""Setup the object with a logger and a loglevel
and start the thread
"""
threading.Thread.__init__(self)
self.daemon = False
self.level = level
self.fd_read, self.fd_write = os.pipe()
self.pipe_reader = os.fdopen(self.fd_read)
self.start()
def fileno(self):
"""Return the write file descriptor of the pipe
"""
return self.fd_write
def run(self):
"""Run the thread, logging everything.
"""
for line in iter(self.pipe_reader.readline, ''):
logging.log(self.level, line.strip('\n'))
self.pipe_reader.close()
def close(self):
"""Close the write end of the pipe.
"""
os.close(self.fd_write)

View file

@ -0,0 +1,8 @@
class NoopCanceller(object):
""" A class that can not cancel a build """
def __init__(self, config=None):
pass
def try_cancel_build(self, uuid):
""" Does nothing and fails to cancel build. """
return False

View file

@ -0,0 +1,26 @@
import logging
from buildman.orchestrator import orchestrator_from_config, OrchestratorError
from util import slash_join
logger = logging.getLogger(__name__)
CANCEL_PREFIX = 'cancel/'
class OrchestratorCanceller(object):
""" An asynchronous way to cancel a build with any Orchestrator. """
def __init__(self, config):
self._orchestrator = orchestrator_from_config(config, canceller_only=True)
def try_cancel_build(self, build_uuid):
logger.info('Cancelling build %s', build_uuid)
cancel_key = slash_join(CANCEL_PREFIX, build_uuid)
try:
self._orchestrator.set_key_sync(cancel_key, build_uuid, expiration=60)
return True
except OrchestratorError:
logger.exception('Failed to write cancel action to redis with uuid %s', build_uuid)
return False