initial import for Open Source 🎉

This commit is contained in:
Jimmy Zelinskie 2019-11-12 11:09:47 -05:00
parent 1898c361f3
commit 9c0dd3b722
2048 changed files with 218743 additions and 0 deletions

2
buildman/MAINTAINERS Normal file
View file

@ -0,0 +1,2 @@
Charlton Austin <charlton.austin@coreos.com> (@charltonaustin)
Joseph Schorr <joseph.schorr@coreos.com> (@josephschorr)

0
buildman/__init__.py Normal file
View file

42
buildman/asyncutil.py Normal file
View file

@ -0,0 +1,42 @@
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from trollius import get_event_loop, coroutine
def wrap_with_threadpool(obj, worker_threads=1):
"""
Wraps a class in an async executor so that it can be safely used in an event loop like trollius.
"""
async_executor = ThreadPoolExecutor(worker_threads)
return AsyncWrapper(obj, executor=async_executor), async_executor
class AsyncWrapper(object):
""" Wrapper class which will transform a syncronous library to one that can be used with
trollius coroutines.
"""
def __init__(self, delegate, loop=None, executor=None):
self._loop = loop if loop is not None else get_event_loop()
self._delegate = delegate
self._executor = executor
def __getattr__(self, attrib):
delegate_attr = getattr(self._delegate, attrib)
if not callable(delegate_attr):
return delegate_attr
def wrapper(*args, **kwargs):
""" Wraps the delegate_attr with primitives that will transform sync calls to ones shelled
out to a thread pool.
"""
callable_delegate_attr = partial(delegate_attr, *args, **kwargs)
return self._loop.run_in_executor(self._executor, callable_delegate_attr)
return wrapper
@coroutine
def __call__(self, *args, **kwargs):
callable_delegate_attr = partial(self._delegate, *args, **kwargs)
return self._loop.run_in_executor(self._executor, callable_delegate_attr)

97
buildman/builder.py Normal file
View file

@ -0,0 +1,97 @@
import logging
import os
import time
import socket
import features
from app import app, userfiles as user_files, build_logs, dockerfile_build_queue
from util.log import logfile_path
from buildman.manager.enterprise import EnterpriseManager
from buildman.manager.ephemeral import EphemeralBuilderManager
from buildman.server import BuilderServer
from trollius import SSLContext
from raven.handlers.logging import SentryHandler
from raven.conf import setup_logging
logger = logging.getLogger(__name__)
BUILD_MANAGERS = {
'enterprise': EnterpriseManager,
'ephemeral': EphemeralBuilderManager,
}
EXTERNALLY_MANAGED = 'external'
DEFAULT_WEBSOCKET_PORT = 8787
DEFAULT_CONTROLLER_PORT = 8686
LOG_FORMAT = "%(asctime)s [%(process)d] [%(levelname)s] [%(name)s] %(message)s"
def run_build_manager():
if not features.BUILD_SUPPORT:
logger.debug('Building is disabled. Please enable the feature flag')
while True:
time.sleep(1000)
return
if app.config.get('REGISTRY_STATE', 'normal') == 'readonly':
logger.debug('Building is disabled while in read-only mode.')
while True:
time.sleep(1000)
return
build_manager_config = app.config.get('BUILD_MANAGER')
if build_manager_config is None:
return
# If the build system is externally managed, then we just sleep this process.
if build_manager_config[0] == EXTERNALLY_MANAGED:
logger.debug('Builds are externally managed.')
while True:
time.sleep(1000)
return
logger.debug('Asking to start build manager with lifecycle "%s"', build_manager_config[0])
manager_klass = BUILD_MANAGERS.get(build_manager_config[0])
if manager_klass is None:
return
manager_hostname = os.environ.get('BUILDMAN_HOSTNAME',
app.config.get('BUILDMAN_HOSTNAME',
app.config['SERVER_HOSTNAME']))
websocket_port = int(os.environ.get('BUILDMAN_WEBSOCKET_PORT',
app.config.get('BUILDMAN_WEBSOCKET_PORT',
DEFAULT_WEBSOCKET_PORT)))
controller_port = int(os.environ.get('BUILDMAN_CONTROLLER_PORT',
app.config.get('BUILDMAN_CONTROLLER_PORT',
DEFAULT_CONTROLLER_PORT)))
logger.debug('Will pass buildman hostname %s to builders for websocket connection',
manager_hostname)
logger.debug('Starting build manager with lifecycle "%s"', build_manager_config[0])
ssl_context = None
if os.environ.get('SSL_CONFIG'):
logger.debug('Loading SSL cert and key')
ssl_context = SSLContext()
ssl_context.load_cert_chain(os.path.join(os.environ.get('SSL_CONFIG'), 'ssl.cert'),
os.path.join(os.environ.get('SSL_CONFIG'), 'ssl.key'))
server = BuilderServer(app.config['SERVER_HOSTNAME'], dockerfile_build_queue, build_logs,
user_files, manager_klass, build_manager_config[1], manager_hostname)
server.run('0.0.0.0', websocket_port, controller_port, ssl=ssl_context)
if __name__ == '__main__':
logging.config.fileConfig(logfile_path(debug=True), disable_existing_loggers=False)
logging.getLogger('peewee').setLevel(logging.WARN)
logging.getLogger('boto').setLevel(logging.WARN)
if app.config.get('EXCEPTION_LOG_TYPE', 'FakeSentry') == 'Sentry':
buildman_name = '%s:buildman' % socket.gethostname()
setup_logging(SentryHandler(app.config.get('SENTRY_DSN', ''), name=buildman_name,
level=logging.ERROR))
run_build_manager()

View file

View file

@ -0,0 +1,13 @@
from autobahn.asyncio.wamp import ApplicationSession
class BaseComponent(ApplicationSession):
""" Base class for all registered component sessions in the server. """
def __init__(self, config, **kwargs):
ApplicationSession.__init__(self, config)
self.server = None
self.parent_manager = None
self.build_logs = None
self.user_files = None
def kind(self):
raise NotImplementedError

View file

@ -0,0 +1,539 @@
import datetime
import os
import time
import logging
import json
import trollius
from autobahn.wamp.exception import ApplicationError
from trollius import From, Return
from active_migration import ActiveDataMigration, ERTMigrationFlags
from buildman.server import BuildJobResult
from buildman.component.basecomponent import BaseComponent
from buildman.component.buildparse import extract_current_step
from buildman.jobutil.buildjob import BuildJobLoadException
from buildman.jobutil.buildstatus import StatusHandler
from buildman.jobutil.workererror import WorkerError
from app import app
from data.database import BUILD_PHASE, UseThenDisconnect
from data.model import InvalidRepositoryBuildException
from data.registry_model import registry_model
from util import slash_join
HEARTBEAT_DELTA = datetime.timedelta(seconds=60)
BUILD_HEARTBEAT_DELAY = datetime.timedelta(seconds=30)
HEARTBEAT_TIMEOUT = 10
INITIAL_TIMEOUT = 25
SUPPORTED_WORKER_VERSIONS = ['0.3']
# Label which marks a manifest with its source build ID.
INTERNAL_LABEL_BUILD_UUID = 'quay.build.uuid'
logger = logging.getLogger(__name__)
class ComponentStatus(object):
""" ComponentStatus represents the possible states of a component. """
JOINING = 'joining'
WAITING = 'waiting'
RUNNING = 'running'
BUILDING = 'building'
TIMED_OUT = 'timeout'
class BuildComponent(BaseComponent):
""" An application session component which conducts one (or more) builds. """
def __init__(self, config, realm=None, token=None, **kwargs):
self.expected_token = token
self.builder_realm = realm
self.parent_manager = None
self.registry_hostname = None
self._component_status = ComponentStatus.JOINING
self._last_heartbeat = None
self._current_job = None
self._build_status = None
self._image_info = None
self._worker_version = None
BaseComponent.__init__(self, config, **kwargs)
def kind(self):
return 'builder'
def onConnect(self):
self.join(self.builder_realm)
@trollius.coroutine
def onJoin(self, details):
logger.debug('Registering methods and listeners for component %s', self.builder_realm)
yield From(self.register(self._on_ready, u'io.quay.buildworker.ready'))
yield From(self.register(self._determine_cache_tag, u'io.quay.buildworker.determinecachetag'))
yield From(self.register(self._ping, u'io.quay.buildworker.ping'))
yield From(self.register(self._on_log_message, u'io.quay.builder.logmessagesynchronously'))
yield From(self.subscribe(self._on_heartbeat, u'io.quay.builder.heartbeat'))
yield From(self._set_status(ComponentStatus.WAITING))
@trollius.coroutine
def start_build(self, build_job):
""" Starts a build. """
if self._component_status not in (ComponentStatus.WAITING, ComponentStatus.RUNNING):
logger.debug('Could not start build for component %s (build %s, worker version: %s): %s',
self.builder_realm, build_job.repo_build.uuid, self._worker_version,
self._component_status)
raise Return()
logger.debug('Starting build for component %s (build %s, worker version: %s)',
self.builder_realm, build_job.repo_build.uuid, self._worker_version)
self._current_job = build_job
self._build_status = StatusHandler(self.build_logs, build_job.repo_build.uuid)
self._image_info = {}
yield From(self._set_status(ComponentStatus.BUILDING))
# Send the notification that the build has started.
build_job.send_notification('build_start')
# Parse the build configuration.
try:
build_config = build_job.build_config
except BuildJobLoadException as irbe:
yield From(self._build_failure('Could not load build job information', irbe))
raise Return()
base_image_information = {}
# Add the pull robot information, if any.
if build_job.pull_credentials:
base_image_information['username'] = build_job.pull_credentials.get('username', '')
base_image_information['password'] = build_job.pull_credentials.get('password', '')
# Retrieve the repository's fully qualified name.
repo = build_job.repo_build.repository
repository_name = repo.namespace_user.username + '/' + repo.name
# Parse the build queue item into build arguments.
# build_package: URL to the build package to download and untar/unzip.
# defaults to empty string to avoid requiring a pointer on the builder.
# sub_directory: The location within the build package of the Dockerfile and the build context.
# repository: The repository for which this build is occurring.
# registry: The registry for which this build is occuring (e.g. 'quay.io').
# pull_token: The token to use when pulling the cache for building.
# push_token: The token to use to push the built image.
# tag_names: The name(s) of the tag(s) for the newly built image.
# base_image: The image name and credentials to use to conduct the base image pull.
# username: The username for pulling the base image (if any).
# password: The password for pulling the base image (if any).
context, dockerfile_path = self.extract_dockerfile_args(build_config)
build_arguments = {
'build_package': build_job.get_build_package_url(self.user_files),
'context': context,
'dockerfile_path': dockerfile_path,
'repository': repository_name,
'registry': self.registry_hostname,
'pull_token': build_job.repo_build.access_token.get_code(),
'push_token': build_job.repo_build.access_token.get_code(),
'tag_names': build_config.get('docker_tags', ['latest']),
'base_image': base_image_information,
}
# If the trigger has a private key, it's using git, thus we should add
# git data to the build args.
# url: url used to clone the git repository
# sha: the sha1 identifier of the commit to check out
# private_key: the key used to get read access to the git repository
# TODO(remove-unenc): Remove legacy field.
private_key = None
if build_job.repo_build.trigger is not None and \
build_job.repo_build.trigger.secure_private_key is not None:
private_key = build_job.repo_build.trigger.secure_private_key.decrypt()
if ActiveDataMigration.has_flag(ERTMigrationFlags.READ_OLD_FIELDS) and \
private_key is None and \
build_job.repo_build.trigger is not None:
private_key = build_job.repo_build.trigger.private_key
if private_key is not None:
build_arguments['git'] = {
'url': build_config['trigger_metadata'].get('git_url', ''),
'sha': BuildComponent._commit_sha(build_config),
'private_key': private_key or '',
}
# If the build args have no buildpack, mark it as a failure before sending
# it to a builder instance.
if not build_arguments['build_package'] and not build_arguments['git']:
logger.error('%s: insufficient build args: %s',
self._current_job.repo_build.uuid, build_arguments)
yield From(self._build_failure('Insufficient build arguments. No buildpack available.'))
raise Return()
# Invoke the build.
logger.debug('Invoking build: %s', self.builder_realm)
logger.debug('With Arguments: %s', build_arguments)
def build_complete_callback(result):
""" This function is used to execute a coroutine as the callback. """
trollius.ensure_future(self._build_complete(result))
self.call("io.quay.builder.build", **build_arguments).add_done_callback(build_complete_callback)
# Set the heartbeat for the future. If the builder never receives the build call,
# then this will cause a timeout after 30 seconds. We know the builder has registered
# by this point, so it makes sense to have a timeout.
self._last_heartbeat = datetime.datetime.utcnow() + BUILD_HEARTBEAT_DELAY
@staticmethod
def extract_dockerfile_args(build_config):
dockerfile_path = build_config.get('build_subdir', '')
context = build_config.get('context', '')
if not (dockerfile_path == '' or context == ''):
# This should not happen and can be removed when we centralize validating build_config
dockerfile_abspath = slash_join('', dockerfile_path)
if ".." in os.path.relpath(dockerfile_abspath, context):
return os.path.split(dockerfile_path)
dockerfile_path = os.path.relpath(dockerfile_abspath, context)
return context, dockerfile_path
@staticmethod
def _commit_sha(build_config):
""" Determines whether the metadata is using an old schema or not and returns the commit. """
commit_sha = build_config['trigger_metadata'].get('commit', '')
old_commit_sha = build_config['trigger_metadata'].get('commit_sha', '')
return commit_sha or old_commit_sha
@staticmethod
def name_and_path(subdir):
""" Returns the dockerfile path and name """
if subdir.endswith("/"):
subdir += "Dockerfile"
elif not subdir.endswith("Dockerfile"):
subdir += "/Dockerfile"
return os.path.split(subdir)
@staticmethod
def _total_completion(statuses, total_images):
""" Returns the current amount completion relative to the total completion of a build. """
percentage_with_sizes = float(len(statuses.values())) / total_images
sent_bytes = sum([status['current'] for status in statuses.values()])
total_bytes = sum([status['total'] for status in statuses.values()])
return float(sent_bytes) / total_bytes * percentage_with_sizes
@staticmethod
def _process_pushpull_status(status_dict, current_phase, docker_data, images):
""" Processes the status of a push or pull by updating the provided status_dict and images. """
if not docker_data:
return
num_images = 0
status_completion_key = ''
if current_phase == 'pushing':
status_completion_key = 'push_completion'
num_images = status_dict['total_commands']
elif current_phase == 'pulling':
status_completion_key = 'pull_completion'
elif current_phase == 'priming-cache':
status_completion_key = 'cache_completion'
else:
return
if 'progressDetail' in docker_data and 'id' in docker_data:
image_id = docker_data['id']
detail = docker_data['progressDetail']
if 'current' in detail and 'total' in detail:
images[image_id] = detail
status_dict[status_completion_key] = \
BuildComponent._total_completion(images, max(len(images), num_images))
@trollius.coroutine
def _on_log_message(self, phase, json_data):
""" Tails log messages and updates the build status. """
# Update the heartbeat.
self._last_heartbeat = datetime.datetime.utcnow()
# Parse any of the JSON data logged.
log_data = {}
if json_data:
try:
log_data = json.loads(json_data)
except ValueError:
pass
# Extract the current status message (if any).
fully_unwrapped = ''
keys_to_extract = ['error', 'status', 'stream']
for key in keys_to_extract:
if key in log_data:
fully_unwrapped = log_data[key]
break
# Determine if this is a step string.
current_step = None
current_status_string = str(fully_unwrapped.encode('utf-8'))
if current_status_string and phase == BUILD_PHASE.BUILDING:
current_step = extract_current_step(current_status_string)
# Parse and update the phase and the status_dict. The status dictionary contains
# the pull/push progress, as well as the current step index.
with self._build_status as status_dict:
try:
changed_phase = yield From(self._build_status.set_phase(phase, log_data.get('status_data')))
if changed_phase:
logger.debug('Build %s has entered a new phase: %s', self.builder_realm, phase)
elif self._current_job.repo_build.phase == BUILD_PHASE.CANCELLED:
build_id = self._current_job.repo_build.uuid
logger.debug('Trying to move cancelled build into phase: %s with id: %s', phase, build_id)
raise Return(False)
except InvalidRepositoryBuildException:
build_id = self._current_job.repo_build.uuid
logger.warning('Build %s was not found; repo was probably deleted', build_id)
raise Return(False)
BuildComponent._process_pushpull_status(status_dict, phase, log_data, self._image_info)
# If the current message represents the beginning of a new step, then update the
# current command index.
if current_step is not None:
status_dict['current_command'] = current_step
# If the json data contains an error, then something went wrong with a push or pull.
if 'error' in log_data:
yield From(self._build_status.set_error(log_data['error']))
if current_step is not None:
yield From(self._build_status.set_command(current_status_string))
elif phase == BUILD_PHASE.BUILDING:
yield From(self._build_status.append_log(current_status_string))
raise Return(True)
@trollius.coroutine
def _determine_cache_tag(self, command_comments, base_image_name, base_image_tag, base_image_id):
with self._build_status as status_dict:
status_dict['total_commands'] = len(command_comments) + 1
logger.debug('Checking cache on realm %s. Base image: %s:%s (%s)', self.builder_realm,
base_image_name, base_image_tag, base_image_id)
tag_found = self._current_job.determine_cached_tag(base_image_id, command_comments)
raise Return(tag_found or '')
@trollius.coroutine
def _build_failure(self, error_message, exception=None):
""" Handles and logs a failed build. """
yield From(self._build_status.set_error(error_message, {
'internal_error': str(exception) if exception else None
}))
build_id = self._current_job.repo_build.uuid
logger.warning('Build %s failed with message: %s', build_id, error_message)
# Mark that the build has finished (in an error state)
yield From(self._build_finished(BuildJobResult.ERROR))
@trollius.coroutine
def _build_complete(self, result):
""" Wraps up a completed build. Handles any errors and calls self._build_finished. """
build_id = self._current_job.repo_build.uuid
try:
# Retrieve the result. This will raise an ApplicationError on any error that occurred.
result_value = result.result()
kwargs = {}
# Note: If we are hitting an older builder that didn't return ANY map data, then the result
# value will be a bool instead of a proper CallResult object.
# Therefore: we have a try-except guard here to ensure we don't hit this pitfall.
try:
kwargs = result_value.kwresults
except:
pass
try:
yield From(self._build_status.set_phase(BUILD_PHASE.COMPLETE))
except InvalidRepositoryBuildException:
logger.warning('Build %s was not found; repo was probably deleted', build_id)
raise Return()
yield From(self._build_finished(BuildJobResult.COMPLETE))
# Label the pushed manifests with the build metadata.
manifest_digests = kwargs.get('digests') or []
repository = registry_model.lookup_repository(self._current_job.namespace,
self._current_job.repo_name)
if repository is not None:
for digest in manifest_digests:
with UseThenDisconnect(app.config):
manifest = registry_model.lookup_manifest_by_digest(repository, digest,
require_available=True)
if manifest is None:
continue
registry_model.create_manifest_label(manifest, INTERNAL_LABEL_BUILD_UUID,
build_id, 'internal', 'text/plain')
# Send the notification that the build has completed successfully.
self._current_job.send_notification('build_success',
image_id=kwargs.get('image_id'),
manifest_digests=manifest_digests)
except ApplicationError as aex:
worker_error = WorkerError(aex.error, aex.kwargs.get('base_error'))
# Write the error to the log.
yield From(self._build_status.set_error(worker_error.public_message(),
worker_error.extra_data(),
internal_error=worker_error.is_internal_error(),
requeued=self._current_job.has_retries_remaining()))
# Send the notification that the build has failed.
self._current_job.send_notification('build_failure',
error_message=worker_error.public_message())
# Mark the build as completed.
if worker_error.is_internal_error():
logger.exception('[BUILD INTERNAL ERROR: Remote] Build ID: %s: %s', build_id,
worker_error.public_message())
yield From(self._build_finished(BuildJobResult.INCOMPLETE))
else:
logger.debug('Got remote failure exception for build %s: %s', build_id, aex)
yield From(self._build_finished(BuildJobResult.ERROR))
# Remove the current job.
self._current_job = None
@trollius.coroutine
def _build_finished(self, job_status):
""" Alerts the parent that a build has completed and sets the status back to running. """
yield From(self.parent_manager.job_completed(self._current_job, job_status, self))
# Set the component back to a running state.
yield From(self._set_status(ComponentStatus.RUNNING))
@staticmethod
def _ping():
""" Ping pong. """
return 'pong'
@trollius.coroutine
def _on_ready(self, token, version):
logger.debug('On ready called (token "%s")', token)
self._worker_version = version
if not version in SUPPORTED_WORKER_VERSIONS:
logger.warning('Build component (token "%s") is running an out-of-date version: %s', token,
version)
raise Return(False)
if self._component_status != ComponentStatus.WAITING:
logger.warning('Build component (token "%s") is already connected', self.expected_token)
raise Return(False)
if token != self.expected_token:
logger.warning('Builder token mismatch. Expected: "%s". Found: "%s"', self.expected_token,
token)
raise Return(False)
yield From(self._set_status(ComponentStatus.RUNNING))
# Start the heartbeat check and updating loop.
loop = trollius.get_event_loop()
loop.create_task(self._heartbeat())
logger.debug('Build worker %s is connected and ready', self.builder_realm)
raise Return(True)
@trollius.coroutine
def _set_status(self, phase):
if phase == ComponentStatus.RUNNING:
yield From(self.parent_manager.build_component_ready(self))
self._component_status = phase
def _on_heartbeat(self):
""" Updates the last known heartbeat. """
if self._component_status == ComponentStatus.TIMED_OUT:
return
logger.debug('Got heartbeat on realm %s', self.builder_realm)
self._last_heartbeat = datetime.datetime.utcnow()
@trollius.coroutine
def _heartbeat(self):
""" Coroutine that runs every HEARTBEAT_TIMEOUT seconds, both checking the worker's heartbeat
and updating the heartbeat in the build status dictionary (if applicable). This allows
the build system to catch crashes from either end.
"""
yield From(trollius.sleep(INITIAL_TIMEOUT))
while True:
# If the component is no longer running or actively building, nothing more to do.
if (self._component_status != ComponentStatus.RUNNING and
self._component_status != ComponentStatus.BUILDING):
raise Return()
# If there is an active build, write the heartbeat to its status.
if self._build_status is not None:
with self._build_status as status_dict:
status_dict['heartbeat'] = int(time.time())
# Mark the build item.
current_job = self._current_job
if current_job is not None:
yield From(self.parent_manager.job_heartbeat(current_job))
# Check the heartbeat from the worker.
logger.debug('Checking heartbeat on realm %s', self.builder_realm)
if (self._last_heartbeat and
self._last_heartbeat < datetime.datetime.utcnow() - HEARTBEAT_DELTA):
logger.debug('Heartbeat on realm %s has expired: %s', self.builder_realm,
self._last_heartbeat)
yield From(self._timeout())
raise Return()
logger.debug('Heartbeat on realm %s is valid: %s (%s).', self.builder_realm,
self._last_heartbeat, self._component_status)
yield From(trollius.sleep(HEARTBEAT_TIMEOUT))
@trollius.coroutine
def _timeout(self):
if self._component_status == ComponentStatus.TIMED_OUT:
raise Return()
yield From(self._set_status(ComponentStatus.TIMED_OUT))
logger.warning('Build component with realm %s has timed out', self.builder_realm)
# If we still have a running job, then it has not completed and we need to tell the parent
# manager.
if self._current_job is not None:
yield From(self._build_status.set_error('Build worker timed out', internal_error=True,
requeued=self._current_job.has_retries_remaining()))
build_id = self._current_job.build_uuid
logger.error('[BUILD INTERNAL ERROR: Timeout] Build ID: %s', build_id)
yield From(self.parent_manager.job_completed(self._current_job,
BuildJobResult.INCOMPLETE,
self))
# Unregister the current component so that it cannot be invoked again.
self.parent_manager.build_component_disposed(self, True)
# Remove the job reference.
self._current_job = None
@trollius.coroutine
def cancel_build(self):
self.parent_manager.build_component_disposed(self, True)
self._current_job = None
yield From(self._set_status(ComponentStatus.RUNNING))

View file

@ -0,0 +1,15 @@
import re
def extract_current_step(current_status_string):
""" Attempts to extract the current step numeric identifier from the given status string. Returns the step
number or None if none.
"""
# Older format: `Step 12 :`
# Newer format: `Step 4/13 :`
step_increment = re.search(r'Step ([0-9]+)/([0-9]+) :', current_status_string)
if step_increment:
return int(step_increment.group(1))
step_increment = re.search(r'Step ([0-9]+) :', current_status_string)
if step_increment:
return int(step_increment.group(1))

View file

@ -0,0 +1,36 @@
import pytest
from buildman.component.buildcomponent import BuildComponent
@pytest.mark.parametrize('input,expected_path,expected_file', [
("", "/", "Dockerfile"),
("/", "/", "Dockerfile"),
("/Dockerfile", "/", "Dockerfile"),
("/server.Dockerfile", "/", "server.Dockerfile"),
("/somepath", "/somepath", "Dockerfile"),
("/somepath/", "/somepath", "Dockerfile"),
("/somepath/Dockerfile", "/somepath", "Dockerfile"),
("/somepath/server.Dockerfile", "/somepath", "server.Dockerfile"),
("/somepath/some_other_path", "/somepath/some_other_path", "Dockerfile"),
("/somepath/some_other_path/", "/somepath/some_other_path", "Dockerfile"),
("/somepath/some_other_path/Dockerfile", "/somepath/some_other_path", "Dockerfile"),
("/somepath/some_other_path/server.Dockerfile", "/somepath/some_other_path", "server.Dockerfile"),
])
def test_path_is_dockerfile(input, expected_path, expected_file):
actual_path, actual_file = BuildComponent.name_and_path(input)
assert actual_path == expected_path
assert actual_file == expected_file
@pytest.mark.parametrize('build_config,context,dockerfile_path', [
({}, '', ''),
({'build_subdir': '/builddir/Dockerfile'}, '', '/builddir/Dockerfile'),
({'context': '/builddir'}, '/builddir', ''),
({'context': '/builddir', 'build_subdir': '/builddir/Dockerfile'}, '/builddir', 'Dockerfile'),
({'context': '/some_other_dir/Dockerfile', 'build_subdir': '/builddir/Dockerfile'}, '/builddir', 'Dockerfile'),
({'context': '/', 'build_subdir':'Dockerfile'}, '/', 'Dockerfile')
])
def test_extract_dockerfile_args(build_config, context, dockerfile_path):
actual_context, actual_dockerfile_path = BuildComponent.extract_dockerfile_args(build_config)
assert context == actual_context
assert dockerfile_path == actual_dockerfile_path

View file

@ -0,0 +1,16 @@
import pytest
from buildman.component.buildparse import extract_current_step
@pytest.mark.parametrize('input,expected_step', [
("", None),
("Step a :", None),
("Step 1 :", 1),
("Step 1 : ", 1),
("Step 1/2 : ", 1),
("Step 2/17 : ", 2),
("Step 4/13 : ARG somearg=foo", 4),
])
def test_extract_current_step(input, expected_step):
assert extract_current_step(input) == expected_step

21
buildman/enums.py Normal file
View file

@ -0,0 +1,21 @@
from data.database import BUILD_PHASE
class BuildJobResult(object):
""" Build job result enum """
INCOMPLETE = 'incomplete'
COMPLETE = 'complete'
ERROR = 'error'
class BuildServerStatus(object):
""" Build server status enum """
STARTING = 'starting'
RUNNING = 'running'
SHUTDOWN = 'shutting_down'
EXCEPTION = 'exception'
RESULT_PHASES = {
BuildJobResult.INCOMPLETE: BUILD_PHASE.INTERNAL_ERROR,
BuildJobResult.COMPLETE: BUILD_PHASE.COMPLETE,
BuildJobResult.ERROR: BUILD_PHASE.ERROR,
}

View file

View file

@ -0,0 +1,183 @@
import json
import logging
from app import app
from cachetools.func import lru_cache
from notifications import spawn_notification
from data import model
from data.registry_model import registry_model
from data.registry_model.datatypes import RepositoryReference
from data.database import UseThenDisconnect
from util.morecollections import AttrDict
logger = logging.getLogger(__name__)
class BuildJobLoadException(Exception):
""" Exception raised if a build job could not be instantiated for some reason. """
pass
class BuildJob(object):
""" Represents a single in-progress build job. """
def __init__(self, job_item):
self.job_item = job_item
try:
self.job_details = json.loads(job_item.body)
self.build_notifier = BuildJobNotifier(self.build_uuid)
except ValueError:
raise BuildJobLoadException(
'Could not parse build queue item config with ID %s' % self.job_details['build_uuid']
)
@property
def retries_remaining(self):
return self.job_item.retries_remaining
def has_retries_remaining(self):
return self.job_item.retries_remaining > 0
def send_notification(self, kind, error_message=None, image_id=None, manifest_digests=None):
self.build_notifier.send_notification(kind, error_message, image_id, manifest_digests)
@lru_cache(maxsize=1)
def _load_repo_build(self):
with UseThenDisconnect(app.config):
try:
return model.build.get_repository_build(self.build_uuid)
except model.InvalidRepositoryBuildException:
raise BuildJobLoadException(
'Could not load repository build with ID %s' % self.build_uuid)
@property
def build_uuid(self):
""" Returns the unique UUID for this build job. """
return self.job_details['build_uuid']
@property
def namespace(self):
""" Returns the namespace under which this build is running. """
return self.repo_build.repository.namespace_user.username
@property
def repo_name(self):
""" Returns the name of the repository under which this build is running. """
return self.repo_build.repository.name
@property
def repo_build(self):
return self._load_repo_build()
def get_build_package_url(self, user_files):
""" Returns the URL of the build package for this build, if any or empty string if none. """
archive_url = self.build_config.get('archive_url', None)
if archive_url:
return archive_url
if not self.repo_build.resource_key:
return ''
return user_files.get_file_url(self.repo_build.resource_key, '127.0.0.1', requires_cors=False)
@property
def pull_credentials(self):
""" Returns the pull credentials for this job, or None if none. """
return self.job_details.get('pull_credentials')
@property
def build_config(self):
try:
return json.loads(self.repo_build.job_config)
except ValueError:
raise BuildJobLoadException(
'Could not parse repository build job config with ID %s' % self.job_details['build_uuid']
)
def determine_cached_tag(self, base_image_id=None, cache_comments=None):
""" Returns the tag to pull to prime the cache or None if none. """
cached_tag = self._determine_cached_tag_by_tag()
logger.debug('Determined cached tag %s for %s: %s', cached_tag, base_image_id, cache_comments)
return cached_tag
def _determine_cached_tag_by_tag(self):
""" Determines the cached tag by looking for one of the tags being built, and seeing if it
exists in the repository. This is a fallback for when no comment information is available.
"""
with UseThenDisconnect(app.config):
tags = self.build_config.get('docker_tags', ['latest'])
repository = RepositoryReference.for_repo_obj(self.repo_build.repository)
matching_tag = registry_model.find_matching_tag(repository, tags)
if matching_tag is not None:
return matching_tag.name
most_recent_tag = registry_model.get_most_recent_tag(repository)
if most_recent_tag is not None:
return most_recent_tag.name
return None
class BuildJobNotifier(object):
""" A class for sending notifications to a job that only relies on the build_uuid """
def __init__(self, build_uuid):
self.build_uuid = build_uuid
@property
def repo_build(self):
return self._load_repo_build()
@lru_cache(maxsize=1)
def _load_repo_build(self):
try:
return model.build.get_repository_build(self.build_uuid)
except model.InvalidRepositoryBuildException:
raise BuildJobLoadException(
'Could not load repository build with ID %s' % self.build_uuid)
@property
def build_config(self):
try:
return json.loads(self.repo_build.job_config)
except ValueError:
raise BuildJobLoadException(
'Could not parse repository build job config with ID %s' % self.repo_build.uuid
)
def send_notification(self, kind, error_message=None, image_id=None, manifest_digests=None):
with UseThenDisconnect(app.config):
tags = self.build_config.get('docker_tags', ['latest'])
trigger = self.repo_build.trigger
if trigger is not None and trigger.id is not None:
trigger_kind = trigger.service.name
else:
trigger_kind = None
event_data = {
'build_id': self.repo_build.uuid,
'build_name': self.repo_build.display_name,
'docker_tags': tags,
'trigger_id': trigger.uuid if trigger is not None else None,
'trigger_kind': trigger_kind,
'trigger_metadata': self.build_config.get('trigger_metadata', {})
}
if image_id is not None:
event_data['image_id'] = image_id
if manifest_digests:
event_data['manifest_digests'] = manifest_digests
if error_message is not None:
event_data['error_message'] = error_message
# TODO: remove when more endpoints have been converted to using
# interfaces
repo = AttrDict({
'namespace_name': self.repo_build.repository.namespace_user.username,
'name': self.repo_build.repository.name,
})
spawn_notification(repo, kind, event_data,
subpage='build/%s' % self.repo_build.uuid,
pathargs=['build', self.repo_build.uuid])

View file

@ -0,0 +1,88 @@
import datetime
import logging
from redis import RedisError
from trollius import From, Return, coroutine
from data.database import BUILD_PHASE
from data import model
from buildman.asyncutil import AsyncWrapper
logger = logging.getLogger(__name__)
class StatusHandler(object):
""" Context wrapper for writing status to build logs. """
def __init__(self, build_logs, repository_build_uuid):
self._current_phase = None
self._current_command = None
self._uuid = repository_build_uuid
self._build_logs = AsyncWrapper(build_logs)
self._sync_build_logs = build_logs
self._build_model = AsyncWrapper(model.build)
self._status = {
'total_commands': 0,
'current_command': None,
'push_completion': 0.0,
'pull_completion': 0.0,
}
# Write the initial status.
self.__exit__(None, None, None)
@coroutine
def _append_log_message(self, log_message, log_type=None, log_data=None):
log_data = log_data or {}
log_data['datetime'] = str(datetime.datetime.now())
try:
yield From(self._build_logs.append_log_message(self._uuid, log_message, log_type, log_data))
except RedisError:
logger.exception('Could not save build log for build %s: %s', self._uuid, log_message)
@coroutine
def append_log(self, log_message, extra_data=None):
if log_message is None:
return
yield From(self._append_log_message(log_message, log_data=extra_data))
@coroutine
def set_command(self, command, extra_data=None):
if self._current_command == command:
raise Return()
self._current_command = command
yield From(self._append_log_message(command, self._build_logs.COMMAND, extra_data))
@coroutine
def set_error(self, error_message, extra_data=None, internal_error=False, requeued=False):
error_phase = BUILD_PHASE.INTERNAL_ERROR if internal_error and requeued else BUILD_PHASE.ERROR
yield From(self.set_phase(error_phase))
extra_data = extra_data or {}
extra_data['internal_error'] = internal_error
yield From(self._append_log_message(error_message, self._build_logs.ERROR, extra_data))
@coroutine
def set_phase(self, phase, extra_data=None):
if phase == self._current_phase:
raise Return(False)
self._current_phase = phase
yield From(self._append_log_message(phase, self._build_logs.PHASE, extra_data))
# Update the repository build with the new phase
raise Return(self._build_model.update_phase_then_close(self._uuid, phase))
def __enter__(self):
return self._status
def __exit__(self, exc_type, value, traceback):
try:
self._sync_build_logs.set_status(self._uuid, self._status)
except RedisError:
logger.exception('Could not set status of build %s to %s', self._uuid, self._status)

View file

@ -0,0 +1,119 @@
class WorkerError(object):
""" Helper class which represents errors raised by a build worker. """
def __init__(self, error_code, base_message=None):
self._error_code = error_code
self._base_message = base_message
self._error_handlers = {
'io.quay.builder.buildpackissue': {
'message': 'Could not load build package',
'is_internal': True,
},
'io.quay.builder.gitfailure': {
'message': 'Could not clone git repository',
'show_base_error': True,
},
'io.quay.builder.gitcheckout': {
'message': 'Could not checkout git ref. If you force pushed recently, ' +
'the commit may be missing.',
'show_base_error': True,
},
'io.quay.builder.cannotextractbuildpack': {
'message': 'Could not extract the contents of the build package'
},
'io.quay.builder.cannotpullforcache': {
'message': 'Could not pull cached image',
'is_internal': True
},
'io.quay.builder.dockerfileissue': {
'message': 'Could not find or parse Dockerfile',
'show_base_error': True
},
'io.quay.builder.cannotpullbaseimage': {
'message': 'Could not pull base image',
'show_base_error': True
},
'io.quay.builder.internalerror': {
'message': 'An internal error occurred while building. Please submit a ticket.',
'is_internal': True
},
'io.quay.builder.buildrunerror': {
'message': 'Could not start the build process',
'is_internal': True
},
'io.quay.builder.builderror': {
'message': 'A build step failed',
'show_base_error': True
},
'io.quay.builder.tagissue': {
'message': 'Could not tag built image',
'is_internal': True
},
'io.quay.builder.pushissue': {
'message': 'Could not push built image',
'show_base_error': True,
'is_internal': True
},
'io.quay.builder.dockerconnecterror': {
'message': 'Could not connect to Docker daemon',
'is_internal': True
},
'io.quay.builder.missingorinvalidargument': {
'message': 'Missing required arguments for builder',
'is_internal': True
},
'io.quay.builder.cachelookupissue': {
'message': 'Error checking for a cached tag',
'is_internal': True
},
'io.quay.builder.errorduringphasetransition': {
'message': 'Error during phase transition. If this problem persists ' +
'please contact customer support.',
'is_internal': True
},
'io.quay.builder.clientrejectedtransition': {
'message': 'Build can not be finished due to user cancellation.',
}
}
def is_internal_error(self):
handler = self._error_handlers.get(self._error_code)
return handler.get('is_internal', False) if handler else True
def public_message(self):
handler = self._error_handlers.get(self._error_code)
if not handler:
return 'An unknown error occurred'
message = handler['message']
if handler.get('show_base_error', False) and self._base_message:
message = message + ': ' + self._base_message
return message
def extra_data(self):
if self._base_message:
return {
'base_error': self._base_message,
'error_code': self._error_code
}
return {
'error_code': self._error_code
}

View file

View file

@ -0,0 +1,71 @@
from trollius import coroutine
class BaseManager(object):
""" Base for all worker managers. """
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
job_complete_callback, manager_hostname, heartbeat_period_sec):
self.register_component = register_component
self.unregister_component = unregister_component
self.job_heartbeat_callback = job_heartbeat_callback
self.job_complete_callback = job_complete_callback
self.manager_hostname = manager_hostname
self.heartbeat_period_sec = heartbeat_period_sec
@coroutine
def job_heartbeat(self, build_job):
""" Method invoked to tell the manager that a job is still running. This method will be called
every few minutes. """
self.job_heartbeat_callback(build_job)
def overall_setup_time(self):
""" Returns the number of seconds that the build system should wait before allowing the job
to be picked up again after called 'schedule'.
"""
raise NotImplementedError
def shutdown(self):
""" Indicates that the build controller server is in a shutdown state and that no new jobs
or workers should be performed. Existing workers should be cleaned up once their jobs
have completed
"""
raise NotImplementedError
@coroutine
def schedule(self, build_job):
""" Schedules a queue item to be built. Returns a 2-tuple with (True, None) if the item was
properly scheduled and (False, a retry timeout in seconds) if all workers are busy or an
error occurs.
"""
raise NotImplementedError
def initialize(self, manager_config):
""" Runs any initialization code for the manager. Called once the server is in a ready state.
"""
raise NotImplementedError
@coroutine
def build_component_ready(self, build_component):
""" Method invoked whenever a build component announces itself as ready.
"""
raise NotImplementedError
def build_component_disposed(self, build_component, timed_out):
""" Method invoked whenever a build component has been disposed. The timed_out boolean indicates
whether the component's heartbeat timed out.
"""
raise NotImplementedError
@coroutine
def job_completed(self, build_job, job_status, build_component):
""" Method invoked once a job_item has completed, in some manner. The job_status will be
one of: incomplete, error, complete. Implementations of this method should call coroutine
self.job_complete_callback with a status of Incomplete if they wish for the job to be
automatically requeued.
"""
raise NotImplementedError
def num_workers(self):
""" Returns the number of active build workers currently registered. This includes those
that are currently busy and awaiting more work.
"""
raise NotImplementedError

View file

@ -0,0 +1,27 @@
import logging
from buildman.manager.orchestrator_canceller import OrchestratorCanceller
from buildman.manager.noop_canceller import NoopCanceller
logger = logging.getLogger(__name__)
CANCELLERS = {'ephemeral': OrchestratorCanceller}
class BuildCanceller(object):
""" A class to manage cancelling a build """
def __init__(self, app=None):
self.build_manager_config = app.config.get('BUILD_MANAGER')
if app is None or self.build_manager_config is None:
self.handler = NoopCanceller()
else:
self.handler = None
def try_cancel_build(self, uuid):
""" A method to kill a running build """
if self.handler is None:
canceller = CANCELLERS.get(self.build_manager_config[0], NoopCanceller)
self.handler = canceller(self.build_manager_config[1])
return self.handler.try_cancel_build(uuid)

View file

@ -0,0 +1,92 @@
import logging
import uuid
from buildman.component.basecomponent import BaseComponent
from buildman.component.buildcomponent import BuildComponent
from buildman.manager.basemanager import BaseManager
from trollius import From, Return, coroutine
REGISTRATION_REALM = 'registration'
RETRY_TIMEOUT = 5
logger = logging.getLogger(__name__)
class DynamicRegistrationComponent(BaseComponent):
""" Component session that handles dynamic registration of the builder components. """
def onConnect(self):
self.join(REGISTRATION_REALM)
def onJoin(self, details):
logger.debug('Registering registration method')
yield From(self.register(self._worker_register, u'io.quay.buildworker.register'))
def _worker_register(self):
realm = self.parent_manager.add_build_component()
logger.debug('Registering new build component+worker with realm %s', realm)
return realm
def kind(self):
return 'registration'
class EnterpriseManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
def __init__(self, *args, **kwargs):
self.ready_components = set()
self.all_components = set()
self.shutting_down = False
super(EnterpriseManager, self).__init__(*args, **kwargs)
def initialize(self, manager_config):
# Add a component which is used by build workers for dynamic registration. Unlike
# production, build workers in enterprise are long-lived and register dynamically.
self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
def overall_setup_time(self):
# Builders are already registered, so the setup time should be essentially instant. We therefore
# only return a minute here.
return 60
def add_build_component(self):
""" Adds a new build component for an Enterprise Registry. """
# Generate a new unique realm ID for the build worker.
realm = str(uuid.uuid4())
new_component = self.register_component(realm, BuildComponent, token="")
self.all_components.add(new_component)
return realm
@coroutine
def schedule(self, build_job):
""" Schedules a build for an Enterprise Registry. """
if self.shutting_down or not self.ready_components:
raise Return(False, RETRY_TIMEOUT)
component = self.ready_components.pop()
yield From(component.start_build(build_job))
raise Return(True, None)
@coroutine
def build_component_ready(self, build_component):
self.ready_components.add(build_component)
def shutdown(self):
self.shutting_down = True
@coroutine
def job_completed(self, build_job, job_status, build_component):
yield From(self.job_complete_callback(build_job, job_status))
def build_component_disposed(self, build_component, timed_out):
self.all_components.remove(build_component)
if build_component in self.ready_components:
self.ready_components.remove(build_component)
self.unregister_component(build_component)
def num_workers(self):
return len(self.all_components)

View file

@ -0,0 +1,710 @@
import logging
import uuid
import calendar
import json
import time
from collections import namedtuple
from datetime import datetime, timedelta
from six import iteritems
from trollius import From, coroutine, Return, async, sleep
from app import metric_queue
from buildman.orchestrator import (orchestrator_from_config, KeyEvent,
OrchestratorError, OrchestratorConnectionError,
ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
from buildman.manager.basemanager import BaseManager
from buildman.manager.executor import PopenExecutor, EC2Executor, KubernetesExecutor
from buildman.component.buildcomponent import BuildComponent
from buildman.jobutil.buildjob import BuildJob
from buildman.server import BuildJobResult
from util import slash_join
from util.morecollections import AttrDict
logger = logging.getLogger(__name__)
JOB_PREFIX = 'building/'
LOCK_PREFIX = 'lock/'
REALM_PREFIX = 'realm/'
CANCEL_PREFIX = 'cancel/'
METRIC_PREFIX = 'metric/'
CANCELED_LOCK_PREFIX = slash_join(LOCK_PREFIX, 'job-cancelled')
EXPIRED_LOCK_PREFIX = slash_join(LOCK_PREFIX, 'job-expired')
EPHEMERAL_API_TIMEOUT = 20
EPHEMERAL_SETUP_TIMEOUT = 500
RETRY_IMMEDIATELY_SLEEP_DURATION = 0
TOO_MANY_WORKERS_SLEEP_DURATION = 10
BuildInfo = namedtuple('BuildInfo', ['component', 'build_job', 'execution_id', 'executor_name'])
class EphemeralBuilderManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
EXECUTORS = {
'popen': PopenExecutor,
'ec2': EC2Executor,
'kubernetes': KubernetesExecutor,
}
def __init__(self, *args, **kwargs):
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
self._shutting_down = False
self._manager_config = None
self._orchestrator = None
# The registered executors available for running jobs, in order.
self._ordered_executors = []
# The registered executors, mapped by their unique name.
self._executor_name_to_executor = {}
# Map from builder component to its associated job.
self._component_to_job = {}
# Map from build UUID to a BuildInfo tuple with information about the build.
self._build_uuid_to_info = {}
def overall_setup_time(self):
return EPHEMERAL_SETUP_TIMEOUT
@coroutine
def _mark_job_incomplete(self, build_job, build_info):
""" Marks a job as incomplete, in response to a failure to start or a timeout. """
executor_name = build_info.executor_name
execution_id = build_info.execution_id
logger.warning('Build executor failed to successfully boot with execution id %s',
execution_id)
# Take a lock to ensure that only one manager reports the build as incomplete for this
# execution.
lock_key = slash_join(self._expired_lock_prefix, build_job.build_uuid, execution_id)
acquired_lock = yield From(self._orchestrator.lock(lock_key))
if acquired_lock:
try:
# Clean up the bookkeeping for the job.
yield From(self._orchestrator.delete_key(self._job_key(build_job)))
except KeyError:
logger.debug('Could not delete job key %s; might have been removed already',
build_job.build_uuid)
logger.error('[BUILD INTERNAL ERROR] Build ID: %s. Exec name: %s. Exec ID: %s',
build_job.build_uuid, executor_name, execution_id)
yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
update_phase=True))
else:
logger.debug('Did not get lock for job-expiration for job %s', build_job.build_uuid)
@coroutine
def _job_callback(self, key_change):
"""
This is the callback invoked when keys related to jobs are changed.
It ignores all events related to the creation of new jobs.
Deletes or expirations cause checks to ensure they've been properly marked as completed.
:param key_change: the event and value produced by a key changing in the orchestrator
:type key_change: :class:`KeyChange`
"""
if key_change.event in (KeyEvent.CREATE, KeyEvent.SET):
raise Return()
elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE):
# Handle the expiration/deletion.
job_metadata = json.loads(key_change.value)
build_job = BuildJob(AttrDict(job_metadata['job_queue_item']))
logger.debug('Got "%s" of job %s', key_change.event, build_job.build_uuid)
# Get the build info.
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
if build_info is None:
logger.debug('No build info for "%s" job %s (%s); probably already deleted by this manager',
key_change.event, build_job.build_uuid, job_metadata)
raise Return()
if key_change.event != KeyEvent.EXPIRE:
# If the etcd action was not an expiration, then it was already deleted by some manager and
# the execution was therefore already shutdown. All that's left is to remove the build info.
self._build_uuid_to_info.pop(build_job.build_uuid, None)
raise Return()
logger.debug('got expiration for job %s with metadata: %s', build_job.build_uuid,
job_metadata)
if not job_metadata.get('had_heartbeat', False):
# If we have not yet received a heartbeat, then the node failed to boot in some way.
# We mark the job as incomplete here.
yield From(self._mark_job_incomplete(build_job, build_info))
# Finally, we terminate the build execution for the job. We don't do this under a lock as
# terminating a node is an atomic operation; better to make sure it is terminated than not.
logger.info('Terminating expired build executor for job %s with execution id %s',
build_job.build_uuid, build_info.execution_id)
yield From(self.kill_builder_executor(build_job.build_uuid))
else:
logger.warning('Unexpected KeyEvent (%s) on job key: %s', key_change.event, key_change.key)
@coroutine
def _realm_callback(self, key_change):
logger.debug('realm callback for key: %s', key_change.key)
if key_change.event == KeyEvent.CREATE:
# Listen on the realm created by ourselves or another worker.
realm_spec = json.loads(key_change.value)
self._register_realm(realm_spec)
elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE):
# Stop listening for new connections on the realm, if we did not get the connection.
realm_spec = json.loads(key_change.value)
realm_id = realm_spec['realm']
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
build_uuid = build_job.build_uuid
logger.debug('Realm key %s for build %s was %s', realm_id, build_uuid, key_change.event)
build_info = self._build_uuid_to_info.get(build_uuid, None)
if build_info is not None:
# Pop off the component and if we find one, then the build has not connected to this
# manager, so we can safely unregister its component.
component = self._component_to_job.pop(build_info.component, None)
if component is not None:
# We were not the manager which the worker connected to, remove the bookkeeping for it
logger.debug('Unregistering unused component for build %s', build_uuid)
self.unregister_component(build_info.component)
# If the realm has expired, then perform cleanup of the executor.
if key_change.event == KeyEvent.EXPIRE:
execution_id = realm_spec.get('execution_id', None)
executor_name = realm_spec.get('executor_name', 'EC2Executor')
# Cleanup the job, since it never started.
logger.debug('Job %s for incomplete marking: %s', build_uuid, build_info)
if build_info is not None:
yield From(self._mark_job_incomplete(build_job, build_info))
# Cleanup the executor.
logger.info('Realm %s expired for job %s, terminating executor %s with execution id %s',
realm_id, build_uuid, executor_name, execution_id)
yield From(self.terminate_executor(executor_name, execution_id))
else:
logger.warning('Unexpected action (%s) on realm key: %s', key_change.event, key_change.key)
def _register_realm(self, realm_spec):
logger.debug('Got call to register realm %s with manager', realm_spec['realm'])
# Create the build information block for the registered realm.
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
execution_id = realm_spec.get('execution_id', None)
executor_name = realm_spec.get('executor_name', 'EC2Executor')
logger.debug('Registering realm %s with manager: %s', realm_spec['realm'], realm_spec)
component = self.register_component(realm_spec['realm'], BuildComponent,
token=realm_spec['token'])
build_info = BuildInfo(component=component, build_job=build_job, execution_id=execution_id,
executor_name=executor_name)
self._component_to_job[component] = build_job
self._build_uuid_to_info[build_job.build_uuid] = build_info
logger.debug('Registered realm %s with manager', realm_spec['realm'])
return component
@property
def registered_executors(self):
return self._ordered_executors
@coroutine
def _register_existing_realms(self):
try:
all_realms = yield From(self._orchestrator.get_prefixed_keys(self._realm_prefix))
# Register all existing realms found.
encountered = {self._register_realm(json.loads(realm_data))
for _realm, realm_data in all_realms}
# Remove any components not encountered so we can clean up.
for component, job in iteritems(self._component_to_job):
if not component in encountered:
self._component_to_job.pop(component, None)
self._build_uuid_to_info.pop(job.build_uuid, None)
except KeyError:
pass
def _load_executor(self, executor_kind_name, executor_config):
executor_klass = EphemeralBuilderManager.EXECUTORS.get(executor_kind_name)
if executor_klass is None:
logger.error('Unknown executor %s; skipping install', executor_kind_name)
return
executor = executor_klass(executor_config, self.manager_hostname)
if executor.name in self._executor_name_to_executor:
raise Exception('Executor with name %s already registered' % executor.name)
self._ordered_executors.append(executor)
self._executor_name_to_executor[executor.name] = executor
def _config_prefix(self, key):
if self._manager_config.get('ORCHESTRATOR') is None:
return key
prefix = self._manager_config.get('ORCHESTRATOR_PREFIX', '')
return slash_join(prefix, key).lstrip('/') + '/'
@property
def _job_prefix(self):
return self._config_prefix(JOB_PREFIX)
@property
def _realm_prefix(self):
return self._config_prefix(REALM_PREFIX)
@property
def _cancel_prefix(self):
return self._config_prefix(CANCEL_PREFIX)
@property
def _metric_prefix(self):
return self._config_prefix(METRIC_PREFIX)
@property
def _expired_lock_prefix(self):
return self._config_prefix(EXPIRED_LOCK_PREFIX)
@property
def _canceled_lock_prefix(self):
return self._config_prefix(CANCELED_LOCK_PREFIX)
def _metric_key(self, realm):
"""
Create a key which is used to track a job in the Orchestrator.
:param realm: realm for the build
:type realm: str
:returns: key used to track jobs
:rtype: str
"""
return slash_join(self._metric_prefix, realm)
def _job_key(self, build_job):
"""
Creates a key which is used to track a job in the Orchestrator.
:param build_job: unique job identifier for a build
:type build_job: str
:returns: key used to track the job
:rtype: str
"""
return slash_join(self._job_prefix, build_job.job_details['build_uuid'])
def _realm_key(self, realm):
"""
Create a key which is used to track an incoming connection on a realm.
:param realm: realm for the build
:type realm: str
:returns: key used to track the connection to the realm
:rtype: str
"""
return slash_join(self._realm_prefix, realm)
def initialize(self, manager_config):
logger.debug('Calling initialize')
self._manager_config = manager_config
# Note: Executor config can be defined either as a single block of EXECUTOR_CONFIG (old style)
# or as a new set of executor configurations, with the order determining how we fallback. We
# check for both here to ensure backwards compatibility.
if manager_config.get('EXECUTORS'):
for executor_config in manager_config['EXECUTORS']:
self._load_executor(executor_config.get('EXECUTOR'), executor_config)
else:
self._load_executor(manager_config.get('EXECUTOR'), manager_config.get('EXECUTOR_CONFIG'))
logger.debug('calling orchestrator_from_config')
self._orchestrator = orchestrator_from_config(manager_config)
logger.debug('setting on_key_change callbacks for job, cancel, realm')
self._orchestrator.on_key_change(self._job_prefix, self._job_callback)
self._orchestrator.on_key_change(self._cancel_prefix, self._cancel_callback)
self._orchestrator.on_key_change(self._realm_prefix, self._realm_callback,
restarter=self._register_existing_realms)
# Load components for all realms currently known to the cluster
async(self._register_existing_realms())
def shutdown(self):
logger.debug('Shutting down worker.')
if self._orchestrator is not None:
self._orchestrator.shutdown()
@coroutine
def schedule(self, build_job):
build_uuid = build_job.job_details['build_uuid']
logger.debug('Calling schedule with job: %s', build_uuid)
# Check if there are worker slots available by checking the number of jobs in the orchestrator
allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
try:
active_jobs = yield From(self._orchestrator.get_prefixed_keys(self._job_prefix))
workers_alive = len(active_jobs)
except KeyError:
workers_alive = 0
except OrchestratorConnectionError:
logger.exception('Could not read job count from orchestrator for job due to orchestrator being down')
raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
except OrchestratorError:
logger.exception('Exception when reading job count from orchestrator for job: %s', build_uuid)
raise Return(False, RETRY_IMMEDIATELY_SLEEP_DURATION)
logger.debug('Total jobs (scheduling job %s): %s', build_uuid, workers_alive)
if workers_alive >= allowed_worker_count:
logger.info('Too many workers alive, unable to start new worker for build job: %s. %s >= %s',
build_uuid, workers_alive, allowed_worker_count)
raise Return(False, TOO_MANY_WORKERS_SLEEP_DURATION)
job_key = self._job_key(build_job)
# First try to take a lock for this job, meaning we will be responsible for its lifeline
realm = str(uuid.uuid4())
token = str(uuid.uuid4())
nonce = str(uuid.uuid4())
machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200)
max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration)
payload = {
'max_expiration': calendar.timegm(max_expiration.timetuple()),
'nonce': nonce,
'had_heartbeat': False,
'job_queue_item': build_job.job_item,
}
lock_payload = json.dumps(payload)
logger.debug('Writing key for job %s with expiration in %s seconds', build_uuid,
EPHEMERAL_SETUP_TIMEOUT)
try:
yield From(self._orchestrator.set_key(job_key, lock_payload, overwrite=False,
expiration=EPHEMERAL_SETUP_TIMEOUT))
except KeyError:
logger.warning('Job: %s already exists in orchestrator, timeout may be misconfigured',
build_uuid)
raise Return(False, EPHEMERAL_API_TIMEOUT)
except OrchestratorConnectionError:
logger.exception('Exception when writing job %s to orchestrator; could not connect',
build_uuid)
raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
except OrchestratorError:
logger.exception('Exception when writing job %s to orchestrator', build_uuid)
raise Return(False, RETRY_IMMEDIATELY_SLEEP_DURATION)
# Got a lock, now lets boot the job via one of the registered executors.
started_with_executor = None
execution_id = None
logger.debug("Registered executors are: %s", [ex.name for ex in self._ordered_executors])
for executor in self._ordered_executors:
# Check if we can use this executor based on its whitelist, by namespace.
namespace = build_job.namespace
if not executor.allowed_for_namespace(namespace):
logger.debug('Job %s (namespace: %s) cannot use executor %s', build_uuid, namespace,
executor.name)
continue
# Check if we can use this executor based on the retries remaining.
if executor.minimum_retry_threshold > build_job.retries_remaining:
metric_queue.builder_fallback.Inc()
logger.debug('Job %s cannot use executor %s as it is below retry threshold %s (retry #%s)',
build_uuid, executor.name, executor.minimum_retry_threshold,
build_job.retries_remaining)
continue
logger.debug('Starting builder for job %s with selected executor: %s', build_uuid,
executor.name)
try:
execution_id = yield From(executor.start_builder(realm, token, build_uuid))
except:
try:
metric_queue.build_start_failure.Inc(labelvalues=[executor.name])
metric_queue.put_deprecated(('ExecutorFailure-%s' % executor.name), 1, unit='Count')
except:
logger.exception('Exception when writing failure metric for execution %s for job %s',
execution_id, build_uuid)
logger.exception('Exception when starting builder for job: %s', build_uuid)
continue
try:
metric_queue.build_start_success.Inc(labelvalues=[executor.name])
except:
logger.exception('Exception when writing success metric for execution %s for job %s',
execution_id, build_uuid)
try:
metric_queue.ephemeral_build_workers.Inc()
except:
logger.exception('Exception when writing start metrics for execution %s for job %s',
execution_id, build_uuid)
started_with_executor = executor
# Break out of the loop now that we've started a builder successfully.
break
# If we didn't start the job, cleanup and return it to the queue.
if started_with_executor is None:
logger.error('Could not start ephemeral worker for build %s', build_uuid)
# Delete the associated build job record.
yield From(self._orchestrator.delete_key(job_key))
raise Return(False, EPHEMERAL_API_TIMEOUT)
# Job was started!
logger.debug('Started execution with ID %s for job: %s with executor: %s',
execution_id, build_uuid, started_with_executor.name)
# Store metric data
metric_spec = json.dumps({
'executor_name': started_with_executor.name,
'start_time': time.time(),
})
try:
yield From(self._orchestrator.set_key(self._metric_key(realm), metric_spec, overwrite=False,
expiration=machine_max_expiration + 10))
except KeyError:
logger.error('Realm %s already exists in orchestrator for job %s ' +
'UUID collision or something is very very wrong.', realm, build_uuid)
except OrchestratorError:
logger.exception('Exception when writing realm %s to orchestrator for job %s',
realm, build_uuid)
# Store the realm spec which will allow any manager to accept this builder when it connects
realm_spec = json.dumps({
'realm': realm,
'token': token,
'execution_id': execution_id,
'executor_name': started_with_executor.name,
'job_queue_item': build_job.job_item,
})
try:
setup_time = started_with_executor.setup_time or self.overall_setup_time()
logger.debug('Writing job key for job %s using executor %s with ID %s and ttl %s', build_uuid,
started_with_executor.name, execution_id, setup_time)
yield From(self._orchestrator.set_key(self._realm_key(realm), realm_spec,
expiration=setup_time))
except OrchestratorConnectionError:
logger.exception('Exception when writing realm %s to orchestrator for job %s',
realm, build_uuid)
raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
except OrchestratorError:
logger.exception('Exception when writing realm %s to orchestrator for job %s',
realm, build_uuid)
raise Return(False, setup_time)
logger.debug('Builder spawn complete for job %s using executor %s with ID %s ',
build_uuid, started_with_executor.name, execution_id)
raise Return(True, None)
@coroutine
def build_component_ready(self, build_component):
logger.debug('Got component ready for component with realm %s', build_component.builder_realm)
# Pop off the job for the component.
# We do so before we send out the watch below, as it will also remove this mapping.
job = self._component_to_job.pop(build_component, None)
if job is None:
# This will occur once the build finishes, so no need to worry about it.
# We log in case it happens outside of the expected flow.
logger.debug('Could not find job for the build component on realm %s; component is ready',
build_component.builder_realm)
raise Return()
# Start the build job.
logger.debug('Sending build %s to newly ready component on realm %s',
job.build_uuid, build_component.builder_realm)
yield From(build_component.start_build(job))
yield From(self._write_duration_metric(metric_queue.builder_time_to_build,
build_component.builder_realm))
# Clean up the bookkeeping for allowing any manager to take the job.
try:
yield From(self._orchestrator.delete_key(self._realm_key(build_component.builder_realm)))
except KeyError:
logger.warning('Could not delete realm key %s', build_component.builder_realm)
def build_component_disposed(self, build_component, timed_out):
logger.debug('Calling build_component_disposed.')
self.unregister_component(build_component)
@coroutine
def job_completed(self, build_job, job_status, build_component):
logger.debug('Calling job_completed for job %s with status: %s',
build_job.build_uuid, job_status)
yield From(self._write_duration_metric(metric_queue.build_time, build_component.builder_realm))
# Mark the job as completed. Since this is being invoked from the component, we don't need
# to ask for the phase to be updated as well.
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
executor_name = build_info.executor_name if build_info else None
yield From(self.job_complete_callback(build_job, job_status, executor_name, update_phase=False))
# Kill the ephemeral builder.
yield From(self.kill_builder_executor(build_job.build_uuid))
# Delete the build job from the orchestrator.
try:
job_key = self._job_key(build_job)
yield From(self._orchestrator.delete_key(job_key))
except KeyError:
logger.debug('Builder is asking for job to be removed, but work already completed')
except OrchestratorConnectionError:
logger.exception('Could not remove job key as orchestrator is not available')
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
raise Return()
# Delete the metric from the orchestrator.
try:
metric_key = self._metric_key(build_component.builder_realm)
yield From(self._orchestrator.delete_key(metric_key))
except KeyError:
logger.debug('Builder is asking for metric to be removed, but key not found')
except OrchestratorConnectionError:
logger.exception('Could not remove metric key as orchestrator is not available')
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
raise Return()
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
@coroutine
def kill_builder_executor(self, build_uuid):
logger.info('Starting termination of executor for job %s', build_uuid)
build_info = self._build_uuid_to_info.pop(build_uuid, None)
if build_info is None:
logger.debug('Build information not found for build %s; skipping termination', build_uuid)
raise Return()
# Remove the build's component.
self._component_to_job.pop(build_info.component, None)
# Stop the build node/executor itself.
yield From(self.terminate_executor(build_info.executor_name, build_info.execution_id))
@coroutine
def terminate_executor(self, executor_name, execution_id):
executor = self._executor_name_to_executor.get(executor_name)
if executor is None:
logger.error('Could not find registered executor %s', executor_name)
raise Return()
# Terminate the executor's execution.
logger.info('Terminating executor %s with execution id %s', executor_name, execution_id)
yield From(executor.stop_builder(execution_id))
@coroutine
def job_heartbeat(self, build_job):
"""
:param build_job: the identifier for the build
:type build_job: str
"""
self.job_heartbeat_callback(build_job)
self._extend_job_in_orchestrator(build_job)
@coroutine
def _extend_job_in_orchestrator(self, build_job):
try:
job_data = yield From(self._orchestrator.get_key(self._job_key(build_job)))
except KeyError:
logger.info('Job %s no longer exists in the orchestrator', build_job.build_uuid)
raise Return()
except OrchestratorConnectionError:
logger.exception('failed to connect when attempted to extend job')
build_job_metadata = json.loads(job_data)
max_expiration = datetime.utcfromtimestamp(build_job_metadata['max_expiration'])
max_expiration_remaining = max_expiration - datetime.utcnow()
max_expiration_sec = max(0, int(max_expiration_remaining.total_seconds()))
ttl = min(self.heartbeat_period_sec * 2, max_expiration_sec)
payload = {
'job_queue_item': build_job.job_item,
'max_expiration': build_job_metadata['max_expiration'],
'had_heartbeat': True,
}
try:
yield From(self._orchestrator.set_key(self._job_key(build_job), json.dumps(payload),
expiration=ttl))
except OrchestratorConnectionError:
logger.exception('Could not update heartbeat for job as the orchestrator is not available')
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
@coroutine
def _write_duration_metric(self, metric, realm):
"""
:returns: True if the metric was written, otherwise False
:rtype: bool
"""
try:
metric_data = yield From(self._orchestrator.get_key(self._metric_key(realm)))
parsed_metric_data = json.loads(metric_data)
start_time = parsed_metric_data['start_time']
metric.Observe(time.time() - start_time,
labelvalues=[parsed_metric_data.get('executor_name',
'unknown')])
except Exception:
logger.exception("Could not write metric for realm %s", realm)
def num_workers(self):
"""
The number of workers we're managing locally.
:returns: the number of the workers locally managed
:rtype: int
"""
return len(self._component_to_job)
@coroutine
def _cancel_callback(self, key_change):
if key_change.event not in (KeyEvent.CREATE, KeyEvent.SET):
raise Return()
build_uuid = key_change.value
build_info = self._build_uuid_to_info.get(build_uuid, None)
if build_info is None:
logger.debug('No build info for "%s" job %s', key_change.event, build_uuid)
raise Return(False)
lock_key = slash_join(self._canceled_lock_prefix,
build_uuid, build_info.execution_id)
lock_acquired = yield From(self._orchestrator.lock(lock_key))
if lock_acquired:
builder_realm = build_info.component.builder_realm
yield From(self.kill_builder_executor(build_uuid))
yield From(self._orchestrator.delete_key(self._realm_key(builder_realm)))
yield From(self._orchestrator.delete_key(self._metric_key(builder_realm)))
yield From(self._orchestrator.delete_key(slash_join(self._job_prefix, build_uuid)))
# This is outside the lock so we can un-register the component wherever it is registered to.
yield From(build_info.component.cancel_build())

View file

@ -0,0 +1,37 @@
import logging
import etcd
logger = logging.getLogger(__name__)
class EtcdCanceller(object):
""" A class that sends a message to etcd to cancel a build """
def __init__(self, config):
etcd_host = config.get('ETCD_HOST', '127.0.0.1')
etcd_port = config.get('ETCD_PORT', 2379)
etcd_ca_cert = config.get('ETCD_CA_CERT', None)
etcd_auth = config.get('ETCD_CERT_AND_KEY', None)
if etcd_auth is not None:
etcd_auth = tuple(etcd_auth)
etcd_protocol = 'http' if etcd_auth is None else 'https'
logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
self._cancel_prefix = config.get('ETCD_CANCEL_PREFIX', 'cancel/')
self._etcd_client = etcd.Client(
host=etcd_host,
port=etcd_port,
cert=etcd_auth,
ca_cert=etcd_ca_cert,
protocol=etcd_protocol,
read_timeout=5)
def try_cancel_build(self, build_uuid):
""" Writes etcd message to cancel build_uuid. """
logger.info("Cancelling build %s".format(build_uuid))
try:
self._etcd_client.write("{}{}".format(self._cancel_prefix, build_uuid), build_uuid, ttl=60)
return True
except etcd.EtcdException:
logger.exception("Failed to write to etcd client %s", build_uuid)
return False

View file

@ -0,0 +1,560 @@
import datetime
import hashlib
import logging
import os
import socket
import subprocess
import threading
import uuid
from functools import partial
import boto.ec2
import cachetools.func
import requests
import trollius
from container_cloud_config import CloudConfigContext
from jinja2 import FileSystemLoader, Environment
from trollius import coroutine, From, Return, get_event_loop
import release
from buildman.asyncutil import AsyncWrapper
from app import metric_queue, app
from util.metrics.metricqueue import duration_collector_async
from _init import ROOT_DIR
logger = logging.getLogger(__name__)
ONE_HOUR = 60*60
_TAG_RETRY_COUNT = 3 # Number of times to retry adding tags.
_TAG_RETRY_SLEEP = 2 # Number of seconds to wait between tag retries.
ENV = Environment(loader=FileSystemLoader(os.path.join(ROOT_DIR, "buildman/templates")))
TEMPLATE = ENV.get_template('cloudconfig.yaml')
CloudConfigContext().populate_jinja_environment(ENV)
class ExecutorException(Exception):
""" Exception raised when there is a problem starting or stopping a builder.
"""
pass
class BuilderExecutor(object):
def __init__(self, executor_config, manager_hostname):
""" Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
starting and stopping builders.
"""
self.executor_config = executor_config
self.manager_hostname = manager_hostname
default_websocket_scheme = 'wss' if app.config['PREFERRED_URL_SCHEME'] == 'https' else 'ws'
self.websocket_scheme = executor_config.get("WEBSOCKET_SCHEME", default_websocket_scheme)
@property
def name(self):
""" Name returns the unique name for this executor. """
return self.executor_config.get('NAME') or self.__class__.__name__
@property
def setup_time(self):
""" Returns the amount of time (in seconds) to wait for the execution to start for the build.
If None, the manager's default will be used.
"""
return self.executor_config.get('SETUP_TIME')
@coroutine
def start_builder(self, realm, token, build_uuid):
""" Create a builder with the specified config. Returns a unique id which can be used to manage
the builder.
"""
raise NotImplementedError
@coroutine
def stop_builder(self, builder_id):
""" Stop a builder which is currently running.
"""
raise NotImplementedError
def allowed_for_namespace(self, namespace):
""" Returns true if this executor can be used for builds in the given namespace. """
# Check for an explicit namespace whitelist.
namespace_whitelist = self.executor_config.get('NAMESPACE_WHITELIST')
if namespace_whitelist is not None and namespace in namespace_whitelist:
return True
# Check for a staged rollout percentage. If found, we hash the namespace and, if it is found
# in the first X% of the character space, we allow this executor to be used.
staged_rollout = self.executor_config.get('STAGED_ROLLOUT')
if staged_rollout is not None:
bucket = int(hashlib.sha256(namespace).hexdigest()[-2:], 16)
return bucket < (256 * staged_rollout)
# If there are no restrictions in place, we are free to use this executor.
return staged_rollout is None and namespace_whitelist is None
@property
def minimum_retry_threshold(self):
""" Returns the minimum number of retries required for this executor to be used or 0 if
none. """
return self.executor_config.get('MINIMUM_RETRY_THRESHOLD', 0)
def generate_cloud_config(self, realm, token, build_uuid, coreos_channel,
manager_hostname, quay_username=None,
quay_password=None):
if quay_username is None:
quay_username = self.executor_config['QUAY_USERNAME']
if quay_password is None:
quay_password = self.executor_config['QUAY_PASSWORD']
return TEMPLATE.render(
realm=realm,
token=token,
build_uuid=build_uuid,
quay_username=quay_username,
quay_password=quay_password,
manager_hostname=manager_hostname,
websocket_scheme=self.websocket_scheme,
coreos_channel=coreos_channel,
worker_image=self.executor_config.get('WORKER_IMAGE', 'quay.io/coreos/registry-build-worker'),
worker_tag=self.executor_config['WORKER_TAG'],
logentries_token=self.executor_config.get('LOGENTRIES_TOKEN', None),
volume_size=self.executor_config.get('VOLUME_SIZE', '42G'),
max_lifetime_s=self.executor_config.get('MAX_LIFETIME_S', 10800),
ssh_authorized_keys=self.executor_config.get('SSH_AUTHORIZED_KEYS', []),
)
class EC2Executor(BuilderExecutor):
""" Implementation of BuilderExecutor which uses libcloud to start machines on a variety of cloud
providers.
"""
COREOS_STACK_URL = 'http://%s.release.core-os.net/amd64-usr/current/coreos_production_ami_hvm.txt'
def __init__(self, *args, **kwargs):
self._loop = get_event_loop()
super(EC2Executor, self).__init__(*args, **kwargs)
def _get_conn(self):
""" Creates an ec2 connection which can be used to manage instances.
"""
return AsyncWrapper(boto.ec2.connect_to_region(
self.executor_config['EC2_REGION'],
aws_access_key_id=self.executor_config['AWS_ACCESS_KEY'],
aws_secret_access_key=self.executor_config['AWS_SECRET_KEY'],
))
@classmethod
@cachetools.func.ttl_cache(ttl=ONE_HOUR)
def _get_coreos_ami(cls, ec2_region, coreos_channel):
""" Retrieve the CoreOS AMI id from the canonical listing.
"""
stack_list_string = requests.get(EC2Executor.COREOS_STACK_URL % coreos_channel).text
stack_amis = dict([stack.split('=') for stack in stack_list_string.split('|')])
return stack_amis[ec2_region]
@coroutine
@duration_collector_async(metric_queue.builder_time_to_start, ['ec2'])
def start_builder(self, realm, token, build_uuid):
region = self.executor_config['EC2_REGION']
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
coreos_ami = self.executor_config.get('COREOS_AMI', None)
if coreos_ami is None:
get_ami_callable = partial(self._get_coreos_ami, region, channel)
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
user_data = self.generate_cloud_config(realm, token, build_uuid, channel, self.manager_hostname)
logger.debug('Generated cloud config for build %s: %s', build_uuid, user_data)
ec2_conn = self._get_conn()
ssd_root_ebs = boto.ec2.blockdevicemapping.BlockDeviceType(
size=int(self.executor_config.get('BLOCK_DEVICE_SIZE', 48)),
volume_type='gp2',
delete_on_termination=True,
)
block_devices = boto.ec2.blockdevicemapping.BlockDeviceMapping()
block_devices['/dev/xvda'] = ssd_root_ebs
interfaces = None
if self.executor_config.get('EC2_VPC_SUBNET_ID', None) is not None:
interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
subnet_id=self.executor_config['EC2_VPC_SUBNET_ID'],
groups=self.executor_config['EC2_SECURITY_GROUP_IDS'],
associate_public_ip_address=True,
)
interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)
try:
reservation = yield From(ec2_conn.run_instances(
coreos_ami,
instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
key_name=self.executor_config.get('EC2_KEY_NAME', None),
user_data=user_data,
instance_initiated_shutdown_behavior='terminate',
block_device_map=block_devices,
network_interfaces=interfaces,
))
except boto.exception.EC2ResponseError as ec2e:
logger.exception('Unable to spawn builder instance')
metric_queue.ephemeral_build_worker_failure.Inc()
raise ec2e
if not reservation.instances:
raise ExecutorException('Unable to spawn builder instance.')
elif len(reservation.instances) != 1:
raise ExecutorException('EC2 started wrong number of instances!')
launched = AsyncWrapper(reservation.instances[0])
# Sleep a few seconds to wait for AWS to spawn the instance.
yield From(trollius.sleep(_TAG_RETRY_SLEEP))
# Tag the instance with its metadata.
for i in range(0, _TAG_RETRY_COUNT):
try:
yield From(launched.add_tags({
'Name': 'Quay Ephemeral Builder',
'Realm': realm,
'Token': token,
'BuildUUID': build_uuid,
}))
except boto.exception.EC2ResponseError as ec2e:
if ec2e.error_code == 'InvalidInstanceID.NotFound':
if i < _TAG_RETRY_COUNT - 1:
logger.warning('Failed to write EC2 tags for instance %s for build %s (attempt #%s)',
launched.id, build_uuid, i)
yield From(trollius.sleep(_TAG_RETRY_SLEEP))
continue
raise ExecutorException('Unable to find builder instance.')
logger.exception('Failed to write EC2 tags (attempt #%s)', i)
logger.debug('Machine with ID %s started for build %s', launched.id, build_uuid)
raise Return(launched.id)
@coroutine
def stop_builder(self, builder_id):
try:
ec2_conn = self._get_conn()
terminated_instances = yield From(ec2_conn.terminate_instances([builder_id]))
except boto.exception.EC2ResponseError as ec2e:
if ec2e.error_code == 'InvalidInstanceID.NotFound':
logger.debug('Instance %s already terminated', builder_id)
return
logger.exception('Exception when trying to terminate instance %s', builder_id)
raise
if builder_id not in [si.id for si in terminated_instances]:
raise ExecutorException('Unable to terminate instance: %s' % builder_id)
class PopenExecutor(BuilderExecutor):
""" Implementation of BuilderExecutor which uses Popen to fork a quay-builder process.
"""
def __init__(self, executor_config, manager_hostname):
self._jobs = {}
super(PopenExecutor, self).__init__(executor_config, manager_hostname)
""" Executor which uses Popen to fork a quay-builder process.
"""
@coroutine
@duration_collector_async(metric_queue.builder_time_to_start, ['fork'])
def start_builder(self, realm, token, build_uuid):
# Now start a machine for this job, adding the machine id to the etcd information
logger.debug('Forking process for build')
ws_host = os.environ.get("BUILDMAN_WS_HOST", "localhost")
ws_port = os.environ.get("BUILDMAN_WS_PORT", "8787")
builder_env = {
'TOKEN': token,
'REALM': realm,
'ENDPOINT': 'ws://%s:%s' % (ws_host, ws_port),
'DOCKER_TLS_VERIFY': os.environ.get('DOCKER_TLS_VERIFY', ''),
'DOCKER_CERT_PATH': os.environ.get('DOCKER_CERT_PATH', ''),
'DOCKER_HOST': os.environ.get('DOCKER_HOST', ''),
'PATH': "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
}
logpipe = LogPipe(logging.INFO)
spawned = subprocess.Popen(os.environ.get('BUILDER_BINARY_LOCATION',
'/usr/local/bin/quay-builder'),
stdout=logpipe,
stderr=logpipe,
env=builder_env)
builder_id = str(uuid.uuid4())
self._jobs[builder_id] = (spawned, logpipe)
logger.debug('Builder spawned with id: %s', builder_id)
raise Return(builder_id)
@coroutine
def stop_builder(self, builder_id):
if builder_id not in self._jobs:
raise ExecutorException('Builder id not being tracked by executor.')
logger.debug('Killing builder with id: %s', builder_id)
spawned, logpipe = self._jobs[builder_id]
if spawned.poll() is None:
spawned.kill()
logpipe.close()
class KubernetesExecutor(BuilderExecutor):
""" Executes build jobs by creating Kubernetes jobs which run a qemu-kvm virtual
machine in a pod """
def __init__(self, *args, **kwargs):
super(KubernetesExecutor, self).__init__(*args, **kwargs)
self._loop = get_event_loop()
self.namespace = self.executor_config.get('BUILDER_NAMESPACE', 'builder')
self.image = self.executor_config.get('BUILDER_VM_CONTAINER_IMAGE',
'quay.io/quay/quay-builder-qemu-coreos:stable')
@coroutine
def _request(self, method, path, **kwargs):
request_options = dict(kwargs)
tls_cert = self.executor_config.get('K8S_API_TLS_CERT')
tls_key = self.executor_config.get('K8S_API_TLS_KEY')
tls_ca = self.executor_config.get('K8S_API_TLS_CA')
service_account_token = self.executor_config.get('SERVICE_ACCOUNT_TOKEN')
if 'timeout' not in request_options:
request_options['timeout'] = self.executor_config.get("K8S_API_TIMEOUT", 20)
if service_account_token:
scheme = 'https'
request_options['headers'] = {'Authorization': 'Bearer ' + service_account_token}
logger.debug('Using service account token for Kubernetes authentication')
elif tls_cert and tls_key:
scheme = 'https'
request_options['cert'] = (tls_cert, tls_key)
logger.debug('Using tls certificate and key for Kubernetes authentication')
if tls_ca:
request_options['verify'] = tls_ca
else:
scheme = 'http'
server = self.executor_config.get('K8S_API_SERVER', 'localhost:8080')
url = '%s://%s%s' % (scheme, server, path)
logger.debug('Executor config: %s', self.executor_config)
logger.debug('Kubernetes request: %s %s: %s', method, url, request_options)
res = requests.request(method, url, **request_options)
logger.debug('Kubernetes response: %s: %s', res.status_code, res.text)
raise Return(res)
def _jobs_path(self):
return '/apis/batch/v1/namespaces/%s/jobs' % self.namespace
def _job_path(self, build_uuid):
return '%s/%s' % (self._jobs_path(), build_uuid)
def _kubernetes_distribution(self):
return self.executor_config.get('KUBERNETES_DISTRIBUTION', 'basic').lower()
def _is_basic_kubernetes_distribution(self):
return self._kubernetes_distribution() == 'basic'
def _is_openshift_kubernetes_distribution(self):
return self._kubernetes_distribution() == 'openshift'
def _build_job_container_resources(self):
# Minimum acceptable free resources for this container to "fit" in a quota
# These may be lower than the absolute limits if the cluster is knowingly
# oversubscribed by some amount.
container_requests = {
'memory' : self.executor_config.get('CONTAINER_MEMORY_REQUEST', '3968Mi'),
}
container_limits = {
'memory' : self.executor_config.get('CONTAINER_MEMORY_LIMITS', '5120Mi'),
'cpu' : self.executor_config.get('CONTAINER_CPU_LIMITS', '1000m'),
}
resources = {
'requests': container_requests,
}
if self._is_openshift_kubernetes_distribution():
resources['requests']['cpu'] = self.executor_config.get('CONTAINER_CPU_REQUEST', '500m')
resources['limits'] = container_limits
return resources
def _build_job_containers(self, user_data):
vm_memory_limit = self.executor_config.get('VM_MEMORY_LIMIT', '4G')
vm_volume_size = self.executor_config.get('VOLUME_SIZE', '32G')
container = {
'name': 'builder',
'imagePullPolicy': 'IfNotPresent',
'image': self.image,
'securityContext': {'privileged': True},
'env': [
{'name': 'USERDATA', 'value': user_data},
{'name': 'VM_MEMORY', 'value': vm_memory_limit},
{'name': 'VM_VOLUME_SIZE', 'value': vm_volume_size},
],
'resources': self._build_job_container_resources(),
}
if self._is_basic_kubernetes_distribution():
container['volumeMounts'] = [{'name': 'secrets-mask','mountPath': '/var/run/secrets/kubernetes.io/serviceaccount'}]
return container
def _job_resource(self, build_uuid, user_data, coreos_channel='stable'):
image_pull_secret_name = self.executor_config.get('IMAGE_PULL_SECRET_NAME', 'builder')
service_account = self.executor_config.get('SERVICE_ACCOUNT_NAME', 'quay-builder-sa')
node_selector_label_key = self.executor_config.get('NODE_SELECTOR_LABEL_KEY', 'beta.kubernetes.io/instance-type')
node_selector_label_value = self.executor_config.get('NODE_SELECTOR_LABEL_VALUE', '')
node_selector = {
node_selector_label_key : node_selector_label_value
}
release_sha = release.GIT_HEAD or 'none'
if ' ' in release_sha:
release_sha = 'HEAD'
job_resource = {
'apiVersion': 'batch/v1',
'kind': 'Job',
'metadata': {
'namespace': self.namespace,
'generateName': build_uuid + '-',
'labels': {
'build': build_uuid,
'time': datetime.datetime.now().strftime('%Y-%m-%d-%H'),
'manager': socket.gethostname(),
'quay-sha': release_sha,
},
},
'spec' : {
'activeDeadlineSeconds': self.executor_config.get('MAXIMUM_JOB_TIME', 7200),
'template': {
'metadata': {
'labels': {
'build': build_uuid,
'time': datetime.datetime.now().strftime('%Y-%m-%d-%H'),
'manager': socket.gethostname(),
'quay-sha': release_sha,
},
},
'spec': {
'imagePullSecrets': [{ 'name': image_pull_secret_name }],
'restartPolicy': 'Never',
'dnsPolicy': 'Default',
'containers': [self._build_job_containers(user_data)],
},
},
},
}
if self._is_openshift_kubernetes_distribution():
# Setting `automountServiceAccountToken` to false will prevent automounting API credentials for a service account.
job_resource['spec']['template']['spec']['automountServiceAccountToken'] = False
# Use dedicated service account that has no authorization to any resources.
job_resource['spec']['template']['spec']['serviceAccount'] = service_account
# Setting `enableServiceLinks` to false prevents information about other services from being injected into pod's
# environment variables. Pod has no visibility into other services on the cluster.
job_resource['spec']['template']['spec']['enableServiceLinks'] = False
if node_selector_label_value.strip() != '':
job_resource['spec']['template']['spec']['nodeSelector'] = node_selector
if self._is_basic_kubernetes_distribution():
# This volume is a hack to mask the token for the namespace's
# default service account, which is placed in a file mounted under
# `/var/run/secrets/kubernetes.io/serviceaccount` in all pods.
# There's currently no other way to just disable the service
# account at either the pod or namespace level.
#
# https://github.com/kubernetes/kubernetes/issues/16779
#
job_resource['spec']['template']['spec']['volumes'] = [{'name': 'secrets-mask','emptyDir': {'medium': 'Memory'}}]
return job_resource
@coroutine
@duration_collector_async(metric_queue.builder_time_to_start, ['k8s'])
def start_builder(self, realm, token, build_uuid):
# generate resource
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
user_data = self.generate_cloud_config(realm, token, build_uuid, channel, self.manager_hostname)
resource = self._job_resource(build_uuid, user_data, channel)
logger.debug('Using Kubernetes Distribution: %s', self._kubernetes_distribution())
logger.debug('Generated kubernetes resource:\n%s', resource)
# schedule
create_job = yield From(self._request('POST', self._jobs_path(), json=resource))
if int(create_job.status_code / 100) != 2:
raise ExecutorException('Failed to create job: %s: %s: %s' %
(build_uuid, create_job.status_code, create_job.text))
job = create_job.json()
raise Return(job['metadata']['name'])
@coroutine
def stop_builder(self, builder_id):
pods_path = '/api/v1/namespaces/%s/pods' % self.namespace
# Delete the job itself.
try:
yield From(self._request('DELETE', self._job_path(builder_id)))
except:
logger.exception('Failed to send delete job call for job %s', builder_id)
# Delete the pod(s) for the job.
selectorString = "job-name=%s" % builder_id
try:
yield From(self._request('DELETE', pods_path, params=dict(labelSelector=selectorString)))
except:
logger.exception("Failed to send delete pod call for job %s", builder_id)
class LogPipe(threading.Thread):
""" Adapted from http://codereview.stackexchange.com/a/17959
"""
def __init__(self, level):
"""Setup the object with a logger and a loglevel
and start the thread
"""
threading.Thread.__init__(self)
self.daemon = False
self.level = level
self.fd_read, self.fd_write = os.pipe()
self.pipe_reader = os.fdopen(self.fd_read)
self.start()
def fileno(self):
"""Return the write file descriptor of the pipe
"""
return self.fd_write
def run(self):
"""Run the thread, logging everything.
"""
for line in iter(self.pipe_reader.readline, ''):
logging.log(self.level, line.strip('\n'))
self.pipe_reader.close()
def close(self):
"""Close the write end of the pipe.
"""
os.close(self.fd_write)

View file

@ -0,0 +1,8 @@
class NoopCanceller(object):
""" A class that can not cancel a build """
def __init__(self, config=None):
pass
def try_cancel_build(self, uuid):
""" Does nothing and fails to cancel build. """
return False

View file

@ -0,0 +1,26 @@
import logging
from buildman.orchestrator import orchestrator_from_config, OrchestratorError
from util import slash_join
logger = logging.getLogger(__name__)
CANCEL_PREFIX = 'cancel/'
class OrchestratorCanceller(object):
""" An asynchronous way to cancel a build with any Orchestrator. """
def __init__(self, config):
self._orchestrator = orchestrator_from_config(config, canceller_only=True)
def try_cancel_build(self, build_uuid):
logger.info('Cancelling build %s', build_uuid)
cancel_key = slash_join(CANCEL_PREFIX, build_uuid)
try:
self._orchestrator.set_key_sync(cancel_key, build_uuid, expiration=60)
return True
except OrchestratorError:
logger.exception('Failed to write cancel action to redis with uuid %s', build_uuid)
return False

753
buildman/orchestrator.py Normal file
View file

@ -0,0 +1,753 @@
from abc import ABCMeta, abstractmethod
from collections import namedtuple
import datetime
import json
import logging
import re
import time
from enum import IntEnum, unique
from six import add_metaclass, iteritems
from trollius import async, coroutine, From, Return
from urllib3.exceptions import ReadTimeoutError, ProtocolError
import etcd
import redis
from buildman.asyncutil import wrap_with_threadpool
from util import slash_join
from util.expiresdict import ExpiresDict
logger = logging.getLogger(__name__)
ONE_DAY = 60 * 60 * 24
ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION = 5
DEFAULT_LOCK_EXPIRATION = 10000
ETCD_READ_TIMEOUT = 5
ETCD_MAX_WATCH_TIMEOUT = 30
REDIS_EXPIRING_SUFFIX = '/expiring'
REDIS_DEFAULT_PUBSUB_KEY = 'orchestrator_events'
REDIS_EVENT_KIND_MESSAGE = 'message'
REDIS_EVENT_KIND_PMESSAGE = 'pmessage'
REDIS_NONEXPIRING_KEY = -1
# This constant defines the Redis configuration flags used to watch [K]eyspace and e[x]pired
# events on keys. For more info, see https://redis.io/topics/notifications#configuration
REDIS_KEYSPACE_EVENT_CONFIG_VALUE = 'Kx'
REDIS_KEYSPACE_EVENT_CONFIG_KEY = 'notify-keyspace-events'
REDIS_KEYSPACE_KEY_PATTERN = '__keyspace@%s__:%s'
REDIS_EXPIRED_KEYSPACE_PATTERN = slash_join(REDIS_KEYSPACE_KEY_PATTERN, REDIS_EXPIRING_SUFFIX)
REDIS_EXPIRED_KEYSPACE_REGEX = re.compile(REDIS_EXPIRED_KEYSPACE_PATTERN % (r'(\S+)', r'(\S+)'))
def orchestrator_from_config(manager_config, canceller_only=False):
"""
Allocates a new Orchestrator from the 'ORCHESTRATOR' block from provided manager config.
Checks for legacy configuration prefixed with 'ETCD_' when the 'ORCHESTRATOR' is not present.
:param manager_config: the configuration for the orchestrator
:type manager_config: dict
:rtype: :class: Orchestrator
"""
# Legacy codepath only knows how to configure etcd.
if manager_config.get('ORCHESTRATOR') is None:
manager_config['ORCHESTRATOR'] = {key: value
for (key, value) in iteritems(manager_config)
if key.startswith('ETCD_') and not key.endswith('_PREFIX')}
# Sanity check that legacy prefixes are no longer being used.
for key in manager_config['ORCHESTRATOR'].keys():
words = key.split('_')
if len(words) > 1 and words[-1].lower() == 'prefix':
raise AssertionError('legacy prefix used, use ORCHESTRATOR_PREFIX instead')
def _dict_key_prefix(d):
"""
:param d: the dict that has keys prefixed with underscore
:type d: {str: any}
:rtype: str
"""
return d.keys()[0].split('_', 1)[0].lower()
orchestrator_name = _dict_key_prefix(manager_config['ORCHESTRATOR'])
def format_key(key):
return key.lower().split('_', 1)[1]
orchestrator_kwargs = {format_key(key): value
for (key, value) in iteritems(manager_config['ORCHESTRATOR'])}
if manager_config.get('ORCHESTRATOR_PREFIX') is not None:
orchestrator_kwargs['orchestrator_prefix'] = manager_config['ORCHESTRATOR_PREFIX']
orchestrator_kwargs['canceller_only'] = canceller_only
logger.debug('attempting to create orchestrator %s with kwargs %s',
orchestrator_name, orchestrator_kwargs)
return orchestrator_by_name(orchestrator_name, **orchestrator_kwargs)
def orchestrator_by_name(name, **kwargs):
_ORCHESTRATORS = {
'etcd': Etcd2Orchestrator,
'mem': MemoryOrchestrator,
'redis': RedisOrchestrator,
}
return _ORCHESTRATORS.get(name, MemoryOrchestrator)(**kwargs)
class OrchestratorError(Exception):
pass
# TODO: replace with ConnectionError when this codebase is Python 3.
class OrchestratorConnectionError(OrchestratorError):
pass
@unique
class KeyEvent(IntEnum):
CREATE = 1
SET = 2
DELETE = 3
EXPIRE = 4
class KeyChange(namedtuple('KeyChange', ['event', 'key', 'value'])):
pass
@add_metaclass(ABCMeta)
class Orchestrator(object):
"""
Orchestrator is the interface that is used to synchronize the build states
across build managers.
This interface assumes that storage is being done by a key-value store
that supports watching for events on keys.
Missing keys should return KeyError; otherwise, errors should raise an
OrchestratorError.
:param key_prefix: the prefix of keys being watched
:type key_prefix: str
"""
@abstractmethod
def on_key_change(self, key, callback, restarter=None):
"""
The callback parameter takes in a KeyChange object as a parameter.
"""
pass
@abstractmethod
def get_prefixed_keys(self, prefix):
"""
:returns: a dict of key value pairs beginning with prefix
:rtype: {str: str}
"""
pass
@abstractmethod
def get_key(self, key):
"""
:returns: the value stored at the provided key
:rtype: str
"""
pass
@abstractmethod
def set_key(self, key, value, overwrite=False, expiration=None):
"""
:param key: the identifier for the value
:type key: str
:param value: the value being stored
:type value: str
:param overwrite: whether or not a KeyError is thrown if the key already exists
:type overwrite: bool
:param expiration: the duration in seconds that a key should be available
:type expiration: int
"""
pass
@abstractmethod
def set_key_sync(self, key, value, overwrite=False, expiration=None):
"""
set_key, but without trollius coroutines.
"""
pass
@abstractmethod
def delete_key(self, key):
"""
Deletes a key that has been set in the orchestrator.
:param key: the identifier for the key
:type key: str
"""
pass
@abstractmethod
def lock(self, key, expiration=DEFAULT_LOCK_EXPIRATION):
"""
Takes a lock for synchronizing exclusive operations cluster-wide.
:param key: the identifier for the lock
:type key: str
:param expiration: the duration until the lock expires
:type expiration: :class:`datetime.timedelta` or int (seconds)
:returns: whether or not the lock was acquired
:rtype: bool
"""
pass
@abstractmethod
def shutdown():
"""
This function should shutdown any final resources allocated by the Orchestrator.
"""
pass
def _sleep_orchestrator():
"""
This function blocks the trollius event loop by sleeping in order to backoff if a failure
such as a ConnectionError has occurred.
"""
logger.exception('Connecting to etcd failed; sleeping for %s and then trying again',
ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
time.sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
logger.exception('Connecting to etcd failed; slept for %s and now trying again',
ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
class EtcdAction(object):
""" Enumeration of the various kinds of etcd actions we can observe via a watch. """
GET = 'get'
SET = 'set'
EXPIRE = 'expire'
UPDATE = 'update'
DELETE = 'delete'
CREATE = 'create'
COMPARE_AND_SWAP = 'compareAndSwap'
COMPARE_AND_DELETE = 'compareAndDelete'
class Etcd2Orchestrator(Orchestrator):
def __init__(self, host='127.0.0.1', port=2379, cert_and_key=None, ca_cert=None,
client_threads=5, canceller_only=False, **kwargs):
self.is_canceller_only = canceller_only
logger.debug('initializing async etcd client')
self._sync_etcd_client = etcd.Client(
host=host,
port=port,
cert=tuple(cert_and_key) if cert_and_key is not None else None,
ca_cert=ca_cert,
protocol='http' if cert_and_key is None else 'https',
read_timeout=ETCD_READ_TIMEOUT,
)
if not self.is_canceller_only:
(self._etcd_client, self._async_executor) = wrap_with_threadpool(self._sync_etcd_client,
client_threads)
logger.debug('creating initial orchestrator state')
self._shutting_down = False
self._watch_tasks = {}
@staticmethod
def _sanity_check_ttl(ttl):
"""
A TTL of < 0 in etcd results in the key *never being expired*.
We use a max here to ensure that if the TTL is < 0, the key will expire immediately.
"""
return max(ttl, 0)
def _watch_etcd(self, key, callback, restarter=None, start_index=None):
def callback_wrapper(changed_key_future):
new_index = start_index
etcd_result = None
if not changed_key_future.cancelled():
try:
etcd_result = changed_key_future.result()
existing_index = getattr(etcd_result, 'etcd_index', None)
new_index = etcd_result.modifiedIndex + 1
logger.debug('Got watch of key: %s at #%s with result: %s',
key, existing_index, etcd_result)
except ReadTimeoutError:
logger.debug('Read-timeout on etcd watch %s, rescheduling', key)
except etcd.EtcdEventIndexCleared:
# This happens if etcd2 has moved forward too fast for us to start watching at the index
# we retrieved. We therefore start a new watch at HEAD and (if specified) call the
# restarter method which should conduct a read and reset the state of the manager.
logger.debug('Etcd moved forward too quickly. Restarting watch cycle.')
new_index = None
if restarter is not None:
async(restarter())
except (KeyError, etcd.EtcdKeyError):
logger.debug('Etcd key already cleared: %s', key)
return
except etcd.EtcdConnectionFailed:
_sleep_orchestrator()
except etcd.EtcdException as eex:
# TODO: This is a quick and dirty hack and should be replaced with a proper
# exception check.
if str(eex.message).find('Read timed out') >= 0:
logger.debug('Read-timeout on etcd watch %s, rescheduling', key)
else:
logger.exception('Exception on etcd watch: %s', key)
except ProtocolError:
logger.exception('Exception on etcd watch: %s', key)
if key not in self._watch_tasks or self._watch_tasks[key].done():
self._watch_etcd(key, callback, start_index=new_index, restarter=restarter)
if etcd_result and etcd_result.value is not None:
async(callback(self._etcd_result_to_keychange(etcd_result)))
if not self._shutting_down:
logger.debug('Scheduling watch of key: %s at start index %s', key, start_index)
watch_future = self._etcd_client.watch(key, recursive=True, index=start_index,
timeout=ETCD_MAX_WATCH_TIMEOUT)
watch_future.add_done_callback(callback_wrapper)
self._watch_tasks[key] = async(watch_future)
@staticmethod
def _etcd_result_to_keychange(etcd_result):
event = Etcd2Orchestrator._etcd_result_to_keyevent(etcd_result)
return KeyChange(event, etcd_result.key, etcd_result.value)
@staticmethod
def _etcd_result_to_keyevent(etcd_result):
if etcd_result.action == EtcdAction.CREATE:
return KeyEvent.CREATE
if etcd_result.action == EtcdAction.SET:
return KeyEvent.CREATE if etcd_result.createdIndex == etcd_result.modifiedIndex else KeyEvent.SET
if etcd_result.action == EtcdAction.DELETE:
return KeyEvent.DELETE
if etcd_result.action == EtcdAction.EXPIRE:
return KeyEvent.EXPIRE
raise AssertionError('etcd action must have equivalant keyevent')
def on_key_change(self, key, callback, restarter=None):
assert not self.is_canceller_only
logger.debug('creating watch on %s', key)
self._watch_etcd(key, callback, restarter=restarter)
@coroutine
def get_prefixed_keys(self, prefix):
assert not self.is_canceller_only
try:
etcd_result = yield From(self._etcd_client.read(prefix, recursive=True))
raise Return({leaf.key: leaf.value for leaf in etcd_result.leaves})
except etcd.EtcdKeyError:
raise KeyError
except etcd.EtcdConnectionFailed as ex:
raise OrchestratorConnectionError(ex)
except etcd.EtcdException as ex:
raise OrchestratorError(ex)
@coroutine
def get_key(self, key):
assert not self.is_canceller_only
try:
# Ignore pylint: the value property on EtcdResult is added dynamically using setattr.
etcd_result = yield From(self._etcd_client.read(key))
raise Return(etcd_result.value)
except etcd.EtcdKeyError:
raise KeyError
except etcd.EtcdConnectionFailed as ex:
raise OrchestratorConnectionError(ex)
except etcd.EtcdException as ex:
raise OrchestratorError(ex)
@coroutine
def set_key(self, key, value, overwrite=False, expiration=None):
assert not self.is_canceller_only
yield From(self._etcd_client.write(key, value, prevExists=overwrite,
ttl=self._sanity_check_ttl(expiration)))
def set_key_sync(self, key, value, overwrite=False, expiration=None):
self._sync_etcd_client.write(key, value, prevExists=overwrite,
ttl=self._sanity_check_ttl(expiration))
@coroutine
def delete_key(self, key):
assert not self.is_canceller_only
try:
yield From(self._etcd_client.delete(key))
except etcd.EtcdKeyError:
raise KeyError
except etcd.EtcdConnectionFailed as ex:
raise OrchestratorConnectionError(ex)
except etcd.EtcdException as ex:
raise OrchestratorError(ex)
@coroutine
def lock(self, key, expiration=DEFAULT_LOCK_EXPIRATION):
assert not self.is_canceller_only
try:
yield From(self._etcd_client.write(key, {}, prevExist=False,
ttl=self._sanity_check_ttl(expiration)))
raise Return(True)
except (KeyError, etcd.EtcdKeyError):
raise Return(False)
except etcd.EtcdConnectionFailed:
logger.exception('Could not get etcd atomic lock as etcd is down')
raise Return(False)
except etcd.EtcdException as ex:
raise OrchestratorError(ex)
def shutdown(self):
logger.debug('Shutting down etcd client.')
self._shutting_down = True
if self.is_canceller_only:
return
for (key, _), task in self._watch_tasks.items():
if not task.done():
logger.debug('Canceling watch task for %s', key)
task.cancel()
if self._async_executor is not None:
self._async_executor.shutdown()
class MemoryOrchestrator(Orchestrator):
def __init__(self, **kwargs):
self.state = ExpiresDict()
self.callbacks = {}
def _callbacks_prefixed(self, prefix):
return (callback for (key, callback) in iteritems(self.callbacks)
if key.startswith(prefix))
def on_key_change(self, key, callback, restarter=None):
self.callbacks[key] = callback
@coroutine
def get_prefixed_keys(self, prefix):
raise Return({k: value for (k, value) in self.state.items()
if k.startswith(prefix)})
@coroutine
def get_key(self, key):
raise Return(self.state[key])
@coroutine
def set_key(self, key, value, overwrite=False, expiration=None):
preexisting_key = 'key' in self.state
if preexisting_key and not overwrite:
raise KeyError
absolute_expiration = None
if expiration is not None:
absolute_expiration = datetime.datetime.now() + datetime.timedelta(seconds=expiration)
self.state.set(key, value, expires=absolute_expiration)
event = KeyEvent.CREATE if not preexisting_key else KeyEvent.SET
for callback in self._callbacks_prefixed(key):
yield From(callback(KeyChange(event, key, value)))
def set_key_sync(self, key, value, overwrite=False, expiration=None):
"""
set_key, but without trollius coroutines.
"""
preexisting_key = 'key' in self.state
if preexisting_key and not overwrite:
raise KeyError
absolute_expiration = None
if expiration is not None:
absolute_expiration = datetime.datetime.now() + datetime.timedelta(seconds=expiration)
self.state.set(key, value, expires=absolute_expiration)
event = KeyEvent.CREATE if not preexisting_key else KeyEvent.SET
for callback in self._callbacks_prefixed(key):
callback(KeyChange(event, key, value))
@coroutine
def delete_key(self, key):
value = self.state[key]
del self.state[key]
for callback in self._callbacks_prefixed(key):
yield From(callback(KeyChange(KeyEvent.DELETE, key, value)))
@coroutine
def lock(self, key, expiration=DEFAULT_LOCK_EXPIRATION):
if key in self.state:
raise Return(False)
self.state.set(key, None, expires=expiration)
raise Return(True)
def shutdown(self):
self.state = None
self.callbacks = None
class RedisOrchestrator(Orchestrator):
def __init__(self, host='127.0.0.1', port=6379, password=None, db=0, cert_and_key=None,
ca_cert=None, client_threads=5, ssl=False, skip_keyspace_event_setup=False,
canceller_only=False, **kwargs):
self.is_canceller_only = canceller_only
(cert, key) = tuple(cert_and_key) if cert_and_key is not None else (None, None)
self._sync_client = redis.StrictRedis(
host=host,
port=port,
password=password,
db=db,
ssl_certfile=cert,
ssl_keyfile=key,
ssl_ca_certs=ca_cert,
ssl=ssl,
)
self._shutting_down = False
self._tasks = {}
self._watched_keys = {}
self._pubsub_key = slash_join(kwargs.get('orchestrator_prefix', ''),
REDIS_DEFAULT_PUBSUB_KEY).lstrip('/')
if not self.is_canceller_only:
(self._client, self._async_executor) = wrap_with_threadpool(self._sync_client, client_threads)
# Configure a subscription to watch events that the orchestrator manually publishes.
logger.debug('creating pubsub with key %s', self._pubsub_key)
published_pubsub = self._sync_client.pubsub()
published_pubsub.subscribe(self._pubsub_key)
(self._pubsub, self._async_executor_pub) = wrap_with_threadpool(published_pubsub)
self._watch_published_key()
# Configure a subscription to watch expired keyspace events.
if not skip_keyspace_event_setup:
self._sync_client.config_set(REDIS_KEYSPACE_EVENT_CONFIG_KEY,
REDIS_KEYSPACE_EVENT_CONFIG_VALUE)
expiring_pubsub = self._sync_client.pubsub()
expiring_pubsub.psubscribe(REDIS_EXPIRED_KEYSPACE_PATTERN % (db, '*'))
(self._pubsub_expiring, self._async_executor_ex) = wrap_with_threadpool(expiring_pubsub)
self._watch_expiring_key()
def _watch_published_key(self):
def published_callback_wrapper(event_future):
logger.debug('published callback called')
event_result = None
if not event_future.cancelled():
try:
event_result = event_future.result()
(redis_event, event_key, event_value) = event_result
logger.debug('Got watch of key: (%s, %s, %s)', redis_event, event_key, event_value)
except redis.ConnectionError:
_sleep_orchestrator()
except redis.RedisError:
logger.exception('Exception watching redis publish: %s', event_key)
# Schedule creating a new future if this one has been consumed.
if 'pub' not in self._tasks or self._tasks['pub'].done():
self._watch_published_key()
if event_result is not None and redis_event == REDIS_EVENT_KIND_MESSAGE:
keychange = self._publish_to_keychange(event_value)
for watched_key, callback in iteritems(self._watched_keys):
if keychange.key.startswith(watched_key):
async(callback(keychange))
if not self._shutting_down:
logger.debug('Scheduling watch of publish stream')
watch_future = self._pubsub.parse_response()
watch_future.add_done_callback(published_callback_wrapper)
self._tasks['pub'] = async(watch_future)
def _watch_expiring_key(self):
def expiring_callback_wrapper(event_future):
logger.debug('expiring callback called')
event_result = None
if not event_future.cancelled():
try:
event_result = event_future.result()
if self._is_expired_keyspace_event(event_result):
# Get the value of the original key before the expiration happened.
key = self._key_from_expiration(event_future)
expired_value = yield From(self._client.get(key))
# $KEY/expiring is gone, but the original key still remains, set an expiration for it
# so that other managers have time to get the event and still read the expired value.
yield From(self._client.expire(key, ONE_DAY))
except redis.ConnectionError:
_sleep_orchestrator()
except redis.RedisError:
logger.exception('Exception watching redis expirations: %s', key)
# Schedule creating a new future if this one has been consumed.
if 'expire' not in self._tasks or self._tasks['expire'].done():
self._watch_expiring_key()
if self._is_expired_keyspace_event(event_result) and expired_value is not None:
for watched_key, callback in iteritems(self._watched_keys):
if key.startswith(watched_key):
async(callback(KeyChange(KeyEvent.EXPIRE, key, expired_value)))
if not self._shutting_down:
logger.debug('Scheduling watch of expiration')
watch_future = self._pubsub_expiring.parse_response()
watch_future.add_done_callback(expiring_callback_wrapper)
self._tasks['expire'] = async(watch_future)
def on_key_change(self, key, callback, restarter=None):
assert not self.is_canceller_only
logger.debug('watching key: %s', key)
self._watched_keys[key] = callback
@staticmethod
def _is_expired_keyspace_event(event_result):
"""
Sanity check that this isn't an unrelated keyspace event.
There could be a more efficient keyspace event config to avoid this client-side filter.
"""
if event_result is None:
return False
(redis_event, _pattern, matched_key, expired) = event_result
return (redis_event == REDIS_EVENT_KIND_PMESSAGE and
expired == 'expired' and
REDIS_EXPIRED_KEYSPACE_REGEX.match(matched_key) is not None)
@staticmethod
def _key_from_expiration(event_result):
(_redis_event, _pattern, matched_key, _expired) = event_result
return REDIS_EXPIRED_KEYSPACE_REGEX.match(matched_key).groups()[1]
@staticmethod
def _publish_to_keychange(event_value):
e = json.loads(event_value)
return KeyChange(KeyEvent(e['event']), e['key'], e['value'])
@coroutine
def get_prefixed_keys(self, prefix):
assert not self.is_canceller_only
# TODO: This can probably be done with redis pipelines to make it transactional.
keys = yield From(self._client.keys(prefix + '*'))
# Yielding to the event loop is required, thus this cannot be written as a dict comprehension.
results = {}
for key in keys:
if key.endswith(REDIS_EXPIRING_SUFFIX):
continue
ttl = yield From(self._client.ttl(key))
if ttl != REDIS_NONEXPIRING_KEY:
# Only redis keys without expirations are live build manager keys.
value = yield From(self._client.get(key))
results.update({key: value})
raise Return(results)
@coroutine
def get_key(self, key):
assert not self.is_canceller_only
value = yield From(self._client.get(key))
raise Return(value)
@coroutine
def set_key(self, key, value, overwrite=False, expiration=None):
assert not self.is_canceller_only
already_exists = yield From(self._client.exists(key))
yield From(self._client.set(key, value, xx=overwrite))
if expiration is not None:
yield From(self._client.set(slash_join(key, REDIS_EXPIRING_SUFFIX), value,
xx=overwrite, ex=expiration))
key_event = KeyEvent.SET if already_exists else KeyEvent.CREATE
yield From(self._publish(event=key_event, key=key, value=value))
def set_key_sync(self, key, value, overwrite=False, expiration=None):
already_exists = self._sync_client.exists(key)
self._sync_client.set(key, value, xx=overwrite)
if expiration is not None:
self._sync_client.set(slash_join(key, REDIS_EXPIRING_SUFFIX), value,
xx=overwrite, ex=expiration)
self._sync_client.publish(self._pubsub_key, json.dumps({
'event': int(KeyEvent.SET if already_exists else KeyEvent.CREATE),
'key': key,
'value': value,
}))
@coroutine
def _publish(self, **kwargs):
kwargs['event'] = int(kwargs['event'])
event_json = json.dumps(kwargs)
logger.debug('publishing event: %s', event_json)
yield From(self._client.publish(self._pubsub_key, event_json))
@coroutine
def delete_key(self, key):
assert not self.is_canceller_only
value = yield From(self._client.get(key))
yield From(self._client.delete(key))
yield From(self._client.delete(slash_join(key, REDIS_EXPIRING_SUFFIX)))
yield From(self._publish(event=KeyEvent.DELETE, key=key, value=value))
@coroutine
def lock(self, key, expiration=DEFAULT_LOCK_EXPIRATION):
assert not self.is_canceller_only
yield From(self.set_key(key, '', ex=expiration))
raise Return(True)
@coroutine
def shutdown(self):
logger.debug('Shutting down redis client.')
self._shutting_down = True
if self.is_canceller_only:
return
for key, task in iteritems(self._tasks):
if not task.done():
logger.debug('Canceling watch task for %s', key)
task.cancel()
if self._async_executor is not None:
self._async_executor.shutdown()
if self._async_executor_ex is not None:
self._async_executor_ex.shutdown()
if self._async_executor_pub is not None:
self._async_executor_pub.shutdown()

View file

@ -0,0 +1,26 @@
FROM debian
RUN apt-get clean && apt-get update && apt-get upgrade -y # 03APR2017
RUN apt-get install -y \
bzip2 \
curl \
openssh-client \
qemu-kvm
ARG channel=stable
ARG version=current
RUN echo "Downloading http://${channel}.release.core-os.net/amd64-usr/${version}/coreos_production_qemu_image.img.bz2"
RUN curl -s -O http://${channel}.release.core-os.net/amd64-usr/${version}/coreos_production_qemu_image.img.bz2 && \
bzip2 -d coreos_production_qemu_image.img.bz2
RUN apt-get remove -y curl bzip2 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
COPY start.sh /start.sh
LABEL com.coreos.channel ${channel}
LABEL com.coreos.version ${version}
ENTRYPOINT ["/bin/bash", "/start.sh"]

View file

@ -0,0 +1,5 @@
# Builder Image
```
docker build --build-arg channel=stable --build-arg version=current -t quay.io/quay/quay-builder-qemu-coreos:staging .
```

View file

@ -0,0 +1,26 @@
#!/bin/bash
VM_VOLUME_SIZE="${VM_VOLUME_SIZE:-32G}"
VM_MEMORY="${VM_MEMORY:-4G}"
set -e
set -x
set -o nounset
mkdir -p /userdata/openstack/latest
echo "${USERDATA}" > /userdata/openstack/latest/user_data
time qemu-img resize ./coreos_production_qemu_image.img "${VM_VOLUME_SIZE}"
qemu-system-x86_64 \
-enable-kvm \
-cpu host \
-device virtio-9p-pci,fsdev=conf,mount_tag=config-2 \
-nographic \
-drive if=virtio,file=./coreos_production_qemu_image.img \
-fsdev local,id=conf,security_model=none,readonly,path=/userdata \
-m "${VM_MEMORY}" \
-machine accel=kvm \
-net nic,model=virtio \
-net user,hostfwd=tcp::2222-:22 \
-smp 2

266
buildman/server.py Normal file
View file

@ -0,0 +1,266 @@
import logging
import json
import trollius
from threading import Event
from datetime import timedelta
from trollius.coroutines import From
from autobahn.asyncio.wamp import RouterFactory, RouterSessionFactory
from autobahn.asyncio.websocket import WampWebSocketServerFactory
from autobahn.wamp import types
from aiowsgi import create_server as create_wsgi_server
from flask import Flask
from buildman.enums import BuildJobResult, BuildServerStatus, RESULT_PHASES
from buildman.jobutil.buildstatus import StatusHandler
from buildman.jobutil.buildjob import BuildJob, BuildJobLoadException
from data import database, model
from app import app, metric_queue
logger = logging.getLogger(__name__)
WORK_CHECK_TIMEOUT = 10
TIMEOUT_PERIOD_MINUTES = 20
JOB_TIMEOUT_SECONDS = 300
SETUP_LEEWAY_SECONDS = 30
MINIMUM_JOB_EXTENSION = timedelta(minutes=1)
HEARTBEAT_PERIOD_SEC = 30
class BuilderServer(object):
""" Server which handles both HTTP and WAMP requests, managing the full state of the build
controller.
"""
def __init__(self, registry_hostname, queue, build_logs, user_files, lifecycle_manager_klass,
lifecycle_manager_config, manager_hostname):
self._loop = None
self._current_status = BuildServerStatus.STARTING
self._current_components = []
self._realm_map = {}
self._job_count = 0
self._session_factory = RouterSessionFactory(RouterFactory())
self._registry_hostname = registry_hostname
self._queue = queue
self._build_logs = build_logs
self._user_files = user_files
self._lifecycle_manager = lifecycle_manager_klass(
self._register_component,
self._unregister_component,
self._job_heartbeat,
self._job_complete,
manager_hostname,
HEARTBEAT_PERIOD_SEC,
)
self._lifecycle_manager_config = lifecycle_manager_config
self._shutdown_event = Event()
self._current_status = BuildServerStatus.RUNNING
self._register_controller()
def _register_controller(self):
controller_app = Flask('controller')
server = self
@controller_app.route('/status')
def status():
metrics = server._queue.get_metrics()
(running_count, available_not_running_count, available_count) = metrics
workers = [component for component in server._current_components
if component.kind() == 'builder']
data = {
'status': server._current_status,
'running_local': server._job_count,
'running_total': running_count,
'workers': len(workers),
'job_total': available_count + running_count
}
return json.dumps(data)
self._controller_app = controller_app
def run(self, host, websocket_port, controller_port, ssl=None):
logger.debug('Initializing the lifecycle manager')
self._lifecycle_manager.initialize(self._lifecycle_manager_config)
logger.debug('Initializing all members of the event loop')
loop = trollius.get_event_loop()
logger.debug('Starting server on port %s, with controller on port %s', websocket_port,
controller_port)
try:
loop.run_until_complete(self._initialize(loop, host, websocket_port, controller_port, ssl))
except KeyboardInterrupt:
pass
finally:
loop.close()
def close(self):
logger.debug('Requested server shutdown')
self._current_status = BuildServerStatus.SHUTDOWN
self._lifecycle_manager.shutdown()
self._shutdown_event.wait()
logger.debug('Shutting down server')
def _register_component(self, realm, component_klass, **kwargs):
""" Registers a component with the server. The component_klass must derive from
BaseComponent.
"""
logger.debug('Registering component with realm %s', realm)
if realm in self._realm_map:
logger.debug('Component with realm %s already registered', realm)
return self._realm_map[realm]
component = component_klass(types.ComponentConfig(realm=realm), realm=realm, **kwargs)
component.server = self
component.parent_manager = self._lifecycle_manager
component.build_logs = self._build_logs
component.user_files = self._user_files
component.registry_hostname = self._registry_hostname
self._realm_map[realm] = component
self._current_components.append(component)
self._session_factory.add(component)
return component
def _unregister_component(self, component):
logger.debug('Unregistering component with realm %s and token %s',
component.builder_realm, component.expected_token)
self._realm_map.pop(component.builder_realm, None)
if component in self._current_components:
self._current_components.remove(component)
self._session_factory.remove(component)
def _job_heartbeat(self, build_job):
self._queue.extend_processing(build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS,
minimum_extension=MINIMUM_JOB_EXTENSION)
@trollius.coroutine
def _job_complete(self, build_job, job_status, executor_name=None, update_phase=False):
if job_status == BuildJobResult.INCOMPLETE:
logger.warning('[BUILD INCOMPLETE: job complete] Build ID: %s. No retry restore.',
build_job.repo_build.uuid)
self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30)
else:
self._queue.complete(build_job.job_item)
# Update the trigger failure tracking (if applicable).
if build_job.repo_build.trigger is not None:
model.build.update_trigger_disable_status(build_job.repo_build.trigger,
RESULT_PHASES[job_status])
if update_phase:
status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid)
yield From(status_handler.set_phase(RESULT_PHASES[job_status]))
self._job_count = self._job_count - 1
if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count:
self._shutdown_event.set()
_report_completion_status(build_job, job_status, executor_name)
@trollius.coroutine
def _work_checker(self):
logger.debug('Initializing work checker')
while self._current_status == BuildServerStatus.RUNNING:
with database.CloseForLongOperation(app.config):
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
logger.debug('Checking for more work for %d active workers',
self._lifecycle_manager.num_workers())
processing_time = self._lifecycle_manager.overall_setup_time() + SETUP_LEEWAY_SECONDS
job_item = self._queue.get(processing_time=processing_time, ordering_required=True)
if job_item is None:
logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
continue
try:
build_job = BuildJob(job_item)
except BuildJobLoadException as irbe:
logger.warning('[BUILD INCOMPLETE: job load exception] Job data: %s. No retry restore.',
job_item.body)
logger.exception(irbe)
self._queue.incomplete(job_item, restore_retry=False)
continue
logger.debug('Checking for an avaliable worker for build job %s',
build_job.repo_build.uuid)
try:
schedule_success, retry_timeout = yield From(self._lifecycle_manager.schedule(build_job))
except:
logger.warning('[BUILD INCOMPLETE: scheduling] Build ID: %s. Retry restored.',
build_job.repo_build.uuid)
logger.exception('Exception when scheduling job: %s', build_job.repo_build.uuid)
self._current_status = BuildServerStatus.EXCEPTION
self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT)
return
if schedule_success:
logger.debug('Marking build %s as scheduled', build_job.repo_build.uuid)
status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid)
yield From(status_handler.set_phase(database.BUILD_PHASE.BUILD_SCHEDULED))
self._job_count = self._job_count + 1
logger.debug('Build job %s scheduled. Running: %s', build_job.repo_build.uuid,
self._job_count)
else:
logger.warning('[BUILD INCOMPLETE: no schedule] Build ID: %s. Retry restored.',
build_job.repo_build.uuid)
logger.debug('All workers are busy for job %s Requeuing after %s seconds.',
build_job.repo_build.uuid, retry_timeout)
self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout)
@trollius.coroutine
def _queue_metrics_updater(self):
logger.debug('Initializing queue metrics updater')
while self._current_status == BuildServerStatus.RUNNING:
logger.debug('Writing metrics')
self._queue.update_metrics()
logger.debug('Metrics going to sleep for 30 seconds')
yield From(trollius.sleep(30))
@trollius.coroutine
def _initialize(self, loop, host, websocket_port, controller_port, ssl=None):
self._loop = loop
# Create the WAMP server.
transport_factory = WampWebSocketServerFactory(self._session_factory, debug_wamp=False)
transport_factory.setProtocolOptions(failByDrop=True)
# Initialize the controller server and the WAMP server
create_wsgi_server(self._controller_app, loop=loop, host=host, port=controller_port, ssl=ssl)
yield From(loop.create_server(transport_factory, host, websocket_port, ssl=ssl))
# Initialize the metrics updater
trollius.async(self._queue_metrics_updater())
# Initialize the work queue checker.
yield From(self._work_checker())
def _report_completion_status(build_job, status, executor_name):
metric_queue.build_counter.Inc(labelvalues=[status])
metric_queue.repository_build_completed.Inc(labelvalues=[build_job.namespace, build_job.repo_name,
status, executor_name or 'executor'])
if status == BuildJobResult.COMPLETE:
status_name = 'CompleteBuilds'
elif status == BuildJobResult.ERROR:
status_name = 'FailedBuilds'
elif status == BuildJobResult.INCOMPLETE:
status_name = 'IncompletedBuilds'
else:
return
metric_queue.put_deprecated(status_name, 1, unit='Count')

View file

@ -0,0 +1,102 @@
#cloud-config
hostname: {{ build_uuid | default('quay-builder', True) }}
users:
groups:
- sudo
- docker
{% if ssh_authorized_keys -%}
ssh_authorized_keys:
{% for ssh_key in ssh_authorized_keys -%}
- {{ ssh_key }}
{%- endfor %}
{%- endif %}
write_files:
- path: /root/disable-aws-metadata.sh
permission: '0755'
content: |
iptables -t nat -I PREROUTING -p tcp -d 169.254.169.254 --dport 80 -j DNAT --to-destination 1.1.1.1
- path: /etc/docker/daemon.json
permission: '0644'
content: |
{
"storage-driver": "overlay2"
}
- path: /root/overrides.list
permission: '0644'
content: |
REALM={{ realm }}
TOKEN={{ token }}
SERVER={{ websocket_scheme }}://{{ manager_hostname }}
{% if logentries_token -%}
LOGENTRIES_TOKEN={{ logentries_token }}
{%- endif %}
coreos:
update:
reboot-strategy: off
group: {{ coreos_channel }}
units:
- name: update-engine.service
command: stop
- name: locksmithd.service
command: stop
- name: systemd-journal-gatewayd.socket
command: start
enable: yes
content: |
[Unit]
Description=Journal Gateway Service Socket
[Socket]
ListenStream=/var/run/journald.sock
Service=systemd-journal-gatewayd.service
[Install]
WantedBy=sockets.target
{{ dockersystemd('quay-builder',
worker_image,
quay_username,
quay_password,
worker_tag,
extra_args='--net=host --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock -v /usr/share/ca-certificates:/etc/ssl/certs',
exec_stop_post=['/bin/sh -xc "/bin/sleep 120; /usr/bin/systemctl --no-block poweroff"'],
flattened=True,
restart_policy='no'
) | indent(4) }}
{% if logentries_token -%}
# https://github.com/kelseyhightower/journal-2-logentries/pull/11 so moved journal-2-logentries to coreos
{{ dockersystemd('builder-logs',
'quay.io/coreos/journal-2-logentries',
extra_args='--env-file /root/overrides.list -v /run/journald.sock:/run/journald.sock',
flattened=True,
after_units=['quay-builder.service']
) | indent(4) }}
{%- endif %}
- name: disable-aws-metadata.service
command: start
enable: yes
content: |
[Unit]
Description=Disable AWS metadata service
Before=network-pre.target
Wants=network-pre.target
[Service]
Type=oneshot
ExecStart=/root/disable-aws-metadata.sh
RemainAfterExit=yes
[Install]
WantedBy=multi-user.target
- name: machine-lifetime.service
command: start
enable: yes
content: |
[Unit]
Description=Machine Lifetime Service
[Service]
Type=oneshot
ExecStart=/bin/sh -xc "/bin/sleep {{ max_lifetime_s }}; /usr/bin/systemctl --no-block poweroff"

View file

@ -0,0 +1,679 @@
import unittest
import json
import uuid
from mock import Mock, ANY
from six import iteritems
from trollius import coroutine, get_event_loop, From, Future, Return
from app import metric_queue
from buildman.asyncutil import AsyncWrapper
from buildman.component.buildcomponent import BuildComponent
from buildman.manager.ephemeral import (EphemeralBuilderManager, REALM_PREFIX,
JOB_PREFIX)
from buildman.manager.executor import BuilderExecutor, ExecutorException
from buildman.orchestrator import KeyEvent, KeyChange
from buildman.server import BuildJobResult
from util import slash_join
from util.metrics.metricqueue import duration_collector_async
BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead'
REALM_ID = '1234-realm'
def async_test(f):
def wrapper(*args, **kwargs):
coro = coroutine(f)
future = coro(*args, **kwargs)
loop = get_event_loop()
loop.run_until_complete(future)
return wrapper
class TestExecutor(BuilderExecutor):
job_started = None
job_stopped = None
@coroutine
@duration_collector_async(metric_queue.builder_time_to_start, labelvalues=["testlabel"])
def start_builder(self, realm, token, build_uuid):
self.job_started = str(uuid.uuid4())
raise Return(self.job_started)
@coroutine
def stop_builder(self, execution_id):
self.job_stopped = execution_id
class BadExecutor(BuilderExecutor):
@coroutine
@duration_collector_async(metric_queue.builder_time_to_start, labelvalues=["testlabel"])
def start_builder(self, realm, token, build_uuid):
raise ExecutorException('raised on purpose!')
class EphemeralBuilderTestCase(unittest.TestCase):
def __init__(self, *args, **kwargs):
self.etcd_client_mock = None
super(EphemeralBuilderTestCase, self).__init__(*args, **kwargs)
@staticmethod
def _create_completed_future(result=None):
def inner(*args, **kwargs):
new_future = Future()
new_future.set_result(result)
return new_future
return inner
def setUp(self):
self._existing_executors = dict(EphemeralBuilderManager.EXECUTORS)
def tearDown(self):
EphemeralBuilderManager.EXECUTORS = self._existing_executors
@coroutine
def _register_component(self, realm_spec, build_component, token):
raise Return('hello')
def _create_build_job(self, namespace='namespace', retries=3):
mock_job = Mock()
mock_job.job_details = {'build_uuid': BUILD_UUID}
mock_job.job_item = {
'body': json.dumps(mock_job.job_details),
'id': 1,
}
mock_job.namespace = namespace
mock_job.retries_remaining = retries
mock_job.build_uuid = BUILD_UUID
return mock_job
class TestEphemeralLifecycle(EphemeralBuilderTestCase):
""" Tests the various lifecycles of the ephemeral builder and its interaction with etcd. """
def __init__(self, *args, **kwargs):
super(TestEphemeralLifecycle, self).__init__(*args, **kwargs)
self.etcd_client_mock = None
self.test_executor = None
def _create_completed_future(self, result=None):
def inner(*args, **kwargs):
new_future = Future()
new_future.set_result(result)
return new_future
return inner
def _create_mock_executor(self, *args, **kwargs):
self.test_executor = Mock(spec=BuilderExecutor)
self.test_executor.start_builder = Mock(side_effect=self._create_completed_future('123'))
self.test_executor.stop_builder = Mock(side_effect=self._create_completed_future())
self.test_executor.setup_time = 60
self.test_executor.name = 'MockExecutor'
self.test_executor.minimum_retry_threshold = 0
return self.test_executor
def setUp(self):
super(TestEphemeralLifecycle, self).setUp()
EphemeralBuilderManager.EXECUTORS['test'] = self._create_mock_executor
self.register_component_callback = Mock()
self.unregister_component_callback = Mock()
self.job_heartbeat_callback = Mock()
self.job_complete_callback = AsyncWrapper(Mock())
self.manager = EphemeralBuilderManager(
self.register_component_callback,
self.unregister_component_callback,
self.job_heartbeat_callback,
self.job_complete_callback,
'127.0.0.1',
30,
)
self.manager.initialize({
'EXECUTOR': 'test',
'ORCHESTRATOR': {'MEM_CONFIG': None},
})
# Ensure that that the realm and building callbacks have been registered
callback_keys = [key for key in self.manager._orchestrator.callbacks]
self.assertIn(REALM_PREFIX, callback_keys)
self.assertIn(JOB_PREFIX, callback_keys)
self.mock_job = self._create_build_job()
self.mock_job_key = slash_join('building', BUILD_UUID)
def tearDown(self):
super(TestEphemeralLifecycle, self).tearDown()
self.manager.shutdown()
@coroutine
def _setup_job_for_managers(self):
test_component = Mock(spec=BuildComponent)
test_component.builder_realm = REALM_ID
test_component.start_build = Mock(side_effect=self._create_completed_future())
self.register_component_callback.return_value = test_component
is_scheduled = yield From(self.manager.schedule(self.mock_job))
self.assertTrue(is_scheduled)
self.assertEqual(self.test_executor.start_builder.call_count, 1)
# Ensure that that the job, realm, and metric callbacks have been registered
callback_keys = [key for key in self.manager._orchestrator.callbacks]
self.assertIn(self.mock_job_key, self.manager._orchestrator.state)
self.assertIn(REALM_PREFIX, callback_keys)
# TODO: assert metric key has been set
realm_for_build = self._find_realm_key(self.manager._orchestrator, BUILD_UUID)
raw_realm_data = yield From(self.manager._orchestrator.get_key(slash_join('realm',
realm_for_build)))
realm_data = json.loads(raw_realm_data)
realm_data['realm'] = REALM_ID
# Right now the job is not registered with any managers because etcd has not accepted the job
self.assertEqual(self.register_component_callback.call_count, 0)
# Fire off a realm changed with the same data.
yield From(self.manager._realm_callback(
KeyChange(KeyEvent.CREATE,
slash_join(REALM_PREFIX, REALM_ID),
json.dumps(realm_data))))
# Ensure that we have at least one component node.
self.assertEqual(self.register_component_callback.call_count, 1)
self.assertEqual(1, self.manager.num_workers())
# Ensure that the build info exists.
self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
raise Return(test_component)
@staticmethod
def _find_realm_key(orchestrator, build_uuid):
for key, value in iteritems(orchestrator.state):
if key.startswith(REALM_PREFIX):
parsed_value = json.loads(value)
body = json.loads(parsed_value['job_queue_item']['body'])
if body['build_uuid'] == build_uuid:
return parsed_value['realm']
continue
raise KeyError
@async_test
def test_schedule_and_complete(self):
# Test that a job is properly registered with all of the managers
test_component = yield From(self._setup_job_for_managers())
# Take the job ourselves
yield From(self.manager.build_component_ready(test_component))
self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
# Finish the job
yield From(self.manager.job_completed(self.mock_job, BuildJobResult.COMPLETE, test_component))
# Ensure that the executor kills the job.
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
# Ensure the build information is cleaned up.
self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
self.assertEqual(0, self.manager.num_workers())
@async_test
def test_another_manager_takes_job(self):
# Prepare a job to be taken by another manager
test_component = yield From(self._setup_job_for_managers())
yield From(self.manager._realm_callback(
KeyChange(KeyEvent.DELETE,
slash_join(REALM_PREFIX, REALM_ID),
json.dumps({'realm': REALM_ID,
'token': 'beef',
'execution_id': '123',
'job_queue_item': self.mock_job.job_item}))))
self.unregister_component_callback.assert_called_once_with(test_component)
# Ensure that the executor does not kill the job.
self.assertEqual(self.test_executor.stop_builder.call_count, 0)
# Ensure that we still have the build info, but not the component.
self.assertEqual(0, self.manager.num_workers())
self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
# Delete the job once it has "completed".
yield From(self.manager._job_callback(
KeyChange(KeyEvent.DELETE,
self.mock_job_key,
json.dumps({'had_heartbeat': False,
'job_queue_item': self.mock_job.job_item}))))
# Ensure the job was removed from the info, but stop was not called.
self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
self.assertEqual(self.test_executor.stop_builder.call_count, 0)
@async_test
def test_job_started_by_other_manager(self):
# Ensure that that the building callbacks have been registered
callback_keys = [key for key in self.manager._orchestrator.callbacks]
self.assertIn(JOB_PREFIX, callback_keys)
# Send a signal to the callback that the job has been created.
yield From(self.manager._job_callback(
KeyChange(KeyEvent.CREATE,
self.mock_job_key,
json.dumps({'had_heartbeat': False,
'job_queue_item': self.mock_job.job_item}))))
# Ensure the create does nothing.
self.assertEqual(self.test_executor.stop_builder.call_count, 0)
@async_test
def test_expiring_worker_not_started(self):
# Ensure that that the building callbacks have been registered
callback_keys = [key for key in self.manager._orchestrator.callbacks]
self.assertIn(JOB_PREFIX, callback_keys)
# Send a signal to the callback that a worker has expired
yield From(self.manager._job_callback(
KeyChange(KeyEvent.EXPIRE,
self.mock_job_key,
json.dumps({'had_heartbeat': True,
'job_queue_item': self.mock_job.job_item}))))
# Since the realm was never registered, expiration should do nothing.
self.assertEqual(self.test_executor.stop_builder.call_count, 0)
@async_test
def test_expiring_worker_started(self):
test_component = yield From(self._setup_job_for_managers())
# Ensure that that the building callbacks have been registered
callback_keys = [key for key in self.manager._orchestrator.callbacks]
self.assertIn(JOB_PREFIX, callback_keys)
yield From(self.manager._job_callback(
KeyChange(KeyEvent.EXPIRE,
self.mock_job_key,
json.dumps({'had_heartbeat': True,
'job_queue_item': self.mock_job.job_item}))))
self.test_executor.stop_builder.assert_called_once_with('123')
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
@async_test
def test_buildjob_deleted(self):
test_component = yield From(self._setup_job_for_managers())
# Ensure that that the building callbacks have been registered
callback_keys = [key for key in self.manager._orchestrator.callbacks]
self.assertIn(JOB_PREFIX, callback_keys)
# Send a signal to the callback that a worker has expired
yield From(self.manager._job_callback(
KeyChange(KeyEvent.DELETE,
self.mock_job_key,
json.dumps({'had_heartbeat': False,
'job_queue_item': self.mock_job.job_item}))))
self.assertEqual(self.test_executor.stop_builder.call_count, 0)
self.assertEqual(self.job_complete_callback.call_count, 0)
self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
@async_test
def test_builder_never_starts(self):
test_component = yield From(self._setup_job_for_managers())
# Ensure that that the building callbacks have been registered
callback_keys = [key for key in self.manager._orchestrator.callbacks]
self.assertIn(JOB_PREFIX, callback_keys)
# Send a signal to the callback that a worker has expired
yield From(self.manager._job_callback(
KeyChange(KeyEvent.EXPIRE,
self.mock_job_key,
json.dumps({'had_heartbeat': False,
'job_queue_item': self.mock_job.job_item}))))
self.test_executor.stop_builder.assert_called_once_with('123')
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
# Ensure the job was marked as incomplete, with an update_phase to True (so the DB record and
# logs are updated as well)
yield From(self.job_complete_callback.assert_called_once_with(ANY, BuildJobResult.INCOMPLETE,
'MockExecutor',
update_phase=True))
@async_test
def test_change_worker(self):
# Send a signal to the callback that a worker key has been changed
self.manager._job_callback(KeyChange(KeyEvent.SET, self.mock_job_key, 'value'))
self.assertEqual(self.test_executor.stop_builder.call_count, 0)
@async_test
def test_realm_expired(self):
test_component = yield From(self._setup_job_for_managers())
# Send a signal to the callback that a realm has expired
yield From(self.manager._realm_callback(KeyChange(
KeyEvent.EXPIRE,
self.mock_job_key,
json.dumps({
'realm': REALM_ID,
'execution_id': 'foobar',
'executor_name': 'MockExecutor',
'job_queue_item': {'body': '{"build_uuid": "fakeid"}'},
}))))
# Ensure that the cleanup code for the executor was called.
self.test_executor.stop_builder.assert_called_once_with('foobar')
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
class TestEphemeral(EphemeralBuilderTestCase):
""" Simple unit tests for the ephemeral builder around config management, starting and stopping
jobs.
"""
def setUp(self):
super(TestEphemeral, self).setUp()
unregister_component_callback = Mock()
job_heartbeat_callback = Mock()
@coroutine
def job_complete_callback(*args, **kwargs):
raise Return()
self.manager = EphemeralBuilderManager(
self._register_component,
unregister_component_callback,
job_heartbeat_callback,
job_complete_callback,
'127.0.0.1',
30,
)
def tearDown(self):
super(TestEphemeral, self).tearDown()
self.manager.shutdown()
def test_verify_executor_oldconfig(self):
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
self.manager.initialize({
'EXECUTOR': 'test',
'EXECUTOR_CONFIG': dict(MINIMUM_RETRY_THRESHOLD=42),
'ORCHESTRATOR': {'MEM_CONFIG': None},
})
# Ensure that we have a single test executor.
self.assertEqual(1, len(self.manager.registered_executors))
self.assertEqual(42, self.manager.registered_executors[0].minimum_retry_threshold)
self.assertEqual('TestExecutor', self.manager.registered_executors[0].name)
def test_verify_executor_newconfig(self):
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
self.manager.initialize({
'EXECUTORS': [{
'EXECUTOR': 'test',
'MINIMUM_RETRY_THRESHOLD': 42
}],
'ORCHESTRATOR': {'MEM_CONFIG': None},
})
# Ensure that we have a single test executor.
self.assertEqual(1, len(self.manager.registered_executors))
self.assertEqual(42, self.manager.registered_executors[0].minimum_retry_threshold)
def test_multiple_executors_samename(self):
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
EphemeralBuilderManager.EXECUTORS['anotherexecutor'] = TestExecutor
with self.assertRaises(Exception):
self.manager.initialize({
'EXECUTORS': [
{
'NAME': 'primary',
'EXECUTOR': 'test',
'MINIMUM_RETRY_THRESHOLD': 42
},
{
'NAME': 'primary',
'EXECUTOR': 'anotherexecutor',
'MINIMUM_RETRY_THRESHOLD': 24
},
],
'ORCHESTRATOR': {'MEM_CONFIG': None},
})
def test_verify_multiple_executors(self):
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
EphemeralBuilderManager.EXECUTORS['anotherexecutor'] = TestExecutor
self.manager.initialize({
'EXECUTORS': [
{
'NAME': 'primary',
'EXECUTOR': 'test',
'MINIMUM_RETRY_THRESHOLD': 42
},
{
'NAME': 'secondary',
'EXECUTOR': 'anotherexecutor',
'MINIMUM_RETRY_THRESHOLD': 24
},
],
'ORCHESTRATOR': {'MEM_CONFIG': None},
})
# Ensure that we have a two test executors.
self.assertEqual(2, len(self.manager.registered_executors))
self.assertEqual(42, self.manager.registered_executors[0].minimum_retry_threshold)
self.assertEqual(24, self.manager.registered_executors[1].minimum_retry_threshold)
def test_skip_invalid_executor(self):
self.manager.initialize({
'EXECUTORS': [
{
'EXECUTOR': 'unknown',
'MINIMUM_RETRY_THRESHOLD': 42
},
],
'ORCHESTRATOR': {'MEM_CONFIG': None},
})
self.assertEqual(0, len(self.manager.registered_executors))
@async_test
def test_schedule_job_namespace_filter(self):
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
self.manager.initialize({
'EXECUTORS': [{
'EXECUTOR': 'test',
'NAMESPACE_WHITELIST': ['something'],
}],
'ORCHESTRATOR': {'MEM_CONFIG': None},
})
# Try with a build job in an invalid namespace.
build_job = self._create_build_job(namespace='somethingelse')
result = yield From(self.manager.schedule(build_job))
self.assertFalse(result[0])
# Try with a valid namespace.
build_job = self._create_build_job(namespace='something')
result = yield From(self.manager.schedule(build_job))
self.assertTrue(result[0])
@async_test
def test_schedule_job_retries_filter(self):
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
self.manager.initialize({
'EXECUTORS': [{
'EXECUTOR': 'test',
'MINIMUM_RETRY_THRESHOLD': 2,
}],
'ORCHESTRATOR': {'MEM_CONFIG': None},
})
# Try with a build job that has too few retries.
build_job = self._create_build_job(retries=1)
result = yield From(self.manager.schedule(build_job))
self.assertFalse(result[0])
# Try with a valid job.
build_job = self._create_build_job(retries=2)
result = yield From(self.manager.schedule(build_job))
self.assertTrue(result[0])
@async_test
def test_schedule_job_executor_fallback(self):
EphemeralBuilderManager.EXECUTORS['primary'] = TestExecutor
EphemeralBuilderManager.EXECUTORS['secondary'] = TestExecutor
self.manager.initialize({
'EXECUTORS': [
{
'NAME': 'primary',
'EXECUTOR': 'primary',
'NAMESPACE_WHITELIST': ['something'],
'MINIMUM_RETRY_THRESHOLD': 3,
},
{
'NAME': 'secondary',
'EXECUTOR': 'secondary',
'MINIMUM_RETRY_THRESHOLD': 2,
},
],
'ALLOWED_WORKER_COUNT': 5,
'ORCHESTRATOR': {'MEM_CONFIG': None},
})
# Try a job not matching the primary's namespace filter. Should schedule on secondary.
build_job = self._create_build_job(namespace='somethingelse')
result = yield From(self.manager.schedule(build_job))
self.assertTrue(result[0])
self.assertIsNone(self.manager.registered_executors[0].job_started)
self.assertIsNotNone(self.manager.registered_executors[1].job_started)
self.manager.registered_executors[0].job_started = None
self.manager.registered_executors[1].job_started = None
# Try a job not matching the primary's retry minimum. Should schedule on secondary.
build_job = self._create_build_job(namespace='something', retries=2)
result = yield From(self.manager.schedule(build_job))
self.assertTrue(result[0])
self.assertIsNone(self.manager.registered_executors[0].job_started)
self.assertIsNotNone(self.manager.registered_executors[1].job_started)
self.manager.registered_executors[0].job_started = None
self.manager.registered_executors[1].job_started = None
# Try a job matching the primary. Should schedule on the primary.
build_job = self._create_build_job(namespace='something', retries=3)
result = yield From(self.manager.schedule(build_job))
self.assertTrue(result[0])
self.assertIsNotNone(self.manager.registered_executors[0].job_started)
self.assertIsNone(self.manager.registered_executors[1].job_started)
self.manager.registered_executors[0].job_started = None
self.manager.registered_executors[1].job_started = None
# Try a job not matching either's restrictions.
build_job = self._create_build_job(namespace='somethingelse', retries=1)
result = yield From(self.manager.schedule(build_job))
self.assertFalse(result[0])
self.assertIsNone(self.manager.registered_executors[0].job_started)
self.assertIsNone(self.manager.registered_executors[1].job_started)
self.manager.registered_executors[0].job_started = None
self.manager.registered_executors[1].job_started = None
@async_test
def test_schedule_job_single_executor(self):
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
self.manager.initialize({
'EXECUTOR': 'test',
'EXECUTOR_CONFIG': {},
'ALLOWED_WORKER_COUNT': 5,
'ORCHESTRATOR': {'MEM_CONFIG': None},
})
build_job = self._create_build_job(namespace='something', retries=3)
result = yield From(self.manager.schedule(build_job))
self.assertTrue(result[0])
self.assertIsNotNone(self.manager.registered_executors[0].job_started)
self.manager.registered_executors[0].job_started = None
build_job = self._create_build_job(namespace='something', retries=0)
result = yield From(self.manager.schedule(build_job))
self.assertTrue(result[0])
self.assertIsNotNone(self.manager.registered_executors[0].job_started)
self.manager.registered_executors[0].job_started = None
@async_test
def test_executor_exception(self):
EphemeralBuilderManager.EXECUTORS['bad'] = BadExecutor
self.manager.initialize({
'EXECUTOR': 'bad',
'EXECUTOR_CONFIG': {},
'ORCHESTRATOR': {'MEM_CONFIG': None},
})
build_job = self._create_build_job(namespace='something', retries=3)
result = yield From(self.manager.schedule(build_job))
self.assertFalse(result[0])
@async_test
def test_schedule_and_stop(self):
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
self.manager.initialize({
'EXECUTOR': 'test',
'EXECUTOR_CONFIG': {},
'ORCHESTRATOR': {'MEM_CONFIG': None},
})
# Start the build job.
build_job = self._create_build_job(namespace='something', retries=3)
result = yield From(self.manager.schedule(build_job))
self.assertTrue(result[0])
executor = self.manager.registered_executors[0]
self.assertIsNotNone(executor.job_started)
# Register the realm so the build information is added.
yield From(self.manager._register_realm({
'realm': str(uuid.uuid4()),
'token': str(uuid.uuid4()),
'execution_id': executor.job_started,
'executor_name': 'TestExecutor',
'build_uuid': build_job.build_uuid,
'job_queue_item': build_job.job_item,
}))
# Stop the build job.
yield From(self.manager.kill_builder_executor(build_job.build_uuid))
self.assertEqual(executor.job_stopped, executor.job_started)
if __name__ == '__main__':
unittest.main()