initial import for Open Source 🎉
This commit is contained in:
parent
1898c361f3
commit
9c0dd3b722
2048 changed files with 218743 additions and 0 deletions
2
buildman/MAINTAINERS
Normal file
2
buildman/MAINTAINERS
Normal file
|
@ -0,0 +1,2 @@
|
|||
Charlton Austin <charlton.austin@coreos.com> (@charltonaustin)
|
||||
Joseph Schorr <joseph.schorr@coreos.com> (@josephschorr)
|
0
buildman/__init__.py
Normal file
0
buildman/__init__.py
Normal file
42
buildman/asyncutil.py
Normal file
42
buildman/asyncutil.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
|
||||
from trollius import get_event_loop, coroutine
|
||||
|
||||
|
||||
def wrap_with_threadpool(obj, worker_threads=1):
|
||||
"""
|
||||
Wraps a class in an async executor so that it can be safely used in an event loop like trollius.
|
||||
"""
|
||||
async_executor = ThreadPoolExecutor(worker_threads)
|
||||
return AsyncWrapper(obj, executor=async_executor), async_executor
|
||||
|
||||
|
||||
class AsyncWrapper(object):
|
||||
""" Wrapper class which will transform a syncronous library to one that can be used with
|
||||
trollius coroutines.
|
||||
"""
|
||||
def __init__(self, delegate, loop=None, executor=None):
|
||||
self._loop = loop if loop is not None else get_event_loop()
|
||||
self._delegate = delegate
|
||||
self._executor = executor
|
||||
|
||||
def __getattr__(self, attrib):
|
||||
delegate_attr = getattr(self._delegate, attrib)
|
||||
|
||||
if not callable(delegate_attr):
|
||||
return delegate_attr
|
||||
|
||||
def wrapper(*args, **kwargs):
|
||||
""" Wraps the delegate_attr with primitives that will transform sync calls to ones shelled
|
||||
out to a thread pool.
|
||||
"""
|
||||
callable_delegate_attr = partial(delegate_attr, *args, **kwargs)
|
||||
return self._loop.run_in_executor(self._executor, callable_delegate_attr)
|
||||
|
||||
return wrapper
|
||||
|
||||
@coroutine
|
||||
def __call__(self, *args, **kwargs):
|
||||
callable_delegate_attr = partial(self._delegate, *args, **kwargs)
|
||||
return self._loop.run_in_executor(self._executor, callable_delegate_attr)
|
97
buildman/builder.py
Normal file
97
buildman/builder.py
Normal file
|
@ -0,0 +1,97 @@
|
|||
import logging
|
||||
import os
|
||||
import time
|
||||
import socket
|
||||
|
||||
import features
|
||||
|
||||
from app import app, userfiles as user_files, build_logs, dockerfile_build_queue
|
||||
from util.log import logfile_path
|
||||
|
||||
from buildman.manager.enterprise import EnterpriseManager
|
||||
from buildman.manager.ephemeral import EphemeralBuilderManager
|
||||
from buildman.server import BuilderServer
|
||||
|
||||
from trollius import SSLContext
|
||||
from raven.handlers.logging import SentryHandler
|
||||
from raven.conf import setup_logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BUILD_MANAGERS = {
|
||||
'enterprise': EnterpriseManager,
|
||||
'ephemeral': EphemeralBuilderManager,
|
||||
}
|
||||
|
||||
EXTERNALLY_MANAGED = 'external'
|
||||
|
||||
DEFAULT_WEBSOCKET_PORT = 8787
|
||||
DEFAULT_CONTROLLER_PORT = 8686
|
||||
|
||||
LOG_FORMAT = "%(asctime)s [%(process)d] [%(levelname)s] [%(name)s] %(message)s"
|
||||
|
||||
def run_build_manager():
|
||||
if not features.BUILD_SUPPORT:
|
||||
logger.debug('Building is disabled. Please enable the feature flag')
|
||||
while True:
|
||||
time.sleep(1000)
|
||||
return
|
||||
|
||||
if app.config.get('REGISTRY_STATE', 'normal') == 'readonly':
|
||||
logger.debug('Building is disabled while in read-only mode.')
|
||||
while True:
|
||||
time.sleep(1000)
|
||||
return
|
||||
|
||||
build_manager_config = app.config.get('BUILD_MANAGER')
|
||||
if build_manager_config is None:
|
||||
return
|
||||
|
||||
# If the build system is externally managed, then we just sleep this process.
|
||||
if build_manager_config[0] == EXTERNALLY_MANAGED:
|
||||
logger.debug('Builds are externally managed.')
|
||||
while True:
|
||||
time.sleep(1000)
|
||||
return
|
||||
|
||||
logger.debug('Asking to start build manager with lifecycle "%s"', build_manager_config[0])
|
||||
manager_klass = BUILD_MANAGERS.get(build_manager_config[0])
|
||||
if manager_klass is None:
|
||||
return
|
||||
|
||||
manager_hostname = os.environ.get('BUILDMAN_HOSTNAME',
|
||||
app.config.get('BUILDMAN_HOSTNAME',
|
||||
app.config['SERVER_HOSTNAME']))
|
||||
websocket_port = int(os.environ.get('BUILDMAN_WEBSOCKET_PORT',
|
||||
app.config.get('BUILDMAN_WEBSOCKET_PORT',
|
||||
DEFAULT_WEBSOCKET_PORT)))
|
||||
controller_port = int(os.environ.get('BUILDMAN_CONTROLLER_PORT',
|
||||
app.config.get('BUILDMAN_CONTROLLER_PORT',
|
||||
DEFAULT_CONTROLLER_PORT)))
|
||||
|
||||
logger.debug('Will pass buildman hostname %s to builders for websocket connection',
|
||||
manager_hostname)
|
||||
|
||||
logger.debug('Starting build manager with lifecycle "%s"', build_manager_config[0])
|
||||
ssl_context = None
|
||||
if os.environ.get('SSL_CONFIG'):
|
||||
logger.debug('Loading SSL cert and key')
|
||||
ssl_context = SSLContext()
|
||||
ssl_context.load_cert_chain(os.path.join(os.environ.get('SSL_CONFIG'), 'ssl.cert'),
|
||||
os.path.join(os.environ.get('SSL_CONFIG'), 'ssl.key'))
|
||||
|
||||
server = BuilderServer(app.config['SERVER_HOSTNAME'], dockerfile_build_queue, build_logs,
|
||||
user_files, manager_klass, build_manager_config[1], manager_hostname)
|
||||
server.run('0.0.0.0', websocket_port, controller_port, ssl=ssl_context)
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.config.fileConfig(logfile_path(debug=True), disable_existing_loggers=False)
|
||||
logging.getLogger('peewee').setLevel(logging.WARN)
|
||||
logging.getLogger('boto').setLevel(logging.WARN)
|
||||
|
||||
if app.config.get('EXCEPTION_LOG_TYPE', 'FakeSentry') == 'Sentry':
|
||||
buildman_name = '%s:buildman' % socket.gethostname()
|
||||
setup_logging(SentryHandler(app.config.get('SENTRY_DSN', ''), name=buildman_name,
|
||||
level=logging.ERROR))
|
||||
|
||||
run_build_manager()
|
0
buildman/component/__init__.py
Normal file
0
buildman/component/__init__.py
Normal file
13
buildman/component/basecomponent.py
Normal file
13
buildman/component/basecomponent.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
from autobahn.asyncio.wamp import ApplicationSession
|
||||
|
||||
class BaseComponent(ApplicationSession):
|
||||
""" Base class for all registered component sessions in the server. """
|
||||
def __init__(self, config, **kwargs):
|
||||
ApplicationSession.__init__(self, config)
|
||||
self.server = None
|
||||
self.parent_manager = None
|
||||
self.build_logs = None
|
||||
self.user_files = None
|
||||
|
||||
def kind(self):
|
||||
raise NotImplementedError
|
539
buildman/component/buildcomponent.py
Normal file
539
buildman/component/buildcomponent.py
Normal file
|
@ -0,0 +1,539 @@
|
|||
import datetime
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import json
|
||||
import trollius
|
||||
|
||||
from autobahn.wamp.exception import ApplicationError
|
||||
from trollius import From, Return
|
||||
|
||||
from active_migration import ActiveDataMigration, ERTMigrationFlags
|
||||
from buildman.server import BuildJobResult
|
||||
from buildman.component.basecomponent import BaseComponent
|
||||
from buildman.component.buildparse import extract_current_step
|
||||
from buildman.jobutil.buildjob import BuildJobLoadException
|
||||
from buildman.jobutil.buildstatus import StatusHandler
|
||||
from buildman.jobutil.workererror import WorkerError
|
||||
|
||||
from app import app
|
||||
from data.database import BUILD_PHASE, UseThenDisconnect
|
||||
from data.model import InvalidRepositoryBuildException
|
||||
from data.registry_model import registry_model
|
||||
from util import slash_join
|
||||
|
||||
HEARTBEAT_DELTA = datetime.timedelta(seconds=60)
|
||||
BUILD_HEARTBEAT_DELAY = datetime.timedelta(seconds=30)
|
||||
HEARTBEAT_TIMEOUT = 10
|
||||
INITIAL_TIMEOUT = 25
|
||||
|
||||
SUPPORTED_WORKER_VERSIONS = ['0.3']
|
||||
|
||||
# Label which marks a manifest with its source build ID.
|
||||
INTERNAL_LABEL_BUILD_UUID = 'quay.build.uuid'
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ComponentStatus(object):
|
||||
""" ComponentStatus represents the possible states of a component. """
|
||||
JOINING = 'joining'
|
||||
WAITING = 'waiting'
|
||||
RUNNING = 'running'
|
||||
BUILDING = 'building'
|
||||
TIMED_OUT = 'timeout'
|
||||
|
||||
class BuildComponent(BaseComponent):
|
||||
""" An application session component which conducts one (or more) builds. """
|
||||
def __init__(self, config, realm=None, token=None, **kwargs):
|
||||
self.expected_token = token
|
||||
self.builder_realm = realm
|
||||
|
||||
self.parent_manager = None
|
||||
self.registry_hostname = None
|
||||
|
||||
self._component_status = ComponentStatus.JOINING
|
||||
self._last_heartbeat = None
|
||||
self._current_job = None
|
||||
self._build_status = None
|
||||
self._image_info = None
|
||||
self._worker_version = None
|
||||
|
||||
BaseComponent.__init__(self, config, **kwargs)
|
||||
|
||||
def kind(self):
|
||||
return 'builder'
|
||||
|
||||
def onConnect(self):
|
||||
self.join(self.builder_realm)
|
||||
|
||||
@trollius.coroutine
|
||||
def onJoin(self, details):
|
||||
logger.debug('Registering methods and listeners for component %s', self.builder_realm)
|
||||
yield From(self.register(self._on_ready, u'io.quay.buildworker.ready'))
|
||||
yield From(self.register(self._determine_cache_tag, u'io.quay.buildworker.determinecachetag'))
|
||||
yield From(self.register(self._ping, u'io.quay.buildworker.ping'))
|
||||
yield From(self.register(self._on_log_message, u'io.quay.builder.logmessagesynchronously'))
|
||||
|
||||
yield From(self.subscribe(self._on_heartbeat, u'io.quay.builder.heartbeat'))
|
||||
|
||||
yield From(self._set_status(ComponentStatus.WAITING))
|
||||
|
||||
@trollius.coroutine
|
||||
def start_build(self, build_job):
|
||||
""" Starts a build. """
|
||||
if self._component_status not in (ComponentStatus.WAITING, ComponentStatus.RUNNING):
|
||||
logger.debug('Could not start build for component %s (build %s, worker version: %s): %s',
|
||||
self.builder_realm, build_job.repo_build.uuid, self._worker_version,
|
||||
self._component_status)
|
||||
raise Return()
|
||||
|
||||
logger.debug('Starting build for component %s (build %s, worker version: %s)',
|
||||
self.builder_realm, build_job.repo_build.uuid, self._worker_version)
|
||||
|
||||
self._current_job = build_job
|
||||
self._build_status = StatusHandler(self.build_logs, build_job.repo_build.uuid)
|
||||
self._image_info = {}
|
||||
|
||||
yield From(self._set_status(ComponentStatus.BUILDING))
|
||||
|
||||
# Send the notification that the build has started.
|
||||
build_job.send_notification('build_start')
|
||||
|
||||
# Parse the build configuration.
|
||||
try:
|
||||
build_config = build_job.build_config
|
||||
except BuildJobLoadException as irbe:
|
||||
yield From(self._build_failure('Could not load build job information', irbe))
|
||||
raise Return()
|
||||
|
||||
base_image_information = {}
|
||||
|
||||
# Add the pull robot information, if any.
|
||||
if build_job.pull_credentials:
|
||||
base_image_information['username'] = build_job.pull_credentials.get('username', '')
|
||||
base_image_information['password'] = build_job.pull_credentials.get('password', '')
|
||||
|
||||
# Retrieve the repository's fully qualified name.
|
||||
repo = build_job.repo_build.repository
|
||||
repository_name = repo.namespace_user.username + '/' + repo.name
|
||||
|
||||
# Parse the build queue item into build arguments.
|
||||
# build_package: URL to the build package to download and untar/unzip.
|
||||
# defaults to empty string to avoid requiring a pointer on the builder.
|
||||
# sub_directory: The location within the build package of the Dockerfile and the build context.
|
||||
# repository: The repository for which this build is occurring.
|
||||
# registry: The registry for which this build is occuring (e.g. 'quay.io').
|
||||
# pull_token: The token to use when pulling the cache for building.
|
||||
# push_token: The token to use to push the built image.
|
||||
# tag_names: The name(s) of the tag(s) for the newly built image.
|
||||
# base_image: The image name and credentials to use to conduct the base image pull.
|
||||
# username: The username for pulling the base image (if any).
|
||||
# password: The password for pulling the base image (if any).
|
||||
context, dockerfile_path = self.extract_dockerfile_args(build_config)
|
||||
build_arguments = {
|
||||
'build_package': build_job.get_build_package_url(self.user_files),
|
||||
'context': context,
|
||||
'dockerfile_path': dockerfile_path,
|
||||
'repository': repository_name,
|
||||
'registry': self.registry_hostname,
|
||||
'pull_token': build_job.repo_build.access_token.get_code(),
|
||||
'push_token': build_job.repo_build.access_token.get_code(),
|
||||
'tag_names': build_config.get('docker_tags', ['latest']),
|
||||
'base_image': base_image_information,
|
||||
}
|
||||
|
||||
# If the trigger has a private key, it's using git, thus we should add
|
||||
# git data to the build args.
|
||||
# url: url used to clone the git repository
|
||||
# sha: the sha1 identifier of the commit to check out
|
||||
# private_key: the key used to get read access to the git repository
|
||||
|
||||
# TODO(remove-unenc): Remove legacy field.
|
||||
private_key = None
|
||||
if build_job.repo_build.trigger is not None and \
|
||||
build_job.repo_build.trigger.secure_private_key is not None:
|
||||
private_key = build_job.repo_build.trigger.secure_private_key.decrypt()
|
||||
|
||||
if ActiveDataMigration.has_flag(ERTMigrationFlags.READ_OLD_FIELDS) and \
|
||||
private_key is None and \
|
||||
build_job.repo_build.trigger is not None:
|
||||
private_key = build_job.repo_build.trigger.private_key
|
||||
|
||||
if private_key is not None:
|
||||
build_arguments['git'] = {
|
||||
'url': build_config['trigger_metadata'].get('git_url', ''),
|
||||
'sha': BuildComponent._commit_sha(build_config),
|
||||
'private_key': private_key or '',
|
||||
}
|
||||
|
||||
# If the build args have no buildpack, mark it as a failure before sending
|
||||
# it to a builder instance.
|
||||
if not build_arguments['build_package'] and not build_arguments['git']:
|
||||
logger.error('%s: insufficient build args: %s',
|
||||
self._current_job.repo_build.uuid, build_arguments)
|
||||
yield From(self._build_failure('Insufficient build arguments. No buildpack available.'))
|
||||
raise Return()
|
||||
|
||||
# Invoke the build.
|
||||
logger.debug('Invoking build: %s', self.builder_realm)
|
||||
logger.debug('With Arguments: %s', build_arguments)
|
||||
|
||||
def build_complete_callback(result):
|
||||
""" This function is used to execute a coroutine as the callback. """
|
||||
trollius.ensure_future(self._build_complete(result))
|
||||
|
||||
self.call("io.quay.builder.build", **build_arguments).add_done_callback(build_complete_callback)
|
||||
|
||||
# Set the heartbeat for the future. If the builder never receives the build call,
|
||||
# then this will cause a timeout after 30 seconds. We know the builder has registered
|
||||
# by this point, so it makes sense to have a timeout.
|
||||
self._last_heartbeat = datetime.datetime.utcnow() + BUILD_HEARTBEAT_DELAY
|
||||
|
||||
@staticmethod
|
||||
def extract_dockerfile_args(build_config):
|
||||
dockerfile_path = build_config.get('build_subdir', '')
|
||||
context = build_config.get('context', '')
|
||||
if not (dockerfile_path == '' or context == ''):
|
||||
# This should not happen and can be removed when we centralize validating build_config
|
||||
dockerfile_abspath = slash_join('', dockerfile_path)
|
||||
if ".." in os.path.relpath(dockerfile_abspath, context):
|
||||
return os.path.split(dockerfile_path)
|
||||
dockerfile_path = os.path.relpath(dockerfile_abspath, context)
|
||||
|
||||
return context, dockerfile_path
|
||||
|
||||
@staticmethod
|
||||
def _commit_sha(build_config):
|
||||
""" Determines whether the metadata is using an old schema or not and returns the commit. """
|
||||
commit_sha = build_config['trigger_metadata'].get('commit', '')
|
||||
old_commit_sha = build_config['trigger_metadata'].get('commit_sha', '')
|
||||
return commit_sha or old_commit_sha
|
||||
|
||||
@staticmethod
|
||||
def name_and_path(subdir):
|
||||
""" Returns the dockerfile path and name """
|
||||
if subdir.endswith("/"):
|
||||
subdir += "Dockerfile"
|
||||
elif not subdir.endswith("Dockerfile"):
|
||||
subdir += "/Dockerfile"
|
||||
return os.path.split(subdir)
|
||||
|
||||
@staticmethod
|
||||
def _total_completion(statuses, total_images):
|
||||
""" Returns the current amount completion relative to the total completion of a build. """
|
||||
percentage_with_sizes = float(len(statuses.values())) / total_images
|
||||
sent_bytes = sum([status['current'] for status in statuses.values()])
|
||||
total_bytes = sum([status['total'] for status in statuses.values()])
|
||||
return float(sent_bytes) / total_bytes * percentage_with_sizes
|
||||
|
||||
@staticmethod
|
||||
def _process_pushpull_status(status_dict, current_phase, docker_data, images):
|
||||
""" Processes the status of a push or pull by updating the provided status_dict and images. """
|
||||
if not docker_data:
|
||||
return
|
||||
|
||||
num_images = 0
|
||||
status_completion_key = ''
|
||||
|
||||
if current_phase == 'pushing':
|
||||
status_completion_key = 'push_completion'
|
||||
num_images = status_dict['total_commands']
|
||||
elif current_phase == 'pulling':
|
||||
status_completion_key = 'pull_completion'
|
||||
elif current_phase == 'priming-cache':
|
||||
status_completion_key = 'cache_completion'
|
||||
else:
|
||||
return
|
||||
|
||||
if 'progressDetail' in docker_data and 'id' in docker_data:
|
||||
image_id = docker_data['id']
|
||||
detail = docker_data['progressDetail']
|
||||
|
||||
if 'current' in detail and 'total' in detail:
|
||||
images[image_id] = detail
|
||||
status_dict[status_completion_key] = \
|
||||
BuildComponent._total_completion(images, max(len(images), num_images))
|
||||
|
||||
|
||||
@trollius.coroutine
|
||||
def _on_log_message(self, phase, json_data):
|
||||
""" Tails log messages and updates the build status. """
|
||||
# Update the heartbeat.
|
||||
self._last_heartbeat = datetime.datetime.utcnow()
|
||||
|
||||
# Parse any of the JSON data logged.
|
||||
log_data = {}
|
||||
if json_data:
|
||||
try:
|
||||
log_data = json.loads(json_data)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Extract the current status message (if any).
|
||||
fully_unwrapped = ''
|
||||
keys_to_extract = ['error', 'status', 'stream']
|
||||
for key in keys_to_extract:
|
||||
if key in log_data:
|
||||
fully_unwrapped = log_data[key]
|
||||
break
|
||||
|
||||
# Determine if this is a step string.
|
||||
current_step = None
|
||||
current_status_string = str(fully_unwrapped.encode('utf-8'))
|
||||
|
||||
if current_status_string and phase == BUILD_PHASE.BUILDING:
|
||||
current_step = extract_current_step(current_status_string)
|
||||
|
||||
# Parse and update the phase and the status_dict. The status dictionary contains
|
||||
# the pull/push progress, as well as the current step index.
|
||||
with self._build_status as status_dict:
|
||||
try:
|
||||
changed_phase = yield From(self._build_status.set_phase(phase, log_data.get('status_data')))
|
||||
if changed_phase:
|
||||
logger.debug('Build %s has entered a new phase: %s', self.builder_realm, phase)
|
||||
elif self._current_job.repo_build.phase == BUILD_PHASE.CANCELLED:
|
||||
build_id = self._current_job.repo_build.uuid
|
||||
logger.debug('Trying to move cancelled build into phase: %s with id: %s', phase, build_id)
|
||||
raise Return(False)
|
||||
except InvalidRepositoryBuildException:
|
||||
build_id = self._current_job.repo_build.uuid
|
||||
logger.warning('Build %s was not found; repo was probably deleted', build_id)
|
||||
raise Return(False)
|
||||
|
||||
BuildComponent._process_pushpull_status(status_dict, phase, log_data, self._image_info)
|
||||
|
||||
# If the current message represents the beginning of a new step, then update the
|
||||
# current command index.
|
||||
if current_step is not None:
|
||||
status_dict['current_command'] = current_step
|
||||
|
||||
# If the json data contains an error, then something went wrong with a push or pull.
|
||||
if 'error' in log_data:
|
||||
yield From(self._build_status.set_error(log_data['error']))
|
||||
|
||||
if current_step is not None:
|
||||
yield From(self._build_status.set_command(current_status_string))
|
||||
elif phase == BUILD_PHASE.BUILDING:
|
||||
yield From(self._build_status.append_log(current_status_string))
|
||||
raise Return(True)
|
||||
|
||||
@trollius.coroutine
|
||||
def _determine_cache_tag(self, command_comments, base_image_name, base_image_tag, base_image_id):
|
||||
with self._build_status as status_dict:
|
||||
status_dict['total_commands'] = len(command_comments) + 1
|
||||
|
||||
logger.debug('Checking cache on realm %s. Base image: %s:%s (%s)', self.builder_realm,
|
||||
base_image_name, base_image_tag, base_image_id)
|
||||
|
||||
tag_found = self._current_job.determine_cached_tag(base_image_id, command_comments)
|
||||
raise Return(tag_found or '')
|
||||
|
||||
@trollius.coroutine
|
||||
def _build_failure(self, error_message, exception=None):
|
||||
""" Handles and logs a failed build. """
|
||||
yield From(self._build_status.set_error(error_message, {
|
||||
'internal_error': str(exception) if exception else None
|
||||
}))
|
||||
|
||||
build_id = self._current_job.repo_build.uuid
|
||||
logger.warning('Build %s failed with message: %s', build_id, error_message)
|
||||
|
||||
# Mark that the build has finished (in an error state)
|
||||
yield From(self._build_finished(BuildJobResult.ERROR))
|
||||
|
||||
@trollius.coroutine
|
||||
def _build_complete(self, result):
|
||||
""" Wraps up a completed build. Handles any errors and calls self._build_finished. """
|
||||
build_id = self._current_job.repo_build.uuid
|
||||
|
||||
try:
|
||||
# Retrieve the result. This will raise an ApplicationError on any error that occurred.
|
||||
result_value = result.result()
|
||||
kwargs = {}
|
||||
|
||||
# Note: If we are hitting an older builder that didn't return ANY map data, then the result
|
||||
# value will be a bool instead of a proper CallResult object.
|
||||
# Therefore: we have a try-except guard here to ensure we don't hit this pitfall.
|
||||
try:
|
||||
kwargs = result_value.kwresults
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
yield From(self._build_status.set_phase(BUILD_PHASE.COMPLETE))
|
||||
except InvalidRepositoryBuildException:
|
||||
logger.warning('Build %s was not found; repo was probably deleted', build_id)
|
||||
raise Return()
|
||||
|
||||
yield From(self._build_finished(BuildJobResult.COMPLETE))
|
||||
|
||||
# Label the pushed manifests with the build metadata.
|
||||
manifest_digests = kwargs.get('digests') or []
|
||||
repository = registry_model.lookup_repository(self._current_job.namespace,
|
||||
self._current_job.repo_name)
|
||||
if repository is not None:
|
||||
for digest in manifest_digests:
|
||||
with UseThenDisconnect(app.config):
|
||||
manifest = registry_model.lookup_manifest_by_digest(repository, digest,
|
||||
require_available=True)
|
||||
if manifest is None:
|
||||
continue
|
||||
|
||||
registry_model.create_manifest_label(manifest, INTERNAL_LABEL_BUILD_UUID,
|
||||
build_id, 'internal', 'text/plain')
|
||||
|
||||
# Send the notification that the build has completed successfully.
|
||||
self._current_job.send_notification('build_success',
|
||||
image_id=kwargs.get('image_id'),
|
||||
manifest_digests=manifest_digests)
|
||||
except ApplicationError as aex:
|
||||
worker_error = WorkerError(aex.error, aex.kwargs.get('base_error'))
|
||||
|
||||
# Write the error to the log.
|
||||
yield From(self._build_status.set_error(worker_error.public_message(),
|
||||
worker_error.extra_data(),
|
||||
internal_error=worker_error.is_internal_error(),
|
||||
requeued=self._current_job.has_retries_remaining()))
|
||||
|
||||
# Send the notification that the build has failed.
|
||||
self._current_job.send_notification('build_failure',
|
||||
error_message=worker_error.public_message())
|
||||
|
||||
# Mark the build as completed.
|
||||
if worker_error.is_internal_error():
|
||||
logger.exception('[BUILD INTERNAL ERROR: Remote] Build ID: %s: %s', build_id,
|
||||
worker_error.public_message())
|
||||
yield From(self._build_finished(BuildJobResult.INCOMPLETE))
|
||||
else:
|
||||
logger.debug('Got remote failure exception for build %s: %s', build_id, aex)
|
||||
yield From(self._build_finished(BuildJobResult.ERROR))
|
||||
|
||||
# Remove the current job.
|
||||
self._current_job = None
|
||||
|
||||
|
||||
@trollius.coroutine
|
||||
def _build_finished(self, job_status):
|
||||
""" Alerts the parent that a build has completed and sets the status back to running. """
|
||||
yield From(self.parent_manager.job_completed(self._current_job, job_status, self))
|
||||
|
||||
# Set the component back to a running state.
|
||||
yield From(self._set_status(ComponentStatus.RUNNING))
|
||||
|
||||
@staticmethod
|
||||
def _ping():
|
||||
""" Ping pong. """
|
||||
return 'pong'
|
||||
|
||||
@trollius.coroutine
|
||||
def _on_ready(self, token, version):
|
||||
logger.debug('On ready called (token "%s")', token)
|
||||
self._worker_version = version
|
||||
|
||||
if not version in SUPPORTED_WORKER_VERSIONS:
|
||||
logger.warning('Build component (token "%s") is running an out-of-date version: %s', token,
|
||||
version)
|
||||
raise Return(False)
|
||||
|
||||
if self._component_status != ComponentStatus.WAITING:
|
||||
logger.warning('Build component (token "%s") is already connected', self.expected_token)
|
||||
raise Return(False)
|
||||
|
||||
if token != self.expected_token:
|
||||
logger.warning('Builder token mismatch. Expected: "%s". Found: "%s"', self.expected_token,
|
||||
token)
|
||||
raise Return(False)
|
||||
|
||||
yield From(self._set_status(ComponentStatus.RUNNING))
|
||||
|
||||
# Start the heartbeat check and updating loop.
|
||||
loop = trollius.get_event_loop()
|
||||
loop.create_task(self._heartbeat())
|
||||
logger.debug('Build worker %s is connected and ready', self.builder_realm)
|
||||
raise Return(True)
|
||||
|
||||
@trollius.coroutine
|
||||
def _set_status(self, phase):
|
||||
if phase == ComponentStatus.RUNNING:
|
||||
yield From(self.parent_manager.build_component_ready(self))
|
||||
|
||||
self._component_status = phase
|
||||
|
||||
def _on_heartbeat(self):
|
||||
""" Updates the last known heartbeat. """
|
||||
if self._component_status == ComponentStatus.TIMED_OUT:
|
||||
return
|
||||
|
||||
logger.debug('Got heartbeat on realm %s', self.builder_realm)
|
||||
self._last_heartbeat = datetime.datetime.utcnow()
|
||||
|
||||
@trollius.coroutine
|
||||
def _heartbeat(self):
|
||||
""" Coroutine that runs every HEARTBEAT_TIMEOUT seconds, both checking the worker's heartbeat
|
||||
and updating the heartbeat in the build status dictionary (if applicable). This allows
|
||||
the build system to catch crashes from either end.
|
||||
"""
|
||||
yield From(trollius.sleep(INITIAL_TIMEOUT))
|
||||
|
||||
while True:
|
||||
# If the component is no longer running or actively building, nothing more to do.
|
||||
if (self._component_status != ComponentStatus.RUNNING and
|
||||
self._component_status != ComponentStatus.BUILDING):
|
||||
raise Return()
|
||||
|
||||
# If there is an active build, write the heartbeat to its status.
|
||||
if self._build_status is not None:
|
||||
with self._build_status as status_dict:
|
||||
status_dict['heartbeat'] = int(time.time())
|
||||
|
||||
# Mark the build item.
|
||||
current_job = self._current_job
|
||||
if current_job is not None:
|
||||
yield From(self.parent_manager.job_heartbeat(current_job))
|
||||
|
||||
# Check the heartbeat from the worker.
|
||||
logger.debug('Checking heartbeat on realm %s', self.builder_realm)
|
||||
if (self._last_heartbeat and
|
||||
self._last_heartbeat < datetime.datetime.utcnow() - HEARTBEAT_DELTA):
|
||||
logger.debug('Heartbeat on realm %s has expired: %s', self.builder_realm,
|
||||
self._last_heartbeat)
|
||||
|
||||
yield From(self._timeout())
|
||||
raise Return()
|
||||
|
||||
logger.debug('Heartbeat on realm %s is valid: %s (%s).', self.builder_realm,
|
||||
self._last_heartbeat, self._component_status)
|
||||
|
||||
yield From(trollius.sleep(HEARTBEAT_TIMEOUT))
|
||||
|
||||
@trollius.coroutine
|
||||
def _timeout(self):
|
||||
if self._component_status == ComponentStatus.TIMED_OUT:
|
||||
raise Return()
|
||||
|
||||
yield From(self._set_status(ComponentStatus.TIMED_OUT))
|
||||
logger.warning('Build component with realm %s has timed out', self.builder_realm)
|
||||
|
||||
# If we still have a running job, then it has not completed and we need to tell the parent
|
||||
# manager.
|
||||
if self._current_job is not None:
|
||||
yield From(self._build_status.set_error('Build worker timed out', internal_error=True,
|
||||
requeued=self._current_job.has_retries_remaining()))
|
||||
|
||||
build_id = self._current_job.build_uuid
|
||||
logger.error('[BUILD INTERNAL ERROR: Timeout] Build ID: %s', build_id)
|
||||
yield From(self.parent_manager.job_completed(self._current_job,
|
||||
BuildJobResult.INCOMPLETE,
|
||||
self))
|
||||
|
||||
# Unregister the current component so that it cannot be invoked again.
|
||||
self.parent_manager.build_component_disposed(self, True)
|
||||
|
||||
# Remove the job reference.
|
||||
self._current_job = None
|
||||
|
||||
@trollius.coroutine
|
||||
def cancel_build(self):
|
||||
self.parent_manager.build_component_disposed(self, True)
|
||||
self._current_job = None
|
||||
yield From(self._set_status(ComponentStatus.RUNNING))
|
15
buildman/component/buildparse.py
Normal file
15
buildman/component/buildparse.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
import re
|
||||
|
||||
def extract_current_step(current_status_string):
|
||||
""" Attempts to extract the current step numeric identifier from the given status string. Returns the step
|
||||
number or None if none.
|
||||
"""
|
||||
# Older format: `Step 12 :`
|
||||
# Newer format: `Step 4/13 :`
|
||||
step_increment = re.search(r'Step ([0-9]+)/([0-9]+) :', current_status_string)
|
||||
if step_increment:
|
||||
return int(step_increment.group(1))
|
||||
|
||||
step_increment = re.search(r'Step ([0-9]+) :', current_status_string)
|
||||
if step_increment:
|
||||
return int(step_increment.group(1))
|
36
buildman/component/test/test_buildcomponent.py
Normal file
36
buildman/component/test/test_buildcomponent.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
import pytest
|
||||
|
||||
from buildman.component.buildcomponent import BuildComponent
|
||||
|
||||
|
||||
@pytest.mark.parametrize('input,expected_path,expected_file', [
|
||||
("", "/", "Dockerfile"),
|
||||
("/", "/", "Dockerfile"),
|
||||
("/Dockerfile", "/", "Dockerfile"),
|
||||
("/server.Dockerfile", "/", "server.Dockerfile"),
|
||||
("/somepath", "/somepath", "Dockerfile"),
|
||||
("/somepath/", "/somepath", "Dockerfile"),
|
||||
("/somepath/Dockerfile", "/somepath", "Dockerfile"),
|
||||
("/somepath/server.Dockerfile", "/somepath", "server.Dockerfile"),
|
||||
("/somepath/some_other_path", "/somepath/some_other_path", "Dockerfile"),
|
||||
("/somepath/some_other_path/", "/somepath/some_other_path", "Dockerfile"),
|
||||
("/somepath/some_other_path/Dockerfile", "/somepath/some_other_path", "Dockerfile"),
|
||||
("/somepath/some_other_path/server.Dockerfile", "/somepath/some_other_path", "server.Dockerfile"),
|
||||
])
|
||||
def test_path_is_dockerfile(input, expected_path, expected_file):
|
||||
actual_path, actual_file = BuildComponent.name_and_path(input)
|
||||
assert actual_path == expected_path
|
||||
assert actual_file == expected_file
|
||||
|
||||
@pytest.mark.parametrize('build_config,context,dockerfile_path', [
|
||||
({}, '', ''),
|
||||
({'build_subdir': '/builddir/Dockerfile'}, '', '/builddir/Dockerfile'),
|
||||
({'context': '/builddir'}, '/builddir', ''),
|
||||
({'context': '/builddir', 'build_subdir': '/builddir/Dockerfile'}, '/builddir', 'Dockerfile'),
|
||||
({'context': '/some_other_dir/Dockerfile', 'build_subdir': '/builddir/Dockerfile'}, '/builddir', 'Dockerfile'),
|
||||
({'context': '/', 'build_subdir':'Dockerfile'}, '/', 'Dockerfile')
|
||||
])
|
||||
def test_extract_dockerfile_args(build_config, context, dockerfile_path):
|
||||
actual_context, actual_dockerfile_path = BuildComponent.extract_dockerfile_args(build_config)
|
||||
assert context == actual_context
|
||||
assert dockerfile_path == actual_dockerfile_path
|
16
buildman/component/test/test_buildparse.py
Normal file
16
buildman/component/test/test_buildparse.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
import pytest
|
||||
|
||||
from buildman.component.buildparse import extract_current_step
|
||||
|
||||
|
||||
@pytest.mark.parametrize('input,expected_step', [
|
||||
("", None),
|
||||
("Step a :", None),
|
||||
("Step 1 :", 1),
|
||||
("Step 1 : ", 1),
|
||||
("Step 1/2 : ", 1),
|
||||
("Step 2/17 : ", 2),
|
||||
("Step 4/13 : ARG somearg=foo", 4),
|
||||
])
|
||||
def test_extract_current_step(input, expected_step):
|
||||
assert extract_current_step(input) == expected_step
|
21
buildman/enums.py
Normal file
21
buildman/enums.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
from data.database import BUILD_PHASE
|
||||
|
||||
class BuildJobResult(object):
|
||||
""" Build job result enum """
|
||||
INCOMPLETE = 'incomplete'
|
||||
COMPLETE = 'complete'
|
||||
ERROR = 'error'
|
||||
|
||||
|
||||
class BuildServerStatus(object):
|
||||
""" Build server status enum """
|
||||
STARTING = 'starting'
|
||||
RUNNING = 'running'
|
||||
SHUTDOWN = 'shutting_down'
|
||||
EXCEPTION = 'exception'
|
||||
|
||||
RESULT_PHASES = {
|
||||
BuildJobResult.INCOMPLETE: BUILD_PHASE.INTERNAL_ERROR,
|
||||
BuildJobResult.COMPLETE: BUILD_PHASE.COMPLETE,
|
||||
BuildJobResult.ERROR: BUILD_PHASE.ERROR,
|
||||
}
|
0
buildman/jobutil/__init__.py
Normal file
0
buildman/jobutil/__init__.py
Normal file
183
buildman/jobutil/buildjob.py
Normal file
183
buildman/jobutil/buildjob.py
Normal file
|
@ -0,0 +1,183 @@
|
|||
import json
|
||||
import logging
|
||||
|
||||
from app import app
|
||||
from cachetools.func import lru_cache
|
||||
from notifications import spawn_notification
|
||||
from data import model
|
||||
from data.registry_model import registry_model
|
||||
from data.registry_model.datatypes import RepositoryReference
|
||||
from data.database import UseThenDisconnect
|
||||
from util.morecollections import AttrDict
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BuildJobLoadException(Exception):
|
||||
""" Exception raised if a build job could not be instantiated for some reason. """
|
||||
pass
|
||||
|
||||
|
||||
class BuildJob(object):
|
||||
""" Represents a single in-progress build job. """
|
||||
def __init__(self, job_item):
|
||||
self.job_item = job_item
|
||||
|
||||
try:
|
||||
self.job_details = json.loads(job_item.body)
|
||||
self.build_notifier = BuildJobNotifier(self.build_uuid)
|
||||
except ValueError:
|
||||
raise BuildJobLoadException(
|
||||
'Could not parse build queue item config with ID %s' % self.job_details['build_uuid']
|
||||
)
|
||||
|
||||
@property
|
||||
def retries_remaining(self):
|
||||
return self.job_item.retries_remaining
|
||||
|
||||
def has_retries_remaining(self):
|
||||
return self.job_item.retries_remaining > 0
|
||||
|
||||
def send_notification(self, kind, error_message=None, image_id=None, manifest_digests=None):
|
||||
self.build_notifier.send_notification(kind, error_message, image_id, manifest_digests)
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _load_repo_build(self):
|
||||
with UseThenDisconnect(app.config):
|
||||
try:
|
||||
return model.build.get_repository_build(self.build_uuid)
|
||||
except model.InvalidRepositoryBuildException:
|
||||
raise BuildJobLoadException(
|
||||
'Could not load repository build with ID %s' % self.build_uuid)
|
||||
|
||||
@property
|
||||
def build_uuid(self):
|
||||
""" Returns the unique UUID for this build job. """
|
||||
return self.job_details['build_uuid']
|
||||
|
||||
@property
|
||||
def namespace(self):
|
||||
""" Returns the namespace under which this build is running. """
|
||||
return self.repo_build.repository.namespace_user.username
|
||||
|
||||
@property
|
||||
def repo_name(self):
|
||||
""" Returns the name of the repository under which this build is running. """
|
||||
return self.repo_build.repository.name
|
||||
|
||||
@property
|
||||
def repo_build(self):
|
||||
return self._load_repo_build()
|
||||
|
||||
def get_build_package_url(self, user_files):
|
||||
""" Returns the URL of the build package for this build, if any or empty string if none. """
|
||||
archive_url = self.build_config.get('archive_url', None)
|
||||
if archive_url:
|
||||
return archive_url
|
||||
|
||||
if not self.repo_build.resource_key:
|
||||
return ''
|
||||
|
||||
return user_files.get_file_url(self.repo_build.resource_key, '127.0.0.1', requires_cors=False)
|
||||
|
||||
@property
|
||||
def pull_credentials(self):
|
||||
""" Returns the pull credentials for this job, or None if none. """
|
||||
return self.job_details.get('pull_credentials')
|
||||
|
||||
@property
|
||||
def build_config(self):
|
||||
try:
|
||||
return json.loads(self.repo_build.job_config)
|
||||
except ValueError:
|
||||
raise BuildJobLoadException(
|
||||
'Could not parse repository build job config with ID %s' % self.job_details['build_uuid']
|
||||
)
|
||||
|
||||
def determine_cached_tag(self, base_image_id=None, cache_comments=None):
|
||||
""" Returns the tag to pull to prime the cache or None if none. """
|
||||
cached_tag = self._determine_cached_tag_by_tag()
|
||||
logger.debug('Determined cached tag %s for %s: %s', cached_tag, base_image_id, cache_comments)
|
||||
return cached_tag
|
||||
|
||||
def _determine_cached_tag_by_tag(self):
|
||||
""" Determines the cached tag by looking for one of the tags being built, and seeing if it
|
||||
exists in the repository. This is a fallback for when no comment information is available.
|
||||
"""
|
||||
with UseThenDisconnect(app.config):
|
||||
tags = self.build_config.get('docker_tags', ['latest'])
|
||||
repository = RepositoryReference.for_repo_obj(self.repo_build.repository)
|
||||
matching_tag = registry_model.find_matching_tag(repository, tags)
|
||||
if matching_tag is not None:
|
||||
return matching_tag.name
|
||||
|
||||
most_recent_tag = registry_model.get_most_recent_tag(repository)
|
||||
if most_recent_tag is not None:
|
||||
return most_recent_tag.name
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class BuildJobNotifier(object):
|
||||
""" A class for sending notifications to a job that only relies on the build_uuid """
|
||||
|
||||
def __init__(self, build_uuid):
|
||||
self.build_uuid = build_uuid
|
||||
|
||||
@property
|
||||
def repo_build(self):
|
||||
return self._load_repo_build()
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _load_repo_build(self):
|
||||
try:
|
||||
return model.build.get_repository_build(self.build_uuid)
|
||||
except model.InvalidRepositoryBuildException:
|
||||
raise BuildJobLoadException(
|
||||
'Could not load repository build with ID %s' % self.build_uuid)
|
||||
|
||||
@property
|
||||
def build_config(self):
|
||||
try:
|
||||
return json.loads(self.repo_build.job_config)
|
||||
except ValueError:
|
||||
raise BuildJobLoadException(
|
||||
'Could not parse repository build job config with ID %s' % self.repo_build.uuid
|
||||
)
|
||||
|
||||
def send_notification(self, kind, error_message=None, image_id=None, manifest_digests=None):
|
||||
with UseThenDisconnect(app.config):
|
||||
tags = self.build_config.get('docker_tags', ['latest'])
|
||||
trigger = self.repo_build.trigger
|
||||
if trigger is not None and trigger.id is not None:
|
||||
trigger_kind = trigger.service.name
|
||||
else:
|
||||
trigger_kind = None
|
||||
|
||||
event_data = {
|
||||
'build_id': self.repo_build.uuid,
|
||||
'build_name': self.repo_build.display_name,
|
||||
'docker_tags': tags,
|
||||
'trigger_id': trigger.uuid if trigger is not None else None,
|
||||
'trigger_kind': trigger_kind,
|
||||
'trigger_metadata': self.build_config.get('trigger_metadata', {})
|
||||
}
|
||||
|
||||
if image_id is not None:
|
||||
event_data['image_id'] = image_id
|
||||
|
||||
if manifest_digests:
|
||||
event_data['manifest_digests'] = manifest_digests
|
||||
|
||||
if error_message is not None:
|
||||
event_data['error_message'] = error_message
|
||||
|
||||
# TODO: remove when more endpoints have been converted to using
|
||||
# interfaces
|
||||
repo = AttrDict({
|
||||
'namespace_name': self.repo_build.repository.namespace_user.username,
|
||||
'name': self.repo_build.repository.name,
|
||||
})
|
||||
spawn_notification(repo, kind, event_data,
|
||||
subpage='build/%s' % self.repo_build.uuid,
|
||||
pathargs=['build', self.repo_build.uuid])
|
88
buildman/jobutil/buildstatus.py
Normal file
88
buildman/jobutil/buildstatus.py
Normal file
|
@ -0,0 +1,88 @@
|
|||
import datetime
|
||||
import logging
|
||||
|
||||
from redis import RedisError
|
||||
from trollius import From, Return, coroutine
|
||||
|
||||
from data.database import BUILD_PHASE
|
||||
from data import model
|
||||
from buildman.asyncutil import AsyncWrapper
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StatusHandler(object):
|
||||
""" Context wrapper for writing status to build logs. """
|
||||
|
||||
def __init__(self, build_logs, repository_build_uuid):
|
||||
self._current_phase = None
|
||||
self._current_command = None
|
||||
self._uuid = repository_build_uuid
|
||||
self._build_logs = AsyncWrapper(build_logs)
|
||||
self._sync_build_logs = build_logs
|
||||
self._build_model = AsyncWrapper(model.build)
|
||||
|
||||
self._status = {
|
||||
'total_commands': 0,
|
||||
'current_command': None,
|
||||
'push_completion': 0.0,
|
||||
'pull_completion': 0.0,
|
||||
}
|
||||
|
||||
# Write the initial status.
|
||||
self.__exit__(None, None, None)
|
||||
|
||||
@coroutine
|
||||
def _append_log_message(self, log_message, log_type=None, log_data=None):
|
||||
log_data = log_data or {}
|
||||
log_data['datetime'] = str(datetime.datetime.now())
|
||||
|
||||
try:
|
||||
yield From(self._build_logs.append_log_message(self._uuid, log_message, log_type, log_data))
|
||||
except RedisError:
|
||||
logger.exception('Could not save build log for build %s: %s', self._uuid, log_message)
|
||||
|
||||
@coroutine
|
||||
def append_log(self, log_message, extra_data=None):
|
||||
if log_message is None:
|
||||
return
|
||||
|
||||
yield From(self._append_log_message(log_message, log_data=extra_data))
|
||||
|
||||
@coroutine
|
||||
def set_command(self, command, extra_data=None):
|
||||
if self._current_command == command:
|
||||
raise Return()
|
||||
|
||||
self._current_command = command
|
||||
yield From(self._append_log_message(command, self._build_logs.COMMAND, extra_data))
|
||||
|
||||
@coroutine
|
||||
def set_error(self, error_message, extra_data=None, internal_error=False, requeued=False):
|
||||
error_phase = BUILD_PHASE.INTERNAL_ERROR if internal_error and requeued else BUILD_PHASE.ERROR
|
||||
yield From(self.set_phase(error_phase))
|
||||
|
||||
extra_data = extra_data or {}
|
||||
extra_data['internal_error'] = internal_error
|
||||
yield From(self._append_log_message(error_message, self._build_logs.ERROR, extra_data))
|
||||
|
||||
@coroutine
|
||||
def set_phase(self, phase, extra_data=None):
|
||||
if phase == self._current_phase:
|
||||
raise Return(False)
|
||||
|
||||
self._current_phase = phase
|
||||
yield From(self._append_log_message(phase, self._build_logs.PHASE, extra_data))
|
||||
|
||||
# Update the repository build with the new phase
|
||||
raise Return(self._build_model.update_phase_then_close(self._uuid, phase))
|
||||
|
||||
def __enter__(self):
|
||||
return self._status
|
||||
|
||||
def __exit__(self, exc_type, value, traceback):
|
||||
try:
|
||||
self._sync_build_logs.set_status(self._uuid, self._status)
|
||||
except RedisError:
|
||||
logger.exception('Could not set status of build %s to %s', self._uuid, self._status)
|
119
buildman/jobutil/workererror.py
Normal file
119
buildman/jobutil/workererror.py
Normal file
|
@ -0,0 +1,119 @@
|
|||
class WorkerError(object):
|
||||
""" Helper class which represents errors raised by a build worker. """
|
||||
def __init__(self, error_code, base_message=None):
|
||||
self._error_code = error_code
|
||||
self._base_message = base_message
|
||||
|
||||
self._error_handlers = {
|
||||
'io.quay.builder.buildpackissue': {
|
||||
'message': 'Could not load build package',
|
||||
'is_internal': True,
|
||||
},
|
||||
|
||||
'io.quay.builder.gitfailure': {
|
||||
'message': 'Could not clone git repository',
|
||||
'show_base_error': True,
|
||||
},
|
||||
|
||||
'io.quay.builder.gitcheckout': {
|
||||
'message': 'Could not checkout git ref. If you force pushed recently, ' +
|
||||
'the commit may be missing.',
|
||||
'show_base_error': True,
|
||||
},
|
||||
|
||||
'io.quay.builder.cannotextractbuildpack': {
|
||||
'message': 'Could not extract the contents of the build package'
|
||||
},
|
||||
|
||||
'io.quay.builder.cannotpullforcache': {
|
||||
'message': 'Could not pull cached image',
|
||||
'is_internal': True
|
||||
},
|
||||
|
||||
'io.quay.builder.dockerfileissue': {
|
||||
'message': 'Could not find or parse Dockerfile',
|
||||
'show_base_error': True
|
||||
},
|
||||
|
||||
'io.quay.builder.cannotpullbaseimage': {
|
||||
'message': 'Could not pull base image',
|
||||
'show_base_error': True
|
||||
},
|
||||
|
||||
'io.quay.builder.internalerror': {
|
||||
'message': 'An internal error occurred while building. Please submit a ticket.',
|
||||
'is_internal': True
|
||||
},
|
||||
|
||||
'io.quay.builder.buildrunerror': {
|
||||
'message': 'Could not start the build process',
|
||||
'is_internal': True
|
||||
},
|
||||
|
||||
'io.quay.builder.builderror': {
|
||||
'message': 'A build step failed',
|
||||
'show_base_error': True
|
||||
},
|
||||
|
||||
'io.quay.builder.tagissue': {
|
||||
'message': 'Could not tag built image',
|
||||
'is_internal': True
|
||||
},
|
||||
|
||||
'io.quay.builder.pushissue': {
|
||||
'message': 'Could not push built image',
|
||||
'show_base_error': True,
|
||||
'is_internal': True
|
||||
},
|
||||
|
||||
'io.quay.builder.dockerconnecterror': {
|
||||
'message': 'Could not connect to Docker daemon',
|
||||
'is_internal': True
|
||||
},
|
||||
|
||||
'io.quay.builder.missingorinvalidargument': {
|
||||
'message': 'Missing required arguments for builder',
|
||||
'is_internal': True
|
||||
},
|
||||
|
||||
'io.quay.builder.cachelookupissue': {
|
||||
'message': 'Error checking for a cached tag',
|
||||
'is_internal': True
|
||||
},
|
||||
|
||||
'io.quay.builder.errorduringphasetransition': {
|
||||
'message': 'Error during phase transition. If this problem persists ' +
|
||||
'please contact customer support.',
|
||||
'is_internal': True
|
||||
},
|
||||
|
||||
'io.quay.builder.clientrejectedtransition': {
|
||||
'message': 'Build can not be finished due to user cancellation.',
|
||||
}
|
||||
}
|
||||
|
||||
def is_internal_error(self):
|
||||
handler = self._error_handlers.get(self._error_code)
|
||||
return handler.get('is_internal', False) if handler else True
|
||||
|
||||
def public_message(self):
|
||||
handler = self._error_handlers.get(self._error_code)
|
||||
if not handler:
|
||||
return 'An unknown error occurred'
|
||||
|
||||
message = handler['message']
|
||||
if handler.get('show_base_error', False) and self._base_message:
|
||||
message = message + ': ' + self._base_message
|
||||
|
||||
return message
|
||||
|
||||
def extra_data(self):
|
||||
if self._base_message:
|
||||
return {
|
||||
'base_error': self._base_message,
|
||||
'error_code': self._error_code
|
||||
}
|
||||
|
||||
return {
|
||||
'error_code': self._error_code
|
||||
}
|
0
buildman/manager/__init__.py
Normal file
0
buildman/manager/__init__.py
Normal file
71
buildman/manager/basemanager.py
Normal file
71
buildman/manager/basemanager.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
from trollius import coroutine
|
||||
|
||||
class BaseManager(object):
|
||||
""" Base for all worker managers. """
|
||||
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
|
||||
job_complete_callback, manager_hostname, heartbeat_period_sec):
|
||||
self.register_component = register_component
|
||||
self.unregister_component = unregister_component
|
||||
self.job_heartbeat_callback = job_heartbeat_callback
|
||||
self.job_complete_callback = job_complete_callback
|
||||
self.manager_hostname = manager_hostname
|
||||
self.heartbeat_period_sec = heartbeat_period_sec
|
||||
|
||||
@coroutine
|
||||
def job_heartbeat(self, build_job):
|
||||
""" Method invoked to tell the manager that a job is still running. This method will be called
|
||||
every few minutes. """
|
||||
self.job_heartbeat_callback(build_job)
|
||||
|
||||
def overall_setup_time(self):
|
||||
""" Returns the number of seconds that the build system should wait before allowing the job
|
||||
to be picked up again after called 'schedule'.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def shutdown(self):
|
||||
""" Indicates that the build controller server is in a shutdown state and that no new jobs
|
||||
or workers should be performed. Existing workers should be cleaned up once their jobs
|
||||
have completed
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@coroutine
|
||||
def schedule(self, build_job):
|
||||
""" Schedules a queue item to be built. Returns a 2-tuple with (True, None) if the item was
|
||||
properly scheduled and (False, a retry timeout in seconds) if all workers are busy or an
|
||||
error occurs.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def initialize(self, manager_config):
|
||||
""" Runs any initialization code for the manager. Called once the server is in a ready state.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@coroutine
|
||||
def build_component_ready(self, build_component):
|
||||
""" Method invoked whenever a build component announces itself as ready.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def build_component_disposed(self, build_component, timed_out):
|
||||
""" Method invoked whenever a build component has been disposed. The timed_out boolean indicates
|
||||
whether the component's heartbeat timed out.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@coroutine
|
||||
def job_completed(self, build_job, job_status, build_component):
|
||||
""" Method invoked once a job_item has completed, in some manner. The job_status will be
|
||||
one of: incomplete, error, complete. Implementations of this method should call coroutine
|
||||
self.job_complete_callback with a status of Incomplete if they wish for the job to be
|
||||
automatically requeued.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def num_workers(self):
|
||||
""" Returns the number of active build workers currently registered. This includes those
|
||||
that are currently busy and awaiting more work.
|
||||
"""
|
||||
raise NotImplementedError
|
27
buildman/manager/buildcanceller.py
Normal file
27
buildman/manager/buildcanceller.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import logging
|
||||
|
||||
from buildman.manager.orchestrator_canceller import OrchestratorCanceller
|
||||
from buildman.manager.noop_canceller import NoopCanceller
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CANCELLERS = {'ephemeral': OrchestratorCanceller}
|
||||
|
||||
|
||||
class BuildCanceller(object):
|
||||
""" A class to manage cancelling a build """
|
||||
|
||||
def __init__(self, app=None):
|
||||
self.build_manager_config = app.config.get('BUILD_MANAGER')
|
||||
if app is None or self.build_manager_config is None:
|
||||
self.handler = NoopCanceller()
|
||||
else:
|
||||
self.handler = None
|
||||
|
||||
def try_cancel_build(self, uuid):
|
||||
""" A method to kill a running build """
|
||||
if self.handler is None:
|
||||
canceller = CANCELLERS.get(self.build_manager_config[0], NoopCanceller)
|
||||
self.handler = canceller(self.build_manager_config[1])
|
||||
|
||||
return self.handler.try_cancel_build(uuid)
|
92
buildman/manager/enterprise.py
Normal file
92
buildman/manager/enterprise.py
Normal file
|
@ -0,0 +1,92 @@
|
|||
import logging
|
||||
import uuid
|
||||
|
||||
from buildman.component.basecomponent import BaseComponent
|
||||
from buildman.component.buildcomponent import BuildComponent
|
||||
from buildman.manager.basemanager import BaseManager
|
||||
|
||||
from trollius import From, Return, coroutine
|
||||
|
||||
REGISTRATION_REALM = 'registration'
|
||||
RETRY_TIMEOUT = 5
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DynamicRegistrationComponent(BaseComponent):
|
||||
""" Component session that handles dynamic registration of the builder components. """
|
||||
|
||||
def onConnect(self):
|
||||
self.join(REGISTRATION_REALM)
|
||||
|
||||
def onJoin(self, details):
|
||||
logger.debug('Registering registration method')
|
||||
yield From(self.register(self._worker_register, u'io.quay.buildworker.register'))
|
||||
|
||||
def _worker_register(self):
|
||||
realm = self.parent_manager.add_build_component()
|
||||
logger.debug('Registering new build component+worker with realm %s', realm)
|
||||
return realm
|
||||
|
||||
def kind(self):
|
||||
return 'registration'
|
||||
|
||||
|
||||
class EnterpriseManager(BaseManager):
|
||||
""" Build manager implementation for the Enterprise Registry. """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.ready_components = set()
|
||||
self.all_components = set()
|
||||
self.shutting_down = False
|
||||
|
||||
super(EnterpriseManager, self).__init__(*args, **kwargs)
|
||||
|
||||
def initialize(self, manager_config):
|
||||
# Add a component which is used by build workers for dynamic registration. Unlike
|
||||
# production, build workers in enterprise are long-lived and register dynamically.
|
||||
self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
|
||||
|
||||
def overall_setup_time(self):
|
||||
# Builders are already registered, so the setup time should be essentially instant. We therefore
|
||||
# only return a minute here.
|
||||
return 60
|
||||
|
||||
def add_build_component(self):
|
||||
""" Adds a new build component for an Enterprise Registry. """
|
||||
# Generate a new unique realm ID for the build worker.
|
||||
realm = str(uuid.uuid4())
|
||||
new_component = self.register_component(realm, BuildComponent, token="")
|
||||
self.all_components.add(new_component)
|
||||
return realm
|
||||
|
||||
@coroutine
|
||||
def schedule(self, build_job):
|
||||
""" Schedules a build for an Enterprise Registry. """
|
||||
if self.shutting_down or not self.ready_components:
|
||||
raise Return(False, RETRY_TIMEOUT)
|
||||
|
||||
component = self.ready_components.pop()
|
||||
|
||||
yield From(component.start_build(build_job))
|
||||
|
||||
raise Return(True, None)
|
||||
|
||||
@coroutine
|
||||
def build_component_ready(self, build_component):
|
||||
self.ready_components.add(build_component)
|
||||
|
||||
def shutdown(self):
|
||||
self.shutting_down = True
|
||||
|
||||
@coroutine
|
||||
def job_completed(self, build_job, job_status, build_component):
|
||||
yield From(self.job_complete_callback(build_job, job_status))
|
||||
|
||||
def build_component_disposed(self, build_component, timed_out):
|
||||
self.all_components.remove(build_component)
|
||||
if build_component in self.ready_components:
|
||||
self.ready_components.remove(build_component)
|
||||
|
||||
self.unregister_component(build_component)
|
||||
|
||||
def num_workers(self):
|
||||
return len(self.all_components)
|
710
buildman/manager/ephemeral.py
Normal file
710
buildman/manager/ephemeral.py
Normal file
|
@ -0,0 +1,710 @@
|
|||
import logging
|
||||
import uuid
|
||||
import calendar
|
||||
import json
|
||||
import time
|
||||
|
||||
from collections import namedtuple
|
||||
from datetime import datetime, timedelta
|
||||
from six import iteritems
|
||||
|
||||
from trollius import From, coroutine, Return, async, sleep
|
||||
|
||||
from app import metric_queue
|
||||
from buildman.orchestrator import (orchestrator_from_config, KeyEvent,
|
||||
OrchestratorError, OrchestratorConnectionError,
|
||||
ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
|
||||
from buildman.manager.basemanager import BaseManager
|
||||
from buildman.manager.executor import PopenExecutor, EC2Executor, KubernetesExecutor
|
||||
from buildman.component.buildcomponent import BuildComponent
|
||||
from buildman.jobutil.buildjob import BuildJob
|
||||
from buildman.server import BuildJobResult
|
||||
from util import slash_join
|
||||
from util.morecollections import AttrDict
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
JOB_PREFIX = 'building/'
|
||||
LOCK_PREFIX = 'lock/'
|
||||
REALM_PREFIX = 'realm/'
|
||||
CANCEL_PREFIX = 'cancel/'
|
||||
METRIC_PREFIX = 'metric/'
|
||||
|
||||
CANCELED_LOCK_PREFIX = slash_join(LOCK_PREFIX, 'job-cancelled')
|
||||
EXPIRED_LOCK_PREFIX = slash_join(LOCK_PREFIX, 'job-expired')
|
||||
|
||||
EPHEMERAL_API_TIMEOUT = 20
|
||||
EPHEMERAL_SETUP_TIMEOUT = 500
|
||||
|
||||
RETRY_IMMEDIATELY_SLEEP_DURATION = 0
|
||||
TOO_MANY_WORKERS_SLEEP_DURATION = 10
|
||||
|
||||
|
||||
BuildInfo = namedtuple('BuildInfo', ['component', 'build_job', 'execution_id', 'executor_name'])
|
||||
|
||||
|
||||
class EphemeralBuilderManager(BaseManager):
|
||||
""" Build manager implementation for the Enterprise Registry. """
|
||||
|
||||
EXECUTORS = {
|
||||
'popen': PopenExecutor,
|
||||
'ec2': EC2Executor,
|
||||
'kubernetes': KubernetesExecutor,
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
|
||||
|
||||
self._shutting_down = False
|
||||
|
||||
self._manager_config = None
|
||||
self._orchestrator = None
|
||||
|
||||
# The registered executors available for running jobs, in order.
|
||||
self._ordered_executors = []
|
||||
|
||||
# The registered executors, mapped by their unique name.
|
||||
self._executor_name_to_executor = {}
|
||||
|
||||
# Map from builder component to its associated job.
|
||||
self._component_to_job = {}
|
||||
|
||||
# Map from build UUID to a BuildInfo tuple with information about the build.
|
||||
self._build_uuid_to_info = {}
|
||||
|
||||
def overall_setup_time(self):
|
||||
return EPHEMERAL_SETUP_TIMEOUT
|
||||
|
||||
@coroutine
|
||||
def _mark_job_incomplete(self, build_job, build_info):
|
||||
""" Marks a job as incomplete, in response to a failure to start or a timeout. """
|
||||
executor_name = build_info.executor_name
|
||||
execution_id = build_info.execution_id
|
||||
|
||||
logger.warning('Build executor failed to successfully boot with execution id %s',
|
||||
execution_id)
|
||||
|
||||
# Take a lock to ensure that only one manager reports the build as incomplete for this
|
||||
# execution.
|
||||
lock_key = slash_join(self._expired_lock_prefix, build_job.build_uuid, execution_id)
|
||||
acquired_lock = yield From(self._orchestrator.lock(lock_key))
|
||||
if acquired_lock:
|
||||
try:
|
||||
# Clean up the bookkeeping for the job.
|
||||
yield From(self._orchestrator.delete_key(self._job_key(build_job)))
|
||||
except KeyError:
|
||||
logger.debug('Could not delete job key %s; might have been removed already',
|
||||
build_job.build_uuid)
|
||||
|
||||
logger.error('[BUILD INTERNAL ERROR] Build ID: %s. Exec name: %s. Exec ID: %s',
|
||||
build_job.build_uuid, executor_name, execution_id)
|
||||
yield From(self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name,
|
||||
update_phase=True))
|
||||
else:
|
||||
logger.debug('Did not get lock for job-expiration for job %s', build_job.build_uuid)
|
||||
|
||||
@coroutine
|
||||
def _job_callback(self, key_change):
|
||||
"""
|
||||
This is the callback invoked when keys related to jobs are changed.
|
||||
It ignores all events related to the creation of new jobs.
|
||||
Deletes or expirations cause checks to ensure they've been properly marked as completed.
|
||||
|
||||
:param key_change: the event and value produced by a key changing in the orchestrator
|
||||
:type key_change: :class:`KeyChange`
|
||||
"""
|
||||
if key_change.event in (KeyEvent.CREATE, KeyEvent.SET):
|
||||
raise Return()
|
||||
|
||||
elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE):
|
||||
# Handle the expiration/deletion.
|
||||
job_metadata = json.loads(key_change.value)
|
||||
build_job = BuildJob(AttrDict(job_metadata['job_queue_item']))
|
||||
logger.debug('Got "%s" of job %s', key_change.event, build_job.build_uuid)
|
||||
|
||||
# Get the build info.
|
||||
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
|
||||
if build_info is None:
|
||||
logger.debug('No build info for "%s" job %s (%s); probably already deleted by this manager',
|
||||
key_change.event, build_job.build_uuid, job_metadata)
|
||||
raise Return()
|
||||
|
||||
if key_change.event != KeyEvent.EXPIRE:
|
||||
# If the etcd action was not an expiration, then it was already deleted by some manager and
|
||||
# the execution was therefore already shutdown. All that's left is to remove the build info.
|
||||
self._build_uuid_to_info.pop(build_job.build_uuid, None)
|
||||
raise Return()
|
||||
|
||||
logger.debug('got expiration for job %s with metadata: %s', build_job.build_uuid,
|
||||
job_metadata)
|
||||
|
||||
if not job_metadata.get('had_heartbeat', False):
|
||||
# If we have not yet received a heartbeat, then the node failed to boot in some way.
|
||||
# We mark the job as incomplete here.
|
||||
yield From(self._mark_job_incomplete(build_job, build_info))
|
||||
|
||||
# Finally, we terminate the build execution for the job. We don't do this under a lock as
|
||||
# terminating a node is an atomic operation; better to make sure it is terminated than not.
|
||||
logger.info('Terminating expired build executor for job %s with execution id %s',
|
||||
build_job.build_uuid, build_info.execution_id)
|
||||
yield From(self.kill_builder_executor(build_job.build_uuid))
|
||||
else:
|
||||
logger.warning('Unexpected KeyEvent (%s) on job key: %s', key_change.event, key_change.key)
|
||||
|
||||
|
||||
@coroutine
|
||||
def _realm_callback(self, key_change):
|
||||
logger.debug('realm callback for key: %s', key_change.key)
|
||||
if key_change.event == KeyEvent.CREATE:
|
||||
# Listen on the realm created by ourselves or another worker.
|
||||
realm_spec = json.loads(key_change.value)
|
||||
self._register_realm(realm_spec)
|
||||
|
||||
elif key_change.event in (KeyEvent.DELETE, KeyEvent.EXPIRE):
|
||||
# Stop listening for new connections on the realm, if we did not get the connection.
|
||||
realm_spec = json.loads(key_change.value)
|
||||
realm_id = realm_spec['realm']
|
||||
|
||||
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
|
||||
build_uuid = build_job.build_uuid
|
||||
|
||||
logger.debug('Realm key %s for build %s was %s', realm_id, build_uuid, key_change.event)
|
||||
build_info = self._build_uuid_to_info.get(build_uuid, None)
|
||||
if build_info is not None:
|
||||
# Pop off the component and if we find one, then the build has not connected to this
|
||||
# manager, so we can safely unregister its component.
|
||||
component = self._component_to_job.pop(build_info.component, None)
|
||||
if component is not None:
|
||||
# We were not the manager which the worker connected to, remove the bookkeeping for it
|
||||
logger.debug('Unregistering unused component for build %s', build_uuid)
|
||||
self.unregister_component(build_info.component)
|
||||
|
||||
# If the realm has expired, then perform cleanup of the executor.
|
||||
if key_change.event == KeyEvent.EXPIRE:
|
||||
execution_id = realm_spec.get('execution_id', None)
|
||||
executor_name = realm_spec.get('executor_name', 'EC2Executor')
|
||||
|
||||
# Cleanup the job, since it never started.
|
||||
logger.debug('Job %s for incomplete marking: %s', build_uuid, build_info)
|
||||
if build_info is not None:
|
||||
yield From(self._mark_job_incomplete(build_job, build_info))
|
||||
|
||||
# Cleanup the executor.
|
||||
logger.info('Realm %s expired for job %s, terminating executor %s with execution id %s',
|
||||
realm_id, build_uuid, executor_name, execution_id)
|
||||
yield From(self.terminate_executor(executor_name, execution_id))
|
||||
|
||||
else:
|
||||
logger.warning('Unexpected action (%s) on realm key: %s', key_change.event, key_change.key)
|
||||
|
||||
|
||||
def _register_realm(self, realm_spec):
|
||||
logger.debug('Got call to register realm %s with manager', realm_spec['realm'])
|
||||
|
||||
# Create the build information block for the registered realm.
|
||||
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
|
||||
execution_id = realm_spec.get('execution_id', None)
|
||||
executor_name = realm_spec.get('executor_name', 'EC2Executor')
|
||||
|
||||
logger.debug('Registering realm %s with manager: %s', realm_spec['realm'], realm_spec)
|
||||
component = self.register_component(realm_spec['realm'], BuildComponent,
|
||||
token=realm_spec['token'])
|
||||
|
||||
build_info = BuildInfo(component=component, build_job=build_job, execution_id=execution_id,
|
||||
executor_name=executor_name)
|
||||
|
||||
self._component_to_job[component] = build_job
|
||||
self._build_uuid_to_info[build_job.build_uuid] = build_info
|
||||
|
||||
logger.debug('Registered realm %s with manager', realm_spec['realm'])
|
||||
return component
|
||||
|
||||
@property
|
||||
def registered_executors(self):
|
||||
return self._ordered_executors
|
||||
|
||||
@coroutine
|
||||
def _register_existing_realms(self):
|
||||
try:
|
||||
all_realms = yield From(self._orchestrator.get_prefixed_keys(self._realm_prefix))
|
||||
|
||||
# Register all existing realms found.
|
||||
encountered = {self._register_realm(json.loads(realm_data))
|
||||
for _realm, realm_data in all_realms}
|
||||
|
||||
# Remove any components not encountered so we can clean up.
|
||||
for component, job in iteritems(self._component_to_job):
|
||||
if not component in encountered:
|
||||
self._component_to_job.pop(component, None)
|
||||
self._build_uuid_to_info.pop(job.build_uuid, None)
|
||||
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
def _load_executor(self, executor_kind_name, executor_config):
|
||||
executor_klass = EphemeralBuilderManager.EXECUTORS.get(executor_kind_name)
|
||||
if executor_klass is None:
|
||||
logger.error('Unknown executor %s; skipping install', executor_kind_name)
|
||||
return
|
||||
|
||||
executor = executor_klass(executor_config, self.manager_hostname)
|
||||
if executor.name in self._executor_name_to_executor:
|
||||
raise Exception('Executor with name %s already registered' % executor.name)
|
||||
|
||||
self._ordered_executors.append(executor)
|
||||
self._executor_name_to_executor[executor.name] = executor
|
||||
|
||||
def _config_prefix(self, key):
|
||||
if self._manager_config.get('ORCHESTRATOR') is None:
|
||||
return key
|
||||
|
||||
prefix = self._manager_config.get('ORCHESTRATOR_PREFIX', '')
|
||||
return slash_join(prefix, key).lstrip('/') + '/'
|
||||
|
||||
@property
|
||||
def _job_prefix(self):
|
||||
return self._config_prefix(JOB_PREFIX)
|
||||
|
||||
@property
|
||||
def _realm_prefix(self):
|
||||
return self._config_prefix(REALM_PREFIX)
|
||||
|
||||
@property
|
||||
def _cancel_prefix(self):
|
||||
return self._config_prefix(CANCEL_PREFIX)
|
||||
|
||||
@property
|
||||
def _metric_prefix(self):
|
||||
return self._config_prefix(METRIC_PREFIX)
|
||||
|
||||
@property
|
||||
def _expired_lock_prefix(self):
|
||||
return self._config_prefix(EXPIRED_LOCK_PREFIX)
|
||||
|
||||
@property
|
||||
def _canceled_lock_prefix(self):
|
||||
return self._config_prefix(CANCELED_LOCK_PREFIX)
|
||||
|
||||
def _metric_key(self, realm):
|
||||
"""
|
||||
Create a key which is used to track a job in the Orchestrator.
|
||||
|
||||
:param realm: realm for the build
|
||||
:type realm: str
|
||||
:returns: key used to track jobs
|
||||
:rtype: str
|
||||
"""
|
||||
return slash_join(self._metric_prefix, realm)
|
||||
|
||||
def _job_key(self, build_job):
|
||||
"""
|
||||
Creates a key which is used to track a job in the Orchestrator.
|
||||
|
||||
:param build_job: unique job identifier for a build
|
||||
:type build_job: str
|
||||
:returns: key used to track the job
|
||||
:rtype: str
|
||||
"""
|
||||
return slash_join(self._job_prefix, build_job.job_details['build_uuid'])
|
||||
|
||||
def _realm_key(self, realm):
|
||||
"""
|
||||
Create a key which is used to track an incoming connection on a realm.
|
||||
|
||||
:param realm: realm for the build
|
||||
:type realm: str
|
||||
:returns: key used to track the connection to the realm
|
||||
:rtype: str
|
||||
"""
|
||||
return slash_join(self._realm_prefix, realm)
|
||||
|
||||
|
||||
def initialize(self, manager_config):
|
||||
logger.debug('Calling initialize')
|
||||
self._manager_config = manager_config
|
||||
|
||||
# Note: Executor config can be defined either as a single block of EXECUTOR_CONFIG (old style)
|
||||
# or as a new set of executor configurations, with the order determining how we fallback. We
|
||||
# check for both here to ensure backwards compatibility.
|
||||
if manager_config.get('EXECUTORS'):
|
||||
for executor_config in manager_config['EXECUTORS']:
|
||||
self._load_executor(executor_config.get('EXECUTOR'), executor_config)
|
||||
else:
|
||||
self._load_executor(manager_config.get('EXECUTOR'), manager_config.get('EXECUTOR_CONFIG'))
|
||||
|
||||
logger.debug('calling orchestrator_from_config')
|
||||
self._orchestrator = orchestrator_from_config(manager_config)
|
||||
|
||||
logger.debug('setting on_key_change callbacks for job, cancel, realm')
|
||||
self._orchestrator.on_key_change(self._job_prefix, self._job_callback)
|
||||
self._orchestrator.on_key_change(self._cancel_prefix, self._cancel_callback)
|
||||
self._orchestrator.on_key_change(self._realm_prefix, self._realm_callback,
|
||||
restarter=self._register_existing_realms)
|
||||
|
||||
# Load components for all realms currently known to the cluster
|
||||
async(self._register_existing_realms())
|
||||
|
||||
def shutdown(self):
|
||||
logger.debug('Shutting down worker.')
|
||||
if self._orchestrator is not None:
|
||||
self._orchestrator.shutdown()
|
||||
|
||||
@coroutine
|
||||
def schedule(self, build_job):
|
||||
build_uuid = build_job.job_details['build_uuid']
|
||||
logger.debug('Calling schedule with job: %s', build_uuid)
|
||||
|
||||
# Check if there are worker slots available by checking the number of jobs in the orchestrator
|
||||
allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
|
||||
try:
|
||||
active_jobs = yield From(self._orchestrator.get_prefixed_keys(self._job_prefix))
|
||||
workers_alive = len(active_jobs)
|
||||
except KeyError:
|
||||
workers_alive = 0
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('Could not read job count from orchestrator for job due to orchestrator being down')
|
||||
raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
|
||||
except OrchestratorError:
|
||||
logger.exception('Exception when reading job count from orchestrator for job: %s', build_uuid)
|
||||
raise Return(False, RETRY_IMMEDIATELY_SLEEP_DURATION)
|
||||
|
||||
logger.debug('Total jobs (scheduling job %s): %s', build_uuid, workers_alive)
|
||||
|
||||
if workers_alive >= allowed_worker_count:
|
||||
logger.info('Too many workers alive, unable to start new worker for build job: %s. %s >= %s',
|
||||
build_uuid, workers_alive, allowed_worker_count)
|
||||
raise Return(False, TOO_MANY_WORKERS_SLEEP_DURATION)
|
||||
|
||||
job_key = self._job_key(build_job)
|
||||
|
||||
# First try to take a lock for this job, meaning we will be responsible for its lifeline
|
||||
realm = str(uuid.uuid4())
|
||||
token = str(uuid.uuid4())
|
||||
nonce = str(uuid.uuid4())
|
||||
|
||||
machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200)
|
||||
max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration)
|
||||
|
||||
payload = {
|
||||
'max_expiration': calendar.timegm(max_expiration.timetuple()),
|
||||
'nonce': nonce,
|
||||
'had_heartbeat': False,
|
||||
'job_queue_item': build_job.job_item,
|
||||
}
|
||||
|
||||
lock_payload = json.dumps(payload)
|
||||
logger.debug('Writing key for job %s with expiration in %s seconds', build_uuid,
|
||||
EPHEMERAL_SETUP_TIMEOUT)
|
||||
|
||||
try:
|
||||
yield From(self._orchestrator.set_key(job_key, lock_payload, overwrite=False,
|
||||
expiration=EPHEMERAL_SETUP_TIMEOUT))
|
||||
except KeyError:
|
||||
logger.warning('Job: %s already exists in orchestrator, timeout may be misconfigured',
|
||||
build_uuid)
|
||||
raise Return(False, EPHEMERAL_API_TIMEOUT)
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('Exception when writing job %s to orchestrator; could not connect',
|
||||
build_uuid)
|
||||
raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
|
||||
except OrchestratorError:
|
||||
logger.exception('Exception when writing job %s to orchestrator', build_uuid)
|
||||
raise Return(False, RETRY_IMMEDIATELY_SLEEP_DURATION)
|
||||
|
||||
# Got a lock, now lets boot the job via one of the registered executors.
|
||||
started_with_executor = None
|
||||
execution_id = None
|
||||
|
||||
logger.debug("Registered executors are: %s", [ex.name for ex in self._ordered_executors])
|
||||
for executor in self._ordered_executors:
|
||||
# Check if we can use this executor based on its whitelist, by namespace.
|
||||
namespace = build_job.namespace
|
||||
if not executor.allowed_for_namespace(namespace):
|
||||
logger.debug('Job %s (namespace: %s) cannot use executor %s', build_uuid, namespace,
|
||||
executor.name)
|
||||
continue
|
||||
|
||||
# Check if we can use this executor based on the retries remaining.
|
||||
if executor.minimum_retry_threshold > build_job.retries_remaining:
|
||||
metric_queue.builder_fallback.Inc()
|
||||
logger.debug('Job %s cannot use executor %s as it is below retry threshold %s (retry #%s)',
|
||||
build_uuid, executor.name, executor.minimum_retry_threshold,
|
||||
build_job.retries_remaining)
|
||||
continue
|
||||
|
||||
logger.debug('Starting builder for job %s with selected executor: %s', build_uuid,
|
||||
executor.name)
|
||||
|
||||
try:
|
||||
execution_id = yield From(executor.start_builder(realm, token, build_uuid))
|
||||
except:
|
||||
try:
|
||||
metric_queue.build_start_failure.Inc(labelvalues=[executor.name])
|
||||
metric_queue.put_deprecated(('ExecutorFailure-%s' % executor.name), 1, unit='Count')
|
||||
except:
|
||||
logger.exception('Exception when writing failure metric for execution %s for job %s',
|
||||
execution_id, build_uuid)
|
||||
|
||||
logger.exception('Exception when starting builder for job: %s', build_uuid)
|
||||
continue
|
||||
|
||||
try:
|
||||
metric_queue.build_start_success.Inc(labelvalues=[executor.name])
|
||||
except:
|
||||
logger.exception('Exception when writing success metric for execution %s for job %s',
|
||||
execution_id, build_uuid)
|
||||
|
||||
try:
|
||||
metric_queue.ephemeral_build_workers.Inc()
|
||||
except:
|
||||
logger.exception('Exception when writing start metrics for execution %s for job %s',
|
||||
execution_id, build_uuid)
|
||||
|
||||
started_with_executor = executor
|
||||
|
||||
# Break out of the loop now that we've started a builder successfully.
|
||||
break
|
||||
|
||||
# If we didn't start the job, cleanup and return it to the queue.
|
||||
if started_with_executor is None:
|
||||
logger.error('Could not start ephemeral worker for build %s', build_uuid)
|
||||
|
||||
# Delete the associated build job record.
|
||||
yield From(self._orchestrator.delete_key(job_key))
|
||||
raise Return(False, EPHEMERAL_API_TIMEOUT)
|
||||
|
||||
# Job was started!
|
||||
logger.debug('Started execution with ID %s for job: %s with executor: %s',
|
||||
execution_id, build_uuid, started_with_executor.name)
|
||||
|
||||
# Store metric data
|
||||
metric_spec = json.dumps({
|
||||
'executor_name': started_with_executor.name,
|
||||
'start_time': time.time(),
|
||||
})
|
||||
|
||||
try:
|
||||
yield From(self._orchestrator.set_key(self._metric_key(realm), metric_spec, overwrite=False,
|
||||
expiration=machine_max_expiration + 10))
|
||||
except KeyError:
|
||||
logger.error('Realm %s already exists in orchestrator for job %s ' +
|
||||
'UUID collision or something is very very wrong.', realm, build_uuid)
|
||||
except OrchestratorError:
|
||||
logger.exception('Exception when writing realm %s to orchestrator for job %s',
|
||||
realm, build_uuid)
|
||||
|
||||
# Store the realm spec which will allow any manager to accept this builder when it connects
|
||||
realm_spec = json.dumps({
|
||||
'realm': realm,
|
||||
'token': token,
|
||||
'execution_id': execution_id,
|
||||
'executor_name': started_with_executor.name,
|
||||
'job_queue_item': build_job.job_item,
|
||||
})
|
||||
|
||||
try:
|
||||
setup_time = started_with_executor.setup_time or self.overall_setup_time()
|
||||
logger.debug('Writing job key for job %s using executor %s with ID %s and ttl %s', build_uuid,
|
||||
started_with_executor.name, execution_id, setup_time)
|
||||
yield From(self._orchestrator.set_key(self._realm_key(realm), realm_spec,
|
||||
expiration=setup_time))
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('Exception when writing realm %s to orchestrator for job %s',
|
||||
realm, build_uuid)
|
||||
raise Return(False, ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
|
||||
except OrchestratorError:
|
||||
logger.exception('Exception when writing realm %s to orchestrator for job %s',
|
||||
realm, build_uuid)
|
||||
raise Return(False, setup_time)
|
||||
|
||||
logger.debug('Builder spawn complete for job %s using executor %s with ID %s ',
|
||||
build_uuid, started_with_executor.name, execution_id)
|
||||
raise Return(True, None)
|
||||
|
||||
@coroutine
|
||||
def build_component_ready(self, build_component):
|
||||
logger.debug('Got component ready for component with realm %s', build_component.builder_realm)
|
||||
|
||||
# Pop off the job for the component.
|
||||
# We do so before we send out the watch below, as it will also remove this mapping.
|
||||
job = self._component_to_job.pop(build_component, None)
|
||||
if job is None:
|
||||
# This will occur once the build finishes, so no need to worry about it.
|
||||
# We log in case it happens outside of the expected flow.
|
||||
logger.debug('Could not find job for the build component on realm %s; component is ready',
|
||||
build_component.builder_realm)
|
||||
raise Return()
|
||||
|
||||
# Start the build job.
|
||||
logger.debug('Sending build %s to newly ready component on realm %s',
|
||||
job.build_uuid, build_component.builder_realm)
|
||||
yield From(build_component.start_build(job))
|
||||
|
||||
yield From(self._write_duration_metric(metric_queue.builder_time_to_build,
|
||||
build_component.builder_realm))
|
||||
|
||||
# Clean up the bookkeeping for allowing any manager to take the job.
|
||||
try:
|
||||
yield From(self._orchestrator.delete_key(self._realm_key(build_component.builder_realm)))
|
||||
except KeyError:
|
||||
logger.warning('Could not delete realm key %s', build_component.builder_realm)
|
||||
|
||||
def build_component_disposed(self, build_component, timed_out):
|
||||
logger.debug('Calling build_component_disposed.')
|
||||
self.unregister_component(build_component)
|
||||
|
||||
@coroutine
|
||||
def job_completed(self, build_job, job_status, build_component):
|
||||
logger.debug('Calling job_completed for job %s with status: %s',
|
||||
build_job.build_uuid, job_status)
|
||||
|
||||
yield From(self._write_duration_metric(metric_queue.build_time, build_component.builder_realm))
|
||||
|
||||
# Mark the job as completed. Since this is being invoked from the component, we don't need
|
||||
# to ask for the phase to be updated as well.
|
||||
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
|
||||
executor_name = build_info.executor_name if build_info else None
|
||||
yield From(self.job_complete_callback(build_job, job_status, executor_name, update_phase=False))
|
||||
|
||||
# Kill the ephemeral builder.
|
||||
yield From(self.kill_builder_executor(build_job.build_uuid))
|
||||
|
||||
# Delete the build job from the orchestrator.
|
||||
try:
|
||||
job_key = self._job_key(build_job)
|
||||
yield From(self._orchestrator.delete_key(job_key))
|
||||
except KeyError:
|
||||
logger.debug('Builder is asking for job to be removed, but work already completed')
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('Could not remove job key as orchestrator is not available')
|
||||
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
|
||||
raise Return()
|
||||
|
||||
# Delete the metric from the orchestrator.
|
||||
try:
|
||||
metric_key = self._metric_key(build_component.builder_realm)
|
||||
yield From(self._orchestrator.delete_key(metric_key))
|
||||
except KeyError:
|
||||
logger.debug('Builder is asking for metric to be removed, but key not found')
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('Could not remove metric key as orchestrator is not available')
|
||||
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
|
||||
raise Return()
|
||||
|
||||
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
|
||||
|
||||
@coroutine
|
||||
def kill_builder_executor(self, build_uuid):
|
||||
logger.info('Starting termination of executor for job %s', build_uuid)
|
||||
build_info = self._build_uuid_to_info.pop(build_uuid, None)
|
||||
if build_info is None:
|
||||
logger.debug('Build information not found for build %s; skipping termination', build_uuid)
|
||||
raise Return()
|
||||
|
||||
# Remove the build's component.
|
||||
self._component_to_job.pop(build_info.component, None)
|
||||
|
||||
# Stop the build node/executor itself.
|
||||
yield From(self.terminate_executor(build_info.executor_name, build_info.execution_id))
|
||||
|
||||
@coroutine
|
||||
def terminate_executor(self, executor_name, execution_id):
|
||||
executor = self._executor_name_to_executor.get(executor_name)
|
||||
if executor is None:
|
||||
logger.error('Could not find registered executor %s', executor_name)
|
||||
raise Return()
|
||||
|
||||
# Terminate the executor's execution.
|
||||
logger.info('Terminating executor %s with execution id %s', executor_name, execution_id)
|
||||
yield From(executor.stop_builder(execution_id))
|
||||
|
||||
@coroutine
|
||||
def job_heartbeat(self, build_job):
|
||||
"""
|
||||
:param build_job: the identifier for the build
|
||||
:type build_job: str
|
||||
"""
|
||||
self.job_heartbeat_callback(build_job)
|
||||
self._extend_job_in_orchestrator(build_job)
|
||||
|
||||
@coroutine
|
||||
def _extend_job_in_orchestrator(self, build_job):
|
||||
try:
|
||||
job_data = yield From(self._orchestrator.get_key(self._job_key(build_job)))
|
||||
except KeyError:
|
||||
logger.info('Job %s no longer exists in the orchestrator', build_job.build_uuid)
|
||||
raise Return()
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('failed to connect when attempted to extend job')
|
||||
|
||||
build_job_metadata = json.loads(job_data)
|
||||
|
||||
max_expiration = datetime.utcfromtimestamp(build_job_metadata['max_expiration'])
|
||||
max_expiration_remaining = max_expiration - datetime.utcnow()
|
||||
max_expiration_sec = max(0, int(max_expiration_remaining.total_seconds()))
|
||||
|
||||
ttl = min(self.heartbeat_period_sec * 2, max_expiration_sec)
|
||||
payload = {
|
||||
'job_queue_item': build_job.job_item,
|
||||
'max_expiration': build_job_metadata['max_expiration'],
|
||||
'had_heartbeat': True,
|
||||
}
|
||||
|
||||
try:
|
||||
yield From(self._orchestrator.set_key(self._job_key(build_job), json.dumps(payload),
|
||||
expiration=ttl))
|
||||
except OrchestratorConnectionError:
|
||||
logger.exception('Could not update heartbeat for job as the orchestrator is not available')
|
||||
yield From(sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION))
|
||||
|
||||
@coroutine
|
||||
def _write_duration_metric(self, metric, realm):
|
||||
"""
|
||||
:returns: True if the metric was written, otherwise False
|
||||
:rtype: bool
|
||||
"""
|
||||
try:
|
||||
metric_data = yield From(self._orchestrator.get_key(self._metric_key(realm)))
|
||||
parsed_metric_data = json.loads(metric_data)
|
||||
start_time = parsed_metric_data['start_time']
|
||||
metric.Observe(time.time() - start_time,
|
||||
labelvalues=[parsed_metric_data.get('executor_name',
|
||||
'unknown')])
|
||||
except Exception:
|
||||
logger.exception("Could not write metric for realm %s", realm)
|
||||
|
||||
def num_workers(self):
|
||||
"""
|
||||
The number of workers we're managing locally.
|
||||
|
||||
:returns: the number of the workers locally managed
|
||||
:rtype: int
|
||||
"""
|
||||
return len(self._component_to_job)
|
||||
|
||||
|
||||
@coroutine
|
||||
def _cancel_callback(self, key_change):
|
||||
if key_change.event not in (KeyEvent.CREATE, KeyEvent.SET):
|
||||
raise Return()
|
||||
|
||||
build_uuid = key_change.value
|
||||
build_info = self._build_uuid_to_info.get(build_uuid, None)
|
||||
if build_info is None:
|
||||
logger.debug('No build info for "%s" job %s', key_change.event, build_uuid)
|
||||
raise Return(False)
|
||||
|
||||
lock_key = slash_join(self._canceled_lock_prefix,
|
||||
build_uuid, build_info.execution_id)
|
||||
lock_acquired = yield From(self._orchestrator.lock(lock_key))
|
||||
if lock_acquired:
|
||||
builder_realm = build_info.component.builder_realm
|
||||
yield From(self.kill_builder_executor(build_uuid))
|
||||
yield From(self._orchestrator.delete_key(self._realm_key(builder_realm)))
|
||||
yield From(self._orchestrator.delete_key(self._metric_key(builder_realm)))
|
||||
yield From(self._orchestrator.delete_key(slash_join(self._job_prefix, build_uuid)))
|
||||
|
||||
# This is outside the lock so we can un-register the component wherever it is registered to.
|
||||
yield From(build_info.component.cancel_build())
|
37
buildman/manager/etcd_canceller.py
Normal file
37
buildman/manager/etcd_canceller.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
import logging
|
||||
import etcd
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EtcdCanceller(object):
|
||||
""" A class that sends a message to etcd to cancel a build """
|
||||
|
||||
def __init__(self, config):
|
||||
etcd_host = config.get('ETCD_HOST', '127.0.0.1')
|
||||
etcd_port = config.get('ETCD_PORT', 2379)
|
||||
etcd_ca_cert = config.get('ETCD_CA_CERT', None)
|
||||
etcd_auth = config.get('ETCD_CERT_AND_KEY', None)
|
||||
if etcd_auth is not None:
|
||||
etcd_auth = tuple(etcd_auth)
|
||||
|
||||
etcd_protocol = 'http' if etcd_auth is None else 'https'
|
||||
logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
|
||||
self._cancel_prefix = config.get('ETCD_CANCEL_PREFIX', 'cancel/')
|
||||
self._etcd_client = etcd.Client(
|
||||
host=etcd_host,
|
||||
port=etcd_port,
|
||||
cert=etcd_auth,
|
||||
ca_cert=etcd_ca_cert,
|
||||
protocol=etcd_protocol,
|
||||
read_timeout=5)
|
||||
|
||||
def try_cancel_build(self, build_uuid):
|
||||
""" Writes etcd message to cancel build_uuid. """
|
||||
logger.info("Cancelling build %s".format(build_uuid))
|
||||
try:
|
||||
self._etcd_client.write("{}{}".format(self._cancel_prefix, build_uuid), build_uuid, ttl=60)
|
||||
return True
|
||||
except etcd.EtcdException:
|
||||
logger.exception("Failed to write to etcd client %s", build_uuid)
|
||||
return False
|
560
buildman/manager/executor.py
Normal file
560
buildman/manager/executor.py
Normal file
|
@ -0,0 +1,560 @@
|
|||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
import threading
|
||||
import uuid
|
||||
|
||||
from functools import partial
|
||||
|
||||
import boto.ec2
|
||||
import cachetools.func
|
||||
import requests
|
||||
import trollius
|
||||
|
||||
from container_cloud_config import CloudConfigContext
|
||||
from jinja2 import FileSystemLoader, Environment
|
||||
from trollius import coroutine, From, Return, get_event_loop
|
||||
|
||||
import release
|
||||
|
||||
from buildman.asyncutil import AsyncWrapper
|
||||
from app import metric_queue, app
|
||||
from util.metrics.metricqueue import duration_collector_async
|
||||
from _init import ROOT_DIR
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
ONE_HOUR = 60*60
|
||||
|
||||
_TAG_RETRY_COUNT = 3 # Number of times to retry adding tags.
|
||||
_TAG_RETRY_SLEEP = 2 # Number of seconds to wait between tag retries.
|
||||
|
||||
ENV = Environment(loader=FileSystemLoader(os.path.join(ROOT_DIR, "buildman/templates")))
|
||||
TEMPLATE = ENV.get_template('cloudconfig.yaml')
|
||||
CloudConfigContext().populate_jinja_environment(ENV)
|
||||
|
||||
class ExecutorException(Exception):
|
||||
""" Exception raised when there is a problem starting or stopping a builder.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class BuilderExecutor(object):
|
||||
def __init__(self, executor_config, manager_hostname):
|
||||
""" Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
|
||||
starting and stopping builders.
|
||||
"""
|
||||
self.executor_config = executor_config
|
||||
self.manager_hostname = manager_hostname
|
||||
|
||||
default_websocket_scheme = 'wss' if app.config['PREFERRED_URL_SCHEME'] == 'https' else 'ws'
|
||||
self.websocket_scheme = executor_config.get("WEBSOCKET_SCHEME", default_websocket_scheme)
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
""" Name returns the unique name for this executor. """
|
||||
return self.executor_config.get('NAME') or self.__class__.__name__
|
||||
|
||||
@property
|
||||
def setup_time(self):
|
||||
""" Returns the amount of time (in seconds) to wait for the execution to start for the build.
|
||||
If None, the manager's default will be used.
|
||||
"""
|
||||
return self.executor_config.get('SETUP_TIME')
|
||||
|
||||
@coroutine
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
""" Create a builder with the specified config. Returns a unique id which can be used to manage
|
||||
the builder.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@coroutine
|
||||
def stop_builder(self, builder_id):
|
||||
""" Stop a builder which is currently running.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def allowed_for_namespace(self, namespace):
|
||||
""" Returns true if this executor can be used for builds in the given namespace. """
|
||||
|
||||
# Check for an explicit namespace whitelist.
|
||||
namespace_whitelist = self.executor_config.get('NAMESPACE_WHITELIST')
|
||||
if namespace_whitelist is not None and namespace in namespace_whitelist:
|
||||
return True
|
||||
|
||||
# Check for a staged rollout percentage. If found, we hash the namespace and, if it is found
|
||||
# in the first X% of the character space, we allow this executor to be used.
|
||||
staged_rollout = self.executor_config.get('STAGED_ROLLOUT')
|
||||
if staged_rollout is not None:
|
||||
bucket = int(hashlib.sha256(namespace).hexdigest()[-2:], 16)
|
||||
return bucket < (256 * staged_rollout)
|
||||
|
||||
# If there are no restrictions in place, we are free to use this executor.
|
||||
return staged_rollout is None and namespace_whitelist is None
|
||||
|
||||
@property
|
||||
def minimum_retry_threshold(self):
|
||||
""" Returns the minimum number of retries required for this executor to be used or 0 if
|
||||
none. """
|
||||
return self.executor_config.get('MINIMUM_RETRY_THRESHOLD', 0)
|
||||
|
||||
def generate_cloud_config(self, realm, token, build_uuid, coreos_channel,
|
||||
manager_hostname, quay_username=None,
|
||||
quay_password=None):
|
||||
if quay_username is None:
|
||||
quay_username = self.executor_config['QUAY_USERNAME']
|
||||
|
||||
if quay_password is None:
|
||||
quay_password = self.executor_config['QUAY_PASSWORD']
|
||||
|
||||
return TEMPLATE.render(
|
||||
realm=realm,
|
||||
token=token,
|
||||
build_uuid=build_uuid,
|
||||
quay_username=quay_username,
|
||||
quay_password=quay_password,
|
||||
manager_hostname=manager_hostname,
|
||||
websocket_scheme=self.websocket_scheme,
|
||||
coreos_channel=coreos_channel,
|
||||
worker_image=self.executor_config.get('WORKER_IMAGE', 'quay.io/coreos/registry-build-worker'),
|
||||
worker_tag=self.executor_config['WORKER_TAG'],
|
||||
logentries_token=self.executor_config.get('LOGENTRIES_TOKEN', None),
|
||||
volume_size=self.executor_config.get('VOLUME_SIZE', '42G'),
|
||||
max_lifetime_s=self.executor_config.get('MAX_LIFETIME_S', 10800),
|
||||
ssh_authorized_keys=self.executor_config.get('SSH_AUTHORIZED_KEYS', []),
|
||||
)
|
||||
|
||||
|
||||
class EC2Executor(BuilderExecutor):
|
||||
""" Implementation of BuilderExecutor which uses libcloud to start machines on a variety of cloud
|
||||
providers.
|
||||
"""
|
||||
COREOS_STACK_URL = 'http://%s.release.core-os.net/amd64-usr/current/coreos_production_ami_hvm.txt'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._loop = get_event_loop()
|
||||
super(EC2Executor, self).__init__(*args, **kwargs)
|
||||
|
||||
def _get_conn(self):
|
||||
""" Creates an ec2 connection which can be used to manage instances.
|
||||
"""
|
||||
return AsyncWrapper(boto.ec2.connect_to_region(
|
||||
self.executor_config['EC2_REGION'],
|
||||
aws_access_key_id=self.executor_config['AWS_ACCESS_KEY'],
|
||||
aws_secret_access_key=self.executor_config['AWS_SECRET_KEY'],
|
||||
))
|
||||
|
||||
@classmethod
|
||||
@cachetools.func.ttl_cache(ttl=ONE_HOUR)
|
||||
def _get_coreos_ami(cls, ec2_region, coreos_channel):
|
||||
""" Retrieve the CoreOS AMI id from the canonical listing.
|
||||
"""
|
||||
stack_list_string = requests.get(EC2Executor.COREOS_STACK_URL % coreos_channel).text
|
||||
stack_amis = dict([stack.split('=') for stack in stack_list_string.split('|')])
|
||||
return stack_amis[ec2_region]
|
||||
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, ['ec2'])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
region = self.executor_config['EC2_REGION']
|
||||
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
|
||||
|
||||
coreos_ami = self.executor_config.get('COREOS_AMI', None)
|
||||
if coreos_ami is None:
|
||||
get_ami_callable = partial(self._get_coreos_ami, region, channel)
|
||||
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
|
||||
|
||||
user_data = self.generate_cloud_config(realm, token, build_uuid, channel, self.manager_hostname)
|
||||
logger.debug('Generated cloud config for build %s: %s', build_uuid, user_data)
|
||||
|
||||
ec2_conn = self._get_conn()
|
||||
|
||||
ssd_root_ebs = boto.ec2.blockdevicemapping.BlockDeviceType(
|
||||
size=int(self.executor_config.get('BLOCK_DEVICE_SIZE', 48)),
|
||||
volume_type='gp2',
|
||||
delete_on_termination=True,
|
||||
)
|
||||
block_devices = boto.ec2.blockdevicemapping.BlockDeviceMapping()
|
||||
block_devices['/dev/xvda'] = ssd_root_ebs
|
||||
|
||||
interfaces = None
|
||||
if self.executor_config.get('EC2_VPC_SUBNET_ID', None) is not None:
|
||||
interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
|
||||
subnet_id=self.executor_config['EC2_VPC_SUBNET_ID'],
|
||||
groups=self.executor_config['EC2_SECURITY_GROUP_IDS'],
|
||||
associate_public_ip_address=True,
|
||||
)
|
||||
interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)
|
||||
|
||||
try:
|
||||
reservation = yield From(ec2_conn.run_instances(
|
||||
coreos_ami,
|
||||
instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
|
||||
key_name=self.executor_config.get('EC2_KEY_NAME', None),
|
||||
user_data=user_data,
|
||||
instance_initiated_shutdown_behavior='terminate',
|
||||
block_device_map=block_devices,
|
||||
network_interfaces=interfaces,
|
||||
))
|
||||
except boto.exception.EC2ResponseError as ec2e:
|
||||
logger.exception('Unable to spawn builder instance')
|
||||
metric_queue.ephemeral_build_worker_failure.Inc()
|
||||
raise ec2e
|
||||
|
||||
if not reservation.instances:
|
||||
raise ExecutorException('Unable to spawn builder instance.')
|
||||
elif len(reservation.instances) != 1:
|
||||
raise ExecutorException('EC2 started wrong number of instances!')
|
||||
|
||||
launched = AsyncWrapper(reservation.instances[0])
|
||||
|
||||
# Sleep a few seconds to wait for AWS to spawn the instance.
|
||||
yield From(trollius.sleep(_TAG_RETRY_SLEEP))
|
||||
|
||||
# Tag the instance with its metadata.
|
||||
for i in range(0, _TAG_RETRY_COUNT):
|
||||
try:
|
||||
yield From(launched.add_tags({
|
||||
'Name': 'Quay Ephemeral Builder',
|
||||
'Realm': realm,
|
||||
'Token': token,
|
||||
'BuildUUID': build_uuid,
|
||||
}))
|
||||
except boto.exception.EC2ResponseError as ec2e:
|
||||
if ec2e.error_code == 'InvalidInstanceID.NotFound':
|
||||
if i < _TAG_RETRY_COUNT - 1:
|
||||
logger.warning('Failed to write EC2 tags for instance %s for build %s (attempt #%s)',
|
||||
launched.id, build_uuid, i)
|
||||
yield From(trollius.sleep(_TAG_RETRY_SLEEP))
|
||||
continue
|
||||
|
||||
raise ExecutorException('Unable to find builder instance.')
|
||||
|
||||
logger.exception('Failed to write EC2 tags (attempt #%s)', i)
|
||||
|
||||
logger.debug('Machine with ID %s started for build %s', launched.id, build_uuid)
|
||||
raise Return(launched.id)
|
||||
|
||||
@coroutine
|
||||
def stop_builder(self, builder_id):
|
||||
try:
|
||||
ec2_conn = self._get_conn()
|
||||
terminated_instances = yield From(ec2_conn.terminate_instances([builder_id]))
|
||||
except boto.exception.EC2ResponseError as ec2e:
|
||||
if ec2e.error_code == 'InvalidInstanceID.NotFound':
|
||||
logger.debug('Instance %s already terminated', builder_id)
|
||||
return
|
||||
|
||||
logger.exception('Exception when trying to terminate instance %s', builder_id)
|
||||
raise
|
||||
|
||||
if builder_id not in [si.id for si in terminated_instances]:
|
||||
raise ExecutorException('Unable to terminate instance: %s' % builder_id)
|
||||
|
||||
|
||||
class PopenExecutor(BuilderExecutor):
|
||||
""" Implementation of BuilderExecutor which uses Popen to fork a quay-builder process.
|
||||
"""
|
||||
def __init__(self, executor_config, manager_hostname):
|
||||
self._jobs = {}
|
||||
|
||||
super(PopenExecutor, self).__init__(executor_config, manager_hostname)
|
||||
|
||||
""" Executor which uses Popen to fork a quay-builder process.
|
||||
"""
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, ['fork'])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
# Now start a machine for this job, adding the machine id to the etcd information
|
||||
logger.debug('Forking process for build')
|
||||
|
||||
ws_host = os.environ.get("BUILDMAN_WS_HOST", "localhost")
|
||||
ws_port = os.environ.get("BUILDMAN_WS_PORT", "8787")
|
||||
builder_env = {
|
||||
'TOKEN': token,
|
||||
'REALM': realm,
|
||||
'ENDPOINT': 'ws://%s:%s' % (ws_host, ws_port),
|
||||
'DOCKER_TLS_VERIFY': os.environ.get('DOCKER_TLS_VERIFY', ''),
|
||||
'DOCKER_CERT_PATH': os.environ.get('DOCKER_CERT_PATH', ''),
|
||||
'DOCKER_HOST': os.environ.get('DOCKER_HOST', ''),
|
||||
'PATH': "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||
}
|
||||
|
||||
logpipe = LogPipe(logging.INFO)
|
||||
spawned = subprocess.Popen(os.environ.get('BUILDER_BINARY_LOCATION',
|
||||
'/usr/local/bin/quay-builder'),
|
||||
stdout=logpipe,
|
||||
stderr=logpipe,
|
||||
env=builder_env)
|
||||
|
||||
builder_id = str(uuid.uuid4())
|
||||
self._jobs[builder_id] = (spawned, logpipe)
|
||||
logger.debug('Builder spawned with id: %s', builder_id)
|
||||
raise Return(builder_id)
|
||||
|
||||
@coroutine
|
||||
def stop_builder(self, builder_id):
|
||||
if builder_id not in self._jobs:
|
||||
raise ExecutorException('Builder id not being tracked by executor.')
|
||||
|
||||
logger.debug('Killing builder with id: %s', builder_id)
|
||||
spawned, logpipe = self._jobs[builder_id]
|
||||
|
||||
if spawned.poll() is None:
|
||||
spawned.kill()
|
||||
logpipe.close()
|
||||
|
||||
|
||||
class KubernetesExecutor(BuilderExecutor):
|
||||
""" Executes build jobs by creating Kubernetes jobs which run a qemu-kvm virtual
|
||||
machine in a pod """
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(KubernetesExecutor, self).__init__(*args, **kwargs)
|
||||
self._loop = get_event_loop()
|
||||
self.namespace = self.executor_config.get('BUILDER_NAMESPACE', 'builder')
|
||||
self.image = self.executor_config.get('BUILDER_VM_CONTAINER_IMAGE',
|
||||
'quay.io/quay/quay-builder-qemu-coreos:stable')
|
||||
|
||||
@coroutine
|
||||
def _request(self, method, path, **kwargs):
|
||||
request_options = dict(kwargs)
|
||||
|
||||
tls_cert = self.executor_config.get('K8S_API_TLS_CERT')
|
||||
tls_key = self.executor_config.get('K8S_API_TLS_KEY')
|
||||
tls_ca = self.executor_config.get('K8S_API_TLS_CA')
|
||||
service_account_token = self.executor_config.get('SERVICE_ACCOUNT_TOKEN')
|
||||
|
||||
if 'timeout' not in request_options:
|
||||
request_options['timeout'] = self.executor_config.get("K8S_API_TIMEOUT", 20)
|
||||
|
||||
if service_account_token:
|
||||
scheme = 'https'
|
||||
request_options['headers'] = {'Authorization': 'Bearer ' + service_account_token}
|
||||
logger.debug('Using service account token for Kubernetes authentication')
|
||||
elif tls_cert and tls_key:
|
||||
scheme = 'https'
|
||||
request_options['cert'] = (tls_cert, tls_key)
|
||||
logger.debug('Using tls certificate and key for Kubernetes authentication')
|
||||
if tls_ca:
|
||||
request_options['verify'] = tls_ca
|
||||
else:
|
||||
scheme = 'http'
|
||||
|
||||
server = self.executor_config.get('K8S_API_SERVER', 'localhost:8080')
|
||||
url = '%s://%s%s' % (scheme, server, path)
|
||||
|
||||
logger.debug('Executor config: %s', self.executor_config)
|
||||
logger.debug('Kubernetes request: %s %s: %s', method, url, request_options)
|
||||
res = requests.request(method, url, **request_options)
|
||||
logger.debug('Kubernetes response: %s: %s', res.status_code, res.text)
|
||||
raise Return(res)
|
||||
|
||||
def _jobs_path(self):
|
||||
return '/apis/batch/v1/namespaces/%s/jobs' % self.namespace
|
||||
|
||||
def _job_path(self, build_uuid):
|
||||
return '%s/%s' % (self._jobs_path(), build_uuid)
|
||||
|
||||
def _kubernetes_distribution(self):
|
||||
return self.executor_config.get('KUBERNETES_DISTRIBUTION', 'basic').lower()
|
||||
|
||||
def _is_basic_kubernetes_distribution(self):
|
||||
return self._kubernetes_distribution() == 'basic'
|
||||
|
||||
def _is_openshift_kubernetes_distribution(self):
|
||||
return self._kubernetes_distribution() == 'openshift'
|
||||
|
||||
def _build_job_container_resources(self):
|
||||
# Minimum acceptable free resources for this container to "fit" in a quota
|
||||
# These may be lower than the absolute limits if the cluster is knowingly
|
||||
# oversubscribed by some amount.
|
||||
container_requests = {
|
||||
'memory' : self.executor_config.get('CONTAINER_MEMORY_REQUEST', '3968Mi'),
|
||||
}
|
||||
|
||||
container_limits = {
|
||||
'memory' : self.executor_config.get('CONTAINER_MEMORY_LIMITS', '5120Mi'),
|
||||
'cpu' : self.executor_config.get('CONTAINER_CPU_LIMITS', '1000m'),
|
||||
}
|
||||
|
||||
resources = {
|
||||
'requests': container_requests,
|
||||
}
|
||||
|
||||
if self._is_openshift_kubernetes_distribution():
|
||||
resources['requests']['cpu'] = self.executor_config.get('CONTAINER_CPU_REQUEST', '500m')
|
||||
resources['limits'] = container_limits
|
||||
|
||||
return resources
|
||||
|
||||
def _build_job_containers(self, user_data):
|
||||
vm_memory_limit = self.executor_config.get('VM_MEMORY_LIMIT', '4G')
|
||||
vm_volume_size = self.executor_config.get('VOLUME_SIZE', '32G')
|
||||
|
||||
container = {
|
||||
'name': 'builder',
|
||||
'imagePullPolicy': 'IfNotPresent',
|
||||
'image': self.image,
|
||||
'securityContext': {'privileged': True},
|
||||
'env': [
|
||||
{'name': 'USERDATA', 'value': user_data},
|
||||
{'name': 'VM_MEMORY', 'value': vm_memory_limit},
|
||||
{'name': 'VM_VOLUME_SIZE', 'value': vm_volume_size},
|
||||
],
|
||||
'resources': self._build_job_container_resources(),
|
||||
}
|
||||
|
||||
if self._is_basic_kubernetes_distribution():
|
||||
container['volumeMounts'] = [{'name': 'secrets-mask','mountPath': '/var/run/secrets/kubernetes.io/serviceaccount'}]
|
||||
|
||||
return container
|
||||
|
||||
def _job_resource(self, build_uuid, user_data, coreos_channel='stable'):
|
||||
image_pull_secret_name = self.executor_config.get('IMAGE_PULL_SECRET_NAME', 'builder')
|
||||
service_account = self.executor_config.get('SERVICE_ACCOUNT_NAME', 'quay-builder-sa')
|
||||
node_selector_label_key = self.executor_config.get('NODE_SELECTOR_LABEL_KEY', 'beta.kubernetes.io/instance-type')
|
||||
node_selector_label_value = self.executor_config.get('NODE_SELECTOR_LABEL_VALUE', '')
|
||||
|
||||
node_selector = {
|
||||
node_selector_label_key : node_selector_label_value
|
||||
}
|
||||
|
||||
release_sha = release.GIT_HEAD or 'none'
|
||||
if ' ' in release_sha:
|
||||
release_sha = 'HEAD'
|
||||
|
||||
job_resource = {
|
||||
'apiVersion': 'batch/v1',
|
||||
'kind': 'Job',
|
||||
'metadata': {
|
||||
'namespace': self.namespace,
|
||||
'generateName': build_uuid + '-',
|
||||
'labels': {
|
||||
'build': build_uuid,
|
||||
'time': datetime.datetime.now().strftime('%Y-%m-%d-%H'),
|
||||
'manager': socket.gethostname(),
|
||||
'quay-sha': release_sha,
|
||||
},
|
||||
},
|
||||
'spec' : {
|
||||
'activeDeadlineSeconds': self.executor_config.get('MAXIMUM_JOB_TIME', 7200),
|
||||
'template': {
|
||||
'metadata': {
|
||||
'labels': {
|
||||
'build': build_uuid,
|
||||
'time': datetime.datetime.now().strftime('%Y-%m-%d-%H'),
|
||||
'manager': socket.gethostname(),
|
||||
'quay-sha': release_sha,
|
||||
},
|
||||
},
|
||||
'spec': {
|
||||
'imagePullSecrets': [{ 'name': image_pull_secret_name }],
|
||||
'restartPolicy': 'Never',
|
||||
'dnsPolicy': 'Default',
|
||||
'containers': [self._build_job_containers(user_data)],
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
if self._is_openshift_kubernetes_distribution():
|
||||
# Setting `automountServiceAccountToken` to false will prevent automounting API credentials for a service account.
|
||||
job_resource['spec']['template']['spec']['automountServiceAccountToken'] = False
|
||||
|
||||
# Use dedicated service account that has no authorization to any resources.
|
||||
job_resource['spec']['template']['spec']['serviceAccount'] = service_account
|
||||
|
||||
# Setting `enableServiceLinks` to false prevents information about other services from being injected into pod's
|
||||
# environment variables. Pod has no visibility into other services on the cluster.
|
||||
job_resource['spec']['template']['spec']['enableServiceLinks'] = False
|
||||
|
||||
if node_selector_label_value.strip() != '':
|
||||
job_resource['spec']['template']['spec']['nodeSelector'] = node_selector
|
||||
|
||||
if self._is_basic_kubernetes_distribution():
|
||||
# This volume is a hack to mask the token for the namespace's
|
||||
# default service account, which is placed in a file mounted under
|
||||
# `/var/run/secrets/kubernetes.io/serviceaccount` in all pods.
|
||||
# There's currently no other way to just disable the service
|
||||
# account at either the pod or namespace level.
|
||||
#
|
||||
# https://github.com/kubernetes/kubernetes/issues/16779
|
||||
#
|
||||
job_resource['spec']['template']['spec']['volumes'] = [{'name': 'secrets-mask','emptyDir': {'medium': 'Memory'}}]
|
||||
|
||||
return job_resource
|
||||
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, ['k8s'])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
# generate resource
|
||||
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
|
||||
user_data = self.generate_cloud_config(realm, token, build_uuid, channel, self.manager_hostname)
|
||||
resource = self._job_resource(build_uuid, user_data, channel)
|
||||
logger.debug('Using Kubernetes Distribution: %s', self._kubernetes_distribution())
|
||||
logger.debug('Generated kubernetes resource:\n%s', resource)
|
||||
|
||||
# schedule
|
||||
create_job = yield From(self._request('POST', self._jobs_path(), json=resource))
|
||||
if int(create_job.status_code / 100) != 2:
|
||||
raise ExecutorException('Failed to create job: %s: %s: %s' %
|
||||
(build_uuid, create_job.status_code, create_job.text))
|
||||
|
||||
job = create_job.json()
|
||||
raise Return(job['metadata']['name'])
|
||||
|
||||
@coroutine
|
||||
def stop_builder(self, builder_id):
|
||||
pods_path = '/api/v1/namespaces/%s/pods' % self.namespace
|
||||
|
||||
# Delete the job itself.
|
||||
try:
|
||||
yield From(self._request('DELETE', self._job_path(builder_id)))
|
||||
except:
|
||||
logger.exception('Failed to send delete job call for job %s', builder_id)
|
||||
|
||||
# Delete the pod(s) for the job.
|
||||
selectorString = "job-name=%s" % builder_id
|
||||
try:
|
||||
yield From(self._request('DELETE', pods_path, params=dict(labelSelector=selectorString)))
|
||||
except:
|
||||
logger.exception("Failed to send delete pod call for job %s", builder_id)
|
||||
|
||||
|
||||
class LogPipe(threading.Thread):
|
||||
""" Adapted from http://codereview.stackexchange.com/a/17959
|
||||
"""
|
||||
def __init__(self, level):
|
||||
"""Setup the object with a logger and a loglevel
|
||||
and start the thread
|
||||
"""
|
||||
threading.Thread.__init__(self)
|
||||
self.daemon = False
|
||||
self.level = level
|
||||
self.fd_read, self.fd_write = os.pipe()
|
||||
self.pipe_reader = os.fdopen(self.fd_read)
|
||||
self.start()
|
||||
|
||||
def fileno(self):
|
||||
"""Return the write file descriptor of the pipe
|
||||
"""
|
||||
return self.fd_write
|
||||
|
||||
def run(self):
|
||||
"""Run the thread, logging everything.
|
||||
"""
|
||||
for line in iter(self.pipe_reader.readline, ''):
|
||||
logging.log(self.level, line.strip('\n'))
|
||||
|
||||
self.pipe_reader.close()
|
||||
|
||||
def close(self):
|
||||
"""Close the write end of the pipe.
|
||||
"""
|
||||
os.close(self.fd_write)
|
8
buildman/manager/noop_canceller.py
Normal file
8
buildman/manager/noop_canceller.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
class NoopCanceller(object):
|
||||
""" A class that can not cancel a build """
|
||||
def __init__(self, config=None):
|
||||
pass
|
||||
|
||||
def try_cancel_build(self, uuid):
|
||||
""" Does nothing and fails to cancel build. """
|
||||
return False
|
26
buildman/manager/orchestrator_canceller.py
Normal file
26
buildman/manager/orchestrator_canceller.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
import logging
|
||||
|
||||
from buildman.orchestrator import orchestrator_from_config, OrchestratorError
|
||||
from util import slash_join
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
CANCEL_PREFIX = 'cancel/'
|
||||
|
||||
|
||||
class OrchestratorCanceller(object):
|
||||
""" An asynchronous way to cancel a build with any Orchestrator. """
|
||||
def __init__(self, config):
|
||||
self._orchestrator = orchestrator_from_config(config, canceller_only=True)
|
||||
|
||||
def try_cancel_build(self, build_uuid):
|
||||
logger.info('Cancelling build %s', build_uuid)
|
||||
cancel_key = slash_join(CANCEL_PREFIX, build_uuid)
|
||||
try:
|
||||
self._orchestrator.set_key_sync(cancel_key, build_uuid, expiration=60)
|
||||
return True
|
||||
except OrchestratorError:
|
||||
logger.exception('Failed to write cancel action to redis with uuid %s', build_uuid)
|
||||
return False
|
753
buildman/orchestrator.py
Normal file
753
buildman/orchestrator.py
Normal file
|
@ -0,0 +1,753 @@
|
|||
from abc import ABCMeta, abstractmethod
|
||||
from collections import namedtuple
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
|
||||
from enum import IntEnum, unique
|
||||
from six import add_metaclass, iteritems
|
||||
from trollius import async, coroutine, From, Return
|
||||
from urllib3.exceptions import ReadTimeoutError, ProtocolError
|
||||
|
||||
import etcd
|
||||
import redis
|
||||
|
||||
from buildman.asyncutil import wrap_with_threadpool
|
||||
from util import slash_join
|
||||
from util.expiresdict import ExpiresDict
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ONE_DAY = 60 * 60 * 24
|
||||
ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION = 5
|
||||
DEFAULT_LOCK_EXPIRATION = 10000
|
||||
|
||||
ETCD_READ_TIMEOUT = 5
|
||||
ETCD_MAX_WATCH_TIMEOUT = 30
|
||||
|
||||
REDIS_EXPIRING_SUFFIX = '/expiring'
|
||||
REDIS_DEFAULT_PUBSUB_KEY = 'orchestrator_events'
|
||||
REDIS_EVENT_KIND_MESSAGE = 'message'
|
||||
REDIS_EVENT_KIND_PMESSAGE = 'pmessage'
|
||||
REDIS_NONEXPIRING_KEY = -1
|
||||
|
||||
# This constant defines the Redis configuration flags used to watch [K]eyspace and e[x]pired
|
||||
# events on keys. For more info, see https://redis.io/topics/notifications#configuration
|
||||
REDIS_KEYSPACE_EVENT_CONFIG_VALUE = 'Kx'
|
||||
REDIS_KEYSPACE_EVENT_CONFIG_KEY = 'notify-keyspace-events'
|
||||
REDIS_KEYSPACE_KEY_PATTERN = '__keyspace@%s__:%s'
|
||||
REDIS_EXPIRED_KEYSPACE_PATTERN = slash_join(REDIS_KEYSPACE_KEY_PATTERN, REDIS_EXPIRING_SUFFIX)
|
||||
REDIS_EXPIRED_KEYSPACE_REGEX = re.compile(REDIS_EXPIRED_KEYSPACE_PATTERN % (r'(\S+)', r'(\S+)'))
|
||||
|
||||
|
||||
def orchestrator_from_config(manager_config, canceller_only=False):
|
||||
"""
|
||||
Allocates a new Orchestrator from the 'ORCHESTRATOR' block from provided manager config.
|
||||
Checks for legacy configuration prefixed with 'ETCD_' when the 'ORCHESTRATOR' is not present.
|
||||
|
||||
:param manager_config: the configuration for the orchestrator
|
||||
:type manager_config: dict
|
||||
:rtype: :class: Orchestrator
|
||||
"""
|
||||
# Legacy codepath only knows how to configure etcd.
|
||||
if manager_config.get('ORCHESTRATOR') is None:
|
||||
manager_config['ORCHESTRATOR'] = {key: value
|
||||
for (key, value) in iteritems(manager_config)
|
||||
if key.startswith('ETCD_') and not key.endswith('_PREFIX')}
|
||||
|
||||
# Sanity check that legacy prefixes are no longer being used.
|
||||
for key in manager_config['ORCHESTRATOR'].keys():
|
||||
words = key.split('_')
|
||||
if len(words) > 1 and words[-1].lower() == 'prefix':
|
||||
raise AssertionError('legacy prefix used, use ORCHESTRATOR_PREFIX instead')
|
||||
|
||||
def _dict_key_prefix(d):
|
||||
"""
|
||||
:param d: the dict that has keys prefixed with underscore
|
||||
:type d: {str: any}
|
||||
:rtype: str
|
||||
"""
|
||||
return d.keys()[0].split('_', 1)[0].lower()
|
||||
|
||||
orchestrator_name = _dict_key_prefix(manager_config['ORCHESTRATOR'])
|
||||
|
||||
def format_key(key):
|
||||
return key.lower().split('_', 1)[1]
|
||||
|
||||
orchestrator_kwargs = {format_key(key): value
|
||||
for (key, value) in iteritems(manager_config['ORCHESTRATOR'])}
|
||||
|
||||
if manager_config.get('ORCHESTRATOR_PREFIX') is not None:
|
||||
orchestrator_kwargs['orchestrator_prefix'] = manager_config['ORCHESTRATOR_PREFIX']
|
||||
|
||||
orchestrator_kwargs['canceller_only'] = canceller_only
|
||||
|
||||
logger.debug('attempting to create orchestrator %s with kwargs %s',
|
||||
orchestrator_name, orchestrator_kwargs)
|
||||
return orchestrator_by_name(orchestrator_name, **orchestrator_kwargs)
|
||||
|
||||
|
||||
def orchestrator_by_name(name, **kwargs):
|
||||
_ORCHESTRATORS = {
|
||||
'etcd': Etcd2Orchestrator,
|
||||
'mem': MemoryOrchestrator,
|
||||
'redis': RedisOrchestrator,
|
||||
}
|
||||
return _ORCHESTRATORS.get(name, MemoryOrchestrator)(**kwargs)
|
||||
|
||||
|
||||
class OrchestratorError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# TODO: replace with ConnectionError when this codebase is Python 3.
|
||||
class OrchestratorConnectionError(OrchestratorError):
|
||||
pass
|
||||
|
||||
|
||||
@unique
|
||||
class KeyEvent(IntEnum):
|
||||
CREATE = 1
|
||||
SET = 2
|
||||
DELETE = 3
|
||||
EXPIRE = 4
|
||||
|
||||
|
||||
class KeyChange(namedtuple('KeyChange', ['event', 'key', 'value'])):
|
||||
pass
|
||||
|
||||
|
||||
@add_metaclass(ABCMeta)
|
||||
class Orchestrator(object):
|
||||
"""
|
||||
Orchestrator is the interface that is used to synchronize the build states
|
||||
across build managers.
|
||||
|
||||
This interface assumes that storage is being done by a key-value store
|
||||
that supports watching for events on keys.
|
||||
|
||||
Missing keys should return KeyError; otherwise, errors should raise an
|
||||
OrchestratorError.
|
||||
|
||||
:param key_prefix: the prefix of keys being watched
|
||||
:type key_prefix: str
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def on_key_change(self, key, callback, restarter=None):
|
||||
"""
|
||||
|
||||
The callback parameter takes in a KeyChange object as a parameter.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_prefixed_keys(self, prefix):
|
||||
"""
|
||||
|
||||
:returns: a dict of key value pairs beginning with prefix
|
||||
:rtype: {str: str}
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_key(self, key):
|
||||
"""
|
||||
|
||||
:returns: the value stored at the provided key
|
||||
:rtype: str
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def set_key(self, key, value, overwrite=False, expiration=None):
|
||||
"""
|
||||
|
||||
:param key: the identifier for the value
|
||||
:type key: str
|
||||
:param value: the value being stored
|
||||
:type value: str
|
||||
:param overwrite: whether or not a KeyError is thrown if the key already exists
|
||||
:type overwrite: bool
|
||||
:param expiration: the duration in seconds that a key should be available
|
||||
:type expiration: int
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def set_key_sync(self, key, value, overwrite=False, expiration=None):
|
||||
"""
|
||||
set_key, but without trollius coroutines.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def delete_key(self, key):
|
||||
"""
|
||||
Deletes a key that has been set in the orchestrator.
|
||||
|
||||
:param key: the identifier for the key
|
||||
:type key: str
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def lock(self, key, expiration=DEFAULT_LOCK_EXPIRATION):
|
||||
"""
|
||||
Takes a lock for synchronizing exclusive operations cluster-wide.
|
||||
|
||||
:param key: the identifier for the lock
|
||||
:type key: str
|
||||
:param expiration: the duration until the lock expires
|
||||
:type expiration: :class:`datetime.timedelta` or int (seconds)
|
||||
:returns: whether or not the lock was acquired
|
||||
:rtype: bool
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def shutdown():
|
||||
"""
|
||||
This function should shutdown any final resources allocated by the Orchestrator.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def _sleep_orchestrator():
|
||||
"""
|
||||
This function blocks the trollius event loop by sleeping in order to backoff if a failure
|
||||
such as a ConnectionError has occurred.
|
||||
"""
|
||||
logger.exception('Connecting to etcd failed; sleeping for %s and then trying again',
|
||||
ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
|
||||
time.sleep(ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
|
||||
logger.exception('Connecting to etcd failed; slept for %s and now trying again',
|
||||
ORCHESTRATOR_UNAVAILABLE_SLEEP_DURATION)
|
||||
|
||||
|
||||
class EtcdAction(object):
|
||||
""" Enumeration of the various kinds of etcd actions we can observe via a watch. """
|
||||
GET = 'get'
|
||||
SET = 'set'
|
||||
EXPIRE = 'expire'
|
||||
UPDATE = 'update'
|
||||
DELETE = 'delete'
|
||||
CREATE = 'create'
|
||||
COMPARE_AND_SWAP = 'compareAndSwap'
|
||||
COMPARE_AND_DELETE = 'compareAndDelete'
|
||||
|
||||
|
||||
class Etcd2Orchestrator(Orchestrator):
|
||||
def __init__(self, host='127.0.0.1', port=2379, cert_and_key=None, ca_cert=None,
|
||||
client_threads=5, canceller_only=False, **kwargs):
|
||||
self.is_canceller_only = canceller_only
|
||||
|
||||
logger.debug('initializing async etcd client')
|
||||
self._sync_etcd_client = etcd.Client(
|
||||
host=host,
|
||||
port=port,
|
||||
cert=tuple(cert_and_key) if cert_and_key is not None else None,
|
||||
ca_cert=ca_cert,
|
||||
protocol='http' if cert_and_key is None else 'https',
|
||||
read_timeout=ETCD_READ_TIMEOUT,
|
||||
)
|
||||
|
||||
if not self.is_canceller_only:
|
||||
(self._etcd_client, self._async_executor) = wrap_with_threadpool(self._sync_etcd_client,
|
||||
client_threads)
|
||||
|
||||
logger.debug('creating initial orchestrator state')
|
||||
self._shutting_down = False
|
||||
self._watch_tasks = {}
|
||||
|
||||
@staticmethod
|
||||
def _sanity_check_ttl(ttl):
|
||||
"""
|
||||
A TTL of < 0 in etcd results in the key *never being expired*.
|
||||
We use a max here to ensure that if the TTL is < 0, the key will expire immediately.
|
||||
"""
|
||||
return max(ttl, 0)
|
||||
|
||||
def _watch_etcd(self, key, callback, restarter=None, start_index=None):
|
||||
def callback_wrapper(changed_key_future):
|
||||
new_index = start_index
|
||||
etcd_result = None
|
||||
|
||||
if not changed_key_future.cancelled():
|
||||
try:
|
||||
etcd_result = changed_key_future.result()
|
||||
existing_index = getattr(etcd_result, 'etcd_index', None)
|
||||
new_index = etcd_result.modifiedIndex + 1
|
||||
|
||||
logger.debug('Got watch of key: %s at #%s with result: %s',
|
||||
key, existing_index, etcd_result)
|
||||
|
||||
except ReadTimeoutError:
|
||||
logger.debug('Read-timeout on etcd watch %s, rescheduling', key)
|
||||
|
||||
except etcd.EtcdEventIndexCleared:
|
||||
# This happens if etcd2 has moved forward too fast for us to start watching at the index
|
||||
# we retrieved. We therefore start a new watch at HEAD and (if specified) call the
|
||||
# restarter method which should conduct a read and reset the state of the manager.
|
||||
logger.debug('Etcd moved forward too quickly. Restarting watch cycle.')
|
||||
new_index = None
|
||||
if restarter is not None:
|
||||
async(restarter())
|
||||
|
||||
except (KeyError, etcd.EtcdKeyError):
|
||||
logger.debug('Etcd key already cleared: %s', key)
|
||||
return
|
||||
|
||||
except etcd.EtcdConnectionFailed:
|
||||
_sleep_orchestrator()
|
||||
|
||||
except etcd.EtcdException as eex:
|
||||
# TODO: This is a quick and dirty hack and should be replaced with a proper
|
||||
# exception check.
|
||||
if str(eex.message).find('Read timed out') >= 0:
|
||||
logger.debug('Read-timeout on etcd watch %s, rescheduling', key)
|
||||
else:
|
||||
logger.exception('Exception on etcd watch: %s', key)
|
||||
|
||||
except ProtocolError:
|
||||
logger.exception('Exception on etcd watch: %s', key)
|
||||
|
||||
if key not in self._watch_tasks or self._watch_tasks[key].done():
|
||||
self._watch_etcd(key, callback, start_index=new_index, restarter=restarter)
|
||||
|
||||
if etcd_result and etcd_result.value is not None:
|
||||
async(callback(self._etcd_result_to_keychange(etcd_result)))
|
||||
|
||||
if not self._shutting_down:
|
||||
logger.debug('Scheduling watch of key: %s at start index %s', key, start_index)
|
||||
watch_future = self._etcd_client.watch(key, recursive=True, index=start_index,
|
||||
timeout=ETCD_MAX_WATCH_TIMEOUT)
|
||||
watch_future.add_done_callback(callback_wrapper)
|
||||
|
||||
self._watch_tasks[key] = async(watch_future)
|
||||
|
||||
@staticmethod
|
||||
def _etcd_result_to_keychange(etcd_result):
|
||||
event = Etcd2Orchestrator._etcd_result_to_keyevent(etcd_result)
|
||||
return KeyChange(event, etcd_result.key, etcd_result.value)
|
||||
|
||||
@staticmethod
|
||||
def _etcd_result_to_keyevent(etcd_result):
|
||||
if etcd_result.action == EtcdAction.CREATE:
|
||||
return KeyEvent.CREATE
|
||||
if etcd_result.action == EtcdAction.SET:
|
||||
return KeyEvent.CREATE if etcd_result.createdIndex == etcd_result.modifiedIndex else KeyEvent.SET
|
||||
if etcd_result.action == EtcdAction.DELETE:
|
||||
return KeyEvent.DELETE
|
||||
if etcd_result.action == EtcdAction.EXPIRE:
|
||||
return KeyEvent.EXPIRE
|
||||
raise AssertionError('etcd action must have equivalant keyevent')
|
||||
|
||||
def on_key_change(self, key, callback, restarter=None):
|
||||
assert not self.is_canceller_only
|
||||
|
||||
logger.debug('creating watch on %s', key)
|
||||
self._watch_etcd(key, callback, restarter=restarter)
|
||||
|
||||
@coroutine
|
||||
def get_prefixed_keys(self, prefix):
|
||||
assert not self.is_canceller_only
|
||||
|
||||
try:
|
||||
etcd_result = yield From(self._etcd_client.read(prefix, recursive=True))
|
||||
raise Return({leaf.key: leaf.value for leaf in etcd_result.leaves})
|
||||
except etcd.EtcdKeyError:
|
||||
raise KeyError
|
||||
except etcd.EtcdConnectionFailed as ex:
|
||||
raise OrchestratorConnectionError(ex)
|
||||
except etcd.EtcdException as ex:
|
||||
raise OrchestratorError(ex)
|
||||
|
||||
@coroutine
|
||||
def get_key(self, key):
|
||||
assert not self.is_canceller_only
|
||||
|
||||
try:
|
||||
# Ignore pylint: the value property on EtcdResult is added dynamically using setattr.
|
||||
etcd_result = yield From(self._etcd_client.read(key))
|
||||
raise Return(etcd_result.value)
|
||||
except etcd.EtcdKeyError:
|
||||
raise KeyError
|
||||
except etcd.EtcdConnectionFailed as ex:
|
||||
raise OrchestratorConnectionError(ex)
|
||||
except etcd.EtcdException as ex:
|
||||
raise OrchestratorError(ex)
|
||||
|
||||
@coroutine
|
||||
def set_key(self, key, value, overwrite=False, expiration=None):
|
||||
assert not self.is_canceller_only
|
||||
|
||||
yield From(self._etcd_client.write(key, value, prevExists=overwrite,
|
||||
ttl=self._sanity_check_ttl(expiration)))
|
||||
|
||||
def set_key_sync(self, key, value, overwrite=False, expiration=None):
|
||||
self._sync_etcd_client.write(key, value, prevExists=overwrite,
|
||||
ttl=self._sanity_check_ttl(expiration))
|
||||
|
||||
@coroutine
|
||||
def delete_key(self, key):
|
||||
assert not self.is_canceller_only
|
||||
|
||||
try:
|
||||
yield From(self._etcd_client.delete(key))
|
||||
except etcd.EtcdKeyError:
|
||||
raise KeyError
|
||||
except etcd.EtcdConnectionFailed as ex:
|
||||
raise OrchestratorConnectionError(ex)
|
||||
except etcd.EtcdException as ex:
|
||||
raise OrchestratorError(ex)
|
||||
|
||||
@coroutine
|
||||
def lock(self, key, expiration=DEFAULT_LOCK_EXPIRATION):
|
||||
assert not self.is_canceller_only
|
||||
|
||||
try:
|
||||
yield From(self._etcd_client.write(key, {}, prevExist=False,
|
||||
ttl=self._sanity_check_ttl(expiration)))
|
||||
raise Return(True)
|
||||
except (KeyError, etcd.EtcdKeyError):
|
||||
raise Return(False)
|
||||
except etcd.EtcdConnectionFailed:
|
||||
logger.exception('Could not get etcd atomic lock as etcd is down')
|
||||
raise Return(False)
|
||||
except etcd.EtcdException as ex:
|
||||
raise OrchestratorError(ex)
|
||||
|
||||
def shutdown(self):
|
||||
logger.debug('Shutting down etcd client.')
|
||||
self._shutting_down = True
|
||||
|
||||
if self.is_canceller_only:
|
||||
return
|
||||
|
||||
for (key, _), task in self._watch_tasks.items():
|
||||
if not task.done():
|
||||
logger.debug('Canceling watch task for %s', key)
|
||||
task.cancel()
|
||||
|
||||
if self._async_executor is not None:
|
||||
self._async_executor.shutdown()
|
||||
|
||||
|
||||
class MemoryOrchestrator(Orchestrator):
|
||||
def __init__(self, **kwargs):
|
||||
self.state = ExpiresDict()
|
||||
self.callbacks = {}
|
||||
|
||||
def _callbacks_prefixed(self, prefix):
|
||||
return (callback for (key, callback) in iteritems(self.callbacks)
|
||||
if key.startswith(prefix))
|
||||
|
||||
def on_key_change(self, key, callback, restarter=None):
|
||||
self.callbacks[key] = callback
|
||||
|
||||
@coroutine
|
||||
def get_prefixed_keys(self, prefix):
|
||||
raise Return({k: value for (k, value) in self.state.items()
|
||||
if k.startswith(prefix)})
|
||||
|
||||
@coroutine
|
||||
def get_key(self, key):
|
||||
raise Return(self.state[key])
|
||||
|
||||
@coroutine
|
||||
def set_key(self, key, value, overwrite=False, expiration=None):
|
||||
preexisting_key = 'key' in self.state
|
||||
if preexisting_key and not overwrite:
|
||||
raise KeyError
|
||||
|
||||
absolute_expiration = None
|
||||
if expiration is not None:
|
||||
absolute_expiration = datetime.datetime.now() + datetime.timedelta(seconds=expiration)
|
||||
|
||||
self.state.set(key, value, expires=absolute_expiration)
|
||||
|
||||
event = KeyEvent.CREATE if not preexisting_key else KeyEvent.SET
|
||||
for callback in self._callbacks_prefixed(key):
|
||||
yield From(callback(KeyChange(event, key, value)))
|
||||
|
||||
def set_key_sync(self, key, value, overwrite=False, expiration=None):
|
||||
"""
|
||||
set_key, but without trollius coroutines.
|
||||
"""
|
||||
preexisting_key = 'key' in self.state
|
||||
if preexisting_key and not overwrite:
|
||||
raise KeyError
|
||||
|
||||
absolute_expiration = None
|
||||
if expiration is not None:
|
||||
absolute_expiration = datetime.datetime.now() + datetime.timedelta(seconds=expiration)
|
||||
|
||||
self.state.set(key, value, expires=absolute_expiration)
|
||||
|
||||
event = KeyEvent.CREATE if not preexisting_key else KeyEvent.SET
|
||||
for callback in self._callbacks_prefixed(key):
|
||||
callback(KeyChange(event, key, value))
|
||||
|
||||
@coroutine
|
||||
def delete_key(self, key):
|
||||
value = self.state[key]
|
||||
del self.state[key]
|
||||
|
||||
for callback in self._callbacks_prefixed(key):
|
||||
yield From(callback(KeyChange(KeyEvent.DELETE, key, value)))
|
||||
|
||||
@coroutine
|
||||
def lock(self, key, expiration=DEFAULT_LOCK_EXPIRATION):
|
||||
if key in self.state:
|
||||
raise Return(False)
|
||||
self.state.set(key, None, expires=expiration)
|
||||
raise Return(True)
|
||||
|
||||
def shutdown(self):
|
||||
self.state = None
|
||||
self.callbacks = None
|
||||
|
||||
|
||||
class RedisOrchestrator(Orchestrator):
|
||||
def __init__(self, host='127.0.0.1', port=6379, password=None, db=0, cert_and_key=None,
|
||||
ca_cert=None, client_threads=5, ssl=False, skip_keyspace_event_setup=False,
|
||||
canceller_only=False, **kwargs):
|
||||
self.is_canceller_only = canceller_only
|
||||
(cert, key) = tuple(cert_and_key) if cert_and_key is not None else (None, None)
|
||||
self._sync_client = redis.StrictRedis(
|
||||
host=host,
|
||||
port=port,
|
||||
password=password,
|
||||
db=db,
|
||||
ssl_certfile=cert,
|
||||
ssl_keyfile=key,
|
||||
ssl_ca_certs=ca_cert,
|
||||
ssl=ssl,
|
||||
)
|
||||
|
||||
self._shutting_down = False
|
||||
self._tasks = {}
|
||||
self._watched_keys = {}
|
||||
self._pubsub_key = slash_join(kwargs.get('orchestrator_prefix', ''),
|
||||
REDIS_DEFAULT_PUBSUB_KEY).lstrip('/')
|
||||
|
||||
if not self.is_canceller_only:
|
||||
(self._client, self._async_executor) = wrap_with_threadpool(self._sync_client, client_threads)
|
||||
|
||||
# Configure a subscription to watch events that the orchestrator manually publishes.
|
||||
logger.debug('creating pubsub with key %s', self._pubsub_key)
|
||||
published_pubsub = self._sync_client.pubsub()
|
||||
published_pubsub.subscribe(self._pubsub_key)
|
||||
(self._pubsub, self._async_executor_pub) = wrap_with_threadpool(published_pubsub)
|
||||
self._watch_published_key()
|
||||
|
||||
# Configure a subscription to watch expired keyspace events.
|
||||
if not skip_keyspace_event_setup:
|
||||
self._sync_client.config_set(REDIS_KEYSPACE_EVENT_CONFIG_KEY,
|
||||
REDIS_KEYSPACE_EVENT_CONFIG_VALUE)
|
||||
|
||||
expiring_pubsub = self._sync_client.pubsub()
|
||||
expiring_pubsub.psubscribe(REDIS_EXPIRED_KEYSPACE_PATTERN % (db, '*'))
|
||||
(self._pubsub_expiring, self._async_executor_ex) = wrap_with_threadpool(expiring_pubsub)
|
||||
self._watch_expiring_key()
|
||||
|
||||
def _watch_published_key(self):
|
||||
def published_callback_wrapper(event_future):
|
||||
logger.debug('published callback called')
|
||||
event_result = None
|
||||
|
||||
if not event_future.cancelled():
|
||||
try:
|
||||
event_result = event_future.result()
|
||||
(redis_event, event_key, event_value) = event_result
|
||||
logger.debug('Got watch of key: (%s, %s, %s)', redis_event, event_key, event_value)
|
||||
except redis.ConnectionError:
|
||||
_sleep_orchestrator()
|
||||
except redis.RedisError:
|
||||
logger.exception('Exception watching redis publish: %s', event_key)
|
||||
|
||||
# Schedule creating a new future if this one has been consumed.
|
||||
if 'pub' not in self._tasks or self._tasks['pub'].done():
|
||||
self._watch_published_key()
|
||||
|
||||
if event_result is not None and redis_event == REDIS_EVENT_KIND_MESSAGE:
|
||||
keychange = self._publish_to_keychange(event_value)
|
||||
for watched_key, callback in iteritems(self._watched_keys):
|
||||
if keychange.key.startswith(watched_key):
|
||||
async(callback(keychange))
|
||||
|
||||
if not self._shutting_down:
|
||||
logger.debug('Scheduling watch of publish stream')
|
||||
watch_future = self._pubsub.parse_response()
|
||||
watch_future.add_done_callback(published_callback_wrapper)
|
||||
self._tasks['pub'] = async(watch_future)
|
||||
|
||||
def _watch_expiring_key(self):
|
||||
def expiring_callback_wrapper(event_future):
|
||||
logger.debug('expiring callback called')
|
||||
event_result = None
|
||||
|
||||
if not event_future.cancelled():
|
||||
try:
|
||||
event_result = event_future.result()
|
||||
if self._is_expired_keyspace_event(event_result):
|
||||
# Get the value of the original key before the expiration happened.
|
||||
key = self._key_from_expiration(event_future)
|
||||
expired_value = yield From(self._client.get(key))
|
||||
|
||||
# $KEY/expiring is gone, but the original key still remains, set an expiration for it
|
||||
# so that other managers have time to get the event and still read the expired value.
|
||||
yield From(self._client.expire(key, ONE_DAY))
|
||||
except redis.ConnectionError:
|
||||
_sleep_orchestrator()
|
||||
except redis.RedisError:
|
||||
logger.exception('Exception watching redis expirations: %s', key)
|
||||
|
||||
# Schedule creating a new future if this one has been consumed.
|
||||
if 'expire' not in self._tasks or self._tasks['expire'].done():
|
||||
self._watch_expiring_key()
|
||||
|
||||
if self._is_expired_keyspace_event(event_result) and expired_value is not None:
|
||||
for watched_key, callback in iteritems(self._watched_keys):
|
||||
if key.startswith(watched_key):
|
||||
async(callback(KeyChange(KeyEvent.EXPIRE, key, expired_value)))
|
||||
|
||||
if not self._shutting_down:
|
||||
logger.debug('Scheduling watch of expiration')
|
||||
watch_future = self._pubsub_expiring.parse_response()
|
||||
watch_future.add_done_callback(expiring_callback_wrapper)
|
||||
self._tasks['expire'] = async(watch_future)
|
||||
|
||||
def on_key_change(self, key, callback, restarter=None):
|
||||
assert not self.is_canceller_only
|
||||
|
||||
logger.debug('watching key: %s', key)
|
||||
self._watched_keys[key] = callback
|
||||
|
||||
@staticmethod
|
||||
def _is_expired_keyspace_event(event_result):
|
||||
"""
|
||||
Sanity check that this isn't an unrelated keyspace event.
|
||||
There could be a more efficient keyspace event config to avoid this client-side filter.
|
||||
"""
|
||||
if event_result is None:
|
||||
return False
|
||||
|
||||
(redis_event, _pattern, matched_key, expired) = event_result
|
||||
return (redis_event == REDIS_EVENT_KIND_PMESSAGE and
|
||||
expired == 'expired' and
|
||||
REDIS_EXPIRED_KEYSPACE_REGEX.match(matched_key) is not None)
|
||||
|
||||
@staticmethod
|
||||
def _key_from_expiration(event_result):
|
||||
(_redis_event, _pattern, matched_key, _expired) = event_result
|
||||
return REDIS_EXPIRED_KEYSPACE_REGEX.match(matched_key).groups()[1]
|
||||
|
||||
@staticmethod
|
||||
def _publish_to_keychange(event_value):
|
||||
e = json.loads(event_value)
|
||||
return KeyChange(KeyEvent(e['event']), e['key'], e['value'])
|
||||
|
||||
@coroutine
|
||||
def get_prefixed_keys(self, prefix):
|
||||
assert not self.is_canceller_only
|
||||
|
||||
# TODO: This can probably be done with redis pipelines to make it transactional.
|
||||
keys = yield From(self._client.keys(prefix + '*'))
|
||||
|
||||
# Yielding to the event loop is required, thus this cannot be written as a dict comprehension.
|
||||
results = {}
|
||||
for key in keys:
|
||||
if key.endswith(REDIS_EXPIRING_SUFFIX):
|
||||
continue
|
||||
ttl = yield From(self._client.ttl(key))
|
||||
if ttl != REDIS_NONEXPIRING_KEY:
|
||||
# Only redis keys without expirations are live build manager keys.
|
||||
value = yield From(self._client.get(key))
|
||||
results.update({key: value})
|
||||
|
||||
raise Return(results)
|
||||
|
||||
@coroutine
|
||||
def get_key(self, key):
|
||||
assert not self.is_canceller_only
|
||||
|
||||
value = yield From(self._client.get(key))
|
||||
raise Return(value)
|
||||
|
||||
@coroutine
|
||||
def set_key(self, key, value, overwrite=False, expiration=None):
|
||||
assert not self.is_canceller_only
|
||||
|
||||
already_exists = yield From(self._client.exists(key))
|
||||
|
||||
yield From(self._client.set(key, value, xx=overwrite))
|
||||
if expiration is not None:
|
||||
yield From(self._client.set(slash_join(key, REDIS_EXPIRING_SUFFIX), value,
|
||||
xx=overwrite, ex=expiration))
|
||||
|
||||
key_event = KeyEvent.SET if already_exists else KeyEvent.CREATE
|
||||
yield From(self._publish(event=key_event, key=key, value=value))
|
||||
|
||||
def set_key_sync(self, key, value, overwrite=False, expiration=None):
|
||||
already_exists = self._sync_client.exists(key)
|
||||
|
||||
self._sync_client.set(key, value, xx=overwrite)
|
||||
if expiration is not None:
|
||||
self._sync_client.set(slash_join(key, REDIS_EXPIRING_SUFFIX), value,
|
||||
xx=overwrite, ex=expiration)
|
||||
|
||||
self._sync_client.publish(self._pubsub_key, json.dumps({
|
||||
'event': int(KeyEvent.SET if already_exists else KeyEvent.CREATE),
|
||||
'key': key,
|
||||
'value': value,
|
||||
}))
|
||||
|
||||
@coroutine
|
||||
def _publish(self, **kwargs):
|
||||
kwargs['event'] = int(kwargs['event'])
|
||||
event_json = json.dumps(kwargs)
|
||||
logger.debug('publishing event: %s', event_json)
|
||||
yield From(self._client.publish(self._pubsub_key, event_json))
|
||||
|
||||
@coroutine
|
||||
def delete_key(self, key):
|
||||
assert not self.is_canceller_only
|
||||
|
||||
value = yield From(self._client.get(key))
|
||||
yield From(self._client.delete(key))
|
||||
yield From(self._client.delete(slash_join(key, REDIS_EXPIRING_SUFFIX)))
|
||||
yield From(self._publish(event=KeyEvent.DELETE, key=key, value=value))
|
||||
|
||||
@coroutine
|
||||
def lock(self, key, expiration=DEFAULT_LOCK_EXPIRATION):
|
||||
assert not self.is_canceller_only
|
||||
|
||||
yield From(self.set_key(key, '', ex=expiration))
|
||||
raise Return(True)
|
||||
|
||||
@coroutine
|
||||
def shutdown(self):
|
||||
logger.debug('Shutting down redis client.')
|
||||
|
||||
self._shutting_down = True
|
||||
|
||||
if self.is_canceller_only:
|
||||
return
|
||||
|
||||
for key, task in iteritems(self._tasks):
|
||||
if not task.done():
|
||||
logger.debug('Canceling watch task for %s', key)
|
||||
task.cancel()
|
||||
|
||||
if self._async_executor is not None:
|
||||
self._async_executor.shutdown()
|
||||
if self._async_executor_ex is not None:
|
||||
self._async_executor_ex.shutdown()
|
||||
if self._async_executor_pub is not None:
|
||||
self._async_executor_pub.shutdown()
|
26
buildman/qemu-coreos/Dockerfile
Normal file
26
buildman/qemu-coreos/Dockerfile
Normal file
|
@ -0,0 +1,26 @@
|
|||
FROM debian
|
||||
|
||||
RUN apt-get clean && apt-get update && apt-get upgrade -y # 03APR2017
|
||||
RUN apt-get install -y \
|
||||
bzip2 \
|
||||
curl \
|
||||
openssh-client \
|
||||
qemu-kvm
|
||||
|
||||
ARG channel=stable
|
||||
ARG version=current
|
||||
|
||||
RUN echo "Downloading http://${channel}.release.core-os.net/amd64-usr/${version}/coreos_production_qemu_image.img.bz2"
|
||||
RUN curl -s -O http://${channel}.release.core-os.net/amd64-usr/${version}/coreos_production_qemu_image.img.bz2 && \
|
||||
bzip2 -d coreos_production_qemu_image.img.bz2
|
||||
|
||||
RUN apt-get remove -y curl bzip2 && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
COPY start.sh /start.sh
|
||||
|
||||
LABEL com.coreos.channel ${channel}
|
||||
LABEL com.coreos.version ${version}
|
||||
|
||||
ENTRYPOINT ["/bin/bash", "/start.sh"]
|
5
buildman/qemu-coreos/README.md
Normal file
5
buildman/qemu-coreos/README.md
Normal file
|
@ -0,0 +1,5 @@
|
|||
# Builder Image
|
||||
|
||||
```
|
||||
docker build --build-arg channel=stable --build-arg version=current -t quay.io/quay/quay-builder-qemu-coreos:staging .
|
||||
```
|
26
buildman/qemu-coreos/start.sh
Normal file
26
buildman/qemu-coreos/start.sh
Normal file
|
@ -0,0 +1,26 @@
|
|||
#!/bin/bash
|
||||
|
||||
VM_VOLUME_SIZE="${VM_VOLUME_SIZE:-32G}"
|
||||
VM_MEMORY="${VM_MEMORY:-4G}"
|
||||
|
||||
set -e
|
||||
set -x
|
||||
set -o nounset
|
||||
|
||||
mkdir -p /userdata/openstack/latest
|
||||
echo "${USERDATA}" > /userdata/openstack/latest/user_data
|
||||
|
||||
time qemu-img resize ./coreos_production_qemu_image.img "${VM_VOLUME_SIZE}"
|
||||
|
||||
qemu-system-x86_64 \
|
||||
-enable-kvm \
|
||||
-cpu host \
|
||||
-device virtio-9p-pci,fsdev=conf,mount_tag=config-2 \
|
||||
-nographic \
|
||||
-drive if=virtio,file=./coreos_production_qemu_image.img \
|
||||
-fsdev local,id=conf,security_model=none,readonly,path=/userdata \
|
||||
-m "${VM_MEMORY}" \
|
||||
-machine accel=kvm \
|
||||
-net nic,model=virtio \
|
||||
-net user,hostfwd=tcp::2222-:22 \
|
||||
-smp 2
|
266
buildman/server.py
Normal file
266
buildman/server.py
Normal file
|
@ -0,0 +1,266 @@
|
|||
import logging
|
||||
import json
|
||||
import trollius
|
||||
|
||||
from threading import Event
|
||||
from datetime import timedelta
|
||||
from trollius.coroutines import From
|
||||
from autobahn.asyncio.wamp import RouterFactory, RouterSessionFactory
|
||||
from autobahn.asyncio.websocket import WampWebSocketServerFactory
|
||||
from autobahn.wamp import types
|
||||
from aiowsgi import create_server as create_wsgi_server
|
||||
from flask import Flask
|
||||
|
||||
from buildman.enums import BuildJobResult, BuildServerStatus, RESULT_PHASES
|
||||
from buildman.jobutil.buildstatus import StatusHandler
|
||||
from buildman.jobutil.buildjob import BuildJob, BuildJobLoadException
|
||||
from data import database, model
|
||||
from app import app, metric_queue
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
WORK_CHECK_TIMEOUT = 10
|
||||
TIMEOUT_PERIOD_MINUTES = 20
|
||||
JOB_TIMEOUT_SECONDS = 300
|
||||
SETUP_LEEWAY_SECONDS = 30
|
||||
MINIMUM_JOB_EXTENSION = timedelta(minutes=1)
|
||||
|
||||
HEARTBEAT_PERIOD_SEC = 30
|
||||
|
||||
|
||||
class BuilderServer(object):
|
||||
""" Server which handles both HTTP and WAMP requests, managing the full state of the build
|
||||
controller.
|
||||
"""
|
||||
def __init__(self, registry_hostname, queue, build_logs, user_files, lifecycle_manager_klass,
|
||||
lifecycle_manager_config, manager_hostname):
|
||||
self._loop = None
|
||||
self._current_status = BuildServerStatus.STARTING
|
||||
self._current_components = []
|
||||
self._realm_map = {}
|
||||
self._job_count = 0
|
||||
|
||||
self._session_factory = RouterSessionFactory(RouterFactory())
|
||||
self._registry_hostname = registry_hostname
|
||||
self._queue = queue
|
||||
self._build_logs = build_logs
|
||||
self._user_files = user_files
|
||||
self._lifecycle_manager = lifecycle_manager_klass(
|
||||
self._register_component,
|
||||
self._unregister_component,
|
||||
self._job_heartbeat,
|
||||
self._job_complete,
|
||||
manager_hostname,
|
||||
HEARTBEAT_PERIOD_SEC,
|
||||
)
|
||||
self._lifecycle_manager_config = lifecycle_manager_config
|
||||
|
||||
self._shutdown_event = Event()
|
||||
self._current_status = BuildServerStatus.RUNNING
|
||||
|
||||
self._register_controller()
|
||||
|
||||
def _register_controller(self):
|
||||
controller_app = Flask('controller')
|
||||
server = self
|
||||
|
||||
@controller_app.route('/status')
|
||||
def status():
|
||||
metrics = server._queue.get_metrics()
|
||||
(running_count, available_not_running_count, available_count) = metrics
|
||||
|
||||
workers = [component for component in server._current_components
|
||||
if component.kind() == 'builder']
|
||||
|
||||
data = {
|
||||
'status': server._current_status,
|
||||
'running_local': server._job_count,
|
||||
'running_total': running_count,
|
||||
'workers': len(workers),
|
||||
'job_total': available_count + running_count
|
||||
}
|
||||
|
||||
return json.dumps(data)
|
||||
|
||||
self._controller_app = controller_app
|
||||
|
||||
def run(self, host, websocket_port, controller_port, ssl=None):
|
||||
logger.debug('Initializing the lifecycle manager')
|
||||
self._lifecycle_manager.initialize(self._lifecycle_manager_config)
|
||||
|
||||
logger.debug('Initializing all members of the event loop')
|
||||
loop = trollius.get_event_loop()
|
||||
|
||||
logger.debug('Starting server on port %s, with controller on port %s', websocket_port,
|
||||
controller_port)
|
||||
|
||||
try:
|
||||
loop.run_until_complete(self._initialize(loop, host, websocket_port, controller_port, ssl))
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
def close(self):
|
||||
logger.debug('Requested server shutdown')
|
||||
self._current_status = BuildServerStatus.SHUTDOWN
|
||||
self._lifecycle_manager.shutdown()
|
||||
self._shutdown_event.wait()
|
||||
logger.debug('Shutting down server')
|
||||
|
||||
def _register_component(self, realm, component_klass, **kwargs):
|
||||
""" Registers a component with the server. The component_klass must derive from
|
||||
BaseComponent.
|
||||
"""
|
||||
logger.debug('Registering component with realm %s', realm)
|
||||
if realm in self._realm_map:
|
||||
logger.debug('Component with realm %s already registered', realm)
|
||||
return self._realm_map[realm]
|
||||
|
||||
component = component_klass(types.ComponentConfig(realm=realm), realm=realm, **kwargs)
|
||||
component.server = self
|
||||
component.parent_manager = self._lifecycle_manager
|
||||
component.build_logs = self._build_logs
|
||||
component.user_files = self._user_files
|
||||
component.registry_hostname = self._registry_hostname
|
||||
|
||||
self._realm_map[realm] = component
|
||||
self._current_components.append(component)
|
||||
self._session_factory.add(component)
|
||||
return component
|
||||
|
||||
def _unregister_component(self, component):
|
||||
logger.debug('Unregistering component with realm %s and token %s',
|
||||
component.builder_realm, component.expected_token)
|
||||
|
||||
self._realm_map.pop(component.builder_realm, None)
|
||||
|
||||
if component in self._current_components:
|
||||
self._current_components.remove(component)
|
||||
self._session_factory.remove(component)
|
||||
|
||||
def _job_heartbeat(self, build_job):
|
||||
self._queue.extend_processing(build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS,
|
||||
minimum_extension=MINIMUM_JOB_EXTENSION)
|
||||
|
||||
@trollius.coroutine
|
||||
def _job_complete(self, build_job, job_status, executor_name=None, update_phase=False):
|
||||
if job_status == BuildJobResult.INCOMPLETE:
|
||||
logger.warning('[BUILD INCOMPLETE: job complete] Build ID: %s. No retry restore.',
|
||||
build_job.repo_build.uuid)
|
||||
self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30)
|
||||
else:
|
||||
self._queue.complete(build_job.job_item)
|
||||
|
||||
# Update the trigger failure tracking (if applicable).
|
||||
if build_job.repo_build.trigger is not None:
|
||||
model.build.update_trigger_disable_status(build_job.repo_build.trigger,
|
||||
RESULT_PHASES[job_status])
|
||||
|
||||
if update_phase:
|
||||
status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid)
|
||||
yield From(status_handler.set_phase(RESULT_PHASES[job_status]))
|
||||
|
||||
self._job_count = self._job_count - 1
|
||||
|
||||
if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count:
|
||||
self._shutdown_event.set()
|
||||
|
||||
_report_completion_status(build_job, job_status, executor_name)
|
||||
|
||||
@trollius.coroutine
|
||||
def _work_checker(self):
|
||||
logger.debug('Initializing work checker')
|
||||
while self._current_status == BuildServerStatus.RUNNING:
|
||||
with database.CloseForLongOperation(app.config):
|
||||
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
|
||||
|
||||
logger.debug('Checking for more work for %d active workers',
|
||||
self._lifecycle_manager.num_workers())
|
||||
|
||||
processing_time = self._lifecycle_manager.overall_setup_time() + SETUP_LEEWAY_SECONDS
|
||||
job_item = self._queue.get(processing_time=processing_time, ordering_required=True)
|
||||
if job_item is None:
|
||||
logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
|
||||
continue
|
||||
|
||||
try:
|
||||
build_job = BuildJob(job_item)
|
||||
except BuildJobLoadException as irbe:
|
||||
logger.warning('[BUILD INCOMPLETE: job load exception] Job data: %s. No retry restore.',
|
||||
job_item.body)
|
||||
logger.exception(irbe)
|
||||
self._queue.incomplete(job_item, restore_retry=False)
|
||||
continue
|
||||
|
||||
logger.debug('Checking for an avaliable worker for build job %s',
|
||||
build_job.repo_build.uuid)
|
||||
|
||||
try:
|
||||
schedule_success, retry_timeout = yield From(self._lifecycle_manager.schedule(build_job))
|
||||
except:
|
||||
logger.warning('[BUILD INCOMPLETE: scheduling] Build ID: %s. Retry restored.',
|
||||
build_job.repo_build.uuid)
|
||||
logger.exception('Exception when scheduling job: %s', build_job.repo_build.uuid)
|
||||
self._current_status = BuildServerStatus.EXCEPTION
|
||||
self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT)
|
||||
return
|
||||
|
||||
if schedule_success:
|
||||
logger.debug('Marking build %s as scheduled', build_job.repo_build.uuid)
|
||||
status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid)
|
||||
yield From(status_handler.set_phase(database.BUILD_PHASE.BUILD_SCHEDULED))
|
||||
|
||||
self._job_count = self._job_count + 1
|
||||
logger.debug('Build job %s scheduled. Running: %s', build_job.repo_build.uuid,
|
||||
self._job_count)
|
||||
else:
|
||||
logger.warning('[BUILD INCOMPLETE: no schedule] Build ID: %s. Retry restored.',
|
||||
build_job.repo_build.uuid)
|
||||
logger.debug('All workers are busy for job %s Requeuing after %s seconds.',
|
||||
build_job.repo_build.uuid, retry_timeout)
|
||||
self._queue.incomplete(job_item, restore_retry=True, retry_after=retry_timeout)
|
||||
|
||||
@trollius.coroutine
|
||||
def _queue_metrics_updater(self):
|
||||
logger.debug('Initializing queue metrics updater')
|
||||
while self._current_status == BuildServerStatus.RUNNING:
|
||||
logger.debug('Writing metrics')
|
||||
self._queue.update_metrics()
|
||||
|
||||
logger.debug('Metrics going to sleep for 30 seconds')
|
||||
yield From(trollius.sleep(30))
|
||||
|
||||
@trollius.coroutine
|
||||
def _initialize(self, loop, host, websocket_port, controller_port, ssl=None):
|
||||
self._loop = loop
|
||||
|
||||
# Create the WAMP server.
|
||||
transport_factory = WampWebSocketServerFactory(self._session_factory, debug_wamp=False)
|
||||
transport_factory.setProtocolOptions(failByDrop=True)
|
||||
|
||||
# Initialize the controller server and the WAMP server
|
||||
create_wsgi_server(self._controller_app, loop=loop, host=host, port=controller_port, ssl=ssl)
|
||||
yield From(loop.create_server(transport_factory, host, websocket_port, ssl=ssl))
|
||||
|
||||
# Initialize the metrics updater
|
||||
trollius.async(self._queue_metrics_updater())
|
||||
|
||||
# Initialize the work queue checker.
|
||||
yield From(self._work_checker())
|
||||
|
||||
def _report_completion_status(build_job, status, executor_name):
|
||||
metric_queue.build_counter.Inc(labelvalues=[status])
|
||||
metric_queue.repository_build_completed.Inc(labelvalues=[build_job.namespace, build_job.repo_name,
|
||||
status, executor_name or 'executor'])
|
||||
if status == BuildJobResult.COMPLETE:
|
||||
status_name = 'CompleteBuilds'
|
||||
elif status == BuildJobResult.ERROR:
|
||||
status_name = 'FailedBuilds'
|
||||
elif status == BuildJobResult.INCOMPLETE:
|
||||
status_name = 'IncompletedBuilds'
|
||||
else:
|
||||
return
|
||||
|
||||
metric_queue.put_deprecated(status_name, 1, unit='Count')
|
102
buildman/templates/cloudconfig.yaml
Normal file
102
buildman/templates/cloudconfig.yaml
Normal file
|
@ -0,0 +1,102 @@
|
|||
#cloud-config
|
||||
|
||||
hostname: {{ build_uuid | default('quay-builder', True) }}
|
||||
|
||||
users:
|
||||
groups:
|
||||
- sudo
|
||||
- docker
|
||||
|
||||
{% if ssh_authorized_keys -%}
|
||||
ssh_authorized_keys:
|
||||
{% for ssh_key in ssh_authorized_keys -%}
|
||||
- {{ ssh_key }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
|
||||
write_files:
|
||||
- path: /root/disable-aws-metadata.sh
|
||||
permission: '0755'
|
||||
content: |
|
||||
iptables -t nat -I PREROUTING -p tcp -d 169.254.169.254 --dport 80 -j DNAT --to-destination 1.1.1.1
|
||||
|
||||
- path: /etc/docker/daemon.json
|
||||
permission: '0644'
|
||||
content: |
|
||||
{
|
||||
"storage-driver": "overlay2"
|
||||
}
|
||||
|
||||
- path: /root/overrides.list
|
||||
permission: '0644'
|
||||
content: |
|
||||
REALM={{ realm }}
|
||||
TOKEN={{ token }}
|
||||
SERVER={{ websocket_scheme }}://{{ manager_hostname }}
|
||||
{% if logentries_token -%}
|
||||
LOGENTRIES_TOKEN={{ logentries_token }}
|
||||
{%- endif %}
|
||||
|
||||
coreos:
|
||||
update:
|
||||
reboot-strategy: off
|
||||
group: {{ coreos_channel }}
|
||||
|
||||
units:
|
||||
- name: update-engine.service
|
||||
command: stop
|
||||
- name: locksmithd.service
|
||||
command: stop
|
||||
- name: systemd-journal-gatewayd.socket
|
||||
command: start
|
||||
enable: yes
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Journal Gateway Service Socket
|
||||
[Socket]
|
||||
ListenStream=/var/run/journald.sock
|
||||
Service=systemd-journal-gatewayd.service
|
||||
[Install]
|
||||
WantedBy=sockets.target
|
||||
{{ dockersystemd('quay-builder',
|
||||
worker_image,
|
||||
quay_username,
|
||||
quay_password,
|
||||
worker_tag,
|
||||
extra_args='--net=host --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock -v /usr/share/ca-certificates:/etc/ssl/certs',
|
||||
exec_stop_post=['/bin/sh -xc "/bin/sleep 120; /usr/bin/systemctl --no-block poweroff"'],
|
||||
flattened=True,
|
||||
restart_policy='no'
|
||||
) | indent(4) }}
|
||||
{% if logentries_token -%}
|
||||
# https://github.com/kelseyhightower/journal-2-logentries/pull/11 so moved journal-2-logentries to coreos
|
||||
{{ dockersystemd('builder-logs',
|
||||
'quay.io/coreos/journal-2-logentries',
|
||||
extra_args='--env-file /root/overrides.list -v /run/journald.sock:/run/journald.sock',
|
||||
flattened=True,
|
||||
after_units=['quay-builder.service']
|
||||
) | indent(4) }}
|
||||
{%- endif %}
|
||||
- name: disable-aws-metadata.service
|
||||
command: start
|
||||
enable: yes
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Disable AWS metadata service
|
||||
Before=network-pre.target
|
||||
Wants=network-pre.target
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/root/disable-aws-metadata.sh
|
||||
RemainAfterExit=yes
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
- name: machine-lifetime.service
|
||||
command: start
|
||||
enable: yes
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Machine Lifetime Service
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/bin/sh -xc "/bin/sleep {{ max_lifetime_s }}; /usr/bin/systemctl --no-block poweroff"
|
679
buildman/test/test_buildman.py
Normal file
679
buildman/test/test_buildman.py
Normal file
|
@ -0,0 +1,679 @@
|
|||
import unittest
|
||||
import json
|
||||
import uuid
|
||||
|
||||
from mock import Mock, ANY
|
||||
from six import iteritems
|
||||
from trollius import coroutine, get_event_loop, From, Future, Return
|
||||
|
||||
from app import metric_queue
|
||||
from buildman.asyncutil import AsyncWrapper
|
||||
from buildman.component.buildcomponent import BuildComponent
|
||||
from buildman.manager.ephemeral import (EphemeralBuilderManager, REALM_PREFIX,
|
||||
JOB_PREFIX)
|
||||
from buildman.manager.executor import BuilderExecutor, ExecutorException
|
||||
from buildman.orchestrator import KeyEvent, KeyChange
|
||||
from buildman.server import BuildJobResult
|
||||
from util import slash_join
|
||||
from util.metrics.metricqueue import duration_collector_async
|
||||
|
||||
|
||||
BUILD_UUID = 'deadbeef-dead-beef-dead-deadbeefdead'
|
||||
REALM_ID = '1234-realm'
|
||||
|
||||
|
||||
def async_test(f):
|
||||
def wrapper(*args, **kwargs):
|
||||
coro = coroutine(f)
|
||||
future = coro(*args, **kwargs)
|
||||
loop = get_event_loop()
|
||||
loop.run_until_complete(future)
|
||||
return wrapper
|
||||
|
||||
|
||||
class TestExecutor(BuilderExecutor):
|
||||
job_started = None
|
||||
job_stopped = None
|
||||
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, labelvalues=["testlabel"])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
self.job_started = str(uuid.uuid4())
|
||||
raise Return(self.job_started)
|
||||
|
||||
@coroutine
|
||||
def stop_builder(self, execution_id):
|
||||
self.job_stopped = execution_id
|
||||
|
||||
|
||||
class BadExecutor(BuilderExecutor):
|
||||
@coroutine
|
||||
@duration_collector_async(metric_queue.builder_time_to_start, labelvalues=["testlabel"])
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
raise ExecutorException('raised on purpose!')
|
||||
|
||||
|
||||
class EphemeralBuilderTestCase(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.etcd_client_mock = None
|
||||
super(EphemeralBuilderTestCase, self).__init__(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _create_completed_future(result=None):
|
||||
def inner(*args, **kwargs):
|
||||
new_future = Future()
|
||||
new_future.set_result(result)
|
||||
return new_future
|
||||
return inner
|
||||
|
||||
def setUp(self):
|
||||
self._existing_executors = dict(EphemeralBuilderManager.EXECUTORS)
|
||||
|
||||
def tearDown(self):
|
||||
EphemeralBuilderManager.EXECUTORS = self._existing_executors
|
||||
|
||||
@coroutine
|
||||
def _register_component(self, realm_spec, build_component, token):
|
||||
raise Return('hello')
|
||||
|
||||
def _create_build_job(self, namespace='namespace', retries=3):
|
||||
mock_job = Mock()
|
||||
mock_job.job_details = {'build_uuid': BUILD_UUID}
|
||||
mock_job.job_item = {
|
||||
'body': json.dumps(mock_job.job_details),
|
||||
'id': 1,
|
||||
}
|
||||
|
||||
mock_job.namespace = namespace
|
||||
mock_job.retries_remaining = retries
|
||||
mock_job.build_uuid = BUILD_UUID
|
||||
return mock_job
|
||||
|
||||
|
||||
class TestEphemeralLifecycle(EphemeralBuilderTestCase):
|
||||
""" Tests the various lifecycles of the ephemeral builder and its interaction with etcd. """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(TestEphemeralLifecycle, self).__init__(*args, **kwargs)
|
||||
self.etcd_client_mock = None
|
||||
self.test_executor = None
|
||||
|
||||
def _create_completed_future(self, result=None):
|
||||
def inner(*args, **kwargs):
|
||||
new_future = Future()
|
||||
new_future.set_result(result)
|
||||
return new_future
|
||||
return inner
|
||||
|
||||
def _create_mock_executor(self, *args, **kwargs):
|
||||
self.test_executor = Mock(spec=BuilderExecutor)
|
||||
self.test_executor.start_builder = Mock(side_effect=self._create_completed_future('123'))
|
||||
self.test_executor.stop_builder = Mock(side_effect=self._create_completed_future())
|
||||
self.test_executor.setup_time = 60
|
||||
self.test_executor.name = 'MockExecutor'
|
||||
self.test_executor.minimum_retry_threshold = 0
|
||||
return self.test_executor
|
||||
|
||||
def setUp(self):
|
||||
super(TestEphemeralLifecycle, self).setUp()
|
||||
|
||||
EphemeralBuilderManager.EXECUTORS['test'] = self._create_mock_executor
|
||||
|
||||
self.register_component_callback = Mock()
|
||||
self.unregister_component_callback = Mock()
|
||||
self.job_heartbeat_callback = Mock()
|
||||
self.job_complete_callback = AsyncWrapper(Mock())
|
||||
|
||||
self.manager = EphemeralBuilderManager(
|
||||
self.register_component_callback,
|
||||
self.unregister_component_callback,
|
||||
self.job_heartbeat_callback,
|
||||
self.job_complete_callback,
|
||||
'127.0.0.1',
|
||||
30,
|
||||
)
|
||||
|
||||
self.manager.initialize({
|
||||
'EXECUTOR': 'test',
|
||||
'ORCHESTRATOR': {'MEM_CONFIG': None},
|
||||
})
|
||||
|
||||
# Ensure that that the realm and building callbacks have been registered
|
||||
callback_keys = [key for key in self.manager._orchestrator.callbacks]
|
||||
self.assertIn(REALM_PREFIX, callback_keys)
|
||||
self.assertIn(JOB_PREFIX, callback_keys)
|
||||
|
||||
self.mock_job = self._create_build_job()
|
||||
self.mock_job_key = slash_join('building', BUILD_UUID)
|
||||
|
||||
def tearDown(self):
|
||||
super(TestEphemeralLifecycle, self).tearDown()
|
||||
self.manager.shutdown()
|
||||
|
||||
|
||||
@coroutine
|
||||
def _setup_job_for_managers(self):
|
||||
test_component = Mock(spec=BuildComponent)
|
||||
test_component.builder_realm = REALM_ID
|
||||
test_component.start_build = Mock(side_effect=self._create_completed_future())
|
||||
self.register_component_callback.return_value = test_component
|
||||
|
||||
is_scheduled = yield From(self.manager.schedule(self.mock_job))
|
||||
self.assertTrue(is_scheduled)
|
||||
self.assertEqual(self.test_executor.start_builder.call_count, 1)
|
||||
|
||||
# Ensure that that the job, realm, and metric callbacks have been registered
|
||||
callback_keys = [key for key in self.manager._orchestrator.callbacks]
|
||||
self.assertIn(self.mock_job_key, self.manager._orchestrator.state)
|
||||
self.assertIn(REALM_PREFIX, callback_keys)
|
||||
# TODO: assert metric key has been set
|
||||
|
||||
realm_for_build = self._find_realm_key(self.manager._orchestrator, BUILD_UUID)
|
||||
|
||||
raw_realm_data = yield From(self.manager._orchestrator.get_key(slash_join('realm',
|
||||
realm_for_build)))
|
||||
realm_data = json.loads(raw_realm_data)
|
||||
realm_data['realm'] = REALM_ID
|
||||
|
||||
# Right now the job is not registered with any managers because etcd has not accepted the job
|
||||
self.assertEqual(self.register_component_callback.call_count, 0)
|
||||
|
||||
# Fire off a realm changed with the same data.
|
||||
yield From(self.manager._realm_callback(
|
||||
KeyChange(KeyEvent.CREATE,
|
||||
slash_join(REALM_PREFIX, REALM_ID),
|
||||
json.dumps(realm_data))))
|
||||
|
||||
# Ensure that we have at least one component node.
|
||||
self.assertEqual(self.register_component_callback.call_count, 1)
|
||||
self.assertEqual(1, self.manager.num_workers())
|
||||
|
||||
# Ensure that the build info exists.
|
||||
self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
|
||||
|
||||
raise Return(test_component)
|
||||
|
||||
@staticmethod
|
||||
def _find_realm_key(orchestrator, build_uuid):
|
||||
for key, value in iteritems(orchestrator.state):
|
||||
if key.startswith(REALM_PREFIX):
|
||||
parsed_value = json.loads(value)
|
||||
body = json.loads(parsed_value['job_queue_item']['body'])
|
||||
if body['build_uuid'] == build_uuid:
|
||||
return parsed_value['realm']
|
||||
continue
|
||||
raise KeyError
|
||||
|
||||
|
||||
@async_test
|
||||
def test_schedule_and_complete(self):
|
||||
# Test that a job is properly registered with all of the managers
|
||||
test_component = yield From(self._setup_job_for_managers())
|
||||
|
||||
# Take the job ourselves
|
||||
yield From(self.manager.build_component_ready(test_component))
|
||||
|
||||
self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
|
||||
|
||||
# Finish the job
|
||||
yield From(self.manager.job_completed(self.mock_job, BuildJobResult.COMPLETE, test_component))
|
||||
|
||||
# Ensure that the executor kills the job.
|
||||
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
|
||||
|
||||
# Ensure the build information is cleaned up.
|
||||
self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
|
||||
self.assertEqual(0, self.manager.num_workers())
|
||||
|
||||
@async_test
|
||||
def test_another_manager_takes_job(self):
|
||||
# Prepare a job to be taken by another manager
|
||||
test_component = yield From(self._setup_job_for_managers())
|
||||
|
||||
yield From(self.manager._realm_callback(
|
||||
KeyChange(KeyEvent.DELETE,
|
||||
slash_join(REALM_PREFIX, REALM_ID),
|
||||
json.dumps({'realm': REALM_ID,
|
||||
'token': 'beef',
|
||||
'execution_id': '123',
|
||||
'job_queue_item': self.mock_job.job_item}))))
|
||||
|
||||
self.unregister_component_callback.assert_called_once_with(test_component)
|
||||
|
||||
# Ensure that the executor does not kill the job.
|
||||
self.assertEqual(self.test_executor.stop_builder.call_count, 0)
|
||||
|
||||
# Ensure that we still have the build info, but not the component.
|
||||
self.assertEqual(0, self.manager.num_workers())
|
||||
self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
|
||||
|
||||
# Delete the job once it has "completed".
|
||||
yield From(self.manager._job_callback(
|
||||
KeyChange(KeyEvent.DELETE,
|
||||
self.mock_job_key,
|
||||
json.dumps({'had_heartbeat': False,
|
||||
'job_queue_item': self.mock_job.job_item}))))
|
||||
|
||||
# Ensure the job was removed from the info, but stop was not called.
|
||||
self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
|
||||
self.assertEqual(self.test_executor.stop_builder.call_count, 0)
|
||||
|
||||
@async_test
|
||||
def test_job_started_by_other_manager(self):
|
||||
# Ensure that that the building callbacks have been registered
|
||||
callback_keys = [key for key in self.manager._orchestrator.callbacks]
|
||||
self.assertIn(JOB_PREFIX, callback_keys)
|
||||
|
||||
# Send a signal to the callback that the job has been created.
|
||||
yield From(self.manager._job_callback(
|
||||
KeyChange(KeyEvent.CREATE,
|
||||
self.mock_job_key,
|
||||
json.dumps({'had_heartbeat': False,
|
||||
'job_queue_item': self.mock_job.job_item}))))
|
||||
|
||||
# Ensure the create does nothing.
|
||||
self.assertEqual(self.test_executor.stop_builder.call_count, 0)
|
||||
|
||||
@async_test
|
||||
def test_expiring_worker_not_started(self):
|
||||
# Ensure that that the building callbacks have been registered
|
||||
callback_keys = [key for key in self.manager._orchestrator.callbacks]
|
||||
self.assertIn(JOB_PREFIX, callback_keys)
|
||||
|
||||
# Send a signal to the callback that a worker has expired
|
||||
yield From(self.manager._job_callback(
|
||||
KeyChange(KeyEvent.EXPIRE,
|
||||
self.mock_job_key,
|
||||
json.dumps({'had_heartbeat': True,
|
||||
'job_queue_item': self.mock_job.job_item}))))
|
||||
|
||||
# Since the realm was never registered, expiration should do nothing.
|
||||
self.assertEqual(self.test_executor.stop_builder.call_count, 0)
|
||||
|
||||
@async_test
|
||||
def test_expiring_worker_started(self):
|
||||
test_component = yield From(self._setup_job_for_managers())
|
||||
|
||||
# Ensure that that the building callbacks have been registered
|
||||
callback_keys = [key for key in self.manager._orchestrator.callbacks]
|
||||
self.assertIn(JOB_PREFIX, callback_keys)
|
||||
|
||||
yield From(self.manager._job_callback(
|
||||
KeyChange(KeyEvent.EXPIRE,
|
||||
self.mock_job_key,
|
||||
json.dumps({'had_heartbeat': True,
|
||||
'job_queue_item': self.mock_job.job_item}))))
|
||||
|
||||
self.test_executor.stop_builder.assert_called_once_with('123')
|
||||
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
|
||||
|
||||
@async_test
|
||||
def test_buildjob_deleted(self):
|
||||
test_component = yield From(self._setup_job_for_managers())
|
||||
|
||||
# Ensure that that the building callbacks have been registered
|
||||
callback_keys = [key for key in self.manager._orchestrator.callbacks]
|
||||
self.assertIn(JOB_PREFIX, callback_keys)
|
||||
|
||||
# Send a signal to the callback that a worker has expired
|
||||
yield From(self.manager._job_callback(
|
||||
KeyChange(KeyEvent.DELETE,
|
||||
self.mock_job_key,
|
||||
json.dumps({'had_heartbeat': False,
|
||||
'job_queue_item': self.mock_job.job_item}))))
|
||||
|
||||
self.assertEqual(self.test_executor.stop_builder.call_count, 0)
|
||||
self.assertEqual(self.job_complete_callback.call_count, 0)
|
||||
self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
|
||||
|
||||
@async_test
|
||||
def test_builder_never_starts(self):
|
||||
test_component = yield From(self._setup_job_for_managers())
|
||||
|
||||
# Ensure that that the building callbacks have been registered
|
||||
callback_keys = [key for key in self.manager._orchestrator.callbacks]
|
||||
self.assertIn(JOB_PREFIX, callback_keys)
|
||||
|
||||
# Send a signal to the callback that a worker has expired
|
||||
yield From(self.manager._job_callback(
|
||||
KeyChange(KeyEvent.EXPIRE,
|
||||
self.mock_job_key,
|
||||
json.dumps({'had_heartbeat': False,
|
||||
'job_queue_item': self.mock_job.job_item}))))
|
||||
|
||||
self.test_executor.stop_builder.assert_called_once_with('123')
|
||||
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
|
||||
|
||||
# Ensure the job was marked as incomplete, with an update_phase to True (so the DB record and
|
||||
# logs are updated as well)
|
||||
yield From(self.job_complete_callback.assert_called_once_with(ANY, BuildJobResult.INCOMPLETE,
|
||||
'MockExecutor',
|
||||
update_phase=True))
|
||||
|
||||
@async_test
|
||||
def test_change_worker(self):
|
||||
# Send a signal to the callback that a worker key has been changed
|
||||
self.manager._job_callback(KeyChange(KeyEvent.SET, self.mock_job_key, 'value'))
|
||||
self.assertEqual(self.test_executor.stop_builder.call_count, 0)
|
||||
|
||||
@async_test
|
||||
def test_realm_expired(self):
|
||||
test_component = yield From(self._setup_job_for_managers())
|
||||
|
||||
# Send a signal to the callback that a realm has expired
|
||||
yield From(self.manager._realm_callback(KeyChange(
|
||||
KeyEvent.EXPIRE,
|
||||
self.mock_job_key,
|
||||
json.dumps({
|
||||
'realm': REALM_ID,
|
||||
'execution_id': 'foobar',
|
||||
'executor_name': 'MockExecutor',
|
||||
'job_queue_item': {'body': '{"build_uuid": "fakeid"}'},
|
||||
}))))
|
||||
|
||||
# Ensure that the cleanup code for the executor was called.
|
||||
self.test_executor.stop_builder.assert_called_once_with('foobar')
|
||||
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
|
||||
|
||||
|
||||
class TestEphemeral(EphemeralBuilderTestCase):
|
||||
""" Simple unit tests for the ephemeral builder around config management, starting and stopping
|
||||
jobs.
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
super(TestEphemeral, self).setUp()
|
||||
|
||||
unregister_component_callback = Mock()
|
||||
job_heartbeat_callback = Mock()
|
||||
|
||||
@coroutine
|
||||
def job_complete_callback(*args, **kwargs):
|
||||
raise Return()
|
||||
|
||||
self.manager = EphemeralBuilderManager(
|
||||
self._register_component,
|
||||
unregister_component_callback,
|
||||
job_heartbeat_callback,
|
||||
job_complete_callback,
|
||||
'127.0.0.1',
|
||||
30,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
super(TestEphemeral, self).tearDown()
|
||||
self.manager.shutdown()
|
||||
|
||||
def test_verify_executor_oldconfig(self):
|
||||
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
|
||||
self.manager.initialize({
|
||||
'EXECUTOR': 'test',
|
||||
'EXECUTOR_CONFIG': dict(MINIMUM_RETRY_THRESHOLD=42),
|
||||
'ORCHESTRATOR': {'MEM_CONFIG': None},
|
||||
})
|
||||
|
||||
# Ensure that we have a single test executor.
|
||||
self.assertEqual(1, len(self.manager.registered_executors))
|
||||
self.assertEqual(42, self.manager.registered_executors[0].minimum_retry_threshold)
|
||||
self.assertEqual('TestExecutor', self.manager.registered_executors[0].name)
|
||||
|
||||
def test_verify_executor_newconfig(self):
|
||||
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
|
||||
self.manager.initialize({
|
||||
'EXECUTORS': [{
|
||||
'EXECUTOR': 'test',
|
||||
'MINIMUM_RETRY_THRESHOLD': 42
|
||||
}],
|
||||
'ORCHESTRATOR': {'MEM_CONFIG': None},
|
||||
})
|
||||
|
||||
# Ensure that we have a single test executor.
|
||||
self.assertEqual(1, len(self.manager.registered_executors))
|
||||
self.assertEqual(42, self.manager.registered_executors[0].minimum_retry_threshold)
|
||||
|
||||
|
||||
def test_multiple_executors_samename(self):
|
||||
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
|
||||
EphemeralBuilderManager.EXECUTORS['anotherexecutor'] = TestExecutor
|
||||
|
||||
with self.assertRaises(Exception):
|
||||
self.manager.initialize({
|
||||
'EXECUTORS': [
|
||||
{
|
||||
'NAME': 'primary',
|
||||
'EXECUTOR': 'test',
|
||||
'MINIMUM_RETRY_THRESHOLD': 42
|
||||
},
|
||||
{
|
||||
'NAME': 'primary',
|
||||
'EXECUTOR': 'anotherexecutor',
|
||||
'MINIMUM_RETRY_THRESHOLD': 24
|
||||
},
|
||||
],
|
||||
'ORCHESTRATOR': {'MEM_CONFIG': None},
|
||||
})
|
||||
|
||||
|
||||
def test_verify_multiple_executors(self):
|
||||
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
|
||||
EphemeralBuilderManager.EXECUTORS['anotherexecutor'] = TestExecutor
|
||||
|
||||
self.manager.initialize({
|
||||
'EXECUTORS': [
|
||||
{
|
||||
'NAME': 'primary',
|
||||
'EXECUTOR': 'test',
|
||||
'MINIMUM_RETRY_THRESHOLD': 42
|
||||
},
|
||||
{
|
||||
'NAME': 'secondary',
|
||||
'EXECUTOR': 'anotherexecutor',
|
||||
'MINIMUM_RETRY_THRESHOLD': 24
|
||||
},
|
||||
],
|
||||
'ORCHESTRATOR': {'MEM_CONFIG': None},
|
||||
})
|
||||
|
||||
# Ensure that we have a two test executors.
|
||||
self.assertEqual(2, len(self.manager.registered_executors))
|
||||
self.assertEqual(42, self.manager.registered_executors[0].minimum_retry_threshold)
|
||||
self.assertEqual(24, self.manager.registered_executors[1].minimum_retry_threshold)
|
||||
|
||||
def test_skip_invalid_executor(self):
|
||||
self.manager.initialize({
|
||||
'EXECUTORS': [
|
||||
{
|
||||
'EXECUTOR': 'unknown',
|
||||
'MINIMUM_RETRY_THRESHOLD': 42
|
||||
},
|
||||
],
|
||||
'ORCHESTRATOR': {'MEM_CONFIG': None},
|
||||
})
|
||||
|
||||
self.assertEqual(0, len(self.manager.registered_executors))
|
||||
|
||||
@async_test
|
||||
def test_schedule_job_namespace_filter(self):
|
||||
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
|
||||
self.manager.initialize({
|
||||
'EXECUTORS': [{
|
||||
'EXECUTOR': 'test',
|
||||
'NAMESPACE_WHITELIST': ['something'],
|
||||
}],
|
||||
'ORCHESTRATOR': {'MEM_CONFIG': None},
|
||||
})
|
||||
|
||||
# Try with a build job in an invalid namespace.
|
||||
build_job = self._create_build_job(namespace='somethingelse')
|
||||
result = yield From(self.manager.schedule(build_job))
|
||||
self.assertFalse(result[0])
|
||||
|
||||
# Try with a valid namespace.
|
||||
build_job = self._create_build_job(namespace='something')
|
||||
result = yield From(self.manager.schedule(build_job))
|
||||
self.assertTrue(result[0])
|
||||
|
||||
@async_test
|
||||
def test_schedule_job_retries_filter(self):
|
||||
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
|
||||
self.manager.initialize({
|
||||
'EXECUTORS': [{
|
||||
'EXECUTOR': 'test',
|
||||
'MINIMUM_RETRY_THRESHOLD': 2,
|
||||
}],
|
||||
'ORCHESTRATOR': {'MEM_CONFIG': None},
|
||||
})
|
||||
|
||||
# Try with a build job that has too few retries.
|
||||
build_job = self._create_build_job(retries=1)
|
||||
result = yield From(self.manager.schedule(build_job))
|
||||
self.assertFalse(result[0])
|
||||
|
||||
# Try with a valid job.
|
||||
build_job = self._create_build_job(retries=2)
|
||||
result = yield From(self.manager.schedule(build_job))
|
||||
self.assertTrue(result[0])
|
||||
|
||||
@async_test
|
||||
def test_schedule_job_executor_fallback(self):
|
||||
EphemeralBuilderManager.EXECUTORS['primary'] = TestExecutor
|
||||
EphemeralBuilderManager.EXECUTORS['secondary'] = TestExecutor
|
||||
|
||||
self.manager.initialize({
|
||||
'EXECUTORS': [
|
||||
{
|
||||
'NAME': 'primary',
|
||||
'EXECUTOR': 'primary',
|
||||
'NAMESPACE_WHITELIST': ['something'],
|
||||
'MINIMUM_RETRY_THRESHOLD': 3,
|
||||
},
|
||||
{
|
||||
'NAME': 'secondary',
|
||||
'EXECUTOR': 'secondary',
|
||||
'MINIMUM_RETRY_THRESHOLD': 2,
|
||||
},
|
||||
],
|
||||
'ALLOWED_WORKER_COUNT': 5,
|
||||
'ORCHESTRATOR': {'MEM_CONFIG': None},
|
||||
})
|
||||
|
||||
# Try a job not matching the primary's namespace filter. Should schedule on secondary.
|
||||
build_job = self._create_build_job(namespace='somethingelse')
|
||||
result = yield From(self.manager.schedule(build_job))
|
||||
self.assertTrue(result[0])
|
||||
|
||||
self.assertIsNone(self.manager.registered_executors[0].job_started)
|
||||
self.assertIsNotNone(self.manager.registered_executors[1].job_started)
|
||||
|
||||
self.manager.registered_executors[0].job_started = None
|
||||
self.manager.registered_executors[1].job_started = None
|
||||
|
||||
# Try a job not matching the primary's retry minimum. Should schedule on secondary.
|
||||
build_job = self._create_build_job(namespace='something', retries=2)
|
||||
result = yield From(self.manager.schedule(build_job))
|
||||
self.assertTrue(result[0])
|
||||
|
||||
self.assertIsNone(self.manager.registered_executors[0].job_started)
|
||||
self.assertIsNotNone(self.manager.registered_executors[1].job_started)
|
||||
|
||||
self.manager.registered_executors[0].job_started = None
|
||||
self.manager.registered_executors[1].job_started = None
|
||||
|
||||
# Try a job matching the primary. Should schedule on the primary.
|
||||
build_job = self._create_build_job(namespace='something', retries=3)
|
||||
result = yield From(self.manager.schedule(build_job))
|
||||
self.assertTrue(result[0])
|
||||
|
||||
self.assertIsNotNone(self.manager.registered_executors[0].job_started)
|
||||
self.assertIsNone(self.manager.registered_executors[1].job_started)
|
||||
|
||||
self.manager.registered_executors[0].job_started = None
|
||||
self.manager.registered_executors[1].job_started = None
|
||||
|
||||
# Try a job not matching either's restrictions.
|
||||
build_job = self._create_build_job(namespace='somethingelse', retries=1)
|
||||
result = yield From(self.manager.schedule(build_job))
|
||||
self.assertFalse(result[0])
|
||||
|
||||
self.assertIsNone(self.manager.registered_executors[0].job_started)
|
||||
self.assertIsNone(self.manager.registered_executors[1].job_started)
|
||||
|
||||
self.manager.registered_executors[0].job_started = None
|
||||
self.manager.registered_executors[1].job_started = None
|
||||
|
||||
|
||||
@async_test
|
||||
def test_schedule_job_single_executor(self):
|
||||
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
|
||||
|
||||
self.manager.initialize({
|
||||
'EXECUTOR': 'test',
|
||||
'EXECUTOR_CONFIG': {},
|
||||
'ALLOWED_WORKER_COUNT': 5,
|
||||
'ORCHESTRATOR': {'MEM_CONFIG': None},
|
||||
})
|
||||
|
||||
build_job = self._create_build_job(namespace='something', retries=3)
|
||||
result = yield From(self.manager.schedule(build_job))
|
||||
self.assertTrue(result[0])
|
||||
|
||||
self.assertIsNotNone(self.manager.registered_executors[0].job_started)
|
||||
self.manager.registered_executors[0].job_started = None
|
||||
|
||||
|
||||
build_job = self._create_build_job(namespace='something', retries=0)
|
||||
result = yield From(self.manager.schedule(build_job))
|
||||
self.assertTrue(result[0])
|
||||
|
||||
self.assertIsNotNone(self.manager.registered_executors[0].job_started)
|
||||
self.manager.registered_executors[0].job_started = None
|
||||
|
||||
@async_test
|
||||
def test_executor_exception(self):
|
||||
EphemeralBuilderManager.EXECUTORS['bad'] = BadExecutor
|
||||
|
||||
self.manager.initialize({
|
||||
'EXECUTOR': 'bad',
|
||||
'EXECUTOR_CONFIG': {},
|
||||
'ORCHESTRATOR': {'MEM_CONFIG': None},
|
||||
})
|
||||
|
||||
build_job = self._create_build_job(namespace='something', retries=3)
|
||||
result = yield From(self.manager.schedule(build_job))
|
||||
self.assertFalse(result[0])
|
||||
|
||||
@async_test
|
||||
def test_schedule_and_stop(self):
|
||||
EphemeralBuilderManager.EXECUTORS['test'] = TestExecutor
|
||||
|
||||
self.manager.initialize({
|
||||
'EXECUTOR': 'test',
|
||||
'EXECUTOR_CONFIG': {},
|
||||
'ORCHESTRATOR': {'MEM_CONFIG': None},
|
||||
})
|
||||
|
||||
# Start the build job.
|
||||
build_job = self._create_build_job(namespace='something', retries=3)
|
||||
result = yield From(self.manager.schedule(build_job))
|
||||
self.assertTrue(result[0])
|
||||
|
||||
executor = self.manager.registered_executors[0]
|
||||
self.assertIsNotNone(executor.job_started)
|
||||
|
||||
# Register the realm so the build information is added.
|
||||
yield From(self.manager._register_realm({
|
||||
'realm': str(uuid.uuid4()),
|
||||
'token': str(uuid.uuid4()),
|
||||
'execution_id': executor.job_started,
|
||||
'executor_name': 'TestExecutor',
|
||||
'build_uuid': build_job.build_uuid,
|
||||
'job_queue_item': build_job.job_item,
|
||||
}))
|
||||
|
||||
# Stop the build job.
|
||||
yield From(self.manager.kill_builder_executor(build_job.build_uuid))
|
||||
self.assertEqual(executor.job_stopped, executor.job_started)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Reference in a new issue