Merge branch 'master' into quark

This commit is contained in:
Joseph Schorr 2015-02-13 16:24:53 -05:00
commit fbdbc21eb1
137 changed files with 8691 additions and 2414 deletions

27
buildman/asyncutil.py Normal file
View file

@ -0,0 +1,27 @@
from functools import partial, wraps
from trollius import get_event_loop
class AsyncWrapper(object):
""" Wrapper class which will transform a syncronous library to one that can be used with
trollius coroutines.
"""
def __init__(self, delegate, loop=None, executor=None):
self._loop = loop if loop is not None else get_event_loop()
self._delegate = delegate
self._executor = executor
def __getattr__(self, attrib):
delegate_attr = getattr(self._delegate, attrib)
if not callable(delegate_attr):
return delegate_attr
def wrapper(*args, **kwargs):
""" Wraps the delegate_attr with primitives that will transform sync calls to ones shelled
out to a thread pool.
"""
callable_delegate_attr = partial(delegate_attr, *args, **kwargs)
return self._loop.run_in_executor(self._executor, callable_delegate_attr)
return wrapper

View file

@ -6,6 +6,7 @@ import time
from app import app, userfiles as user_files, build_logs, dockerfile_build_queue
from buildman.manager.enterprise import EnterpriseManager
from buildman.manager.ephemeral import EphemeralBuilderManager
from buildman.server import BuilderServer
from trollius import SSLContext
@ -13,14 +14,22 @@ from trollius import SSLContext
logger = logging.getLogger(__name__)
BUILD_MANAGERS = {
'enterprise': EnterpriseManager
'enterprise': EnterpriseManager,
'ephemeral': EphemeralBuilderManager,
}
EXTERNALLY_MANAGED = 'external'
DEFAULT_WEBSOCKET_PORT = 8787
DEFAULT_CONTROLLER_PORT = 8686
LOG_FORMAT = "%(asctime)s [%(process)d] [%(levelname)s] [%(name)s] %(message)s"
def run_build_manager():
if not features.BUILD_SUPPORT:
logger.debug('Building is disabled. Please enable the feature flag')
while True:
time.sleep(1000)
return
build_manager_config = app.config.get('BUILD_MANAGER')
@ -39,6 +48,19 @@ def run_build_manager():
if manager_klass is None:
return
manager_hostname = os.environ.get('BUILDMAN_HOSTNAME',
app.config.get('BUILDMAN_HOSTNAME',
app.config['SERVER_HOSTNAME']))
websocket_port = int(os.environ.get('BUILDMAN_WEBSOCKET_PORT',
app.config.get('BUILDMAN_WEBSOCKET_PORT',
DEFAULT_WEBSOCKET_PORT)))
controller_port = int(os.environ.get('BUILDMAN_CONTROLLER_PORT',
app.config.get('BUILDMAN_CONTROLLER_PORT',
DEFAULT_CONTROLLER_PORT)))
logger.debug('Will pass buildman hostname %s to builders for websocket connection',
manager_hostname)
logger.debug('Starting build manager with lifecycle "%s"', build_manager_config[0])
ssl_context = None
if os.environ.get('SSL_CONFIG'):
@ -48,9 +70,10 @@ def run_build_manager():
os.path.join(os.environ.get('SSL_CONFIG'), 'ssl.key'))
server = BuilderServer(app.config['SERVER_HOSTNAME'], dockerfile_build_queue, build_logs,
user_files, manager_klass)
server.run('0.0.0.0', ssl=ssl_context)
user_files, manager_klass, build_manager_config[1], manager_hostname)
server.run('0.0.0.0', websocket_port, controller_port, ssl=ssl_context)
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
logging.getLogger('peewee').setLevel(logging.WARN)
run_build_manager()

View file

@ -6,11 +6,10 @@ import trollius
import re
from autobahn.wamp.exception import ApplicationError
from trollius.coroutines import From
from buildman.server import BuildJobResult
from buildman.component.basecomponent import BaseComponent
from buildman.jobutil.buildpack import BuildPackage, BuildPackageException
from buildman.jobutil.buildjob import BuildJobLoadException
from buildman.jobutil.buildstatus import StatusHandler
from buildman.jobutil.workererror import WorkerError
@ -20,7 +19,7 @@ HEARTBEAT_DELTA = datetime.timedelta(seconds=30)
HEARTBEAT_TIMEOUT = 10
INITIAL_TIMEOUT = 25
SUPPORTED_WORKER_VERSIONS = ['0.1-beta']
SUPPORTED_WORKER_VERSIONS = ['0.3']
logger = logging.getLogger(__name__)
@ -39,13 +38,14 @@ class BuildComponent(BaseComponent):
self.builder_realm = realm
self.parent_manager = None
self.server_hostname = None
self.registry_hostname = None
self._component_status = ComponentStatus.JOINING
self._last_heartbeat = None
self._current_job = None
self._build_status = None
self._image_info = None
self._worker_version = None
BaseComponent.__init__(self, config, **kwargs)
@ -57,69 +57,52 @@ class BuildComponent(BaseComponent):
def onJoin(self, details):
logger.debug('Registering methods and listeners for component %s', self.builder_realm)
yield From(self.register(self._on_ready, u'io.quay.buildworker.ready'))
yield From(self.register(self._ping, u'io.quay.buildworker.ping'))
yield From(self.subscribe(self._on_heartbeat, 'io.quay.builder.heartbeat'))
yield From(self.subscribe(self._on_log_message, 'io.quay.builder.logmessage'))
yield trollius.From(self.register(self._on_ready, u'io.quay.buildworker.ready'))
yield trollius.From(self.register(self._determine_cache_tag,
u'io.quay.buildworker.determinecachetag'))
yield trollius.From(self.register(self._ping, u'io.quay.buildworker.ping'))
self._set_status(ComponentStatus.WAITING)
yield trollius.From(self.subscribe(self._on_heartbeat, 'io.quay.builder.heartbeat'))
yield trollius.From(self.subscribe(self._on_log_message, 'io.quay.builder.logmessage'))
yield trollius.From(self._set_status(ComponentStatus.WAITING))
def is_ready(self):
""" Determines whether a build component is ready to begin a build. """
return self._component_status == ComponentStatus.RUNNING
@trollius.coroutine
def start_build(self, build_job):
""" Starts a build. """
logger.debug('Starting build for component %s (worker version: %s)',
self.builder_realm, self._worker_version)
self._current_job = build_job
self._build_status = StatusHandler(self.build_logs, build_job.repo_build())
self._build_status = StatusHandler(self.build_logs, build_job.repo_build.uuid)
self._image_info = {}
self._set_status(ComponentStatus.BUILDING)
yield trollius.From(self._set_status(ComponentStatus.BUILDING))
# Retrieve the job's buildpack.
buildpack_url = self.user_files.get_file_url(build_job.repo_build().resource_key,
# Send the notification that the build has started.
build_job.send_notification('build_start')
# Parse the build configuration.
try:
build_config = build_job.build_config
except BuildJobLoadException as irbe:
self._build_failure('Could not load build job information', irbe)
base_image_information = {}
buildpack_url = self.user_files.get_file_url(build_job.repo_build.resource_key,
requires_cors=False)
logger.debug('Retreiving build package: %s', buildpack_url)
buildpack = None
try:
buildpack = BuildPackage.from_url(buildpack_url)
except BuildPackageException as bpe:
self._build_failure('Could not retrieve build package', bpe)
return
# Extract the base image information from the Dockerfile.
parsed_dockerfile = None
logger.debug('Parsing dockerfile')
build_config = build_job.build_config()
try:
parsed_dockerfile = buildpack.parse_dockerfile(build_config.get('build_subdir'))
except BuildPackageException as bpe:
self._build_failure('Could not find Dockerfile in build package', bpe)
return
image_and_tag_tuple = parsed_dockerfile.get_image_and_tag()
if image_and_tag_tuple is None or image_and_tag_tuple[0] is None:
self._build_failure('Missing FROM line in Dockerfile')
return
base_image_information = {
'repository': image_and_tag_tuple[0],
'tag': image_and_tag_tuple[1]
}
# Extract the number of steps from the Dockerfile.
with self._build_status as status_dict:
status_dict['total_commands'] = len(parsed_dockerfile.commands)
# Add the pull robot information, if any.
if build_config.get('pull_credentials') is not None:
base_image_information['username'] = build_config['pull_credentials'].get('username', '')
base_image_information['password'] = build_config['pull_credentials'].get('password', '')
if build_job.pull_credentials:
base_image_information['username'] = build_job.pull_credentials.get('username', '')
base_image_information['password'] = build_job.pull_credentials.get('password', '')
# Retrieve the repository's fully qualified name.
repo = build_job.repo_build().repository
repo = build_job.repo_build.repository
repository_name = repo.namespace_user.username + '/' + repo.name
# Parse the build queue item into build arguments.
@ -131,29 +114,26 @@ class BuildComponent(BaseComponent):
# push_token: The token to use to push the built image.
# tag_names: The name(s) of the tag(s) for the newly built image.
# base_image: The image name and credentials to use to conduct the base image pull.
# repository: The repository to pull.
# tag: The tag to pull.
# repository: The repository to pull (DEPRECATED 0.2)
# tag: The tag to pull (DEPRECATED in 0.2)
# username: The username for pulling the base image (if any).
# password: The password for pulling the base image (if any).
build_arguments = {
'build_package': buildpack_url,
'sub_directory': build_config.get('build_subdir', ''),
'repository': repository_name,
'registry': self.server_hostname,
'pull_token': build_job.repo_build().access_token.code,
'push_token': build_job.repo_build().access_token.code,
'registry': self.registry_hostname,
'pull_token': build_job.repo_build.access_token.code,
'push_token': build_job.repo_build.access_token.code,
'tag_names': build_config.get('docker_tags', ['latest']),
'base_image': base_image_information,
'cached_tag': build_job.determine_cached_tag() or ''
'base_image': base_image_information
}
# Invoke the build.
logger.debug('Invoking build: %s', self.builder_realm)
logger.debug('With Arguments: %s', build_arguments)
return (self
.call("io.quay.builder.build", **build_arguments)
.add_done_callback(self._build_complete))
self.call("io.quay.builder.build", **build_arguments).add_done_callback(self._build_complete)
@staticmethod
def _total_completion(statuses, total_images):
@ -240,18 +220,28 @@ class BuildComponent(BaseComponent):
elif phase == BUILD_PHASE.BUILDING:
self._build_status.append_log(current_status_string)
@trollius.coroutine
def _determine_cache_tag(self, command_comments, base_image_name, base_image_tag, base_image_id):
with self._build_status as status_dict:
status_dict['total_commands'] = len(command_comments) + 1
logger.debug('Checking cache on realm %s. Base image: %s:%s (%s)', self.builder_realm,
base_image_name, base_image_tag, base_image_id)
tag_found = self._current_job.determine_cached_tag(base_image_id, command_comments)
raise trollius.Return(tag_found or '')
def _build_failure(self, error_message, exception=None):
""" Handles and logs a failed build. """
self._build_status.set_error(error_message, {
'internal_error': exception.message if exception else None
'internal_error': str(exception) if exception else None
})
build_id = self._current_job.repo_build().uuid
build_id = self._current_job.repo_build.uuid
logger.warning('Build %s failed with message: %s', build_id, error_message)
# Mark that the build has finished (in an error state)
self._build_finished(BuildJobResult.ERROR)
trollius.async(self._build_finished(BuildJobResult.ERROR))
def _build_complete(self, result):
""" Wraps up a completed build. Handles any errors and calls self._build_finished. """
@ -259,60 +249,78 @@ class BuildComponent(BaseComponent):
# Retrieve the result. This will raise an ApplicationError on any error that occurred.
result.result()
self._build_status.set_phase(BUILD_PHASE.COMPLETE)
self._build_finished(BuildJobResult.COMPLETE)
trollius.async(self._build_finished(BuildJobResult.COMPLETE))
# Send the notification that the build has completed successfully.
self._current_job.send_notification('build_success')
except ApplicationError as aex:
worker_error = WorkerError(aex.error, aex.kwargs.get('base_error'))
# Write the error to the log.
self._build_status.set_error(worker_error.public_message(), worker_error.extra_data(),
internal_error=worker_error.is_internal_error())
internal_error=worker_error.is_internal_error(),
requeued=self._current_job.has_retries_remaining())
# Send the notification that the build has failed.
self._current_job.send_notification('build_failure',
error_message=worker_error.public_message())
# Mark the build as completed.
if worker_error.is_internal_error():
self._build_finished(BuildJobResult.INCOMPLETE)
trollius.async(self._build_finished(BuildJobResult.INCOMPLETE))
else:
self._build_finished(BuildJobResult.ERROR)
trollius.async(self._build_finished(BuildJobResult.ERROR))
@trollius.coroutine
def _build_finished(self, job_status):
""" Alerts the parent that a build has completed and sets the status back to running. """
self.parent_manager.job_completed(self._current_job, job_status, self)
yield trollius.From(self.parent_manager.job_completed(self._current_job, job_status, self))
self._current_job = None
# Set the component back to a running state.
self._set_status(ComponentStatus.RUNNING)
yield trollius.From(self._set_status(ComponentStatus.RUNNING))
@staticmethod
def _ping():
""" Ping pong. """
return 'pong'
@trollius.coroutine
def _on_ready(self, token, version):
if not version in SUPPORTED_WORKER_VERSIONS:
logger.warning('Build component (token "%s") is running an out-of-date version: %s', version)
return False
self._worker_version = version
if self._component_status != 'waiting':
if not version in SUPPORTED_WORKER_VERSIONS:
logger.warning('Build component (token "%s") is running an out-of-date version: %s', token,
version)
raise trollius.Return(False)
if self._component_status != ComponentStatus.WAITING:
logger.warning('Build component (token "%s") is already connected', self.expected_token)
return False
raise trollius.Return(False)
if token != self.expected_token:
logger.warning('Builder token mismatch. Expected: "%s". Found: "%s"', self.expected_token, token)
return False
logger.warning('Builder token mismatch. Expected: "%s". Found: "%s"', self.expected_token,
token)
raise trollius.Return(False)
self._set_status(ComponentStatus.RUNNING)
yield trollius.From(self._set_status(ComponentStatus.RUNNING))
# Start the heartbeat check and updating loop.
loop = trollius.get_event_loop()
loop.create_task(self._heartbeat())
logger.debug('Build worker %s is connected and ready', self.builder_realm)
return True
raise trollius.Return(True)
@trollius.coroutine
def _set_status(self, phase):
if phase == ComponentStatus.RUNNING:
yield trollius.From(self.parent_manager.build_component_ready(self))
self._component_status = phase
def _on_heartbeat(self):
""" Updates the last known heartbeat. """
self._last_heartbeat = datetime.datetime.now()
self._last_heartbeat = datetime.datetime.utcnow()
@trollius.coroutine
def _heartbeat(self):
@ -320,13 +328,13 @@ class BuildComponent(BaseComponent):
and updating the heartbeat in the build status dictionary (if applicable). This allows
the build system to catch crashes from either end.
"""
yield From(trollius.sleep(INITIAL_TIMEOUT))
yield trollius.From(trollius.sleep(INITIAL_TIMEOUT))
while True:
# If the component is no longer running or actively building, nothing more to do.
if (self._component_status != ComponentStatus.RUNNING and
self._component_status != ComponentStatus.BUILDING):
return
raise trollius.Return()
# If there is an active build, write the heartbeat to its status.
build_status = self._build_status
@ -334,35 +342,37 @@ class BuildComponent(BaseComponent):
with build_status as status_dict:
status_dict['heartbeat'] = int(time.time())
# Mark the build item.
current_job = self._current_job
if current_job is not None:
self.parent_manager.job_heartbeat(current_job)
yield trollius.From(self.parent_manager.job_heartbeat(current_job))
# Check the heartbeat from the worker.
logger.debug('Checking heartbeat on realm %s', self.builder_realm)
if self._last_heartbeat and self._last_heartbeat < datetime.datetime.now() - HEARTBEAT_DELTA:
self._timeout()
return
if (self._last_heartbeat and
self._last_heartbeat < datetime.datetime.utcnow() - HEARTBEAT_DELTA):
yield trollius.From(self._timeout())
raise trollius.Return()
yield From(trollius.sleep(HEARTBEAT_TIMEOUT))
yield trollius.From(trollius.sleep(HEARTBEAT_TIMEOUT))
@trollius.coroutine
def _timeout(self):
self._set_status(ComponentStatus.TIMED_OUT)
logger.warning('Build component with realm %s has timed out', self.builder_realm)
self._dispose(timed_out=True)
if self._component_status == ComponentStatus.TIMED_OUT:
raise trollius.Return()
yield trollius.From(self._set_status(ComponentStatus.TIMED_OUT))
logger.warning('Build component with realm %s has timed out', self.builder_realm)
def _dispose(self, timed_out=False):
# If we still have a running job, then it has not completed and we need to tell the parent
# manager.
if self._current_job is not None:
if timed_out:
self._build_status.set_error('Build worker timed out', internal_error=True)
self._build_status.set_error('Build worker timed out', internal_error=True,
requeued=self._current_job.has_retries_remaining())
self.parent_manager.job_completed(self._current_job, BuildJobResult.INCOMPLETE, self)
self._build_status = None
self._current_job = None
# Unregister the current component so that it cannot be invoked again.
self.parent_manager.build_component_disposed(self, timed_out)
self.parent_manager.build_component_disposed(self, True)

View file

@ -1,6 +1,13 @@
from data import model
import json
import logging
from cachetools import lru_cache
from endpoints.notificationhelper import spawn_notification
from data import model
from util.imagetree import ImageTree
logger = logging.getLogger(__name__)
class BuildJobLoadException(Exception):
""" Exception raised if a build job could not be instantiated for some reason. """
@ -9,50 +16,123 @@ class BuildJobLoadException(Exception):
class BuildJob(object):
""" Represents a single in-progress build job. """
def __init__(self, job_item):
self._job_item = job_item
self.job_item = job_item
try:
self._job_details = json.loads(job_item.body)
self.job_details = json.loads(job_item.body)
except ValueError:
raise BuildJobLoadException(
'Could not parse build queue item config with ID %s' % self._job_details['build_uuid']
'Could not parse build queue item config with ID %s' % self.job_details['build_uuid']
)
def has_retries_remaining(self):
return self.job_item.retries_remaining > 0
def send_notification(self, kind, error_message=None):
tags = self.build_config.get('docker_tags', ['latest'])
event_data = {
'build_id': self.repo_build.uuid,
'build_name': self.repo_build.display_name,
'docker_tags': tags,
'trigger_id': self.repo_build.trigger.uuid,
'trigger_kind': self.repo_build.trigger.service.name
}
if error_message is not None:
event_data['error_message'] = error_message
spawn_notification(self.repo_build.repository, kind, event_data,
subpage='build?current=%s' % self.repo_build.uuid,
pathargs=['build', self.repo_build.uuid])
@lru_cache(maxsize=1)
def _load_repo_build(self):
try:
self._repo_build = model.get_repository_build(self._job_details['build_uuid'])
return model.get_repository_build(self.job_details['build_uuid'])
except model.InvalidRepositoryBuildException:
raise BuildJobLoadException(
'Could not load repository build with ID %s' % self._job_details['build_uuid'])
'Could not load repository build with ID %s' % self.job_details['build_uuid'])
@property
def repo_build(self):
return self._load_repo_build()
@property
def pull_credentials(self):
""" Returns the pull credentials for this job, or None if none. """
return self.job_details.get('pull_credentials')
@property
def build_config(self):
try:
self._build_config = json.loads(self._repo_build.job_config)
return json.loads(self.repo_build.job_config)
except ValueError:
raise BuildJobLoadException(
'Could not parse repository build job config with ID %s' % self._job_details['build_uuid']
'Could not parse repository build job config with ID %s' % self.job_details['build_uuid']
)
def determine_cached_tag(self):
def determine_cached_tag(self, base_image_id=None, cache_comments=None):
""" Returns the tag to pull to prime the cache or None if none. """
# TODO(jschorr): Change this to use the more complicated caching rules, once we have caching
# be a pull of things besides the constructed tags.
tags = self._build_config.get('docker_tags', ['latest'])
existing_tags = model.list_repository_tags(self._repo_build.repository.namespace_user.username,
self._repo_build.repository.name)
cached_tag = None
if base_image_id and cache_comments:
cached_tag = self._determine_cached_tag_by_comments(base_image_id, cache_comments)
if not cached_tag:
cached_tag = self._determine_cached_tag_by_tag()
logger.debug('Determined cached tag %s for %s: %s', cached_tag, base_image_id, cache_comments)
return cached_tag
def _determine_cached_tag_by_comments(self, base_image_id, cache_commands):
""" Determines the tag to use for priming the cache for this build job, by matching commands
starting at the given base_image_id. This mimics the Docker cache checking, so it should,
in theory, provide "perfect" caching.
"""
# Lookup the base image in the repository. If it doesn't exist, nothing more to do.
repo_build = self.repo_build
repo_namespace = repo_build.repository.namespace_user.username
repo_name = repo_build.repository.name
base_image = model.get_image(repo_build.repository, base_image_id)
if base_image is None:
return None
# Build an in-memory tree of the full heirarchy of images in the repository.
all_images = model.get_repository_images(repo_namespace, repo_name)
all_tags = model.list_repository_tags(repo_namespace, repo_name)
tree = ImageTree(all_images, all_tags, base_filter=base_image.id)
# Find a path in the tree, starting at the base image, that matches the cache comments
# or some subset thereof.
def checker(step, image):
if step >= len(cache_commands):
return False
full_command = '["/bin/sh", "-c", "%s"]' % cache_commands[step]
logger.debug('Checking step #%s: %s, %s == %s', step, image.id,
image.storage.command, full_command)
return image.storage.command == full_command
path = tree.find_longest_path(base_image.id, checker)
if not path:
return None
# Find any tag associated with the last image in the path.
return tree.tag_containing_image(path[-1])
def _determine_cached_tag_by_tag(self):
""" Determines the cached tag by looking for one of the tags being built, and seeing if it
exists in the repository. This is a fallback for when no comment information is available.
"""
tags = self.build_config.get('docker_tags', ['latest'])
repository = self.repo_build.repository
existing_tags = model.list_repository_tags(repository.namespace_user.username, repository.name)
cached_tags = set(tags) & set([tag.name for tag in existing_tags])
if cached_tags:
return list(cached_tags)[0]
return None
def job_item(self):
""" Returns the job's queue item. """
return self._job_item
def repo_build(self):
""" Returns the repository build DB row for the job. """
return self._repo_build
def build_config(self):
""" Returns the parsed repository build config for the job. """
return self._build_config

View file

@ -1,88 +0,0 @@
import tarfile
import requests
import os
from tempfile import TemporaryFile, mkdtemp
from zipfile import ZipFile
from util.dockerfileparse import parse_dockerfile
from util.safetar import safe_extractall
class BuildPackageException(Exception):
""" Exception raised when retrieving or parsing a build package. """
pass
class BuildPackage(object):
""" Helper class for easy reading and updating of a Dockerfile build pack. """
def __init__(self, requests_file):
self._mime_processors = {
'application/zip': BuildPackage._prepare_zip,
'application/x-zip-compressed': BuildPackage._prepare_zip,
'text/plain': BuildPackage._prepare_dockerfile,
'application/octet-stream': BuildPackage._prepare_dockerfile,
'application/x-tar': BuildPackage._prepare_tarball,
'application/gzip': BuildPackage._prepare_tarball,
'application/x-gzip': BuildPackage._prepare_tarball,
}
c_type = requests_file.headers['content-type']
c_type = c_type.split(';')[0] if ';' in c_type else c_type
if c_type not in self._mime_processors:
raise BuildPackageException('Unknown build package mime type: %s' % c_type)
self._package_directory = None
try:
self._package_directory = self._mime_processors[c_type](requests_file)
except Exception as ex:
raise BuildPackageException(ex.message)
def parse_dockerfile(self, subdirectory):
dockerfile_path = os.path.join(self._package_directory, subdirectory, 'Dockerfile')
if not os.path.exists(dockerfile_path):
if subdirectory:
message = 'Build package did not contain a Dockerfile at sub directory %s.' % subdirectory
else:
message = 'Build package did not contain a Dockerfile at the root directory.'
raise BuildPackageException(message)
with open(dockerfile_path, 'r') as dockerfileobj:
return parse_dockerfile(dockerfileobj.read())
@staticmethod
def from_url(url):
buildpack_resource = requests.get(url, stream=True)
return BuildPackage(buildpack_resource)
@staticmethod
def _prepare_zip(request_file):
build_dir = mkdtemp(prefix='docker-build-')
# Save the zip file to temp somewhere
with TemporaryFile() as zip_file:
zip_file.write(request_file.content)
to_extract = ZipFile(zip_file)
to_extract.extractall(build_dir)
return build_dir
@staticmethod
def _prepare_dockerfile(request_file):
build_dir = mkdtemp(prefix='docker-build-')
dockerfile_path = os.path.join(build_dir, "Dockerfile")
with open(dockerfile_path, 'w') as dockerfile:
dockerfile.write(request_file.content)
return build_dir
@staticmethod
def _prepare_tarball(request_file):
build_dir = mkdtemp(prefix='docker-build-')
# Save the zip file to temp somewhere
with tarfile.open(mode='r|*', fileobj=request_file.raw) as tar_stream:
safe_extractall(tar_stream, build_dir)
return build_dir

View file

@ -1,16 +1,18 @@
from data.database import BUILD_PHASE
from data import model
import datetime
class StatusHandler(object):
""" Context wrapper for writing status to build logs. """
def __init__(self, build_logs, repository_build):
def __init__(self, build_logs, repository_build_uuid):
self._current_phase = None
self._repository_build = repository_build
self._uuid = repository_build.uuid
self._current_command = None
self._uuid = repository_build_uuid
self._build_logs = build_logs
self._status = {
'total_commands': None,
'total_commands': 0,
'current_command': None,
'push_completion': 0.0,
'pull_completion': 0.0,
@ -20,16 +22,25 @@ class StatusHandler(object):
self.__exit__(None, None, None)
def _append_log_message(self, log_message, log_type=None, log_data=None):
log_data = log_data or {}
log_data['datetime'] = str(datetime.datetime.now())
self._build_logs.append_log_message(self._uuid, log_message, log_type, log_data)
def append_log(self, log_message, extra_data=None):
if log_message is None:
return
self._append_log_message(log_message, log_data=extra_data)
def set_command(self, command, extra_data=None):
if self._current_command == command:
return
self._current_command = command
self._append_log_message(command, self._build_logs.COMMAND, extra_data)
def set_error(self, error_message, extra_data=None, internal_error=False):
self.set_phase(BUILD_PHASE.INTERNAL_ERROR if internal_error else BUILD_PHASE.ERROR)
def set_error(self, error_message, extra_data=None, internal_error=False, requeued=False):
self.set_phase(BUILD_PHASE.INTERNAL_ERROR if internal_error and requeued else BUILD_PHASE.ERROR)
extra_data = extra_data or {}
extra_data['internal_error'] = internal_error
@ -41,8 +52,12 @@ class StatusHandler(object):
self._current_phase = phase
self._append_log_message(phase, self._build_logs.PHASE, extra_data)
self._repository_build.phase = phase
self._repository_build.save()
# Update the repository build with the new phase
repo_build = model.get_repository_build(self._uuid)
repo_build.phase = phase
repo_build.save()
return True
def __enter__(self):

View file

@ -19,13 +19,19 @@ class WorkerError(object):
'is_internal': True
},
'io.quay.builder.dockerfileissue': {
'message': 'Could not find or parse Dockerfile',
'show_base_error': True
},
'io.quay.builder.cannotpullbaseimage': {
'message': 'Could not pull base image',
'show_base_error': True
},
'io.quay.builder.internalerror': {
'message': 'An internal error occurred while building. Please submit a ticket.'
'message': 'An internal error occurred while building. Please submit a ticket.',
'is_internal': True
},
'io.quay.builder.buildrunerror': {
@ -57,6 +63,11 @@ class WorkerError(object):
'io.quay.builder.missingorinvalidargument': {
'message': 'Missing required arguments for builder',
'is_internal': True
},
'io.quay.builder.cachelookupissue': {
'message': 'Error checking for a cached tag',
'is_internal': True
}
}

View file

@ -1,12 +1,17 @@
from trollius import coroutine
class BaseManager(object):
""" Base for all worker managers. """
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
job_complete_callback):
job_complete_callback, manager_hostname, heartbeat_period_sec):
self.register_component = register_component
self.unregister_component = unregister_component
self.job_heartbeat_callback = job_heartbeat_callback
self.job_complete_callback = job_complete_callback
self.manager_hostname = manager_hostname
self.heartbeat_period_sec = heartbeat_period_sec
@coroutine
def job_heartbeat(self, build_job):
""" Method invoked to tell the manager that a job is still running. This method will be called
every few minutes. """
@ -25,26 +30,36 @@ class BaseManager(object):
"""
raise NotImplementedError
def schedule(self, build_job, loop):
@coroutine
def schedule(self, build_job):
""" Schedules a queue item to be built. Returns True if the item was properly scheduled
and False if all workers are busy.
"""
raise NotImplementedError
def initialize(self):
def initialize(self, manager_config):
""" Runs any initialization code for the manager. Called once the server is in a ready state.
"""
raise NotImplementedError
@coroutine
def build_component_ready(self, build_component):
""" Method invoked whenever a build component announces itself as ready.
"""
raise NotImplementedError
def build_component_disposed(self, build_component, timed_out):
""" Method invoked whenever a build component has been disposed. The timed_out boolean indicates
whether the component's heartbeat timed out.
"""
raise NotImplementedError
@coroutine
def job_completed(self, build_job, job_status, build_component):
""" Method invoked once a job_item has completed, in some manner. The job_status will be
one of: incomplete, error, complete. If incomplete, the job should be requeued.
one of: incomplete, error, complete. Implementations of this method should call
self.job_complete_callback with a status of Incomplete if they wish for the job to be
automatically requeued.
"""
raise NotImplementedError

View file

@ -5,7 +5,7 @@ from buildman.component.basecomponent import BaseComponent
from buildman.component.buildcomponent import BuildComponent
from buildman.manager.basemanager import BaseManager
from trollius.coroutines import From
from trollius import From, Return, coroutine
REGISTRATION_REALM = 'registration'
logger = logging.getLogger(__name__)
@ -13,9 +13,6 @@ logger = logging.getLogger(__name__)
class DynamicRegistrationComponent(BaseComponent):
""" Component session that handles dynamic registration of the builder components. """
def kind(self):
return 'registration'
def onConnect(self):
self.join(REGISTRATION_REALM)
@ -31,10 +28,15 @@ class DynamicRegistrationComponent(BaseComponent):
class EnterpriseManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
build_components = []
shutting_down = False
def initialize(self):
def __init__(self, *args, **kwargs):
self.ready_components = set()
self.all_components = set()
self.shutting_down = False
super(EnterpriseManager, self).__init__(*args, **kwargs)
def initialize(self, manager_config):
# Add a component which is used by build workers for dynamic registration. Unlike
# production, build workers in enterprise are long-lived and register dynamically.
self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
@ -48,31 +50,37 @@ class EnterpriseManager(BaseManager):
""" Adds a new build component for an Enterprise Registry. """
# Generate a new unique realm ID for the build worker.
realm = str(uuid.uuid4())
component = self.register_component(realm, BuildComponent, token="")
self.build_components.append(component)
new_component = self.register_component(realm, BuildComponent, token="")
self.all_components.add(new_component)
return realm
def schedule(self, build_job, loop):
@coroutine
def schedule(self, build_job):
""" Schedules a build for an Enterprise Registry. """
if self.shutting_down:
return False
if self.shutting_down or not self.ready_components:
raise Return(False)
for component in self.build_components:
if component.is_ready():
loop.call_soon(component.start_build, build_job)
return True
component = self.ready_components.pop()
return False
yield From(component.start_build(build_job))
raise Return(True)
@coroutine
def build_component_ready(self, build_component):
self.ready_components.add(build_component)
def shutdown(self):
self.shutting_down = True
@coroutine
def job_completed(self, build_job, job_status, build_component):
self.job_complete_callback(build_job, job_status)
def build_component_disposed(self, build_component, timed_out):
self.build_components.remove(build_component)
self.unregister_component(build_component)
self.all_components.remove(build_component)
if build_component in self.ready_components:
self.ready_components.remove(build_component)
def num_workers(self):
return len(self.build_components)
return len(self.all_components)

View file

@ -0,0 +1,328 @@
import logging
import etcd
import uuid
import calendar
import os.path
import json
from datetime import datetime, timedelta
from trollius import From, coroutine, Return, async
from concurrent.futures import ThreadPoolExecutor
from urllib3.exceptions import ReadTimeoutError, ProtocolError
from buildman.manager.basemanager import BaseManager
from buildman.manager.executor import PopenExecutor, EC2Executor
from buildman.component.buildcomponent import BuildComponent
from buildman.jobutil.buildjob import BuildJob
from buildman.asyncutil import AsyncWrapper
from util.morecollections import AttrDict
logger = logging.getLogger(__name__)
ETCD_DISABLE_TIMEOUT = 0
class EtcdAction(object):
GET = 'get'
SET = 'set'
EXPIRE = 'expire'
UPDATE = 'update'
DELETE = 'delete'
CREATE = 'create'
COMPARE_AND_SWAP = 'compareAndSwap'
COMPARE_AND_DELETE = 'compareAndDelete'
class EphemeralBuilderManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
_executors = {
'popen': PopenExecutor,
'ec2': EC2Executor,
}
_etcd_client_klass = etcd.Client
def __init__(self, *args, **kwargs):
self._shutting_down = False
self._manager_config = None
self._async_thread_executor = None
self._etcd_client = None
self._etcd_realm_prefix = None
self._etcd_builder_prefix = None
self._component_to_job = {}
self._job_uuid_to_component = {}
self._component_to_builder = {}
self._executor = None
# Map of etcd keys being watched to the tasks watching them
self._watch_tasks = {}
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
def _watch_etcd(self, etcd_key, change_callback, recursive=True):
watch_task_key = (etcd_key, recursive)
def callback_wrapper(changed_key_future):
if watch_task_key not in self._watch_tasks or self._watch_tasks[watch_task_key].done():
self._watch_etcd(etcd_key, change_callback)
if changed_key_future.cancelled():
# Due to lack of interest, tomorrow has been cancelled
return
try:
etcd_result = changed_key_future.result()
except (ReadTimeoutError, ProtocolError):
return
change_callback(etcd_result)
if not self._shutting_down:
watch_future = self._etcd_client.watch(etcd_key, recursive=recursive,
timeout=ETCD_DISABLE_TIMEOUT)
watch_future.add_done_callback(callback_wrapper)
logger.debug('Scheduling watch of key: %s%s', etcd_key, '/*' if recursive else '')
self._watch_tasks[watch_task_key] = async(watch_future)
def _handle_builder_expiration(self, etcd_result):
if etcd_result.action == EtcdAction.EXPIRE:
# Handle the expiration
logger.debug('Builder expired, clean up the old build node')
job_metadata = json.loads(etcd_result._prev_node.value)
if 'builder_id' in job_metadata:
logger.info('Terminating expired build node.')
async(self._executor.stop_builder(job_metadata['builder_id']))
def _handle_realm_change(self, etcd_result):
if etcd_result.action == EtcdAction.CREATE:
# We must listen on the realm created by ourselves or another worker
realm_spec = json.loads(etcd_result.value)
self._register_realm(realm_spec)
elif etcd_result.action == EtcdAction.DELETE or etcd_result.action == EtcdAction.EXPIRE:
# We must stop listening for new connections on the specified realm, if we did not get the
# connection
realm_spec = json.loads(etcd_result._prev_node.value)
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
component = self._job_uuid_to_component.pop(build_job.job_details['build_uuid'], None)
if component is not None:
# We were not the manager which the worker connected to, remove the bookkeeping for it
logger.debug('Unregistering unused component on realm: %s', realm_spec['realm'])
del self._component_to_job[component]
del self._component_to_builder[component]
self.unregister_component(component)
else:
logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key)
def _register_realm(self, realm_spec):
logger.debug('Registering realm with manager: %s', realm_spec['realm'])
component = self.register_component(realm_spec['realm'], BuildComponent,
token=realm_spec['token'])
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
self._component_to_job[component] = build_job
self._component_to_builder[component] = realm_spec['builder_id']
self._job_uuid_to_component[build_job.job_details['build_uuid']] = component
@coroutine
def _register_existing_realms(self):
try:
all_realms = yield From(self._etcd_client.read(self._etcd_realm_prefix, recursive=True))
for realm in all_realms.children:
if not realm.dir:
self._register_realm(json.loads(realm.value))
except KeyError:
# no realms have been registered yet
pass
def initialize(self, manager_config):
logger.debug('Calling initialize')
self._manager_config = manager_config
executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor)
self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}),
self.manager_hostname)
etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1')
etcd_port = self._manager_config.get('ETCD_PORT', 2379)
etcd_auth = self._manager_config.get('ETCD_CERT_AND_KEY', None)
etcd_ca_cert = self._manager_config.get('ETCD_CA_CERT', None)
etcd_protocol = 'http' if etcd_auth is None else 'https'
logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
worker_threads = self._manager_config.get('ETCD_WORKER_THREADS', 5)
self._async_thread_executor = ThreadPoolExecutor(worker_threads)
self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port,
cert=etcd_auth, ca_cert=etcd_ca_cert,
protocol=etcd_protocol),
executor=self._async_thread_executor)
self._etcd_builder_prefix = self._manager_config.get('ETCD_BUILDER_PREFIX', 'building/')
self._watch_etcd(self._etcd_builder_prefix, self._handle_builder_expiration)
self._etcd_realm_prefix = self._manager_config.get('ETCD_REALM_PREFIX', 'realm/')
self._watch_etcd(self._etcd_realm_prefix, self._handle_realm_change)
# Load components for all realms currently known to the cluster
async(self._register_existing_realms())
def setup_time(self):
setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300)
return setup_time
def shutdown(self):
logger.debug('Shutting down worker.')
self._shutting_down = True
for (etcd_key, _), task in self._watch_tasks.items():
if not task.done():
logger.debug('Canceling watch task for %s', etcd_key)
task.cancel()
if self._async_thread_executor is not None:
logger.debug('Shutting down thread pool executor.')
self._async_thread_executor.shutdown()
@coroutine
def schedule(self, build_job):
build_uuid = build_job.job_details['build_uuid']
logger.debug('Calling schedule with job: %s', build_uuid)
# Check if there are worker slots avialable by checking the number of jobs in etcd
allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
try:
building = yield From(self._etcd_client.read(self._etcd_builder_prefix, recursive=True))
workers_alive = sum(1 for child in building.children if not child.dir)
except KeyError:
workers_alive = 0
logger.debug('Total jobs: %s', workers_alive)
if workers_alive >= allowed_worker_count:
logger.info('Too many workers alive, unable to start new worker. %s >= %s', workers_alive,
allowed_worker_count)
raise Return(False)
job_key = self._etcd_job_key(build_job)
# First try to take a lock for this job, meaning we will be responsible for its lifeline
realm = str(uuid.uuid4())
token = str(uuid.uuid4())
ttl = self.setup_time()
expiration = datetime.utcnow() + timedelta(seconds=ttl)
machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200)
max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration)
payload = {
'expiration': calendar.timegm(expiration.timetuple()),
'max_expiration': calendar.timegm(max_expiration.timetuple()),
}
try:
yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=False, ttl=ttl))
except KeyError:
# The job was already taken by someone else, we are probably a retry
logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?')
raise Return(False)
logger.debug('Starting builder with executor: %s', self._executor)
builder_id = yield From(self._executor.start_builder(realm, token, build_uuid))
# Store the builder in etcd associated with the job id
payload['builder_id'] = builder_id
yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl))
# Store the realm spec which will allow any manager to accept this builder when it connects
realm_spec = json.dumps({
'realm': realm,
'token': token,
'builder_id': builder_id,
'job_queue_item': build_job.job_item,
})
try:
yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False,
ttl=ttl))
except KeyError:
logger.error('Realm already exists in etcd. UUID collision or something is very very wrong.')
raise Return(False)
raise Return(True)
@coroutine
def build_component_ready(self, build_component):
try:
# Clean up the bookkeeping for allowing any manager to take the job
job = self._component_to_job.pop(build_component)
del self._job_uuid_to_component[job.job_details['build_uuid']]
yield From(self._etcd_client.delete(self._etcd_realm_key(build_component.builder_realm)))
logger.debug('Sending build %s to newly ready component on realm %s',
job.job_details['build_uuid'], build_component.builder_realm)
yield From(build_component.start_build(job))
except KeyError:
logger.debug('Builder is asking for more work, but work already completed')
def build_component_disposed(self, build_component, timed_out):
logger.debug('Calling build_component_disposed.')
# TODO make it so that I don't have to unregister the component if it timed out
self.unregister_component(build_component)
@coroutine
def job_completed(self, build_job, job_status, build_component):
logger.debug('Calling job_completed with status: %s', job_status)
# Kill the ephmeral builder
yield From(self._executor.stop_builder(self._component_to_builder.pop(build_component)))
# Release the lock in etcd
job_key = self._etcd_job_key(build_job)
yield From(self._etcd_client.delete(job_key))
self.job_complete_callback(build_job, job_status)
@coroutine
def job_heartbeat(self, build_job):
# Extend the deadline in etcd
job_key = self._etcd_job_key(build_job)
build_job_metadata_response = yield From(self._etcd_client.read(job_key))
build_job_metadata = json.loads(build_job_metadata_response.value)
max_expiration = datetime.utcfromtimestamp(build_job_metadata['max_expiration'])
max_expiration_remaining = max_expiration - datetime.utcnow()
max_expiration_sec = max(0, int(max_expiration_remaining.total_seconds()))
ttl = min(self.heartbeat_period_sec * 2, max_expiration_sec)
new_expiration = datetime.utcnow() + timedelta(seconds=ttl)
payload = {
'expiration': calendar.timegm(new_expiration.timetuple()),
'builder_id': build_job_metadata['builder_id'],
'max_expiration': build_job_metadata['max_expiration'],
}
yield From(self._etcd_client.write(job_key, json.dumps(payload), ttl=ttl))
self.job_heartbeat_callback(build_job)
def _etcd_job_key(self, build_job):
""" Create a key which is used to track a job in etcd.
"""
return os.path.join(self._etcd_builder_prefix, build_job.job_details['build_uuid'])
def _etcd_realm_key(self, realm):
""" Create a key which is used to track an incoming connection on a realm.
"""
return os.path.join(self._etcd_realm_prefix, realm)
def num_workers(self):
""" Return the number of workers we're managing locally.
"""
return len(self._component_to_builder)

View file

@ -0,0 +1,238 @@
import logging
import os
import uuid
import threading
import boto.ec2
import requests
import cachetools
from jinja2 import FileSystemLoader, Environment
from trollius import coroutine, From, Return, get_event_loop
from functools import partial
from buildman.asyncutil import AsyncWrapper
from container_cloud_config import CloudConfigContext
logger = logging.getLogger(__name__)
ONE_HOUR = 60*60
ENV = Environment(loader=FileSystemLoader('buildman/templates'))
TEMPLATE = ENV.get_template('cloudconfig.yaml')
CloudConfigContext().populate_jinja_environment(ENV)
class ExecutorException(Exception):
""" Exception raised when there is a problem starting or stopping a builder.
"""
pass
class BuilderExecutor(object):
def __init__(self, executor_config, manager_hostname):
self.executor_config = executor_config
self.manager_hostname = manager_hostname
""" Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
starting and stopping builders.
"""
@coroutine
def start_builder(self, realm, token, build_uuid):
""" Create a builder with the specified config. Returns a unique id which can be used to manage
the builder.
"""
raise NotImplementedError
@coroutine
def stop_builder(self, builder_id):
""" Stop a builder which is currently running.
"""
raise NotImplementedError
def get_manager_websocket_url(self):
return 'ws://{0}:'
def generate_cloud_config(self, realm, token, coreos_channel, manager_hostname,
quay_username=None, quay_password=None):
if quay_username is None:
quay_username = self.executor_config['QUAY_USERNAME']
if quay_password is None:
quay_password = self.executor_config['QUAY_PASSWORD']
return TEMPLATE.render(
realm=realm,
token=token,
quay_username=quay_username,
quay_password=quay_password,
manager_hostname=manager_hostname,
coreos_channel=coreos_channel,
worker_tag=self.executor_config['WORKER_TAG'],
)
class EC2Executor(BuilderExecutor):
""" Implementation of BuilderExecutor which uses libcloud to start machines on a variety of cloud
providers.
"""
COREOS_STACK_URL = 'http://%s.release.core-os.net/amd64-usr/current/coreos_production_ami_hvm.txt'
def __init__(self, *args, **kwargs):
self._loop = get_event_loop()
super(EC2Executor, self).__init__(*args, **kwargs)
def _get_conn(self):
""" Creates an ec2 connection which can be used to manage instances.
"""
return AsyncWrapper(boto.ec2.connect_to_region(
self.executor_config['EC2_REGION'],
aws_access_key_id=self.executor_config['AWS_ACCESS_KEY'],
aws_secret_access_key=self.executor_config['AWS_SECRET_KEY'],
))
@classmethod
@cachetools.ttl_cache(ttl=ONE_HOUR)
def _get_coreos_ami(cls, ec2_region, coreos_channel):
""" Retrieve the CoreOS AMI id from the canonical listing.
"""
stack_list_string = requests.get(EC2Executor.COREOS_STACK_URL % coreos_channel).text
stack_amis = dict([stack.split('=') for stack in stack_list_string.split('|')])
return stack_amis[ec2_region]
@coroutine
def start_builder(self, realm, token, build_uuid):
region = self.executor_config['EC2_REGION']
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
get_ami_callable = partial(self._get_coreos_ami, region, channel)
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
user_data = self.generate_cloud_config(realm, token, channel, self.manager_hostname)
logger.debug('Generated cloud config: %s', user_data)
ec2_conn = self._get_conn()
ssd_root_ebs = boto.ec2.blockdevicemapping.BlockDeviceType(
size=32,
volume_type='gp2',
delete_on_termination=True,
)
block_devices = boto.ec2.blockdevicemapping.BlockDeviceMapping()
block_devices['/dev/xvda'] = ssd_root_ebs
interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
subnet_id=self.executor_config['EC2_VPC_SUBNET_ID'],
groups=self.executor_config['EC2_SECURITY_GROUP_IDS'],
associate_public_ip_address=True,
)
interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)
reservation = yield From(ec2_conn.run_instances(
coreos_ami,
instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
key_name=self.executor_config.get('EC2_KEY_NAME', None),
user_data=user_data,
instance_initiated_shutdown_behavior='terminate',
block_device_map=block_devices,
network_interfaces=interfaces,
))
if not reservation.instances:
raise ExecutorException('Unable to spawn builder instance.')
elif len(reservation.instances) != 1:
raise ExecutorException('EC2 started wrong number of instances!')
launched = AsyncWrapper(reservation.instances[0])
yield From(launched.add_tags({
'Name': 'Quay Ephemeral Builder',
'Realm': realm,
'Token': token,
'BuildUUID': build_uuid,
}))
raise Return(launched.id)
@coroutine
def stop_builder(self, builder_id):
ec2_conn = self._get_conn()
terminated_instances = yield From(ec2_conn.terminate_instances([builder_id]))
if builder_id not in [si.id for si in terminated_instances]:
raise ExecutorException('Unable to terminate instance: %s' % builder_id)
class PopenExecutor(BuilderExecutor):
""" Implementation of BuilderExecutor which uses Popen to fork a quay-builder process.
"""
def __init__(self, executor_config, manager_hostname):
self._jobs = {}
super(PopenExecutor, self).__init__(executor_config, manager_hostname)
""" Executor which uses Popen to fork a quay-builder process.
"""
@coroutine
def start_builder(self, realm, token, build_uuid):
# Now start a machine for this job, adding the machine id to the etcd information
logger.debug('Forking process for build')
import subprocess
builder_env = {
'TOKEN': token,
'REALM': realm,
'ENDPOINT': 'ws://localhost:8787',
'DOCKER_TLS_VERIFY': os.environ.get('DOCKER_TLS_VERIFY', ''),
'DOCKER_CERT_PATH': os.environ.get('DOCKER_CERT_PATH', ''),
'DOCKER_HOST': os.environ.get('DOCKER_HOST', ''),
}
logpipe = LogPipe(logging.INFO)
spawned = subprocess.Popen('/Users/jake/bin/quay-builder', stdout=logpipe, stderr=logpipe,
env=builder_env)
builder_id = str(uuid.uuid4())
self._jobs[builder_id] = (spawned, logpipe)
logger.debug('Builder spawned with id: %s', builder_id)
raise Return(builder_id)
@coroutine
def stop_builder(self, builder_id):
if builder_id not in self._jobs:
raise ExecutorException('Builder id not being tracked by executor.')
logger.debug('Killing builder with id: %s', builder_id)
spawned, logpipe = self._jobs[builder_id]
if spawned.poll() is None:
spawned.kill()
logpipe.close()
class LogPipe(threading.Thread):
""" Adapted from http://codereview.stackexchange.com/a/17959
"""
def __init__(self, level):
"""Setup the object with a logger and a loglevel
and start the thread
"""
threading.Thread.__init__(self)
self.daemon = False
self.level = level
self.fd_read, self.fd_write = os.pipe()
self.pipe_reader = os.fdopen(self.fd_read)
self.start()
def fileno(self):
"""Return the write file descriptor of the pipe
"""
return self.fd_write
def run(self):
"""Run the thread, logging everything.
"""
for line in iter(self.pipe_reader.readline, ''):
logging.log(self.level, line.strip('\n'))
self.pipe_reader.close()
def close(self):
"""Close the write end of the pipe.
"""
os.close(self.fd_write)

View file

@ -12,8 +12,11 @@ from threading import Event
from trollius.coroutines import From
from datetime import timedelta
from buildman.jobutil.buildstatus import StatusHandler
from buildman.jobutil.buildjob import BuildJob, BuildJobLoadException
from data import database
from data.queue import WorkQueue
from app import app
logger = logging.getLogger(__name__)
@ -22,8 +25,7 @@ TIMEOUT_PERIOD_MINUTES = 20
JOB_TIMEOUT_SECONDS = 300
MINIMUM_JOB_EXTENSION = timedelta(minutes=2)
WEBSOCKET_PORT = 8787
CONTROLLER_PORT = 8686
HEARTBEAT_PERIOD_SEC = 30
class BuildJobResult(object):
""" Build job result enum """
@ -35,14 +37,15 @@ class BuilderServer(object):
""" Server which handles both HTTP and WAMP requests, managing the full state of the build
controller.
"""
def __init__(self, server_hostname, queue, build_logs, user_files, lifecycle_manager_klass):
def __init__(self, registry_hostname, queue, build_logs, user_files, lifecycle_manager_klass,
lifecycle_manager_config, manager_hostname):
self._loop = None
self._current_status = 'starting'
self._current_components = []
self._job_count = 0
self._session_factory = RouterSessionFactory(RouterFactory())
self._server_hostname = server_hostname
self._registry_hostname = registry_hostname
self._queue = queue
self._build_logs = build_logs
self._user_files = user_files
@ -50,8 +53,11 @@ class BuilderServer(object):
self._register_component,
self._unregister_component,
self._job_heartbeat,
self._job_complete
self._job_complete,
manager_hostname,
HEARTBEAT_PERIOD_SEC,
)
self._lifecycle_manager_config = lifecycle_manager_config
self._shutdown_event = Event()
self._current_status = 'running'
@ -81,18 +87,17 @@ class BuilderServer(object):
self._controller_app = controller_app
def run(self, host, ssl=None):
def run(self, host, websocket_port, controller_port, ssl=None):
logger.debug('Initializing the lifecycle manager')
self._lifecycle_manager.initialize()
self._lifecycle_manager.initialize(self._lifecycle_manager_config)
logger.debug('Initializing all members of the event loop')
loop = trollius.get_event_loop()
trollius.Task(self._initialize(loop, host, ssl))
logger.debug('Starting server on port %s, with controller on port %s', WEBSOCKET_PORT,
CONTROLLER_PORT)
logger.debug('Starting server on port %s, with controller on port %s', websocket_port,
controller_port)
try:
loop.run_forever()
loop.run_until_complete(self._initialize(loop, host, websocket_port, controller_port, ssl))
except KeyboardInterrupt:
pass
finally:
@ -116,7 +121,7 @@ class BuilderServer(object):
component.parent_manager = self._lifecycle_manager
component.build_logs = self._build_logs
component.user_files = self._user_files
component.server_hostname = self._server_hostname
component.registry_hostname = self._registry_hostname
self._current_components.append(component)
self._session_factory.add(component)
@ -130,32 +135,32 @@ class BuilderServer(object):
self._session_factory.remove(component)
def _job_heartbeat(self, build_job):
WorkQueue.extend_processing(build_job.job_item(), seconds_from_now=JOB_TIMEOUT_SECONDS,
retry_count=1, minimum_extension=MINIMUM_JOB_EXTENSION)
self._queue.extend_processing(build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS,
minimum_extension=MINIMUM_JOB_EXTENSION)
def _job_complete(self, build_job, job_status):
if job_status == BuildJobResult.INCOMPLETE:
self._queue.incomplete(build_job.job_item(), restore_retry=True, retry_after=30)
elif job_status == BuildJobResult.ERROR:
self._queue.incomplete(build_job.job_item(), restore_retry=False)
self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30)
else:
self._queue.complete(build_job.job_item())
self._queue.complete(build_job.job_item)
self._job_count = self._job_count - 1
if self._current_status == 'shutting_down' and not self._job_count:
self._shutdown_event.set()
# TODO(jschorr): check for work here?
@trollius.coroutine
def _work_checker(self):
while self._current_status == 'running':
logger.debug('Checking for more work for %d active workers', self._lifecycle_manager.num_workers())
with database.CloseForLongOperation(app.config):
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
logger.debug('Checking for more work for %d active workers',
self._lifecycle_manager.num_workers())
job_item = self._queue.get(processing_time=self._lifecycle_manager.setup_time())
if job_item is None:
logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
continue
try:
@ -163,20 +168,24 @@ class BuilderServer(object):
except BuildJobLoadException as irbe:
logger.exception(irbe)
self._queue.incomplete(job_item, restore_retry=False)
continue
logger.debug('Build job found. Checking for an avaliable worker.')
if self._lifecycle_manager.schedule(build_job, self._loop):
scheduled = yield From(self._lifecycle_manager.schedule(build_job))
if scheduled:
status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid)
status_handler.set_phase('build-scheduled')
self._job_count = self._job_count + 1
logger.debug('Build job scheduled. Running: %s', self._job_count)
else:
logger.debug('All workers are busy. Requeuing.')
self._queue.incomplete(job_item, restore_retry=True, retry_after=0)
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
@trollius.coroutine
def _initialize(self, loop, host, ssl=None):
def _initialize(self, loop, host, websocket_port, controller_port, ssl=None):
self._loop = loop
# Create the WAMP server.
@ -184,8 +193,8 @@ class BuilderServer(object):
transport_factory.setProtocolOptions(failByDrop=True)
# Initialize the controller server and the WAMP server
create_wsgi_server(self._controller_app, loop=loop, host=host, port=CONTROLLER_PORT, ssl=ssl)
yield From(loop.create_server(transport_factory, host, WEBSOCKET_PORT, ssl=ssl))
create_wsgi_server(self._controller_app, loop=loop, host=host, port=controller_port, ssl=ssl)
yield From(loop.create_server(transport_factory, host, websocket_port, ssl=ssl))
# Initialize the work queue checker.
yield From(self._work_checker())

View file

@ -0,0 +1,31 @@
#cloud-config
ssh_authorized_keys:
- ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCC0m+hVmyR3vn/xoxJe9+atRWBxSK+YXgyufNVDMcb7H00Jfnc341QH3kDVYZamUbhVh/nyc2RP7YbnZR5zORFtgOaNSdkMYrPozzBvxjnvSUokkCCWbLqXDHvIKiR12r+UTSijPJE/Yk702Mb2ejAFuae1C3Ec+qKAoOCagDjpQ3THyb5oaKE7VPHdwCWjWIQLRhC+plu77ObhoXIFJLD13gCi01L/rp4mYVCxIc2lX5A8rkK+bZHnIZwWUQ4t8SIjWxIaUo0FE7oZ83nKuNkYj5ngmLHQLY23Nx2WhE9H6NBthUpik9SmqQPtVYbhIG+bISPoH9Xs8CLrFb0VRjz Joey's Mac
- ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCo6FhAP7mFFOAzM91gtaKW7saahtaN4lur42FMMztz6aqUycIltCmvxo+3FmrXgCG30maMNU36Vm1+9QRtVQEd+eRuoIWP28t+8MT01Fh4zPuE2Wca3pOHSNo3X81FfWJLzmwEHiQKs9HPQqUhezR9PcVWVkbMyAzw85c0UycGmHGFNb0UiRd9HFY6XbgbxhZv/mvKLZ99xE3xkOzS1PNsdSNvjUKwZR7pSUPqNS5S/1NXyR4GhFTU24VPH/bTATOv2ATH+PSzsZ7Qyz9UHj38tKC+ALJHEDJ4HXGzobyOUP78cHGZOfCB5FYubq0zmOudAjKIAhwI8XTFvJ2DX1P3 jimmyzelinskie
- ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDNvw8qo9m8np7yQ/Smv/oklM8bo8VyNRZriGYBDuolWDL/mZpYCQnZJXphQo7RFdNABYistikjJlBuuwUohLf2uSq0iKoFa2TgwI43wViWzvuzU4nA02/ITD5BZdmWAFNyIoqeB50Ol4qUgDwLAZ+7Kv7uCi6chcgr9gTi99jY3GHyZjrMiXMHGVGi+FExFuzhVC2drKjbz5q6oRfQeLtNfG4psl5GU3MQU6FkX4fgoCx0r9R48/b7l4+TT7pWblJQiRfeldixu6308vyoTUEHasdkU3/X0OTaGz/h5XqTKnGQc6stvvoED3w+L3QFp0H5Z8sZ9stSsitmCBrmbcKZ jakemoshenko
write_files:
- path: /root/overrides.list
permission: '0644'
content: |
REALM={{ realm }}
TOKEN={{ token }}
SERVER=wss://{{ manager_hostname }}
coreos:
update:
reboot-strategy: off
group: {{ coreos_channel }}
units:
{{ dockersystemd('quay-builder',
'quay.io/coreos/registry-build-worker',
quay_username,
quay_password,
worker_tag,
extra_args='--net=host --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock -v /usr/share/ca-certificates:/etc/ssl/certs',
exec_stop_post=['/bin/sh -xc "/bin/sleep 120; /usr/bin/systemctl --no-block poweroff"'],
flattened=True,
restart_policy='no'
) | indent(4) }}