Merge branch 'master' into quark
This commit is contained in:
commit
fbdbc21eb1
137 changed files with 8691 additions and 2414 deletions
27
buildman/asyncutil.py
Normal file
27
buildman/asyncutil.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
from functools import partial, wraps
|
||||
from trollius import get_event_loop
|
||||
|
||||
|
||||
class AsyncWrapper(object):
|
||||
""" Wrapper class which will transform a syncronous library to one that can be used with
|
||||
trollius coroutines.
|
||||
"""
|
||||
def __init__(self, delegate, loop=None, executor=None):
|
||||
self._loop = loop if loop is not None else get_event_loop()
|
||||
self._delegate = delegate
|
||||
self._executor = executor
|
||||
|
||||
def __getattr__(self, attrib):
|
||||
delegate_attr = getattr(self._delegate, attrib)
|
||||
|
||||
if not callable(delegate_attr):
|
||||
return delegate_attr
|
||||
|
||||
def wrapper(*args, **kwargs):
|
||||
""" Wraps the delegate_attr with primitives that will transform sync calls to ones shelled
|
||||
out to a thread pool.
|
||||
"""
|
||||
callable_delegate_attr = partial(delegate_attr, *args, **kwargs)
|
||||
return self._loop.run_in_executor(self._executor, callable_delegate_attr)
|
||||
|
||||
return wrapper
|
|
@ -6,6 +6,7 @@ import time
|
|||
from app import app, userfiles as user_files, build_logs, dockerfile_build_queue
|
||||
|
||||
from buildman.manager.enterprise import EnterpriseManager
|
||||
from buildman.manager.ephemeral import EphemeralBuilderManager
|
||||
from buildman.server import BuilderServer
|
||||
|
||||
from trollius import SSLContext
|
||||
|
@ -13,14 +14,22 @@ from trollius import SSLContext
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
BUILD_MANAGERS = {
|
||||
'enterprise': EnterpriseManager
|
||||
'enterprise': EnterpriseManager,
|
||||
'ephemeral': EphemeralBuilderManager,
|
||||
}
|
||||
|
||||
EXTERNALLY_MANAGED = 'external'
|
||||
|
||||
DEFAULT_WEBSOCKET_PORT = 8787
|
||||
DEFAULT_CONTROLLER_PORT = 8686
|
||||
|
||||
LOG_FORMAT = "%(asctime)s [%(process)d] [%(levelname)s] [%(name)s] %(message)s"
|
||||
|
||||
def run_build_manager():
|
||||
if not features.BUILD_SUPPORT:
|
||||
logger.debug('Building is disabled. Please enable the feature flag')
|
||||
while True:
|
||||
time.sleep(1000)
|
||||
return
|
||||
|
||||
build_manager_config = app.config.get('BUILD_MANAGER')
|
||||
|
@ -39,6 +48,19 @@ def run_build_manager():
|
|||
if manager_klass is None:
|
||||
return
|
||||
|
||||
manager_hostname = os.environ.get('BUILDMAN_HOSTNAME',
|
||||
app.config.get('BUILDMAN_HOSTNAME',
|
||||
app.config['SERVER_HOSTNAME']))
|
||||
websocket_port = int(os.environ.get('BUILDMAN_WEBSOCKET_PORT',
|
||||
app.config.get('BUILDMAN_WEBSOCKET_PORT',
|
||||
DEFAULT_WEBSOCKET_PORT)))
|
||||
controller_port = int(os.environ.get('BUILDMAN_CONTROLLER_PORT',
|
||||
app.config.get('BUILDMAN_CONTROLLER_PORT',
|
||||
DEFAULT_CONTROLLER_PORT)))
|
||||
|
||||
logger.debug('Will pass buildman hostname %s to builders for websocket connection',
|
||||
manager_hostname)
|
||||
|
||||
logger.debug('Starting build manager with lifecycle "%s"', build_manager_config[0])
|
||||
ssl_context = None
|
||||
if os.environ.get('SSL_CONFIG'):
|
||||
|
@ -48,9 +70,10 @@ def run_build_manager():
|
|||
os.path.join(os.environ.get('SSL_CONFIG'), 'ssl.key'))
|
||||
|
||||
server = BuilderServer(app.config['SERVER_HOSTNAME'], dockerfile_build_queue, build_logs,
|
||||
user_files, manager_klass)
|
||||
server.run('0.0.0.0', ssl=ssl_context)
|
||||
user_files, manager_klass, build_manager_config[1], manager_hostname)
|
||||
server.run('0.0.0.0', websocket_port, controller_port, ssl=ssl_context)
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
|
||||
logging.getLogger('peewee').setLevel(logging.WARN)
|
||||
run_build_manager()
|
||||
|
|
|
@ -6,11 +6,10 @@ import trollius
|
|||
import re
|
||||
|
||||
from autobahn.wamp.exception import ApplicationError
|
||||
from trollius.coroutines import From
|
||||
|
||||
from buildman.server import BuildJobResult
|
||||
from buildman.component.basecomponent import BaseComponent
|
||||
from buildman.jobutil.buildpack import BuildPackage, BuildPackageException
|
||||
from buildman.jobutil.buildjob import BuildJobLoadException
|
||||
from buildman.jobutil.buildstatus import StatusHandler
|
||||
from buildman.jobutil.workererror import WorkerError
|
||||
|
||||
|
@ -20,7 +19,7 @@ HEARTBEAT_DELTA = datetime.timedelta(seconds=30)
|
|||
HEARTBEAT_TIMEOUT = 10
|
||||
INITIAL_TIMEOUT = 25
|
||||
|
||||
SUPPORTED_WORKER_VERSIONS = ['0.1-beta']
|
||||
SUPPORTED_WORKER_VERSIONS = ['0.3']
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -39,13 +38,14 @@ class BuildComponent(BaseComponent):
|
|||
self.builder_realm = realm
|
||||
|
||||
self.parent_manager = None
|
||||
self.server_hostname = None
|
||||
self.registry_hostname = None
|
||||
|
||||
self._component_status = ComponentStatus.JOINING
|
||||
self._last_heartbeat = None
|
||||
self._current_job = None
|
||||
self._build_status = None
|
||||
self._image_info = None
|
||||
self._worker_version = None
|
||||
|
||||
BaseComponent.__init__(self, config, **kwargs)
|
||||
|
||||
|
@ -57,69 +57,52 @@ class BuildComponent(BaseComponent):
|
|||
|
||||
def onJoin(self, details):
|
||||
logger.debug('Registering methods and listeners for component %s', self.builder_realm)
|
||||
yield From(self.register(self._on_ready, u'io.quay.buildworker.ready'))
|
||||
yield From(self.register(self._ping, u'io.quay.buildworker.ping'))
|
||||
yield From(self.subscribe(self._on_heartbeat, 'io.quay.builder.heartbeat'))
|
||||
yield From(self.subscribe(self._on_log_message, 'io.quay.builder.logmessage'))
|
||||
yield trollius.From(self.register(self._on_ready, u'io.quay.buildworker.ready'))
|
||||
yield trollius.From(self.register(self._determine_cache_tag,
|
||||
u'io.quay.buildworker.determinecachetag'))
|
||||
yield trollius.From(self.register(self._ping, u'io.quay.buildworker.ping'))
|
||||
|
||||
self._set_status(ComponentStatus.WAITING)
|
||||
yield trollius.From(self.subscribe(self._on_heartbeat, 'io.quay.builder.heartbeat'))
|
||||
yield trollius.From(self.subscribe(self._on_log_message, 'io.quay.builder.logmessage'))
|
||||
|
||||
yield trollius.From(self._set_status(ComponentStatus.WAITING))
|
||||
|
||||
def is_ready(self):
|
||||
""" Determines whether a build component is ready to begin a build. """
|
||||
return self._component_status == ComponentStatus.RUNNING
|
||||
|
||||
@trollius.coroutine
|
||||
def start_build(self, build_job):
|
||||
""" Starts a build. """
|
||||
logger.debug('Starting build for component %s (worker version: %s)',
|
||||
self.builder_realm, self._worker_version)
|
||||
|
||||
self._current_job = build_job
|
||||
self._build_status = StatusHandler(self.build_logs, build_job.repo_build())
|
||||
self._build_status = StatusHandler(self.build_logs, build_job.repo_build.uuid)
|
||||
self._image_info = {}
|
||||
|
||||
self._set_status(ComponentStatus.BUILDING)
|
||||
yield trollius.From(self._set_status(ComponentStatus.BUILDING))
|
||||
|
||||
# Retrieve the job's buildpack.
|
||||
buildpack_url = self.user_files.get_file_url(build_job.repo_build().resource_key,
|
||||
# Send the notification that the build has started.
|
||||
build_job.send_notification('build_start')
|
||||
|
||||
# Parse the build configuration.
|
||||
try:
|
||||
build_config = build_job.build_config
|
||||
except BuildJobLoadException as irbe:
|
||||
self._build_failure('Could not load build job information', irbe)
|
||||
|
||||
base_image_information = {}
|
||||
buildpack_url = self.user_files.get_file_url(build_job.repo_build.resource_key,
|
||||
requires_cors=False)
|
||||
|
||||
logger.debug('Retreiving build package: %s', buildpack_url)
|
||||
buildpack = None
|
||||
try:
|
||||
buildpack = BuildPackage.from_url(buildpack_url)
|
||||
except BuildPackageException as bpe:
|
||||
self._build_failure('Could not retrieve build package', bpe)
|
||||
return
|
||||
|
||||
# Extract the base image information from the Dockerfile.
|
||||
parsed_dockerfile = None
|
||||
logger.debug('Parsing dockerfile')
|
||||
|
||||
build_config = build_job.build_config()
|
||||
try:
|
||||
parsed_dockerfile = buildpack.parse_dockerfile(build_config.get('build_subdir'))
|
||||
except BuildPackageException as bpe:
|
||||
self._build_failure('Could not find Dockerfile in build package', bpe)
|
||||
return
|
||||
|
||||
image_and_tag_tuple = parsed_dockerfile.get_image_and_tag()
|
||||
if image_and_tag_tuple is None or image_and_tag_tuple[0] is None:
|
||||
self._build_failure('Missing FROM line in Dockerfile')
|
||||
return
|
||||
|
||||
base_image_information = {
|
||||
'repository': image_and_tag_tuple[0],
|
||||
'tag': image_and_tag_tuple[1]
|
||||
}
|
||||
|
||||
# Extract the number of steps from the Dockerfile.
|
||||
with self._build_status as status_dict:
|
||||
status_dict['total_commands'] = len(parsed_dockerfile.commands)
|
||||
|
||||
# Add the pull robot information, if any.
|
||||
if build_config.get('pull_credentials') is not None:
|
||||
base_image_information['username'] = build_config['pull_credentials'].get('username', '')
|
||||
base_image_information['password'] = build_config['pull_credentials'].get('password', '')
|
||||
if build_job.pull_credentials:
|
||||
base_image_information['username'] = build_job.pull_credentials.get('username', '')
|
||||
base_image_information['password'] = build_job.pull_credentials.get('password', '')
|
||||
|
||||
# Retrieve the repository's fully qualified name.
|
||||
repo = build_job.repo_build().repository
|
||||
repo = build_job.repo_build.repository
|
||||
repository_name = repo.namespace_user.username + '/' + repo.name
|
||||
|
||||
# Parse the build queue item into build arguments.
|
||||
|
@ -131,29 +114,26 @@ class BuildComponent(BaseComponent):
|
|||
# push_token: The token to use to push the built image.
|
||||
# tag_names: The name(s) of the tag(s) for the newly built image.
|
||||
# base_image: The image name and credentials to use to conduct the base image pull.
|
||||
# repository: The repository to pull.
|
||||
# tag: The tag to pull.
|
||||
# repository: The repository to pull (DEPRECATED 0.2)
|
||||
# tag: The tag to pull (DEPRECATED in 0.2)
|
||||
# username: The username for pulling the base image (if any).
|
||||
# password: The password for pulling the base image (if any).
|
||||
build_arguments = {
|
||||
'build_package': buildpack_url,
|
||||
'sub_directory': build_config.get('build_subdir', ''),
|
||||
'repository': repository_name,
|
||||
'registry': self.server_hostname,
|
||||
'pull_token': build_job.repo_build().access_token.code,
|
||||
'push_token': build_job.repo_build().access_token.code,
|
||||
'registry': self.registry_hostname,
|
||||
'pull_token': build_job.repo_build.access_token.code,
|
||||
'push_token': build_job.repo_build.access_token.code,
|
||||
'tag_names': build_config.get('docker_tags', ['latest']),
|
||||
'base_image': base_image_information,
|
||||
'cached_tag': build_job.determine_cached_tag() or ''
|
||||
'base_image': base_image_information
|
||||
}
|
||||
|
||||
# Invoke the build.
|
||||
logger.debug('Invoking build: %s', self.builder_realm)
|
||||
logger.debug('With Arguments: %s', build_arguments)
|
||||
|
||||
return (self
|
||||
.call("io.quay.builder.build", **build_arguments)
|
||||
.add_done_callback(self._build_complete))
|
||||
self.call("io.quay.builder.build", **build_arguments).add_done_callback(self._build_complete)
|
||||
|
||||
@staticmethod
|
||||
def _total_completion(statuses, total_images):
|
||||
|
@ -240,18 +220,28 @@ class BuildComponent(BaseComponent):
|
|||
elif phase == BUILD_PHASE.BUILDING:
|
||||
self._build_status.append_log(current_status_string)
|
||||
|
||||
@trollius.coroutine
|
||||
def _determine_cache_tag(self, command_comments, base_image_name, base_image_tag, base_image_id):
|
||||
with self._build_status as status_dict:
|
||||
status_dict['total_commands'] = len(command_comments) + 1
|
||||
|
||||
logger.debug('Checking cache on realm %s. Base image: %s:%s (%s)', self.builder_realm,
|
||||
base_image_name, base_image_tag, base_image_id)
|
||||
|
||||
tag_found = self._current_job.determine_cached_tag(base_image_id, command_comments)
|
||||
raise trollius.Return(tag_found or '')
|
||||
|
||||
def _build_failure(self, error_message, exception=None):
|
||||
""" Handles and logs a failed build. """
|
||||
self._build_status.set_error(error_message, {
|
||||
'internal_error': exception.message if exception else None
|
||||
'internal_error': str(exception) if exception else None
|
||||
})
|
||||
|
||||
build_id = self._current_job.repo_build().uuid
|
||||
build_id = self._current_job.repo_build.uuid
|
||||
logger.warning('Build %s failed with message: %s', build_id, error_message)
|
||||
|
||||
# Mark that the build has finished (in an error state)
|
||||
self._build_finished(BuildJobResult.ERROR)
|
||||
trollius.async(self._build_finished(BuildJobResult.ERROR))
|
||||
|
||||
def _build_complete(self, result):
|
||||
""" Wraps up a completed build. Handles any errors and calls self._build_finished. """
|
||||
|
@ -259,60 +249,78 @@ class BuildComponent(BaseComponent):
|
|||
# Retrieve the result. This will raise an ApplicationError on any error that occurred.
|
||||
result.result()
|
||||
self._build_status.set_phase(BUILD_PHASE.COMPLETE)
|
||||
self._build_finished(BuildJobResult.COMPLETE)
|
||||
trollius.async(self._build_finished(BuildJobResult.COMPLETE))
|
||||
|
||||
# Send the notification that the build has completed successfully.
|
||||
self._current_job.send_notification('build_success')
|
||||
except ApplicationError as aex:
|
||||
worker_error = WorkerError(aex.error, aex.kwargs.get('base_error'))
|
||||
|
||||
# Write the error to the log.
|
||||
self._build_status.set_error(worker_error.public_message(), worker_error.extra_data(),
|
||||
internal_error=worker_error.is_internal_error())
|
||||
internal_error=worker_error.is_internal_error(),
|
||||
requeued=self._current_job.has_retries_remaining())
|
||||
|
||||
# Send the notification that the build has failed.
|
||||
self._current_job.send_notification('build_failure',
|
||||
error_message=worker_error.public_message())
|
||||
|
||||
# Mark the build as completed.
|
||||
if worker_error.is_internal_error():
|
||||
self._build_finished(BuildJobResult.INCOMPLETE)
|
||||
trollius.async(self._build_finished(BuildJobResult.INCOMPLETE))
|
||||
else:
|
||||
self._build_finished(BuildJobResult.ERROR)
|
||||
trollius.async(self._build_finished(BuildJobResult.ERROR))
|
||||
|
||||
@trollius.coroutine
|
||||
def _build_finished(self, job_status):
|
||||
""" Alerts the parent that a build has completed and sets the status back to running. """
|
||||
self.parent_manager.job_completed(self._current_job, job_status, self)
|
||||
yield trollius.From(self.parent_manager.job_completed(self._current_job, job_status, self))
|
||||
self._current_job = None
|
||||
|
||||
# Set the component back to a running state.
|
||||
self._set_status(ComponentStatus.RUNNING)
|
||||
yield trollius.From(self._set_status(ComponentStatus.RUNNING))
|
||||
|
||||
@staticmethod
|
||||
def _ping():
|
||||
""" Ping pong. """
|
||||
return 'pong'
|
||||
|
||||
@trollius.coroutine
|
||||
def _on_ready(self, token, version):
|
||||
if not version in SUPPORTED_WORKER_VERSIONS:
|
||||
logger.warning('Build component (token "%s") is running an out-of-date version: %s', version)
|
||||
return False
|
||||
self._worker_version = version
|
||||
|
||||
if self._component_status != 'waiting':
|
||||
if not version in SUPPORTED_WORKER_VERSIONS:
|
||||
logger.warning('Build component (token "%s") is running an out-of-date version: %s', token,
|
||||
version)
|
||||
raise trollius.Return(False)
|
||||
|
||||
if self._component_status != ComponentStatus.WAITING:
|
||||
logger.warning('Build component (token "%s") is already connected', self.expected_token)
|
||||
return False
|
||||
raise trollius.Return(False)
|
||||
|
||||
if token != self.expected_token:
|
||||
logger.warning('Builder token mismatch. Expected: "%s". Found: "%s"', self.expected_token, token)
|
||||
return False
|
||||
logger.warning('Builder token mismatch. Expected: "%s". Found: "%s"', self.expected_token,
|
||||
token)
|
||||
raise trollius.Return(False)
|
||||
|
||||
self._set_status(ComponentStatus.RUNNING)
|
||||
yield trollius.From(self._set_status(ComponentStatus.RUNNING))
|
||||
|
||||
# Start the heartbeat check and updating loop.
|
||||
loop = trollius.get_event_loop()
|
||||
loop.create_task(self._heartbeat())
|
||||
logger.debug('Build worker %s is connected and ready', self.builder_realm)
|
||||
return True
|
||||
raise trollius.Return(True)
|
||||
|
||||
@trollius.coroutine
|
||||
def _set_status(self, phase):
|
||||
if phase == ComponentStatus.RUNNING:
|
||||
yield trollius.From(self.parent_manager.build_component_ready(self))
|
||||
|
||||
self._component_status = phase
|
||||
|
||||
def _on_heartbeat(self):
|
||||
""" Updates the last known heartbeat. """
|
||||
self._last_heartbeat = datetime.datetime.now()
|
||||
self._last_heartbeat = datetime.datetime.utcnow()
|
||||
|
||||
@trollius.coroutine
|
||||
def _heartbeat(self):
|
||||
|
@ -320,13 +328,13 @@ class BuildComponent(BaseComponent):
|
|||
and updating the heartbeat in the build status dictionary (if applicable). This allows
|
||||
the build system to catch crashes from either end.
|
||||
"""
|
||||
yield From(trollius.sleep(INITIAL_TIMEOUT))
|
||||
yield trollius.From(trollius.sleep(INITIAL_TIMEOUT))
|
||||
|
||||
while True:
|
||||
# If the component is no longer running or actively building, nothing more to do.
|
||||
if (self._component_status != ComponentStatus.RUNNING and
|
||||
self._component_status != ComponentStatus.BUILDING):
|
||||
return
|
||||
raise trollius.Return()
|
||||
|
||||
# If there is an active build, write the heartbeat to its status.
|
||||
build_status = self._build_status
|
||||
|
@ -334,35 +342,37 @@ class BuildComponent(BaseComponent):
|
|||
with build_status as status_dict:
|
||||
status_dict['heartbeat'] = int(time.time())
|
||||
|
||||
|
||||
# Mark the build item.
|
||||
current_job = self._current_job
|
||||
if current_job is not None:
|
||||
self.parent_manager.job_heartbeat(current_job)
|
||||
yield trollius.From(self.parent_manager.job_heartbeat(current_job))
|
||||
|
||||
# Check the heartbeat from the worker.
|
||||
logger.debug('Checking heartbeat on realm %s', self.builder_realm)
|
||||
if self._last_heartbeat and self._last_heartbeat < datetime.datetime.now() - HEARTBEAT_DELTA:
|
||||
self._timeout()
|
||||
return
|
||||
if (self._last_heartbeat and
|
||||
self._last_heartbeat < datetime.datetime.utcnow() - HEARTBEAT_DELTA):
|
||||
yield trollius.From(self._timeout())
|
||||
raise trollius.Return()
|
||||
|
||||
yield From(trollius.sleep(HEARTBEAT_TIMEOUT))
|
||||
yield trollius.From(trollius.sleep(HEARTBEAT_TIMEOUT))
|
||||
|
||||
@trollius.coroutine
|
||||
def _timeout(self):
|
||||
self._set_status(ComponentStatus.TIMED_OUT)
|
||||
logger.warning('Build component with realm %s has timed out', self.builder_realm)
|
||||
self._dispose(timed_out=True)
|
||||
if self._component_status == ComponentStatus.TIMED_OUT:
|
||||
raise trollius.Return()
|
||||
|
||||
yield trollius.From(self._set_status(ComponentStatus.TIMED_OUT))
|
||||
logger.warning('Build component with realm %s has timed out', self.builder_realm)
|
||||
|
||||
def _dispose(self, timed_out=False):
|
||||
# If we still have a running job, then it has not completed and we need to tell the parent
|
||||
# manager.
|
||||
if self._current_job is not None:
|
||||
if timed_out:
|
||||
self._build_status.set_error('Build worker timed out', internal_error=True)
|
||||
self._build_status.set_error('Build worker timed out', internal_error=True,
|
||||
requeued=self._current_job.has_retries_remaining())
|
||||
|
||||
self.parent_manager.job_completed(self._current_job, BuildJobResult.INCOMPLETE, self)
|
||||
self._build_status = None
|
||||
self._current_job = None
|
||||
|
||||
# Unregister the current component so that it cannot be invoked again.
|
||||
self.parent_manager.build_component_disposed(self, timed_out)
|
||||
self.parent_manager.build_component_disposed(self, True)
|
||||
|
|
|
@ -1,6 +1,13 @@
|
|||
from data import model
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
from cachetools import lru_cache
|
||||
from endpoints.notificationhelper import spawn_notification
|
||||
from data import model
|
||||
from util.imagetree import ImageTree
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BuildJobLoadException(Exception):
|
||||
""" Exception raised if a build job could not be instantiated for some reason. """
|
||||
|
@ -9,50 +16,123 @@ class BuildJobLoadException(Exception):
|
|||
class BuildJob(object):
|
||||
""" Represents a single in-progress build job. """
|
||||
def __init__(self, job_item):
|
||||
self._job_item = job_item
|
||||
self.job_item = job_item
|
||||
|
||||
try:
|
||||
self._job_details = json.loads(job_item.body)
|
||||
self.job_details = json.loads(job_item.body)
|
||||
except ValueError:
|
||||
raise BuildJobLoadException(
|
||||
'Could not parse build queue item config with ID %s' % self._job_details['build_uuid']
|
||||
'Could not parse build queue item config with ID %s' % self.job_details['build_uuid']
|
||||
)
|
||||
|
||||
def has_retries_remaining(self):
|
||||
return self.job_item.retries_remaining > 0
|
||||
|
||||
def send_notification(self, kind, error_message=None):
|
||||
tags = self.build_config.get('docker_tags', ['latest'])
|
||||
event_data = {
|
||||
'build_id': self.repo_build.uuid,
|
||||
'build_name': self.repo_build.display_name,
|
||||
'docker_tags': tags,
|
||||
'trigger_id': self.repo_build.trigger.uuid,
|
||||
'trigger_kind': self.repo_build.trigger.service.name
|
||||
}
|
||||
|
||||
if error_message is not None:
|
||||
event_data['error_message'] = error_message
|
||||
|
||||
spawn_notification(self.repo_build.repository, kind, event_data,
|
||||
subpage='build?current=%s' % self.repo_build.uuid,
|
||||
pathargs=['build', self.repo_build.uuid])
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _load_repo_build(self):
|
||||
try:
|
||||
self._repo_build = model.get_repository_build(self._job_details['build_uuid'])
|
||||
return model.get_repository_build(self.job_details['build_uuid'])
|
||||
except model.InvalidRepositoryBuildException:
|
||||
raise BuildJobLoadException(
|
||||
'Could not load repository build with ID %s' % self._job_details['build_uuid'])
|
||||
'Could not load repository build with ID %s' % self.job_details['build_uuid'])
|
||||
|
||||
@property
|
||||
def repo_build(self):
|
||||
return self._load_repo_build()
|
||||
|
||||
@property
|
||||
def pull_credentials(self):
|
||||
""" Returns the pull credentials for this job, or None if none. """
|
||||
return self.job_details.get('pull_credentials')
|
||||
|
||||
@property
|
||||
def build_config(self):
|
||||
try:
|
||||
self._build_config = json.loads(self._repo_build.job_config)
|
||||
return json.loads(self.repo_build.job_config)
|
||||
except ValueError:
|
||||
raise BuildJobLoadException(
|
||||
'Could not parse repository build job config with ID %s' % self._job_details['build_uuid']
|
||||
'Could not parse repository build job config with ID %s' % self.job_details['build_uuid']
|
||||
)
|
||||
|
||||
def determine_cached_tag(self):
|
||||
def determine_cached_tag(self, base_image_id=None, cache_comments=None):
|
||||
""" Returns the tag to pull to prime the cache or None if none. """
|
||||
# TODO(jschorr): Change this to use the more complicated caching rules, once we have caching
|
||||
# be a pull of things besides the constructed tags.
|
||||
tags = self._build_config.get('docker_tags', ['latest'])
|
||||
existing_tags = model.list_repository_tags(self._repo_build.repository.namespace_user.username,
|
||||
self._repo_build.repository.name)
|
||||
cached_tag = None
|
||||
if base_image_id and cache_comments:
|
||||
cached_tag = self._determine_cached_tag_by_comments(base_image_id, cache_comments)
|
||||
|
||||
if not cached_tag:
|
||||
cached_tag = self._determine_cached_tag_by_tag()
|
||||
|
||||
logger.debug('Determined cached tag %s for %s: %s', cached_tag, base_image_id, cache_comments)
|
||||
|
||||
return cached_tag
|
||||
|
||||
def _determine_cached_tag_by_comments(self, base_image_id, cache_commands):
|
||||
""" Determines the tag to use for priming the cache for this build job, by matching commands
|
||||
starting at the given base_image_id. This mimics the Docker cache checking, so it should,
|
||||
in theory, provide "perfect" caching.
|
||||
"""
|
||||
# Lookup the base image in the repository. If it doesn't exist, nothing more to do.
|
||||
repo_build = self.repo_build
|
||||
repo_namespace = repo_build.repository.namespace_user.username
|
||||
repo_name = repo_build.repository.name
|
||||
|
||||
base_image = model.get_image(repo_build.repository, base_image_id)
|
||||
if base_image is None:
|
||||
return None
|
||||
|
||||
# Build an in-memory tree of the full heirarchy of images in the repository.
|
||||
all_images = model.get_repository_images(repo_namespace, repo_name)
|
||||
all_tags = model.list_repository_tags(repo_namespace, repo_name)
|
||||
tree = ImageTree(all_images, all_tags, base_filter=base_image.id)
|
||||
|
||||
# Find a path in the tree, starting at the base image, that matches the cache comments
|
||||
# or some subset thereof.
|
||||
def checker(step, image):
|
||||
if step >= len(cache_commands):
|
||||
return False
|
||||
|
||||
full_command = '["/bin/sh", "-c", "%s"]' % cache_commands[step]
|
||||
logger.debug('Checking step #%s: %s, %s == %s', step, image.id,
|
||||
image.storage.command, full_command)
|
||||
|
||||
return image.storage.command == full_command
|
||||
|
||||
path = tree.find_longest_path(base_image.id, checker)
|
||||
if not path:
|
||||
return None
|
||||
|
||||
# Find any tag associated with the last image in the path.
|
||||
return tree.tag_containing_image(path[-1])
|
||||
|
||||
|
||||
def _determine_cached_tag_by_tag(self):
|
||||
""" Determines the cached tag by looking for one of the tags being built, and seeing if it
|
||||
exists in the repository. This is a fallback for when no comment information is available.
|
||||
"""
|
||||
tags = self.build_config.get('docker_tags', ['latest'])
|
||||
repository = self.repo_build.repository
|
||||
existing_tags = model.list_repository_tags(repository.namespace_user.username, repository.name)
|
||||
cached_tags = set(tags) & set([tag.name for tag in existing_tags])
|
||||
if cached_tags:
|
||||
return list(cached_tags)[0]
|
||||
|
||||
return None
|
||||
|
||||
def job_item(self):
|
||||
""" Returns the job's queue item. """
|
||||
return self._job_item
|
||||
|
||||
def repo_build(self):
|
||||
""" Returns the repository build DB row for the job. """
|
||||
return self._repo_build
|
||||
|
||||
def build_config(self):
|
||||
""" Returns the parsed repository build config for the job. """
|
||||
return self._build_config
|
||||
|
|
|
@ -1,88 +0,0 @@
|
|||
import tarfile
|
||||
import requests
|
||||
import os
|
||||
|
||||
from tempfile import TemporaryFile, mkdtemp
|
||||
from zipfile import ZipFile
|
||||
from util.dockerfileparse import parse_dockerfile
|
||||
from util.safetar import safe_extractall
|
||||
|
||||
class BuildPackageException(Exception):
|
||||
""" Exception raised when retrieving or parsing a build package. """
|
||||
pass
|
||||
|
||||
|
||||
class BuildPackage(object):
|
||||
""" Helper class for easy reading and updating of a Dockerfile build pack. """
|
||||
|
||||
def __init__(self, requests_file):
|
||||
self._mime_processors = {
|
||||
'application/zip': BuildPackage._prepare_zip,
|
||||
'application/x-zip-compressed': BuildPackage._prepare_zip,
|
||||
'text/plain': BuildPackage._prepare_dockerfile,
|
||||
'application/octet-stream': BuildPackage._prepare_dockerfile,
|
||||
'application/x-tar': BuildPackage._prepare_tarball,
|
||||
'application/gzip': BuildPackage._prepare_tarball,
|
||||
'application/x-gzip': BuildPackage._prepare_tarball,
|
||||
}
|
||||
|
||||
c_type = requests_file.headers['content-type']
|
||||
c_type = c_type.split(';')[0] if ';' in c_type else c_type
|
||||
|
||||
if c_type not in self._mime_processors:
|
||||
raise BuildPackageException('Unknown build package mime type: %s' % c_type)
|
||||
|
||||
self._package_directory = None
|
||||
try:
|
||||
self._package_directory = self._mime_processors[c_type](requests_file)
|
||||
except Exception as ex:
|
||||
raise BuildPackageException(ex.message)
|
||||
|
||||
def parse_dockerfile(self, subdirectory):
|
||||
dockerfile_path = os.path.join(self._package_directory, subdirectory, 'Dockerfile')
|
||||
if not os.path.exists(dockerfile_path):
|
||||
if subdirectory:
|
||||
message = 'Build package did not contain a Dockerfile at sub directory %s.' % subdirectory
|
||||
else:
|
||||
message = 'Build package did not contain a Dockerfile at the root directory.'
|
||||
|
||||
raise BuildPackageException(message)
|
||||
|
||||
with open(dockerfile_path, 'r') as dockerfileobj:
|
||||
return parse_dockerfile(dockerfileobj.read())
|
||||
|
||||
@staticmethod
|
||||
def from_url(url):
|
||||
buildpack_resource = requests.get(url, stream=True)
|
||||
return BuildPackage(buildpack_resource)
|
||||
|
||||
@staticmethod
|
||||
def _prepare_zip(request_file):
|
||||
build_dir = mkdtemp(prefix='docker-build-')
|
||||
|
||||
# Save the zip file to temp somewhere
|
||||
with TemporaryFile() as zip_file:
|
||||
zip_file.write(request_file.content)
|
||||
to_extract = ZipFile(zip_file)
|
||||
to_extract.extractall(build_dir)
|
||||
|
||||
return build_dir
|
||||
|
||||
@staticmethod
|
||||
def _prepare_dockerfile(request_file):
|
||||
build_dir = mkdtemp(prefix='docker-build-')
|
||||
dockerfile_path = os.path.join(build_dir, "Dockerfile")
|
||||
with open(dockerfile_path, 'w') as dockerfile:
|
||||
dockerfile.write(request_file.content)
|
||||
|
||||
return build_dir
|
||||
|
||||
@staticmethod
|
||||
def _prepare_tarball(request_file):
|
||||
build_dir = mkdtemp(prefix='docker-build-')
|
||||
|
||||
# Save the zip file to temp somewhere
|
||||
with tarfile.open(mode='r|*', fileobj=request_file.raw) as tar_stream:
|
||||
safe_extractall(tar_stream, build_dir)
|
||||
|
||||
return build_dir
|
|
@ -1,16 +1,18 @@
|
|||
from data.database import BUILD_PHASE
|
||||
from data import model
|
||||
import datetime
|
||||
|
||||
class StatusHandler(object):
|
||||
""" Context wrapper for writing status to build logs. """
|
||||
|
||||
def __init__(self, build_logs, repository_build):
|
||||
def __init__(self, build_logs, repository_build_uuid):
|
||||
self._current_phase = None
|
||||
self._repository_build = repository_build
|
||||
self._uuid = repository_build.uuid
|
||||
self._current_command = None
|
||||
self._uuid = repository_build_uuid
|
||||
self._build_logs = build_logs
|
||||
|
||||
self._status = {
|
||||
'total_commands': None,
|
||||
'total_commands': 0,
|
||||
'current_command': None,
|
||||
'push_completion': 0.0,
|
||||
'pull_completion': 0.0,
|
||||
|
@ -20,16 +22,25 @@ class StatusHandler(object):
|
|||
self.__exit__(None, None, None)
|
||||
|
||||
def _append_log_message(self, log_message, log_type=None, log_data=None):
|
||||
log_data = log_data or {}
|
||||
log_data['datetime'] = str(datetime.datetime.now())
|
||||
self._build_logs.append_log_message(self._uuid, log_message, log_type, log_data)
|
||||
|
||||
def append_log(self, log_message, extra_data=None):
|
||||
if log_message is None:
|
||||
return
|
||||
|
||||
self._append_log_message(log_message, log_data=extra_data)
|
||||
|
||||
def set_command(self, command, extra_data=None):
|
||||
if self._current_command == command:
|
||||
return
|
||||
|
||||
self._current_command = command
|
||||
self._append_log_message(command, self._build_logs.COMMAND, extra_data)
|
||||
|
||||
def set_error(self, error_message, extra_data=None, internal_error=False):
|
||||
self.set_phase(BUILD_PHASE.INTERNAL_ERROR if internal_error else BUILD_PHASE.ERROR)
|
||||
def set_error(self, error_message, extra_data=None, internal_error=False, requeued=False):
|
||||
self.set_phase(BUILD_PHASE.INTERNAL_ERROR if internal_error and requeued else BUILD_PHASE.ERROR)
|
||||
|
||||
extra_data = extra_data or {}
|
||||
extra_data['internal_error'] = internal_error
|
||||
|
@ -41,8 +52,12 @@ class StatusHandler(object):
|
|||
|
||||
self._current_phase = phase
|
||||
self._append_log_message(phase, self._build_logs.PHASE, extra_data)
|
||||
self._repository_build.phase = phase
|
||||
self._repository_build.save()
|
||||
|
||||
# Update the repository build with the new phase
|
||||
repo_build = model.get_repository_build(self._uuid)
|
||||
repo_build.phase = phase
|
||||
repo_build.save()
|
||||
|
||||
return True
|
||||
|
||||
def __enter__(self):
|
||||
|
|
|
@ -19,13 +19,19 @@ class WorkerError(object):
|
|||
'is_internal': True
|
||||
},
|
||||
|
||||
'io.quay.builder.dockerfileissue': {
|
||||
'message': 'Could not find or parse Dockerfile',
|
||||
'show_base_error': True
|
||||
},
|
||||
|
||||
'io.quay.builder.cannotpullbaseimage': {
|
||||
'message': 'Could not pull base image',
|
||||
'show_base_error': True
|
||||
},
|
||||
|
||||
'io.quay.builder.internalerror': {
|
||||
'message': 'An internal error occurred while building. Please submit a ticket.'
|
||||
'message': 'An internal error occurred while building. Please submit a ticket.',
|
||||
'is_internal': True
|
||||
},
|
||||
|
||||
'io.quay.builder.buildrunerror': {
|
||||
|
@ -57,6 +63,11 @@ class WorkerError(object):
|
|||
'io.quay.builder.missingorinvalidargument': {
|
||||
'message': 'Missing required arguments for builder',
|
||||
'is_internal': True
|
||||
},
|
||||
|
||||
'io.quay.builder.cachelookupissue': {
|
||||
'message': 'Error checking for a cached tag',
|
||||
'is_internal': True
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,12 +1,17 @@
|
|||
from trollius import coroutine
|
||||
|
||||
class BaseManager(object):
|
||||
""" Base for all worker managers. """
|
||||
def __init__(self, register_component, unregister_component, job_heartbeat_callback,
|
||||
job_complete_callback):
|
||||
job_complete_callback, manager_hostname, heartbeat_period_sec):
|
||||
self.register_component = register_component
|
||||
self.unregister_component = unregister_component
|
||||
self.job_heartbeat_callback = job_heartbeat_callback
|
||||
self.job_complete_callback = job_complete_callback
|
||||
self.manager_hostname = manager_hostname
|
||||
self.heartbeat_period_sec = heartbeat_period_sec
|
||||
|
||||
@coroutine
|
||||
def job_heartbeat(self, build_job):
|
||||
""" Method invoked to tell the manager that a job is still running. This method will be called
|
||||
every few minutes. """
|
||||
|
@ -25,26 +30,36 @@ class BaseManager(object):
|
|||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def schedule(self, build_job, loop):
|
||||
@coroutine
|
||||
def schedule(self, build_job):
|
||||
""" Schedules a queue item to be built. Returns True if the item was properly scheduled
|
||||
and False if all workers are busy.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def initialize(self):
|
||||
def initialize(self, manager_config):
|
||||
""" Runs any initialization code for the manager. Called once the server is in a ready state.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@coroutine
|
||||
def build_component_ready(self, build_component):
|
||||
""" Method invoked whenever a build component announces itself as ready.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def build_component_disposed(self, build_component, timed_out):
|
||||
""" Method invoked whenever a build component has been disposed. The timed_out boolean indicates
|
||||
whether the component's heartbeat timed out.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@coroutine
|
||||
def job_completed(self, build_job, job_status, build_component):
|
||||
""" Method invoked once a job_item has completed, in some manner. The job_status will be
|
||||
one of: incomplete, error, complete. If incomplete, the job should be requeued.
|
||||
one of: incomplete, error, complete. Implementations of this method should call
|
||||
self.job_complete_callback with a status of Incomplete if they wish for the job to be
|
||||
automatically requeued.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from buildman.component.basecomponent import BaseComponent
|
|||
from buildman.component.buildcomponent import BuildComponent
|
||||
from buildman.manager.basemanager import BaseManager
|
||||
|
||||
from trollius.coroutines import From
|
||||
from trollius import From, Return, coroutine
|
||||
|
||||
REGISTRATION_REALM = 'registration'
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -13,9 +13,6 @@ logger = logging.getLogger(__name__)
|
|||
class DynamicRegistrationComponent(BaseComponent):
|
||||
""" Component session that handles dynamic registration of the builder components. """
|
||||
|
||||
def kind(self):
|
||||
return 'registration'
|
||||
|
||||
def onConnect(self):
|
||||
self.join(REGISTRATION_REALM)
|
||||
|
||||
|
@ -31,10 +28,15 @@ class DynamicRegistrationComponent(BaseComponent):
|
|||
|
||||
class EnterpriseManager(BaseManager):
|
||||
""" Build manager implementation for the Enterprise Registry. """
|
||||
build_components = []
|
||||
shutting_down = False
|
||||
|
||||
def initialize(self):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.ready_components = set()
|
||||
self.all_components = set()
|
||||
self.shutting_down = False
|
||||
|
||||
super(EnterpriseManager, self).__init__(*args, **kwargs)
|
||||
|
||||
def initialize(self, manager_config):
|
||||
# Add a component which is used by build workers for dynamic registration. Unlike
|
||||
# production, build workers in enterprise are long-lived and register dynamically.
|
||||
self.register_component(REGISTRATION_REALM, DynamicRegistrationComponent)
|
||||
|
@ -48,31 +50,37 @@ class EnterpriseManager(BaseManager):
|
|||
""" Adds a new build component for an Enterprise Registry. """
|
||||
# Generate a new unique realm ID for the build worker.
|
||||
realm = str(uuid.uuid4())
|
||||
component = self.register_component(realm, BuildComponent, token="")
|
||||
self.build_components.append(component)
|
||||
new_component = self.register_component(realm, BuildComponent, token="")
|
||||
self.all_components.add(new_component)
|
||||
return realm
|
||||
|
||||
def schedule(self, build_job, loop):
|
||||
@coroutine
|
||||
def schedule(self, build_job):
|
||||
""" Schedules a build for an Enterprise Registry. """
|
||||
if self.shutting_down:
|
||||
return False
|
||||
if self.shutting_down or not self.ready_components:
|
||||
raise Return(False)
|
||||
|
||||
for component in self.build_components:
|
||||
if component.is_ready():
|
||||
loop.call_soon(component.start_build, build_job)
|
||||
return True
|
||||
component = self.ready_components.pop()
|
||||
|
||||
return False
|
||||
yield From(component.start_build(build_job))
|
||||
|
||||
raise Return(True)
|
||||
|
||||
@coroutine
|
||||
def build_component_ready(self, build_component):
|
||||
self.ready_components.add(build_component)
|
||||
|
||||
def shutdown(self):
|
||||
self.shutting_down = True
|
||||
|
||||
@coroutine
|
||||
def job_completed(self, build_job, job_status, build_component):
|
||||
self.job_complete_callback(build_job, job_status)
|
||||
|
||||
def build_component_disposed(self, build_component, timed_out):
|
||||
self.build_components.remove(build_component)
|
||||
self.unregister_component(build_component)
|
||||
self.all_components.remove(build_component)
|
||||
if build_component in self.ready_components:
|
||||
self.ready_components.remove(build_component)
|
||||
|
||||
def num_workers(self):
|
||||
return len(self.build_components)
|
||||
return len(self.all_components)
|
||||
|
|
328
buildman/manager/ephemeral.py
Normal file
328
buildman/manager/ephemeral.py
Normal file
|
@ -0,0 +1,328 @@
|
|||
import logging
|
||||
import etcd
|
||||
import uuid
|
||||
import calendar
|
||||
import os.path
|
||||
import json
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from trollius import From, coroutine, Return, async
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from urllib3.exceptions import ReadTimeoutError, ProtocolError
|
||||
|
||||
from buildman.manager.basemanager import BaseManager
|
||||
from buildman.manager.executor import PopenExecutor, EC2Executor
|
||||
from buildman.component.buildcomponent import BuildComponent
|
||||
from buildman.jobutil.buildjob import BuildJob
|
||||
from buildman.asyncutil import AsyncWrapper
|
||||
from util.morecollections import AttrDict
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
ETCD_DISABLE_TIMEOUT = 0
|
||||
|
||||
|
||||
class EtcdAction(object):
|
||||
GET = 'get'
|
||||
SET = 'set'
|
||||
EXPIRE = 'expire'
|
||||
UPDATE = 'update'
|
||||
DELETE = 'delete'
|
||||
CREATE = 'create'
|
||||
COMPARE_AND_SWAP = 'compareAndSwap'
|
||||
COMPARE_AND_DELETE = 'compareAndDelete'
|
||||
|
||||
|
||||
class EphemeralBuilderManager(BaseManager):
|
||||
""" Build manager implementation for the Enterprise Registry. """
|
||||
_executors = {
|
||||
'popen': PopenExecutor,
|
||||
'ec2': EC2Executor,
|
||||
}
|
||||
|
||||
_etcd_client_klass = etcd.Client
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._shutting_down = False
|
||||
|
||||
self._manager_config = None
|
||||
self._async_thread_executor = None
|
||||
self._etcd_client = None
|
||||
|
||||
self._etcd_realm_prefix = None
|
||||
self._etcd_builder_prefix = None
|
||||
|
||||
self._component_to_job = {}
|
||||
self._job_uuid_to_component = {}
|
||||
self._component_to_builder = {}
|
||||
|
||||
self._executor = None
|
||||
|
||||
# Map of etcd keys being watched to the tasks watching them
|
||||
self._watch_tasks = {}
|
||||
|
||||
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
|
||||
|
||||
def _watch_etcd(self, etcd_key, change_callback, recursive=True):
|
||||
watch_task_key = (etcd_key, recursive)
|
||||
def callback_wrapper(changed_key_future):
|
||||
if watch_task_key not in self._watch_tasks or self._watch_tasks[watch_task_key].done():
|
||||
self._watch_etcd(etcd_key, change_callback)
|
||||
|
||||
if changed_key_future.cancelled():
|
||||
# Due to lack of interest, tomorrow has been cancelled
|
||||
return
|
||||
|
||||
try:
|
||||
etcd_result = changed_key_future.result()
|
||||
except (ReadTimeoutError, ProtocolError):
|
||||
return
|
||||
|
||||
change_callback(etcd_result)
|
||||
|
||||
if not self._shutting_down:
|
||||
watch_future = self._etcd_client.watch(etcd_key, recursive=recursive,
|
||||
timeout=ETCD_DISABLE_TIMEOUT)
|
||||
watch_future.add_done_callback(callback_wrapper)
|
||||
logger.debug('Scheduling watch of key: %s%s', etcd_key, '/*' if recursive else '')
|
||||
self._watch_tasks[watch_task_key] = async(watch_future)
|
||||
|
||||
def _handle_builder_expiration(self, etcd_result):
|
||||
if etcd_result.action == EtcdAction.EXPIRE:
|
||||
# Handle the expiration
|
||||
logger.debug('Builder expired, clean up the old build node')
|
||||
job_metadata = json.loads(etcd_result._prev_node.value)
|
||||
|
||||
if 'builder_id' in job_metadata:
|
||||
logger.info('Terminating expired build node.')
|
||||
async(self._executor.stop_builder(job_metadata['builder_id']))
|
||||
|
||||
def _handle_realm_change(self, etcd_result):
|
||||
if etcd_result.action == EtcdAction.CREATE:
|
||||
# We must listen on the realm created by ourselves or another worker
|
||||
realm_spec = json.loads(etcd_result.value)
|
||||
self._register_realm(realm_spec)
|
||||
|
||||
elif etcd_result.action == EtcdAction.DELETE or etcd_result.action == EtcdAction.EXPIRE:
|
||||
# We must stop listening for new connections on the specified realm, if we did not get the
|
||||
# connection
|
||||
realm_spec = json.loads(etcd_result._prev_node.value)
|
||||
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
|
||||
component = self._job_uuid_to_component.pop(build_job.job_details['build_uuid'], None)
|
||||
if component is not None:
|
||||
# We were not the manager which the worker connected to, remove the bookkeeping for it
|
||||
logger.debug('Unregistering unused component on realm: %s', realm_spec['realm'])
|
||||
del self._component_to_job[component]
|
||||
del self._component_to_builder[component]
|
||||
self.unregister_component(component)
|
||||
|
||||
else:
|
||||
logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key)
|
||||
|
||||
def _register_realm(self, realm_spec):
|
||||
logger.debug('Registering realm with manager: %s', realm_spec['realm'])
|
||||
component = self.register_component(realm_spec['realm'], BuildComponent,
|
||||
token=realm_spec['token'])
|
||||
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
|
||||
self._component_to_job[component] = build_job
|
||||
self._component_to_builder[component] = realm_spec['builder_id']
|
||||
self._job_uuid_to_component[build_job.job_details['build_uuid']] = component
|
||||
|
||||
@coroutine
|
||||
def _register_existing_realms(self):
|
||||
try:
|
||||
all_realms = yield From(self._etcd_client.read(self._etcd_realm_prefix, recursive=True))
|
||||
for realm in all_realms.children:
|
||||
if not realm.dir:
|
||||
self._register_realm(json.loads(realm.value))
|
||||
except KeyError:
|
||||
# no realms have been registered yet
|
||||
pass
|
||||
|
||||
def initialize(self, manager_config):
|
||||
logger.debug('Calling initialize')
|
||||
self._manager_config = manager_config
|
||||
|
||||
executor_klass = self._executors.get(manager_config.get('EXECUTOR', ''), PopenExecutor)
|
||||
self._executor = executor_klass(manager_config.get('EXECUTOR_CONFIG', {}),
|
||||
self.manager_hostname)
|
||||
|
||||
etcd_host = self._manager_config.get('ETCD_HOST', '127.0.0.1')
|
||||
etcd_port = self._manager_config.get('ETCD_PORT', 2379)
|
||||
etcd_auth = self._manager_config.get('ETCD_CERT_AND_KEY', None)
|
||||
etcd_ca_cert = self._manager_config.get('ETCD_CA_CERT', None)
|
||||
etcd_protocol = 'http' if etcd_auth is None else 'https'
|
||||
logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
|
||||
|
||||
worker_threads = self._manager_config.get('ETCD_WORKER_THREADS', 5)
|
||||
self._async_thread_executor = ThreadPoolExecutor(worker_threads)
|
||||
self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port,
|
||||
cert=etcd_auth, ca_cert=etcd_ca_cert,
|
||||
protocol=etcd_protocol),
|
||||
executor=self._async_thread_executor)
|
||||
|
||||
self._etcd_builder_prefix = self._manager_config.get('ETCD_BUILDER_PREFIX', 'building/')
|
||||
self._watch_etcd(self._etcd_builder_prefix, self._handle_builder_expiration)
|
||||
|
||||
self._etcd_realm_prefix = self._manager_config.get('ETCD_REALM_PREFIX', 'realm/')
|
||||
self._watch_etcd(self._etcd_realm_prefix, self._handle_realm_change)
|
||||
|
||||
# Load components for all realms currently known to the cluster
|
||||
async(self._register_existing_realms())
|
||||
|
||||
def setup_time(self):
|
||||
setup_time = self._manager_config.get('MACHINE_SETUP_TIME', 300)
|
||||
return setup_time
|
||||
|
||||
def shutdown(self):
|
||||
logger.debug('Shutting down worker.')
|
||||
self._shutting_down = True
|
||||
|
||||
for (etcd_key, _), task in self._watch_tasks.items():
|
||||
if not task.done():
|
||||
logger.debug('Canceling watch task for %s', etcd_key)
|
||||
task.cancel()
|
||||
|
||||
if self._async_thread_executor is not None:
|
||||
logger.debug('Shutting down thread pool executor.')
|
||||
self._async_thread_executor.shutdown()
|
||||
|
||||
@coroutine
|
||||
def schedule(self, build_job):
|
||||
build_uuid = build_job.job_details['build_uuid']
|
||||
logger.debug('Calling schedule with job: %s', build_uuid)
|
||||
|
||||
# Check if there are worker slots avialable by checking the number of jobs in etcd
|
||||
allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
|
||||
try:
|
||||
building = yield From(self._etcd_client.read(self._etcd_builder_prefix, recursive=True))
|
||||
workers_alive = sum(1 for child in building.children if not child.dir)
|
||||
except KeyError:
|
||||
workers_alive = 0
|
||||
|
||||
logger.debug('Total jobs: %s', workers_alive)
|
||||
|
||||
if workers_alive >= allowed_worker_count:
|
||||
logger.info('Too many workers alive, unable to start new worker. %s >= %s', workers_alive,
|
||||
allowed_worker_count)
|
||||
raise Return(False)
|
||||
|
||||
job_key = self._etcd_job_key(build_job)
|
||||
|
||||
# First try to take a lock for this job, meaning we will be responsible for its lifeline
|
||||
realm = str(uuid.uuid4())
|
||||
token = str(uuid.uuid4())
|
||||
ttl = self.setup_time()
|
||||
expiration = datetime.utcnow() + timedelta(seconds=ttl)
|
||||
|
||||
machine_max_expiration = self._manager_config.get('MACHINE_MAX_TIME', 7200)
|
||||
max_expiration = datetime.utcnow() + timedelta(seconds=machine_max_expiration)
|
||||
|
||||
payload = {
|
||||
'expiration': calendar.timegm(expiration.timetuple()),
|
||||
'max_expiration': calendar.timegm(max_expiration.timetuple()),
|
||||
}
|
||||
|
||||
try:
|
||||
yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=False, ttl=ttl))
|
||||
except KeyError:
|
||||
# The job was already taken by someone else, we are probably a retry
|
||||
logger.error('Job already exists in etcd, are timeouts misconfigured or is the queue broken?')
|
||||
raise Return(False)
|
||||
|
||||
logger.debug('Starting builder with executor: %s', self._executor)
|
||||
builder_id = yield From(self._executor.start_builder(realm, token, build_uuid))
|
||||
|
||||
# Store the builder in etcd associated with the job id
|
||||
payload['builder_id'] = builder_id
|
||||
yield From(self._etcd_client.write(job_key, json.dumps(payload), prevExist=True, ttl=ttl))
|
||||
|
||||
# Store the realm spec which will allow any manager to accept this builder when it connects
|
||||
realm_spec = json.dumps({
|
||||
'realm': realm,
|
||||
'token': token,
|
||||
'builder_id': builder_id,
|
||||
'job_queue_item': build_job.job_item,
|
||||
})
|
||||
try:
|
||||
yield From(self._etcd_client.write(self._etcd_realm_key(realm), realm_spec, prevExist=False,
|
||||
ttl=ttl))
|
||||
except KeyError:
|
||||
logger.error('Realm already exists in etcd. UUID collision or something is very very wrong.')
|
||||
raise Return(False)
|
||||
|
||||
raise Return(True)
|
||||
|
||||
@coroutine
|
||||
def build_component_ready(self, build_component):
|
||||
try:
|
||||
# Clean up the bookkeeping for allowing any manager to take the job
|
||||
job = self._component_to_job.pop(build_component)
|
||||
del self._job_uuid_to_component[job.job_details['build_uuid']]
|
||||
yield From(self._etcd_client.delete(self._etcd_realm_key(build_component.builder_realm)))
|
||||
|
||||
logger.debug('Sending build %s to newly ready component on realm %s',
|
||||
job.job_details['build_uuid'], build_component.builder_realm)
|
||||
yield From(build_component.start_build(job))
|
||||
except KeyError:
|
||||
logger.debug('Builder is asking for more work, but work already completed')
|
||||
|
||||
def build_component_disposed(self, build_component, timed_out):
|
||||
logger.debug('Calling build_component_disposed.')
|
||||
|
||||
# TODO make it so that I don't have to unregister the component if it timed out
|
||||
self.unregister_component(build_component)
|
||||
|
||||
@coroutine
|
||||
def job_completed(self, build_job, job_status, build_component):
|
||||
logger.debug('Calling job_completed with status: %s', job_status)
|
||||
|
||||
# Kill the ephmeral builder
|
||||
yield From(self._executor.stop_builder(self._component_to_builder.pop(build_component)))
|
||||
|
||||
# Release the lock in etcd
|
||||
job_key = self._etcd_job_key(build_job)
|
||||
yield From(self._etcd_client.delete(job_key))
|
||||
|
||||
self.job_complete_callback(build_job, job_status)
|
||||
|
||||
@coroutine
|
||||
def job_heartbeat(self, build_job):
|
||||
# Extend the deadline in etcd
|
||||
job_key = self._etcd_job_key(build_job)
|
||||
build_job_metadata_response = yield From(self._etcd_client.read(job_key))
|
||||
build_job_metadata = json.loads(build_job_metadata_response.value)
|
||||
|
||||
max_expiration = datetime.utcfromtimestamp(build_job_metadata['max_expiration'])
|
||||
max_expiration_remaining = max_expiration - datetime.utcnow()
|
||||
max_expiration_sec = max(0, int(max_expiration_remaining.total_seconds()))
|
||||
|
||||
ttl = min(self.heartbeat_period_sec * 2, max_expiration_sec)
|
||||
new_expiration = datetime.utcnow() + timedelta(seconds=ttl)
|
||||
|
||||
payload = {
|
||||
'expiration': calendar.timegm(new_expiration.timetuple()),
|
||||
'builder_id': build_job_metadata['builder_id'],
|
||||
'max_expiration': build_job_metadata['max_expiration'],
|
||||
}
|
||||
|
||||
yield From(self._etcd_client.write(job_key, json.dumps(payload), ttl=ttl))
|
||||
|
||||
self.job_heartbeat_callback(build_job)
|
||||
|
||||
def _etcd_job_key(self, build_job):
|
||||
""" Create a key which is used to track a job in etcd.
|
||||
"""
|
||||
return os.path.join(self._etcd_builder_prefix, build_job.job_details['build_uuid'])
|
||||
|
||||
def _etcd_realm_key(self, realm):
|
||||
""" Create a key which is used to track an incoming connection on a realm.
|
||||
"""
|
||||
return os.path.join(self._etcd_realm_prefix, realm)
|
||||
|
||||
def num_workers(self):
|
||||
""" Return the number of workers we're managing locally.
|
||||
"""
|
||||
return len(self._component_to_builder)
|
238
buildman/manager/executor.py
Normal file
238
buildman/manager/executor.py
Normal file
|
@ -0,0 +1,238 @@
|
|||
import logging
|
||||
import os
|
||||
import uuid
|
||||
import threading
|
||||
import boto.ec2
|
||||
import requests
|
||||
import cachetools
|
||||
|
||||
from jinja2 import FileSystemLoader, Environment
|
||||
from trollius import coroutine, From, Return, get_event_loop
|
||||
from functools import partial
|
||||
|
||||
from buildman.asyncutil import AsyncWrapper
|
||||
from container_cloud_config import CloudConfigContext
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
ONE_HOUR = 60*60
|
||||
|
||||
ENV = Environment(loader=FileSystemLoader('buildman/templates'))
|
||||
TEMPLATE = ENV.get_template('cloudconfig.yaml')
|
||||
CloudConfigContext().populate_jinja_environment(ENV)
|
||||
|
||||
class ExecutorException(Exception):
|
||||
""" Exception raised when there is a problem starting or stopping a builder.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class BuilderExecutor(object):
|
||||
def __init__(self, executor_config, manager_hostname):
|
||||
self.executor_config = executor_config
|
||||
self.manager_hostname = manager_hostname
|
||||
|
||||
""" Interface which can be plugged into the EphemeralNodeManager to provide a strategy for
|
||||
starting and stopping builders.
|
||||
"""
|
||||
@coroutine
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
""" Create a builder with the specified config. Returns a unique id which can be used to manage
|
||||
the builder.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@coroutine
|
||||
def stop_builder(self, builder_id):
|
||||
""" Stop a builder which is currently running.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_manager_websocket_url(self):
|
||||
return 'ws://{0}:'
|
||||
|
||||
def generate_cloud_config(self, realm, token, coreos_channel, manager_hostname,
|
||||
quay_username=None, quay_password=None):
|
||||
if quay_username is None:
|
||||
quay_username = self.executor_config['QUAY_USERNAME']
|
||||
|
||||
if quay_password is None:
|
||||
quay_password = self.executor_config['QUAY_PASSWORD']
|
||||
|
||||
return TEMPLATE.render(
|
||||
realm=realm,
|
||||
token=token,
|
||||
quay_username=quay_username,
|
||||
quay_password=quay_password,
|
||||
manager_hostname=manager_hostname,
|
||||
coreos_channel=coreos_channel,
|
||||
worker_tag=self.executor_config['WORKER_TAG'],
|
||||
)
|
||||
|
||||
|
||||
class EC2Executor(BuilderExecutor):
|
||||
""" Implementation of BuilderExecutor which uses libcloud to start machines on a variety of cloud
|
||||
providers.
|
||||
"""
|
||||
COREOS_STACK_URL = 'http://%s.release.core-os.net/amd64-usr/current/coreos_production_ami_hvm.txt'
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._loop = get_event_loop()
|
||||
super(EC2Executor, self).__init__(*args, **kwargs)
|
||||
|
||||
def _get_conn(self):
|
||||
""" Creates an ec2 connection which can be used to manage instances.
|
||||
"""
|
||||
return AsyncWrapper(boto.ec2.connect_to_region(
|
||||
self.executor_config['EC2_REGION'],
|
||||
aws_access_key_id=self.executor_config['AWS_ACCESS_KEY'],
|
||||
aws_secret_access_key=self.executor_config['AWS_SECRET_KEY'],
|
||||
))
|
||||
|
||||
@classmethod
|
||||
@cachetools.ttl_cache(ttl=ONE_HOUR)
|
||||
def _get_coreos_ami(cls, ec2_region, coreos_channel):
|
||||
""" Retrieve the CoreOS AMI id from the canonical listing.
|
||||
"""
|
||||
stack_list_string = requests.get(EC2Executor.COREOS_STACK_URL % coreos_channel).text
|
||||
stack_amis = dict([stack.split('=') for stack in stack_list_string.split('|')])
|
||||
return stack_amis[ec2_region]
|
||||
|
||||
@coroutine
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
region = self.executor_config['EC2_REGION']
|
||||
channel = self.executor_config.get('COREOS_CHANNEL', 'stable')
|
||||
get_ami_callable = partial(self._get_coreos_ami, region, channel)
|
||||
coreos_ami = yield From(self._loop.run_in_executor(None, get_ami_callable))
|
||||
user_data = self.generate_cloud_config(realm, token, channel, self.manager_hostname)
|
||||
|
||||
logger.debug('Generated cloud config: %s', user_data)
|
||||
|
||||
ec2_conn = self._get_conn()
|
||||
|
||||
ssd_root_ebs = boto.ec2.blockdevicemapping.BlockDeviceType(
|
||||
size=32,
|
||||
volume_type='gp2',
|
||||
delete_on_termination=True,
|
||||
)
|
||||
block_devices = boto.ec2.blockdevicemapping.BlockDeviceMapping()
|
||||
block_devices['/dev/xvda'] = ssd_root_ebs
|
||||
|
||||
interface = boto.ec2.networkinterface.NetworkInterfaceSpecification(
|
||||
subnet_id=self.executor_config['EC2_VPC_SUBNET_ID'],
|
||||
groups=self.executor_config['EC2_SECURITY_GROUP_IDS'],
|
||||
associate_public_ip_address=True,
|
||||
)
|
||||
interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection(interface)
|
||||
|
||||
reservation = yield From(ec2_conn.run_instances(
|
||||
coreos_ami,
|
||||
instance_type=self.executor_config['EC2_INSTANCE_TYPE'],
|
||||
key_name=self.executor_config.get('EC2_KEY_NAME', None),
|
||||
user_data=user_data,
|
||||
instance_initiated_shutdown_behavior='terminate',
|
||||
block_device_map=block_devices,
|
||||
network_interfaces=interfaces,
|
||||
))
|
||||
|
||||
if not reservation.instances:
|
||||
raise ExecutorException('Unable to spawn builder instance.')
|
||||
elif len(reservation.instances) != 1:
|
||||
raise ExecutorException('EC2 started wrong number of instances!')
|
||||
|
||||
launched = AsyncWrapper(reservation.instances[0])
|
||||
yield From(launched.add_tags({
|
||||
'Name': 'Quay Ephemeral Builder',
|
||||
'Realm': realm,
|
||||
'Token': token,
|
||||
'BuildUUID': build_uuid,
|
||||
}))
|
||||
raise Return(launched.id)
|
||||
|
||||
@coroutine
|
||||
def stop_builder(self, builder_id):
|
||||
ec2_conn = self._get_conn()
|
||||
terminated_instances = yield From(ec2_conn.terminate_instances([builder_id]))
|
||||
if builder_id not in [si.id for si in terminated_instances]:
|
||||
raise ExecutorException('Unable to terminate instance: %s' % builder_id)
|
||||
|
||||
|
||||
class PopenExecutor(BuilderExecutor):
|
||||
""" Implementation of BuilderExecutor which uses Popen to fork a quay-builder process.
|
||||
"""
|
||||
def __init__(self, executor_config, manager_hostname):
|
||||
self._jobs = {}
|
||||
|
||||
super(PopenExecutor, self).__init__(executor_config, manager_hostname)
|
||||
|
||||
""" Executor which uses Popen to fork a quay-builder process.
|
||||
"""
|
||||
@coroutine
|
||||
def start_builder(self, realm, token, build_uuid):
|
||||
# Now start a machine for this job, adding the machine id to the etcd information
|
||||
logger.debug('Forking process for build')
|
||||
import subprocess
|
||||
builder_env = {
|
||||
'TOKEN': token,
|
||||
'REALM': realm,
|
||||
'ENDPOINT': 'ws://localhost:8787',
|
||||
'DOCKER_TLS_VERIFY': os.environ.get('DOCKER_TLS_VERIFY', ''),
|
||||
'DOCKER_CERT_PATH': os.environ.get('DOCKER_CERT_PATH', ''),
|
||||
'DOCKER_HOST': os.environ.get('DOCKER_HOST', ''),
|
||||
}
|
||||
|
||||
logpipe = LogPipe(logging.INFO)
|
||||
spawned = subprocess.Popen('/Users/jake/bin/quay-builder', stdout=logpipe, stderr=logpipe,
|
||||
env=builder_env)
|
||||
|
||||
builder_id = str(uuid.uuid4())
|
||||
self._jobs[builder_id] = (spawned, logpipe)
|
||||
logger.debug('Builder spawned with id: %s', builder_id)
|
||||
raise Return(builder_id)
|
||||
|
||||
@coroutine
|
||||
def stop_builder(self, builder_id):
|
||||
if builder_id not in self._jobs:
|
||||
raise ExecutorException('Builder id not being tracked by executor.')
|
||||
|
||||
logger.debug('Killing builder with id: %s', builder_id)
|
||||
spawned, logpipe = self._jobs[builder_id]
|
||||
|
||||
if spawned.poll() is None:
|
||||
spawned.kill()
|
||||
logpipe.close()
|
||||
|
||||
|
||||
class LogPipe(threading.Thread):
|
||||
""" Adapted from http://codereview.stackexchange.com/a/17959
|
||||
"""
|
||||
def __init__(self, level):
|
||||
"""Setup the object with a logger and a loglevel
|
||||
and start the thread
|
||||
"""
|
||||
threading.Thread.__init__(self)
|
||||
self.daemon = False
|
||||
self.level = level
|
||||
self.fd_read, self.fd_write = os.pipe()
|
||||
self.pipe_reader = os.fdopen(self.fd_read)
|
||||
self.start()
|
||||
|
||||
def fileno(self):
|
||||
"""Return the write file descriptor of the pipe
|
||||
"""
|
||||
return self.fd_write
|
||||
|
||||
def run(self):
|
||||
"""Run the thread, logging everything.
|
||||
"""
|
||||
for line in iter(self.pipe_reader.readline, ''):
|
||||
logging.log(self.level, line.strip('\n'))
|
||||
|
||||
self.pipe_reader.close()
|
||||
|
||||
def close(self):
|
||||
"""Close the write end of the pipe.
|
||||
"""
|
||||
os.close(self.fd_write)
|
|
@ -12,8 +12,11 @@ from threading import Event
|
|||
from trollius.coroutines import From
|
||||
from datetime import timedelta
|
||||
|
||||
from buildman.jobutil.buildstatus import StatusHandler
|
||||
from buildman.jobutil.buildjob import BuildJob, BuildJobLoadException
|
||||
from data import database
|
||||
from data.queue import WorkQueue
|
||||
from app import app
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -22,8 +25,7 @@ TIMEOUT_PERIOD_MINUTES = 20
|
|||
JOB_TIMEOUT_SECONDS = 300
|
||||
MINIMUM_JOB_EXTENSION = timedelta(minutes=2)
|
||||
|
||||
WEBSOCKET_PORT = 8787
|
||||
CONTROLLER_PORT = 8686
|
||||
HEARTBEAT_PERIOD_SEC = 30
|
||||
|
||||
class BuildJobResult(object):
|
||||
""" Build job result enum """
|
||||
|
@ -35,14 +37,15 @@ class BuilderServer(object):
|
|||
""" Server which handles both HTTP and WAMP requests, managing the full state of the build
|
||||
controller.
|
||||
"""
|
||||
def __init__(self, server_hostname, queue, build_logs, user_files, lifecycle_manager_klass):
|
||||
def __init__(self, registry_hostname, queue, build_logs, user_files, lifecycle_manager_klass,
|
||||
lifecycle_manager_config, manager_hostname):
|
||||
self._loop = None
|
||||
self._current_status = 'starting'
|
||||
self._current_components = []
|
||||
self._job_count = 0
|
||||
|
||||
self._session_factory = RouterSessionFactory(RouterFactory())
|
||||
self._server_hostname = server_hostname
|
||||
self._registry_hostname = registry_hostname
|
||||
self._queue = queue
|
||||
self._build_logs = build_logs
|
||||
self._user_files = user_files
|
||||
|
@ -50,8 +53,11 @@ class BuilderServer(object):
|
|||
self._register_component,
|
||||
self._unregister_component,
|
||||
self._job_heartbeat,
|
||||
self._job_complete
|
||||
self._job_complete,
|
||||
manager_hostname,
|
||||
HEARTBEAT_PERIOD_SEC,
|
||||
)
|
||||
self._lifecycle_manager_config = lifecycle_manager_config
|
||||
|
||||
self._shutdown_event = Event()
|
||||
self._current_status = 'running'
|
||||
|
@ -81,18 +87,17 @@ class BuilderServer(object):
|
|||
|
||||
self._controller_app = controller_app
|
||||
|
||||
def run(self, host, ssl=None):
|
||||
def run(self, host, websocket_port, controller_port, ssl=None):
|
||||
logger.debug('Initializing the lifecycle manager')
|
||||
self._lifecycle_manager.initialize()
|
||||
self._lifecycle_manager.initialize(self._lifecycle_manager_config)
|
||||
|
||||
logger.debug('Initializing all members of the event loop')
|
||||
loop = trollius.get_event_loop()
|
||||
trollius.Task(self._initialize(loop, host, ssl))
|
||||
|
||||
logger.debug('Starting server on port %s, with controller on port %s', WEBSOCKET_PORT,
|
||||
CONTROLLER_PORT)
|
||||
logger.debug('Starting server on port %s, with controller on port %s', websocket_port,
|
||||
controller_port)
|
||||
try:
|
||||
loop.run_forever()
|
||||
loop.run_until_complete(self._initialize(loop, host, websocket_port, controller_port, ssl))
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
finally:
|
||||
|
@ -116,7 +121,7 @@ class BuilderServer(object):
|
|||
component.parent_manager = self._lifecycle_manager
|
||||
component.build_logs = self._build_logs
|
||||
component.user_files = self._user_files
|
||||
component.server_hostname = self._server_hostname
|
||||
component.registry_hostname = self._registry_hostname
|
||||
|
||||
self._current_components.append(component)
|
||||
self._session_factory.add(component)
|
||||
|
@ -130,32 +135,32 @@ class BuilderServer(object):
|
|||
self._session_factory.remove(component)
|
||||
|
||||
def _job_heartbeat(self, build_job):
|
||||
WorkQueue.extend_processing(build_job.job_item(), seconds_from_now=JOB_TIMEOUT_SECONDS,
|
||||
retry_count=1, minimum_extension=MINIMUM_JOB_EXTENSION)
|
||||
self._queue.extend_processing(build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS,
|
||||
minimum_extension=MINIMUM_JOB_EXTENSION)
|
||||
|
||||
def _job_complete(self, build_job, job_status):
|
||||
if job_status == BuildJobResult.INCOMPLETE:
|
||||
self._queue.incomplete(build_job.job_item(), restore_retry=True, retry_after=30)
|
||||
elif job_status == BuildJobResult.ERROR:
|
||||
self._queue.incomplete(build_job.job_item(), restore_retry=False)
|
||||
self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30)
|
||||
else:
|
||||
self._queue.complete(build_job.job_item())
|
||||
self._queue.complete(build_job.job_item)
|
||||
|
||||
self._job_count = self._job_count - 1
|
||||
|
||||
if self._current_status == 'shutting_down' and not self._job_count:
|
||||
self._shutdown_event.set()
|
||||
|
||||
# TODO(jschorr): check for work here?
|
||||
|
||||
@trollius.coroutine
|
||||
def _work_checker(self):
|
||||
while self._current_status == 'running':
|
||||
logger.debug('Checking for more work for %d active workers', self._lifecycle_manager.num_workers())
|
||||
with database.CloseForLongOperation(app.config):
|
||||
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
|
||||
|
||||
logger.debug('Checking for more work for %d active workers',
|
||||
self._lifecycle_manager.num_workers())
|
||||
|
||||
job_item = self._queue.get(processing_time=self._lifecycle_manager.setup_time())
|
||||
if job_item is None:
|
||||
logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
|
||||
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
|
||||
continue
|
||||
|
||||
try:
|
||||
|
@ -163,20 +168,24 @@ class BuilderServer(object):
|
|||
except BuildJobLoadException as irbe:
|
||||
logger.exception(irbe)
|
||||
self._queue.incomplete(job_item, restore_retry=False)
|
||||
continue
|
||||
|
||||
logger.debug('Build job found. Checking for an avaliable worker.')
|
||||
if self._lifecycle_manager.schedule(build_job, self._loop):
|
||||
scheduled = yield From(self._lifecycle_manager.schedule(build_job))
|
||||
if scheduled:
|
||||
status_handler = StatusHandler(self._build_logs, build_job.repo_build.uuid)
|
||||
status_handler.set_phase('build-scheduled')
|
||||
|
||||
self._job_count = self._job_count + 1
|
||||
logger.debug('Build job scheduled. Running: %s', self._job_count)
|
||||
else:
|
||||
logger.debug('All workers are busy. Requeuing.')
|
||||
self._queue.incomplete(job_item, restore_retry=True, retry_after=0)
|
||||
|
||||
yield From(trollius.sleep(WORK_CHECK_TIMEOUT))
|
||||
|
||||
|
||||
@trollius.coroutine
|
||||
def _initialize(self, loop, host, ssl=None):
|
||||
def _initialize(self, loop, host, websocket_port, controller_port, ssl=None):
|
||||
self._loop = loop
|
||||
|
||||
# Create the WAMP server.
|
||||
|
@ -184,8 +193,8 @@ class BuilderServer(object):
|
|||
transport_factory.setProtocolOptions(failByDrop=True)
|
||||
|
||||
# Initialize the controller server and the WAMP server
|
||||
create_wsgi_server(self._controller_app, loop=loop, host=host, port=CONTROLLER_PORT, ssl=ssl)
|
||||
yield From(loop.create_server(transport_factory, host, WEBSOCKET_PORT, ssl=ssl))
|
||||
create_wsgi_server(self._controller_app, loop=loop, host=host, port=controller_port, ssl=ssl)
|
||||
yield From(loop.create_server(transport_factory, host, websocket_port, ssl=ssl))
|
||||
|
||||
# Initialize the work queue checker.
|
||||
yield From(self._work_checker())
|
||||
|
|
31
buildman/templates/cloudconfig.yaml
Normal file
31
buildman/templates/cloudconfig.yaml
Normal file
|
@ -0,0 +1,31 @@
|
|||
#cloud-config
|
||||
|
||||
ssh_authorized_keys:
|
||||
- ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCC0m+hVmyR3vn/xoxJe9+atRWBxSK+YXgyufNVDMcb7H00Jfnc341QH3kDVYZamUbhVh/nyc2RP7YbnZR5zORFtgOaNSdkMYrPozzBvxjnvSUokkCCWbLqXDHvIKiR12r+UTSijPJE/Yk702Mb2ejAFuae1C3Ec+qKAoOCagDjpQ3THyb5oaKE7VPHdwCWjWIQLRhC+plu77ObhoXIFJLD13gCi01L/rp4mYVCxIc2lX5A8rkK+bZHnIZwWUQ4t8SIjWxIaUo0FE7oZ83nKuNkYj5ngmLHQLY23Nx2WhE9H6NBthUpik9SmqQPtVYbhIG+bISPoH9Xs8CLrFb0VRjz Joey's Mac
|
||||
- ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCo6FhAP7mFFOAzM91gtaKW7saahtaN4lur42FMMztz6aqUycIltCmvxo+3FmrXgCG30maMNU36Vm1+9QRtVQEd+eRuoIWP28t+8MT01Fh4zPuE2Wca3pOHSNo3X81FfWJLzmwEHiQKs9HPQqUhezR9PcVWVkbMyAzw85c0UycGmHGFNb0UiRd9HFY6XbgbxhZv/mvKLZ99xE3xkOzS1PNsdSNvjUKwZR7pSUPqNS5S/1NXyR4GhFTU24VPH/bTATOv2ATH+PSzsZ7Qyz9UHj38tKC+ALJHEDJ4HXGzobyOUP78cHGZOfCB5FYubq0zmOudAjKIAhwI8XTFvJ2DX1P3 jimmyzelinskie
|
||||
- ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDNvw8qo9m8np7yQ/Smv/oklM8bo8VyNRZriGYBDuolWDL/mZpYCQnZJXphQo7RFdNABYistikjJlBuuwUohLf2uSq0iKoFa2TgwI43wViWzvuzU4nA02/ITD5BZdmWAFNyIoqeB50Ol4qUgDwLAZ+7Kv7uCi6chcgr9gTi99jY3GHyZjrMiXMHGVGi+FExFuzhVC2drKjbz5q6oRfQeLtNfG4psl5GU3MQU6FkX4fgoCx0r9R48/b7l4+TT7pWblJQiRfeldixu6308vyoTUEHasdkU3/X0OTaGz/h5XqTKnGQc6stvvoED3w+L3QFp0H5Z8sZ9stSsitmCBrmbcKZ jakemoshenko
|
||||
|
||||
write_files:
|
||||
- path: /root/overrides.list
|
||||
permission: '0644'
|
||||
content: |
|
||||
REALM={{ realm }}
|
||||
TOKEN={{ token }}
|
||||
SERVER=wss://{{ manager_hostname }}
|
||||
|
||||
coreos:
|
||||
update:
|
||||
reboot-strategy: off
|
||||
group: {{ coreos_channel }}
|
||||
|
||||
units:
|
||||
{{ dockersystemd('quay-builder',
|
||||
'quay.io/coreos/registry-build-worker',
|
||||
quay_username,
|
||||
quay_password,
|
||||
worker_tag,
|
||||
extra_args='--net=host --privileged --env-file /root/overrides.list -v /var/run/docker.sock:/var/run/docker.sock -v /usr/share/ca-certificates:/etc/ssl/certs',
|
||||
exec_stop_post=['/bin/sh -xc "/bin/sleep 120; /usr/bin/systemctl --no-block poweroff"'],
|
||||
flattened=True,
|
||||
restart_policy='no'
|
||||
) | indent(4) }}
|
Reference in a new issue