Bug fixes, refactoring and "new" tests for the build manager

- Fixes various bugs introduced in the most recent build system commit
- Refactors state management in the build manager to be cleaner and more contained
- Adds back in the mock-based tests, fixed to not use threads and adjusted for the refactoring
- Adds some more simplified unit tests around non-etch related flows
This commit is contained in:
Joseph Schorr 2016-07-15 18:28:48 -04:00
parent 9f6b47ad1f
commit 2c1880b944
4 changed files with 503 additions and 145 deletions

View file

@ -5,6 +5,7 @@ import calendar
import os.path
import json
from collections import namedtuple
from datetime import datetime, timedelta
from trollius import From, coroutine, Return, async
from concurrent.futures import ThreadPoolExecutor
@ -28,12 +29,6 @@ RETRY_IMMEDIATELY_TIMEOUT = 0
NO_WORKER_AVAILABLE_TIMEOUT = 10
DEFAULT_EPHEMERAL_API_TIMEOUT = 20
EXECUTORS = {
'popen': PopenExecutor,
'ec2': EC2Executor,
'kubernetes': KubernetesExecutor,
}
class EtcdAction(object):
GET = 'get'
SET = 'set'
@ -44,13 +39,28 @@ class EtcdAction(object):
COMPARE_AND_SWAP = 'compareAndSwap'
COMPARE_AND_DELETE = 'compareAndDelete'
BuildInfo = namedtuple('BuildInfo', ['component', 'build_job', 'execution_id', 'executor_name'])
def createAsyncEtcdClient(worker_threads=1, **kwargs):
client = etcd.Client(**kwargs)
async_executor = ThreadPoolExecutor(worker_threads)
return AsyncWrapper(client, executor=async_executor), async_executor
class EphemeralBuilderManager(BaseManager):
""" Build manager implementation for the Enterprise Registry. """
_etcd_client_klass = etcd.Client
EXECUTORS = {
'popen': PopenExecutor,
'ec2': EC2Executor,
'kubernetes': KubernetesExecutor,
}
def __init__(self, *args, **kwargs):
self._etcd_client_creator = kwargs.pop('etcd_creator', createAsyncEtcdClient)
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
self._shutting_down = False
self._manager_config = None
@ -58,22 +68,24 @@ class EphemeralBuilderManager(BaseManager):
self._etcd_client = None
self._etcd_realm_prefix = None
self._etcd_builder_prefix = None
self._etcd_job_prefix = None
self._etcd_lock_prefix = None
self._ephemeral_api_timeout = DEFAULT_EPHEMERAL_API_TIMEOUT
self._component_to_job = {}
self._job_uuid_to_component = {}
self._component_to_builder = {}
self._job_to_executor = {}
# The registered executors available for running jobs, in order.
self._ordered_executors = []
self._executors = []
# The registered executors, mapped by their unique name.
self._executor_name_to_executor = {}
# Map of etcd keys being watched to the tasks watching them
self._watch_tasks = {}
super(EphemeralBuilderManager, self).__init__(*args, **kwargs)
# Map from builder component to its associated job.
self._component_to_job = {}
# Map from build UUID to a BuildInfo tuple with information about the build.
self._build_uuid_to_info = {}
def _watch_etcd(self, etcd_key, change_callback, start_index=None, recursive=True,
restarter=None):
@ -129,7 +141,6 @@ class EphemeralBuilderManager(BaseManager):
if not self._shutting_down:
logger.debug('Scheduling watch of key: %s%s at start index %s', etcd_key,
'*' if recursive else '', start_index)
watch_future = self._etcd_client.watch(etcd_key, recursive=recursive, index=start_index,
timeout=ETCD_MAX_WATCH_TIMEOUT)
watch_future.add_done_callback(callback_wrapper)
@ -137,7 +148,8 @@ class EphemeralBuilderManager(BaseManager):
self._watch_tasks[watch_task_key] = async(watch_future)
@coroutine
def _handle_builder_expiration(self, etcd_result):
def _handle_job_expiration(self, etcd_result):
""" Handler invoked whenever a job expires in etcd. """
if etcd_result is None:
return
@ -145,26 +157,26 @@ class EphemeralBuilderManager(BaseManager):
# Handle the expiration
logger.debug('Builder expired, clean up the old build node')
job_metadata = json.loads(etcd_result._prev_node.value)
build_job = BuildJob(AttrDict(job_metadata['job_queue_item']))
build_info = self._build_uuid_to_info.get(build_job.build_uuid)
if build_info is None:
logger.error('Could not find build info for job %s under etcd expire with metadata: %s',
build_job.build_uuid, job_metadata)
return
if 'builder_id' in job_metadata:
builder_id = job_metadata['builder_id']
execution_id = build_info.execution_id
# Before we delete the build node, we take a lock to make sure that only one manager
# can terminate the node.
try:
lock_key = self._etcd_lock_key(builder_id)
yield From(self._etcd_client.write(lock_key, '', prevExist=False, ttl=self.setup_time()))
except (KeyError, etcd.EtcdKeyError):
logger.debug('Somebody else is cleaning up the build node: %s', builder_id)
return
# If we have not yet received a heartbeat, then the node failed to boot in some way. We mark
# the job as incomplete here.
if not job_metadata.get('had_heartbeat', True):
logger.warning('Build executor failed to successfully boot with execution id %s',
execution_id)
self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE)
if not job_metadata.get('had_heartbeat', True):
logger.warning('Build node failed to successfully boot: %s', builder_id)
build_job = BuildJob(AttrDict(job_metadata['job_queue_item']))
self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE)
logger.info('Terminating expired build node: %s', builder_id)
yield From(self._job_to_executor[builder_id].stop_builder(builder_id))
# Finally, we terminate the build execution for the job.
logger.info('Terminating expired build executor for job %s with execution id %s',
build_job.build_uuid, execution_id)
yield From(self.kill_builder_executor(build_job.build_uuid))
def _handle_realm_change(self, etcd_result):
if etcd_result is None:
@ -180,13 +192,14 @@ class EphemeralBuilderManager(BaseManager):
# connection
realm_spec = json.loads(etcd_result._prev_node.value)
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
component = self._job_uuid_to_component.pop(build_job.job_details['build_uuid'], None)
if component is not None:
build_uuid = build_job.build_uuid
build_info = self._build_uuid_to_info.pop(build_uuid, None)
if build_info is not None:
# We were not the manager which the worker connected to, remove the bookkeeping for it
logger.debug('Unregistering unused component on realm: %s', realm_spec['realm'])
del self._component_to_job[component]
del self._component_to_builder[component]
self.unregister_component(component)
logger.debug('Unregistering unused component for build %s', build_uuid)
self._component_to_job.pop(build_info.component, None)
self.unregister_component(build_info.component)
else:
logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key)
@ -200,15 +213,21 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Realm already registered with manager: %s', realm_spec['realm'])
return component
# Create the build information block for the registered realm.
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
execution_id = realm_spec['execution_id']
executor_name = realm_spec['executor_name']
build_info = BuildInfo(component=component, build_job=build_job, execution_id=execution_id,
executor_name=executor_name)
self._component_to_job[component] = build_job
self._component_to_builder[component] = realm_spec['builder_id']
self._job_uuid_to_component[build_job.job_details['build_uuid']] = component
self._build_uuid_to_info[build_job.build_uuid] = build_info
return component
@property
def registered_executors(self):
return self._executors
return self._ordered_executors
@coroutine
def _register_existing_realms(self):
@ -223,22 +242,27 @@ class EphemeralBuilderManager(BaseManager):
encountered.add(component)
# Remove any components not encountered so we can clean up.
for found in list(self._component_to_job.keys()):
if not found in encountered:
self._component_to_job.pop(component)
self._component_to_builder.pop(component)
for component, job in list(self._component_to_job.items()):
if not component in encountered:
self._component_to_job.pop(component, None)
self._build_uuid_to_info.pop(job.build_uuid, None)
except (KeyError, etcd.EtcdKeyError):
# no realms have been registered yet
pass
def _load_executor(self, executor_class_name, executor_config):
executor_klass = EXECUTORS.get(executor_class_name)
def _load_executor(self, executor_kind_name, executor_config):
executor_klass = EphemeralBuilderManager.EXECUTORS.get(executor_kind_name)
if executor_klass is None:
logger.error('Unknown executor %s; skipping install', executor_class_name)
logger.error('Unknown executor %s; skipping install', executor_kind_name)
return
self._executors.append(executor_klass(executor_config, self.manager_hostname))
executor = executor_klass(executor_config, self.manager_hostname)
if executor.name in self._executor_name_to_executor:
raise Exception('Executor with name %s already registered' % executor.name)
self._ordered_executors.append(executor)
self._executor_name_to_executor[executor.name] = executor
def initialize(self, manager_config):
logger.debug('Calling initialize')
@ -265,21 +289,17 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Connecting to etcd on %s:%s', etcd_host, etcd_port)
worker_threads = self._manager_config.get('ETCD_WORKER_THREADS', 5)
self._async_thread_executor = ThreadPoolExecutor(worker_threads)
self._etcd_client = AsyncWrapper(self._etcd_client_klass(host=etcd_host, port=etcd_port,
cert=etcd_auth, ca_cert=etcd_ca_cert,
protocol=etcd_protocol,
read_timeout=5),
executor=self._async_thread_executor)
(self._etcd_client, self._async_thread_executor) = self._etcd_client_creator(worker_threads,
host=etcd_host, port=etcd_port, cert=etcd_auth, ca_cert=etcd_ca_cert,
protocol=etcd_protocol, read_timeout=5)
self._etcd_builder_prefix = self._manager_config.get('ETCD_BUILDER_PREFIX', 'building/')
self._watch_etcd(self._etcd_builder_prefix, self._handle_builder_expiration)
self._etcd_job_prefix = self._manager_config.get('ETCD_BUILDER_PREFIX', 'building/')
self._watch_etcd(self._etcd_job_prefix, self._handle_job_expiration)
self._etcd_realm_prefix = self._manager_config.get('ETCD_REALM_PREFIX', 'realm/')
self._watch_etcd(self._etcd_realm_prefix, self._handle_realm_change,
restarter=self._register_existing_realms)
self._etcd_lock_prefix = self._manager_config.get('ETCD_LOCK_PREFIX', 'locks/')
self._ephemeral_api_timeout = self._manager_config.get('API_TIMEOUT',
DEFAULT_EPHEMERAL_API_TIMEOUT)
@ -310,8 +330,8 @@ class EphemeralBuilderManager(BaseManager):
# Check if there are worker slots avialable by checking the number of jobs in etcd
allowed_worker_count = self._manager_config.get('ALLOWED_WORKER_COUNT', 1)
try:
building = yield From(self._etcd_client.read(self._etcd_builder_prefix, recursive=True))
workers_alive = sum(1 for child in building.children if not child.dir)
active_jobs = yield From(self._etcd_client.read(self._etcd_job_prefix, recursive=True))
workers_alive = sum(1 for child in active_jobs.children if not child.dir)
except (KeyError, etcd.EtcdKeyError):
workers_alive = 0
except etcd.EtcdException:
@ -359,40 +379,40 @@ class EphemeralBuilderManager(BaseManager):
raise Return(False, RETRY_IMMEDIATELY_TIMEOUT)
started_with_executor = None
builder_id = None
execution_id = None
logger.debug("Registered executors are: %s", [ex.__class__.__name__ for ex in self._executors])
for executor in self._executors:
executor_type = executor.__class__.__name__
logger.debug("Registered executors are: %s", [ex.name for ex in self._ordered_executors])
for executor in self._ordered_executors:
# Check if we can use this executor based on its whitelist, by namespace.
namespace = build_job.namespace
if not executor.allowed_for_namespace(namespace):
logger.debug('Job %s (namespace: %s) cannot use executor %s', build_uuid, namespace,
executor_type)
executor.name)
continue
# Check if we can use this executor based on the retries remaining.
if executor.minimum_retry_threshold > build_job.retries_remaining:
logger.debug('Job %s cannot use executor %s as it is below retry threshold (retry #: %s)',
build_uuid, executor_type, build_job.retries_remaining)
logger.debug('Job %s cannot use executor %s as it is below retry threshold %s (retry #%s)',
build_uuid, executor.name, executor.minimum_retry_threshold,
build_job.retries_remaining)
continue
logger.debug('Starting builder for job %s with selected executor: %s', build_uuid,
executor_type)
executor.name)
try:
builder_id = yield From(executor.start_builder(realm, token, build_uuid))
execution_id = yield From(executor.start_builder(realm, token, build_uuid))
except:
logger.exception('Exception when starting builder for job: %s', build_uuid)
continue
try:
metric_queue.put_deprecated('EphemeralBuilderStarted', 1, unit='Count')
metric_queue.ephemeral_build_workers.Inc(labelvalues=[builder_id, build_uuid])
metric_queue.ephemeral_build_workers.Inc(labelvalues=[execution_id, build_uuid])
except:
logger.exception('Exception when writing start metrics for builder %s for job %s',
builder_id, build_uuid)
logger.exception('Exception when writing start metrics for execution %s for job %s',
execution_id, build_uuid)
started_with_executor = executor
@ -403,23 +423,15 @@ class EphemeralBuilderManager(BaseManager):
logger.error('Could not start ephemeral worker for build %s', build_uuid)
raise Return(False, self._ephemeral_api_timeout)
logger.debug('Started builder with ID %s for job: %s with executor: %s', builder_id, build_uuid,
started_with_executor.__class__.__name__)
# Store the builder in etcd associated with the job id
try:
payload['builder_id'] = builder_id
yield From(self._etcd_client.write(job_key, json.dumps(payload), prevValue=lock_payload,
ttl=setup_time))
except etcd.EtcdException:
logger.exception('Exception when writing job %s to etcd', build_uuid)
raise Return(False, self._ephemeral_api_timeout)
logger.debug('Started execution with ID %s for job: %s with executor: %s',
execution_id, build_uuid, started_with_executor.name)
# Store the realm spec which will allow any manager to accept this builder when it connects
realm_spec = json.dumps({
'realm': realm,
'token': token,
'builder_id': builder_id,
'execution_id': execution_id,
'executor_name': started_with_executor.name,
'job_queue_item': build_job.job_item,
})
@ -434,22 +446,27 @@ class EphemeralBuilderManager(BaseManager):
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
raise Return(False, setup_time)
self._job_to_executor[builder_id] = started_with_executor
logger.debug('Builder spawn complete for job %s using executor %s with ID %s ', build_uuid,
started_with_executor.__class__.__name__, builder_id)
started_with_executor.name, execution_id)
raise Return(True, None)
@coroutine
def build_component_ready(self, build_component):
try:
# Clean up the bookkeeping for allowing any manager to take the job
# Pop off the job for the component. We do so before we send out the etcd watch below,
# as it will also remove this mapping.
job = self._component_to_job.pop(build_component)
del self._job_uuid_to_component[job.job_details['build_uuid']]
if job is None:
logger.error('Could not find job for the build component on realm %s',
build_component.builder_realm)
return
# Clean up the bookkeeping for allowing any manager to take the job.
yield From(self._etcd_client.delete(self._etcd_realm_key(build_component.builder_realm)))
# Start the build job.
logger.debug('Sending build %s to newly ready component on realm %s',
job.job_details['build_uuid'], build_component.builder_realm)
job.build_uuid, build_component.builder_realm)
yield From(build_component.start_build(job))
except (KeyError, etcd.EtcdKeyError):
logger.warning('Builder is asking for more work, but work already completed')
@ -460,21 +477,46 @@ class EphemeralBuilderManager(BaseManager):
@coroutine
def job_completed(self, build_job, job_status, build_component):
logger.debug('Calling job_completed with status: %s', job_status)
logger.debug('Calling job_completed for job %s with status: %s',
build_job.build_uuid, job_status)
# Kill the ephmeral builder
builder_id = self._component_to_builder.pop(build_component)
yield From(self._job_to_executor[builder_id].stop_builder(builder_id))
del self._job_to_executor[builder_id]
# Mark the job as completed.
self.job_complete_callback(build_job, job_status)
# Release the lock in etcd
# Kill the ephmeral builder.
yield From(self.kill_builder_executor(build_job.build_uuid))
# Delete the build job from etcd.
job_key = self._etcd_job_key(build_job)
try:
yield From(self._etcd_client.delete(job_key))
except (KeyError, etcd.EtcdKeyError):
logger.debug('Builder is asking for job to be removed, but work already completed')
self.job_complete_callback(build_job, job_status)
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
@coroutine
def kill_builder_executor(self, build_uuid):
logger.info('Starting termination of executor for job %s', build_uuid)
build_info = self._build_uuid_to_info.pop(build_uuid, None)
if build_info is None:
logger.error('Could not find build information for build %s', build_uuid)
return
# Remove the build's component.
self._component_to_job.pop(build_info.component, None)
# Stop the build node/executor itself.
executor = self._executor_name_to_executor.get(build_info.executor_name)
if executor is None:
logger.error('Could not find registered executor %s for build %s',
build_info.executor_name, build_uuid)
return
# Terminate the executor's execution.
logger.info('Terminating executor for job %s with execution id %s',
build_uuid, build_info.execution_id)
yield From(executor.stop_builder(build_info.execution_id))
@coroutine
def job_heartbeat(self, build_job):
@ -484,7 +526,7 @@ class EphemeralBuilderManager(BaseManager):
try:
build_job_metadata_response = yield From(self._etcd_client.read(job_key))
except (KeyError, etcd.EtcdKeyError):
logger.info('Job %s no longer exists in etcd', build_job.job_details['build_uuid'])
logger.info('Job %s no longer exists in etcd', build_job.build_uuid)
return
build_job_metadata = json.loads(build_job_metadata_response.value)
@ -498,25 +540,18 @@ class EphemeralBuilderManager(BaseManager):
payload = {
'expiration': calendar.timegm(new_expiration.timetuple()),
'builder_id': build_job_metadata['builder_id'],
'job_queue_item': build_job.job_item,
'max_expiration': build_job_metadata['max_expiration'],
'had_heartbeat': True,
}
yield From(self._etcd_client.write(job_key, json.dumps(payload), ttl=ttl))
self.job_heartbeat_callback(build_job)
def _etcd_job_key(self, build_job):
""" Create a key which is used to track a job in etcd.
"""
return os.path.join(self._etcd_builder_prefix, build_job.job_details['build_uuid'])
def _etcd_lock_key(self, unique_lock_id):
""" Create a key which is used to create a temporary lock in etcd.
"""
return os.path.join(self._etcd_lock_prefix, unique_lock_id)
return os.path.join(self._etcd_job_prefix, build_job.job_details['build_uuid'])
def _etcd_realm_key(self, realm):
""" Create a key which is used to track an incoming connection on a realm.
@ -526,4 +561,4 @@ class EphemeralBuilderManager(BaseManager):
def num_workers(self):
""" Return the number of workers we're managing locally.
"""
return len(self._component_to_builder)
return len(self._component_to_job)