Merge pull request #1787 from coreos-inc/build-order

Better handling and logging of exceptions in build manager
This commit is contained in:
josephschorr 2016-08-30 14:16:53 -04:00 committed by GitHub
commit bf6b0c9299
3 changed files with 32 additions and 26 deletions

View file

@ -376,7 +376,7 @@ class BuildComponent(BaseComponent):
if self._component_status == ComponentStatus.TIMED_OUT:
return
logger.debug('got heartbeat on realm %s', self.builder_realm)
logger.debug('Got heartbeat on realm %s', self.builder_realm)
self._last_heartbeat = datetime.datetime.utcnow()
@trollius.coroutine

View file

@ -29,7 +29,7 @@ ETCD_ATOMIC_OP_TIMEOUT = 120
RETRY_IMMEDIATELY_TIMEOUT = 0
NO_WORKER_AVAILABLE_TIMEOUT = 10
DEFAULT_EPHEMERAL_API_TIMEOUT = 20
DEFAULT_EPHEMERAL_SETUP_TIMEOUT = 300
DEFAULT_EPHEMERAL_SETUP_TIMEOUT = 500
class EtcdAction(object):
@ -249,24 +249,24 @@ class EphemeralBuilderManager(BaseManager):
logger.warning('Unexpected action (%s) on realm key: %s', etcd_result.action, etcd_result.key)
def _register_realm(self, realm_spec):
logger.debug('Registering realm with manager: %s', realm_spec['realm'])
component = self.register_component(realm_spec['realm'], BuildComponent,
token=realm_spec['token'])
if component in self._component_to_job:
logger.debug('Realm already registered with manager: %s', realm_spec['realm'])
return component
logger.debug('Got call to register realm %s with manager', realm_spec['realm'])
# Create the build information block for the registered realm.
build_job = BuildJob(AttrDict(realm_spec['job_queue_item']))
execution_id = realm_spec.get('execution_id', None)
executor_name = realm_spec.get('executor_name', 'EC2Executor')
logger.debug('Registering realm %s with manager: %s', realm_spec['realm'], realm_spec)
component = self.register_component(realm_spec['realm'], BuildComponent,
token=realm_spec['token'])
build_info = BuildInfo(component=component, build_job=build_job, execution_id=execution_id,
executor_name=executor_name)
self._component_to_job[component] = build_job
self._build_uuid_to_info[build_job.build_uuid] = build_info
logger.debug('Registered realm %s with manager', realm_spec['realm'])
return component
@property
@ -362,7 +362,7 @@ class EphemeralBuilderManager(BaseManager):
async(self._register_existing_realms())
def setup_time(self):
return self._manager_config.get('MACHINE_SETUP_TIME', 300)
return self._ephemeral_setup_timeout
def shutdown(self):
logger.debug('Shutting down worker.')
@ -420,6 +420,8 @@ class EphemeralBuilderManager(BaseManager):
lock_payload = json.dumps(payload)
logger.debug('Writing key for job %s with expiration in %s seconds', build_uuid,
self._ephemeral_setup_timeout)
try:
yield From(self._etcd_client.write(job_key, lock_payload, prevExist=False,
ttl=self._ephemeral_setup_timeout))
@ -505,24 +507,26 @@ class EphemeralBuilderManager(BaseManager):
@coroutine
def build_component_ready(self, build_component):
try:
# Pop off the job for the component. We do so before we send out the etcd watch below,
# as it will also remove this mapping.
job = self._component_to_job.pop(build_component)
if job is None:
logger.error('Could not find job for the build component on realm %s',
build_component.builder_realm)
raise Return()
logger.debug('Got component ready for component with realm %s', build_component.builder_realm)
# Pop off the job for the component. We do so before we send out the etcd watch below,
# as it will also remove this mapping.
job = self._component_to_job.pop(build_component, None)
if job is None:
logger.error('Could not find job for the build component on realm %s; it may have expired',
build_component.builder_realm)
raise Return()
# Start the build job.
logger.debug('Sending build %s to newly ready component on realm %s',
job.build_uuid, build_component.builder_realm)
yield From(build_component.start_build(job))
try:
# Clean up the bookkeeping for allowing any manager to take the job.
yield From(self._etcd_client.delete(self._etcd_realm_key(build_component.builder_realm)))
# Start the build job.
logger.debug('Sending build %s to newly ready component on realm %s',
job.build_uuid, build_component.builder_realm)
yield From(build_component.start_build(job))
except (KeyError, etcd.EtcdKeyError):
logger.warning('Builder is asking for more work, but work already completed')
logger.warning('Could not delete realm key %s', build_component.builder_realm)
def build_component_disposed(self, build_component, timed_out):
logger.debug('Calling build_component_disposed.')

View file

@ -24,6 +24,7 @@ logger = logging.getLogger(__name__)
WORK_CHECK_TIMEOUT = 10
TIMEOUT_PERIOD_MINUTES = 20
JOB_TIMEOUT_SECONDS = 300
SETUP_LEEWAY_SECONDS = 10
MINIMUM_JOB_EXTENSION = timedelta(minutes=2)
HEARTBEAT_PERIOD_SEC = 30
@ -164,8 +165,8 @@ class BuilderServer(object):
logger.debug('Checking for more work for %d active workers',
self._lifecycle_manager.num_workers())
job_item = self._queue.get(processing_time=self._lifecycle_manager.setup_time(),
ordering_required=True)
processing_time = self._lifecycle_manager.setup_time() + SETUP_LEEWAY_SECONDS
job_item = self._queue.get(processing_time=processing_time, ordering_required=True)
if job_item is None:
logger.debug('No additional work found. Going to sleep for %s seconds', WORK_CHECK_TIMEOUT)
continue
@ -185,6 +186,7 @@ class BuilderServer(object):
except:
logger.exception('Exception when scheduling job: %s', build_job.repo_build.uuid)
self._current_status = BuildServerStatus.EXCEPTION
self._queue.incomplete(job_item, restore_retry=True, retry_after=WORK_CHECK_TIMEOUT)
return
if schedule_success: