Handle internal errors in a better fashion: If a build would be marked as internal error, only do so if there are retries remaining. Otherwise, we mark it as failed (since it won't be rebuilt anyway)

This commit is contained in:
Joseph Schorr 2015-02-12 16:19:44 -05:00
parent e8458267fd
commit f84d1bad45
8 changed files with 70 additions and 19 deletions

View file

@ -255,7 +255,8 @@ class BuildComponent(BaseComponent):
# Write the error to the log.
self._build_status.set_error(worker_error.public_message(), worker_error.extra_data(),
internal_error=worker_error.is_internal_error())
internal_error=worker_error.is_internal_error(),
requeued=self._current_job.has_retries_remaining())
# Send the notification that the build has failed.
self._current_job.send_notification('build_failure',
@ -363,7 +364,8 @@ class BuildComponent(BaseComponent):
# If we still have a running job, then it has not completed and we need to tell the parent
# manager.
if self._current_job is not None:
self._build_status.set_error('Build worker timed out', internal_error=True)
self._build_status.set_error('Build worker timed out', internal_error=True,
requeued=self._current_job.has_retries_remaining())
self.parent_manager.job_completed(self._current_job, BuildJobResult.INCOMPLETE, self)
self._build_status = None

View file

@ -25,6 +25,9 @@ class BuildJob(object):
'Could not parse build queue item config with ID %s' % self.job_details['build_uuid']
)
def has_retries_remaining(self):
return self.job_item.retries_remaining > 0
def send_notification(self, kind, error_message=None):
tags = self.build_config.get('docker_tags', ['latest'])
event_data = {

View file

@ -39,8 +39,8 @@ class StatusHandler(object):
self._current_command = command
self._append_log_message(command, self._build_logs.COMMAND, extra_data)
def set_error(self, error_message, extra_data=None, internal_error=False):
self.set_phase(BUILD_PHASE.INTERNAL_ERROR if internal_error else BUILD_PHASE.ERROR)
def set_error(self, error_message, extra_data=None, internal_error=False, requeued=False):
self.set_phase(BUILD_PHASE.INTERNAL_ERROR if internal_error and requeued else BUILD_PHASE.ERROR)
extra_data = extra_data or {}
extra_data['internal_error'] = internal_error

View file

@ -476,6 +476,15 @@ class BUILD_PHASE(object):
COMPLETE = 'complete'
class QueueItem(BaseModel):
queue_name = CharField(index=True, max_length=1024)
body = TextField()
available_after = DateTimeField(default=datetime.utcnow, index=True)
available = BooleanField(default=True, index=True)
processing_expires = DateTimeField(null=True, index=True)
retries_remaining = IntegerField(default=5)
class RepositoryBuild(BaseModel):
uuid = CharField(default=uuid_generator, index=True)
repository = ForeignKeyField(Repository, index=True)
@ -488,15 +497,7 @@ class RepositoryBuild(BaseModel):
trigger = ForeignKeyField(RepositoryBuildTrigger, null=True, index=True)
pull_robot = QuayUserField(null=True, related_name='buildpullrobot')
logs_archived = BooleanField(default=False)
class QueueItem(BaseModel):
queue_name = CharField(index=True, max_length=1024)
body = TextField()
available_after = DateTimeField(default=datetime.utcnow, index=True)
available = BooleanField(default=True, index=True)
processing_expires = DateTimeField(null=True, index=True)
retries_remaining = IntegerField(default=5)
queue_item = ForeignKeyField(QueueItem, null=True, index=True)
class LogEntryKind(BaseModel):

View file

@ -0,0 +1,30 @@
"""Add build queue item reference to the repositorybuild table
Revision ID: 14fe12ade3df
Revises: 5ad999136045
Create Date: 2015-02-12 16:11:57.814645
"""
# revision identifiers, used by Alembic.
revision = '14fe12ade3df'
down_revision = '5ad999136045'
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import mysql
def upgrade(tables):
### commands auto generated by Alembic - please adjust! ###
op.add_column('repositorybuild', sa.Column('queue_item_id', sa.Integer(), nullable=True))
op.create_index('repositorybuild_queue_item_id', 'repositorybuild', ['queue_item_id'], unique=False)
op.create_foreign_key(op.f('fk_repositorybuild_queue_item_id_queueitem'), 'repositorybuild', 'queueitem', ['queue_item_id'], ['id'])
### end Alembic commands ###
def downgrade(tables):
### commands auto generated by Alembic - please adjust! ###
op.drop_constraint(op.f('fk_repositorybuild_queue_item_id_queueitem'), 'repositorybuild', type_='foreignkey')
op.drop_index('repositorybuild_queue_item_id', table_name='repositorybuild')
op.drop_column('repositorybuild', 'queue_item_id')
### end Alembic commands ###

View file

@ -60,8 +60,8 @@ class WorkQueue(object):
running_query = self._running_jobs(now, name_match_query)
running_count = running_query.distinct().count()
avialable_query = self._available_jobs_not_running(now, name_match_query, running_query)
available_count = avialable_query.select(QueueItem.queue_name).distinct().count()
available_query = self._available_jobs_not_running(now, name_match_query, running_query)
available_count = available_query.select(QueueItem.queue_name).distinct().count()
self._reporter(self._currently_processing, running_count, running_count + available_count)
@ -81,7 +81,7 @@ class WorkQueue(object):
params['available_after'] = available_date
with self._transaction_factory(db):
QueueItem.create(**params)
return QueueItem.create(**params)
def get(self, processing_time=300):
"""
@ -114,6 +114,7 @@ class WorkQueue(object):
item = AttrDict({
'id': db_item.id,
'body': db_item.body,
'retries_remaining': db_item.retries_remaining
})
self._currently_processing = True
@ -141,6 +142,7 @@ class WorkQueue(object):
incomplete_item_obj.save()
self._currently_processing = False
return incomplete_item_obj.retries_remaining > 0
def extend_processing(self, item, seconds_from_now, minimum_extension=MINIMUM_EXTENSION):
with self._transaction_factory(db):

View file

@ -76,6 +76,12 @@ def build_status_view(build_obj, can_write=False):
if datetime.datetime.utcnow() - heartbeat > datetime.timedelta(minutes=1):
phase = database.BUILD_PHASE.INTERNAL_ERROR
# If the phase is internal error, return 'error' instead of the number if retries
# on the queue item is 0.
if phase == database.BUILD_PHASE.INTERNAL_ERROR:
if build_obj.queue_item is None or build_obj.queue_item.retries_remaining == 0:
phase = database.BUILD_PHASE.ERROR
logger.debug('Can write: %s job_config: %s', can_write, build_obj.job_config)
resp = {
'id': build_obj.uuid,
@ -87,7 +93,7 @@ def build_status_view(build_obj, can_write=False):
'is_writer': can_write,
'trigger': trigger_view(build_obj.trigger),
'resource_key': build_obj.resource_key,
'pull_robot': user_view(build_obj.pull_robot) if build_obj.pull_robot else None,
'pull_robot': user_view(build_obj.pull_robot) if build_obj.pull_robot else None
}
if can_write:

View file

@ -230,10 +230,17 @@ def start_build(repository, dockerfile_id, tags, build_name, subdir, manual,
dockerfile_id, build_name,
trigger, pull_robot_name=pull_robot_name)
dockerfile_build_queue.put([repository.namespace_user.username, repository.name], json.dumps({
json_data = json.dumps({
'build_uuid': build_request.uuid,
'pull_credentials': model.get_pull_credentials(pull_robot_name) if pull_robot_name else None
}), retries_remaining=3)
})
queue_item = dockerfile_build_queue.put([repository.namespace_user.username, repository.name],
json_data,
retries_remaining=3)
build_request.queue_item = queue_item
build_request.save()
# Add the build to the repo's log.
metadata = {