From f84d1bad4597755425eec3bc881a6a41ba8cd29e Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 12 Feb 2015 16:19:44 -0500 Subject: [PATCH] Handle internal errors in a better fashion: If a build would be marked as internal error, only do so if there are retries remaining. Otherwise, we mark it as failed (since it won't be rebuilt anyway) --- buildman/component/buildcomponent.py | 6 ++-- buildman/jobutil/buildjob.py | 3 ++ buildman/jobutil/buildstatus.py | 4 +-- data/database.py | 19 ++++++------ ..._add_build_queue_item_reference_to_the_.py | 30 +++++++++++++++++++ data/queue.py | 8 +++-- endpoints/api/build.py | 8 ++++- endpoints/common.py | 11 +++++-- 8 files changed, 70 insertions(+), 19 deletions(-) create mode 100644 data/migrations/versions/14fe12ade3df_add_build_queue_item_reference_to_the_.py diff --git a/buildman/component/buildcomponent.py b/buildman/component/buildcomponent.py index 72caec215..e20ee822d 100644 --- a/buildman/component/buildcomponent.py +++ b/buildman/component/buildcomponent.py @@ -255,7 +255,8 @@ class BuildComponent(BaseComponent): # Write the error to the log. self._build_status.set_error(worker_error.public_message(), worker_error.extra_data(), - internal_error=worker_error.is_internal_error()) + internal_error=worker_error.is_internal_error(), + requeued=self._current_job.has_retries_remaining()) # Send the notification that the build has failed. self._current_job.send_notification('build_failure', @@ -363,7 +364,8 @@ class BuildComponent(BaseComponent): # If we still have a running job, then it has not completed and we need to tell the parent # manager. if self._current_job is not None: - self._build_status.set_error('Build worker timed out', internal_error=True) + self._build_status.set_error('Build worker timed out', internal_error=True, + requeued=self._current_job.has_retries_remaining()) self.parent_manager.job_completed(self._current_job, BuildJobResult.INCOMPLETE, self) self._build_status = None diff --git a/buildman/jobutil/buildjob.py b/buildman/jobutil/buildjob.py index 1710c3aab..a6361e83a 100644 --- a/buildman/jobutil/buildjob.py +++ b/buildman/jobutil/buildjob.py @@ -25,6 +25,9 @@ class BuildJob(object): 'Could not parse build queue item config with ID %s' % self.job_details['build_uuid'] ) + def has_retries_remaining(self): + return self.job_item.retries_remaining > 0 + def send_notification(self, kind, error_message=None): tags = self.build_config.get('docker_tags', ['latest']) event_data = { diff --git a/buildman/jobutil/buildstatus.py b/buildman/jobutil/buildstatus.py index 2ae127ee0..892f8f6c7 100644 --- a/buildman/jobutil/buildstatus.py +++ b/buildman/jobutil/buildstatus.py @@ -39,8 +39,8 @@ class StatusHandler(object): self._current_command = command self._append_log_message(command, self._build_logs.COMMAND, extra_data) - def set_error(self, error_message, extra_data=None, internal_error=False): - self.set_phase(BUILD_PHASE.INTERNAL_ERROR if internal_error else BUILD_PHASE.ERROR) + def set_error(self, error_message, extra_data=None, internal_error=False, requeued=False): + self.set_phase(BUILD_PHASE.INTERNAL_ERROR if internal_error and requeued else BUILD_PHASE.ERROR) extra_data = extra_data or {} extra_data['internal_error'] = internal_error diff --git a/data/database.py b/data/database.py index eb804688e..269b7bf67 100644 --- a/data/database.py +++ b/data/database.py @@ -476,6 +476,15 @@ class BUILD_PHASE(object): COMPLETE = 'complete' +class QueueItem(BaseModel): + queue_name = CharField(index=True, max_length=1024) + body = TextField() + available_after = DateTimeField(default=datetime.utcnow, index=True) + available = BooleanField(default=True, index=True) + processing_expires = DateTimeField(null=True, index=True) + retries_remaining = IntegerField(default=5) + + class RepositoryBuild(BaseModel): uuid = CharField(default=uuid_generator, index=True) repository = ForeignKeyField(Repository, index=True) @@ -488,15 +497,7 @@ class RepositoryBuild(BaseModel): trigger = ForeignKeyField(RepositoryBuildTrigger, null=True, index=True) pull_robot = QuayUserField(null=True, related_name='buildpullrobot') logs_archived = BooleanField(default=False) - - -class QueueItem(BaseModel): - queue_name = CharField(index=True, max_length=1024) - body = TextField() - available_after = DateTimeField(default=datetime.utcnow, index=True) - available = BooleanField(default=True, index=True) - processing_expires = DateTimeField(null=True, index=True) - retries_remaining = IntegerField(default=5) + queue_item = ForeignKeyField(QueueItem, null=True, index=True) class LogEntryKind(BaseModel): diff --git a/data/migrations/versions/14fe12ade3df_add_build_queue_item_reference_to_the_.py b/data/migrations/versions/14fe12ade3df_add_build_queue_item_reference_to_the_.py new file mode 100644 index 000000000..5e8d21211 --- /dev/null +++ b/data/migrations/versions/14fe12ade3df_add_build_queue_item_reference_to_the_.py @@ -0,0 +1,30 @@ +"""Add build queue item reference to the repositorybuild table + +Revision ID: 14fe12ade3df +Revises: 5ad999136045 +Create Date: 2015-02-12 16:11:57.814645 + +""" + +# revision identifiers, used by Alembic. +revision = '14fe12ade3df' +down_revision = '5ad999136045' + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import mysql + +def upgrade(tables): + ### commands auto generated by Alembic - please adjust! ### + op.add_column('repositorybuild', sa.Column('queue_item_id', sa.Integer(), nullable=True)) + op.create_index('repositorybuild_queue_item_id', 'repositorybuild', ['queue_item_id'], unique=False) + op.create_foreign_key(op.f('fk_repositorybuild_queue_item_id_queueitem'), 'repositorybuild', 'queueitem', ['queue_item_id'], ['id']) + ### end Alembic commands ### + + +def downgrade(tables): + ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(op.f('fk_repositorybuild_queue_item_id_queueitem'), 'repositorybuild', type_='foreignkey') + op.drop_index('repositorybuild_queue_item_id', table_name='repositorybuild') + op.drop_column('repositorybuild', 'queue_item_id') + ### end Alembic commands ### diff --git a/data/queue.py b/data/queue.py index cdfa4e9f9..40a94c6e9 100644 --- a/data/queue.py +++ b/data/queue.py @@ -60,8 +60,8 @@ class WorkQueue(object): running_query = self._running_jobs(now, name_match_query) running_count = running_query.distinct().count() - avialable_query = self._available_jobs_not_running(now, name_match_query, running_query) - available_count = avialable_query.select(QueueItem.queue_name).distinct().count() + available_query = self._available_jobs_not_running(now, name_match_query, running_query) + available_count = available_query.select(QueueItem.queue_name).distinct().count() self._reporter(self._currently_processing, running_count, running_count + available_count) @@ -81,7 +81,7 @@ class WorkQueue(object): params['available_after'] = available_date with self._transaction_factory(db): - QueueItem.create(**params) + return QueueItem.create(**params) def get(self, processing_time=300): """ @@ -114,6 +114,7 @@ class WorkQueue(object): item = AttrDict({ 'id': db_item.id, 'body': db_item.body, + 'retries_remaining': db_item.retries_remaining }) self._currently_processing = True @@ -141,6 +142,7 @@ class WorkQueue(object): incomplete_item_obj.save() self._currently_processing = False + return incomplete_item_obj.retries_remaining > 0 def extend_processing(self, item, seconds_from_now, minimum_extension=MINIMUM_EXTENSION): with self._transaction_factory(db): diff --git a/endpoints/api/build.py b/endpoints/api/build.py index 506c250da..604c7258d 100644 --- a/endpoints/api/build.py +++ b/endpoints/api/build.py @@ -76,6 +76,12 @@ def build_status_view(build_obj, can_write=False): if datetime.datetime.utcnow() - heartbeat > datetime.timedelta(minutes=1): phase = database.BUILD_PHASE.INTERNAL_ERROR + # If the phase is internal error, return 'error' instead of the number if retries + # on the queue item is 0. + if phase == database.BUILD_PHASE.INTERNAL_ERROR: + if build_obj.queue_item is None or build_obj.queue_item.retries_remaining == 0: + phase = database.BUILD_PHASE.ERROR + logger.debug('Can write: %s job_config: %s', can_write, build_obj.job_config) resp = { 'id': build_obj.uuid, @@ -87,7 +93,7 @@ def build_status_view(build_obj, can_write=False): 'is_writer': can_write, 'trigger': trigger_view(build_obj.trigger), 'resource_key': build_obj.resource_key, - 'pull_robot': user_view(build_obj.pull_robot) if build_obj.pull_robot else None, + 'pull_robot': user_view(build_obj.pull_robot) if build_obj.pull_robot else None } if can_write: diff --git a/endpoints/common.py b/endpoints/common.py index 9301329e0..2f5ffc67c 100644 --- a/endpoints/common.py +++ b/endpoints/common.py @@ -230,10 +230,17 @@ def start_build(repository, dockerfile_id, tags, build_name, subdir, manual, dockerfile_id, build_name, trigger, pull_robot_name=pull_robot_name) - dockerfile_build_queue.put([repository.namespace_user.username, repository.name], json.dumps({ + json_data = json.dumps({ 'build_uuid': build_request.uuid, 'pull_credentials': model.get_pull_credentials(pull_robot_name) if pull_robot_name else None - }), retries_remaining=3) + }) + + queue_item = dockerfile_build_queue.put([repository.namespace_user.username, repository.name], + json_data, + retries_remaining=3) + + build_request.queue_item = queue_item + build_request.save() # Add the build to the repo's log. metadata = {