From 45c70080785a6fd62e14f1b23656f0dbb219f811 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 22 Jun 2017 18:09:17 -0400 Subject: [PATCH] Change Repo GC to be iterative This prevents us from creating a massive join when there are a large number of tags in the repository, which can result in locking the entire DB for long periods of time. Instead of the join, we just iteratively lookup any images found to be directly referenced by a tag or found as a parent of another image, both of which should be indexed lookups. Once done, we only remove those images and then iterate until the working set stops changing. --- data/model/repository.py | 90 ++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/data/model/repository.py b/data/model/repository.py index 4684bc8af..6602bc081 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -181,49 +181,50 @@ def garbage_collect_repo(repo, extra_candidate_set=None): logger.debug('No candidate images for GC for repo: %s', repo.id) return True - candidates_orphans = list(candidate_orphan_image_set) + all_images_removed = set() + all_storage_id_whitelist = set() + all_unreferenced_candidates = set() - with db_transaction(): - Candidate = Image.alias() - Tagged = Image.alias() - ancestor_superset = Tagged.ancestors ** db_concat_func(Candidate.ancestors, Candidate.id, '/%') + # Iteratively try to remove images from the database. The only images we can remove are those + # that are not referenced by tags AND not the parents of other images. We continue removing images + # until no changes are found. + iteration = 0 + while candidate_orphan_image_set: + iteration = iteration + 1 + logger.debug('Starting iteration #%s for GC of repository %s with candidates: %s', iteration, + repo.id, candidate_orphan_image_set) + candidates_orphans = list(candidate_orphan_image_set) - # We are going to compute all images which are being referenced in two ways: - # First, we will find all images which have their ancestor paths appear in - # another image. Secondly, we union in all of the candidate images which are - # directly referenced by a tag. This can be used in a subquery to directly - # find which candidates are being referenced without any client side - # computation or extra round trips. - direct_referenced = (RepositoryTag - .select(RepositoryTag.image) - .where(RepositoryTag.repository == repo.id, - RepositoryTag.image << candidates_orphans)) - - cloned = direct_referenced.clone().alias('direct_ref') - directly_referenced_subquery = Image.alias().select(cloned.c.image_id).from_(cloned) - - ancestor_referenced = (Candidate - .select(Candidate.id) - .join(Tagged, on=ancestor_superset) - .join(RepositoryTag, on=(Tagged.id == RepositoryTag.image)) + with db_transaction(): + # Any image directly referenced by a tag that still exists, cannot be GCed. + direct_referenced = (RepositoryTag + .select(RepositoryTag.image) .where(RepositoryTag.repository == repo.id, - Candidate.id << candidates_orphans, - ~(Candidate.id << directly_referenced_subquery))) + RepositoryTag.image << candidates_orphans)) - referenced_candidates = (direct_referenced | ancestor_referenced) + # Any image which is the parent of another image, cannot be GCed. + parent_referenced = (Image + .select(Image.parent) + .where(Image.repository == repo.id, + Image.parent << candidates_orphans)) - # We desire a few pieces of information from the database from the following - # query: all of the image ids which are associated with this repository, - # and the storages which are associated with those images. - unreferenced_candidates = (Image - .select(Image.id, Image.docker_image_id, - ImageStorage.id, ImageStorage.uuid) - .join(ImageStorage) - .where(Image.id << candidates_orphans, - ~(Image.id << referenced_candidates))) + referenced_candidates = (direct_referenced | parent_referenced) + + # We desire a few pieces of information from the database from the following + # query: all of the image ids which are associated with this repository, + # and the storages which are associated with those images. + unreferenced_candidates = (Image + .select(Image.id, Image.docker_image_id, + ImageStorage.id, ImageStorage.uuid) + .join(ImageStorage) + .where(Image.id << candidates_orphans, + ~(Image.id << referenced_candidates))) + + image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates] + if len(image_ids_to_remove) == 0: + # No more candidates to remove. + break - image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates] - if len(image_ids_to_remove) > 0: logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove) storage_id_whitelist = set([candidate.storage_id for candidate in unreferenced_candidates]) @@ -253,15 +254,22 @@ def garbage_collect_repo(repo, extra_candidate_set=None): logger.info('Could not GC images %s; will try again soon', image_ids_to_remove) return False + # Add the images to the removed set and remove them from the candidate set. + all_images_removed.update(image_ids_to_remove) + all_storage_id_whitelist.update(storage_id_whitelist) + all_unreferenced_candidates.update(unreferenced_candidates) + + candidate_orphan_image_set.difference_update(image_ids_to_remove) + # If any images were removed, GC any orphaned storages. - if len(image_ids_to_remove) > 0: - logger.info('Garbage collecting storage for images: %s', image_ids_to_remove) - storage_ids_removed = set(storage.garbage_collect_storage(storage_id_whitelist)) + if len(all_images_removed) > 0: + logger.info('Garbage collecting storage for images: %s', all_images_removed) + storage_ids_removed = set(storage.garbage_collect_storage(all_storage_id_whitelist)) # If any storages were removed and cleanup callbacks are registered, call them with # the images+storages removed. if storage_ids_removed and config.image_cleanup_callbacks: - image_storages_removed = [candidate for candidate in unreferenced_candidates + image_storages_removed = [candidate for candidate in all_unreferenced_candidates if candidate.storage_id in storage_ids_removed] for callback in config.image_cleanup_callbacks: callback(image_storages_removed)