diff --git a/data/model/repository.py b/data/model/repository.py index 4684bc8af..6602bc081 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -181,49 +181,50 @@ def garbage_collect_repo(repo, extra_candidate_set=None): logger.debug('No candidate images for GC for repo: %s', repo.id) return True - candidates_orphans = list(candidate_orphan_image_set) + all_images_removed = set() + all_storage_id_whitelist = set() + all_unreferenced_candidates = set() - with db_transaction(): - Candidate = Image.alias() - Tagged = Image.alias() - ancestor_superset = Tagged.ancestors ** db_concat_func(Candidate.ancestors, Candidate.id, '/%') + # Iteratively try to remove images from the database. The only images we can remove are those + # that are not referenced by tags AND not the parents of other images. We continue removing images + # until no changes are found. + iteration = 0 + while candidate_orphan_image_set: + iteration = iteration + 1 + logger.debug('Starting iteration #%s for GC of repository %s with candidates: %s', iteration, + repo.id, candidate_orphan_image_set) + candidates_orphans = list(candidate_orphan_image_set) - # We are going to compute all images which are being referenced in two ways: - # First, we will find all images which have their ancestor paths appear in - # another image. Secondly, we union in all of the candidate images which are - # directly referenced by a tag. This can be used in a subquery to directly - # find which candidates are being referenced without any client side - # computation or extra round trips. - direct_referenced = (RepositoryTag - .select(RepositoryTag.image) - .where(RepositoryTag.repository == repo.id, - RepositoryTag.image << candidates_orphans)) - - cloned = direct_referenced.clone().alias('direct_ref') - directly_referenced_subquery = Image.alias().select(cloned.c.image_id).from_(cloned) - - ancestor_referenced = (Candidate - .select(Candidate.id) - .join(Tagged, on=ancestor_superset) - .join(RepositoryTag, on=(Tagged.id == RepositoryTag.image)) + with db_transaction(): + # Any image directly referenced by a tag that still exists, cannot be GCed. + direct_referenced = (RepositoryTag + .select(RepositoryTag.image) .where(RepositoryTag.repository == repo.id, - Candidate.id << candidates_orphans, - ~(Candidate.id << directly_referenced_subquery))) + RepositoryTag.image << candidates_orphans)) - referenced_candidates = (direct_referenced | ancestor_referenced) + # Any image which is the parent of another image, cannot be GCed. + parent_referenced = (Image + .select(Image.parent) + .where(Image.repository == repo.id, + Image.parent << candidates_orphans)) - # We desire a few pieces of information from the database from the following - # query: all of the image ids which are associated with this repository, - # and the storages which are associated with those images. - unreferenced_candidates = (Image - .select(Image.id, Image.docker_image_id, - ImageStorage.id, ImageStorage.uuid) - .join(ImageStorage) - .where(Image.id << candidates_orphans, - ~(Image.id << referenced_candidates))) + referenced_candidates = (direct_referenced | parent_referenced) + + # We desire a few pieces of information from the database from the following + # query: all of the image ids which are associated with this repository, + # and the storages which are associated with those images. + unreferenced_candidates = (Image + .select(Image.id, Image.docker_image_id, + ImageStorage.id, ImageStorage.uuid) + .join(ImageStorage) + .where(Image.id << candidates_orphans, + ~(Image.id << referenced_candidates))) + + image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates] + if len(image_ids_to_remove) == 0: + # No more candidates to remove. + break - image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates] - if len(image_ids_to_remove) > 0: logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove) storage_id_whitelist = set([candidate.storage_id for candidate in unreferenced_candidates]) @@ -253,15 +254,22 @@ def garbage_collect_repo(repo, extra_candidate_set=None): logger.info('Could not GC images %s; will try again soon', image_ids_to_remove) return False + # Add the images to the removed set and remove them from the candidate set. + all_images_removed.update(image_ids_to_remove) + all_storage_id_whitelist.update(storage_id_whitelist) + all_unreferenced_candidates.update(unreferenced_candidates) + + candidate_orphan_image_set.difference_update(image_ids_to_remove) + # If any images were removed, GC any orphaned storages. - if len(image_ids_to_remove) > 0: - logger.info('Garbage collecting storage for images: %s', image_ids_to_remove) - storage_ids_removed = set(storage.garbage_collect_storage(storage_id_whitelist)) + if len(all_images_removed) > 0: + logger.info('Garbage collecting storage for images: %s', all_images_removed) + storage_ids_removed = set(storage.garbage_collect_storage(all_storage_id_whitelist)) # If any storages were removed and cleanup callbacks are registered, call them with # the images+storages removed. if storage_ids_removed and config.image_cleanup_callbacks: - image_storages_removed = [candidate for candidate in unreferenced_candidates + image_storages_removed = [candidate for candidate in all_unreferenced_candidates if candidate.storage_id in storage_ids_removed] for callback in config.image_cleanup_callbacks: callback(image_storages_removed)