diff --git a/data/model/repository.py b/data/model/repository.py index 2ef204862..7700dde4b 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -92,8 +92,7 @@ def purge_repository(namespace_name, repository_name): # Gc to remove the images and storage all_repo_images = previously_referenced | unreferenced_candidates - successful_gc = garbage_collect_repo(repo, all_repo_images) - + successful_gc = garbage_collect_repo(repo, all_repo_images, is_purge=True) if not successful_gc: return False @@ -151,7 +150,47 @@ def find_repository_with_garbage(limit_to_gc_policy_s): return None -def garbage_collect_repo(repo, extra_candidate_set=None): +def _all_images_for_gc(repo): + """ Returns all the images found in the given repository, for the purposes of GC. """ + images = (Image + .select(Image.id, Image.docker_image_id, + ImageStorage.id, ImageStorage.uuid) + .join(ImageStorage) + .where(Image.repository == repo)) + return list(images) + + +def _filter_to_unreferenced(repo, candidates_orphans): + """ Filters the given candidate orphan images into those unreferenced by any tag or + other image. """ + + # Any image directly referenced by a tag that still exists, cannot be GCed. + direct_referenced = (RepositoryTag + .select(RepositoryTag.image) + .where(RepositoryTag.repository == repo.id, + RepositoryTag.image << candidates_orphans)) + + # Any image which is the parent of another image, cannot be GCed. + parent_referenced = (Image + .select(Image.parent) + .where(Image.repository == repo.id, + Image.parent << candidates_orphans)) + + referenced_candidates = (direct_referenced | parent_referenced) + + # We desire a few pieces of information from the database from the following + # query: all of the image ids which are associated with this repository, + # and the storages which are associated with those images. + unreferenced_candidates = (Image + .select(Image.id, Image.docker_image_id, + ImageStorage.id, ImageStorage.uuid) + .join(ImageStorage) + .where(Image.id << candidates_orphans, + ~(Image.id << referenced_candidates))) + return list(unreferenced_candidates) + + +def garbage_collect_repo(repo, extra_candidate_set=None, is_purge=False): """ Garbage collect the specified repository object. This will remove all images, derived images, and other associated metadata, for images which are no longer referenced by a tag or another image which is itself @@ -162,8 +201,8 @@ def garbage_collect_repo(repo, extra_candidate_set=None): logger.debug('Garbage collecting repository %s', repo.id) storage_id_whitelist = set() - candidate_orphan_image_set = tag.garbage_collect_tags(repo) + candidate_orphan_image_set = tag.garbage_collect_tags(repo) if extra_candidate_set: candidate_orphan_image_set.update(extra_candidate_set) @@ -175,10 +214,11 @@ def garbage_collect_repo(repo, extra_candidate_set=None): all_storage_id_whitelist = set() all_unreferenced_candidates = set() - # Remove any images directly referenced by tags, to prune the working set. - direct_referenced = (RepositoryTag.select(RepositoryTag.image).where( - RepositoryTag.repository == repo.id, RepositoryTag.image << list(candidate_orphan_image_set))) - candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced]) + if not is_purge: + # Remove any images directly referenced by tags, to prune the working set. + direct_referenced = (RepositoryTag.select(RepositoryTag.image).where( + RepositoryTag.repository == repo.id, RepositoryTag.image << list(candidate_orphan_image_set))) + candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced]) # Iteratively try to remove images from the database. The only images we can remove are those # that are not referenced by tags AND not the parents of other images. We continue removing images @@ -192,32 +232,19 @@ def garbage_collect_repo(repo, extra_candidate_set=None): candidates_orphans = list(candidate_orphan_image_set) with db_transaction(): - # Any image directly referenced by a tag that still exists, cannot be GCed. - direct_referenced = (RepositoryTag.select(RepositoryTag.image).where( - RepositoryTag.repository == repo.id, RepositoryTag.image << candidates_orphans)) + # Find the images to delete. + images_to_gc = (_all_images_for_gc(repo) if is_purge + else _filter_to_unreferenced(repo, candidates_orphans)) - # Any image which is the parent of another image, cannot be GCed. - parent_referenced = (Image.select(Image.parent).where(Image.repository == repo.id, - Image.parent << candidates_orphans)) - - referenced_candidates = (direct_referenced | parent_referenced) - - # We desire a few pieces of information from the database from the following - # query: all of the image ids which are associated with this repository, - # and the storages which are associated with those images. - unreferenced_candidates = (Image.select(Image.id, Image.docker_image_id, ImageStorage.id, - ImageStorage.uuid).join(ImageStorage) - .where(Image.id << candidates_orphans, - ~(Image.id << referenced_candidates))) - - image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates] + # Make sure we are making progress. + image_ids_to_remove = [candidate.id for candidate in images_to_gc] making_progress = bool(len(image_ids_to_remove)) if len(image_ids_to_remove) == 0: - # No more candidates to remove. + # No more images to remove. break logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove) - storage_id_whitelist = set([candidate.storage_id for candidate in unreferenced_candidates]) + storage_id_whitelist = set([candidate.storage_id for candidate in images_to_gc]) # Lookup any derived images for the images to remove. derived = DerivedStorageForImage.select().where(DerivedStorageForImage.source_image << @@ -246,7 +273,7 @@ def garbage_collect_repo(repo, extra_candidate_set=None): # Add the images to the removed set and remove them from the candidate set. all_images_removed.update(image_ids_to_remove) all_storage_id_whitelist.update(storage_id_whitelist) - all_unreferenced_candidates.update(unreferenced_candidates) + all_unreferenced_candidates.update(images_to_gc) candidate_orphan_image_set.difference_update(image_ids_to_remove)