Merge pull request #2791 from coreos-inc/purge-repo-optimization

Optimize purging of a repository by skipping the unreferenced check
This commit is contained in:
josephschorr 2017-07-31 18:02:28 -04:00 committed by GitHub
commit 6ce06942f0

View file

@ -92,8 +92,7 @@ def purge_repository(namespace_name, repository_name):
# Gc to remove the images and storage # Gc to remove the images and storage
all_repo_images = previously_referenced | unreferenced_candidates all_repo_images = previously_referenced | unreferenced_candidates
successful_gc = garbage_collect_repo(repo, all_repo_images) successful_gc = garbage_collect_repo(repo, all_repo_images, is_purge=True)
if not successful_gc: if not successful_gc:
return False return False
@ -151,7 +150,47 @@ def find_repository_with_garbage(limit_to_gc_policy_s):
return None return None
def garbage_collect_repo(repo, extra_candidate_set=None): def _all_images_for_gc(repo):
""" Returns all the images found in the given repository, for the purposes of GC. """
images = (Image
.select(Image.id, Image.docker_image_id,
ImageStorage.id, ImageStorage.uuid)
.join(ImageStorage)
.where(Image.repository == repo))
return list(images)
def _filter_to_unreferenced(repo, candidates_orphans):
""" Filters the given candidate orphan images into those unreferenced by any tag or
other image. """
# Any image directly referenced by a tag that still exists, cannot be GCed.
direct_referenced = (RepositoryTag
.select(RepositoryTag.image)
.where(RepositoryTag.repository == repo.id,
RepositoryTag.image << candidates_orphans))
# Any image which is the parent of another image, cannot be GCed.
parent_referenced = (Image
.select(Image.parent)
.where(Image.repository == repo.id,
Image.parent << candidates_orphans))
referenced_candidates = (direct_referenced | parent_referenced)
# We desire a few pieces of information from the database from the following
# query: all of the image ids which are associated with this repository,
# and the storages which are associated with those images.
unreferenced_candidates = (Image
.select(Image.id, Image.docker_image_id,
ImageStorage.id, ImageStorage.uuid)
.join(ImageStorage)
.where(Image.id << candidates_orphans,
~(Image.id << referenced_candidates)))
return list(unreferenced_candidates)
def garbage_collect_repo(repo, extra_candidate_set=None, is_purge=False):
""" Garbage collect the specified repository object. This will remove all """ Garbage collect the specified repository object. This will remove all
images, derived images, and other associated metadata, for images which images, derived images, and other associated metadata, for images which
are no longer referenced by a tag or another image which is itself are no longer referenced by a tag or another image which is itself
@ -162,8 +201,8 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
logger.debug('Garbage collecting repository %s', repo.id) logger.debug('Garbage collecting repository %s', repo.id)
storage_id_whitelist = set() storage_id_whitelist = set()
candidate_orphan_image_set = tag.garbage_collect_tags(repo)
candidate_orphan_image_set = tag.garbage_collect_tags(repo)
if extra_candidate_set: if extra_candidate_set:
candidate_orphan_image_set.update(extra_candidate_set) candidate_orphan_image_set.update(extra_candidate_set)
@ -175,10 +214,11 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
all_storage_id_whitelist = set() all_storage_id_whitelist = set()
all_unreferenced_candidates = set() all_unreferenced_candidates = set()
# Remove any images directly referenced by tags, to prune the working set. if not is_purge:
direct_referenced = (RepositoryTag.select(RepositoryTag.image).where( # Remove any images directly referenced by tags, to prune the working set.
RepositoryTag.repository == repo.id, RepositoryTag.image << list(candidate_orphan_image_set))) direct_referenced = (RepositoryTag.select(RepositoryTag.image).where(
candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced]) RepositoryTag.repository == repo.id, RepositoryTag.image << list(candidate_orphan_image_set)))
candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced])
# Iteratively try to remove images from the database. The only images we can remove are those # Iteratively try to remove images from the database. The only images we can remove are those
# that are not referenced by tags AND not the parents of other images. We continue removing images # that are not referenced by tags AND not the parents of other images. We continue removing images
@ -192,32 +232,19 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
candidates_orphans = list(candidate_orphan_image_set) candidates_orphans = list(candidate_orphan_image_set)
with db_transaction(): with db_transaction():
# Any image directly referenced by a tag that still exists, cannot be GCed. # Find the images to delete.
direct_referenced = (RepositoryTag.select(RepositoryTag.image).where( images_to_gc = (_all_images_for_gc(repo) if is_purge
RepositoryTag.repository == repo.id, RepositoryTag.image << candidates_orphans)) else _filter_to_unreferenced(repo, candidates_orphans))
# Any image which is the parent of another image, cannot be GCed. # Make sure we are making progress.
parent_referenced = (Image.select(Image.parent).where(Image.repository == repo.id, image_ids_to_remove = [candidate.id for candidate in images_to_gc]
Image.parent << candidates_orphans))
referenced_candidates = (direct_referenced | parent_referenced)
# We desire a few pieces of information from the database from the following
# query: all of the image ids which are associated with this repository,
# and the storages which are associated with those images.
unreferenced_candidates = (Image.select(Image.id, Image.docker_image_id, ImageStorage.id,
ImageStorage.uuid).join(ImageStorage)
.where(Image.id << candidates_orphans,
~(Image.id << referenced_candidates)))
image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates]
making_progress = bool(len(image_ids_to_remove)) making_progress = bool(len(image_ids_to_remove))
if len(image_ids_to_remove) == 0: if len(image_ids_to_remove) == 0:
# No more candidates to remove. # No more images to remove.
break break
logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove) logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove)
storage_id_whitelist = set([candidate.storage_id for candidate in unreferenced_candidates]) storage_id_whitelist = set([candidate.storage_id for candidate in images_to_gc])
# Lookup any derived images for the images to remove. # Lookup any derived images for the images to remove.
derived = DerivedStorageForImage.select().where(DerivedStorageForImage.source_image << derived = DerivedStorageForImage.select().where(DerivedStorageForImage.source_image <<
@ -246,7 +273,7 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
# Add the images to the removed set and remove them from the candidate set. # Add the images to the removed set and remove them from the candidate set.
all_images_removed.update(image_ids_to_remove) all_images_removed.update(image_ids_to_remove)
all_storage_id_whitelist.update(storage_id_whitelist) all_storage_id_whitelist.update(storage_id_whitelist)
all_unreferenced_candidates.update(unreferenced_candidates) all_unreferenced_candidates.update(images_to_gc)
candidate_orphan_image_set.difference_update(image_ids_to_remove) candidate_orphan_image_set.difference_update(image_ids_to_remove)