Change Repo GC to be iterative

This prevents us from creating a massive join when there are a large number of tags in the repository, which can result in locking the entire DB for long periods of time. Instead of the join, we just iteratively lookup any images found to be directly referenced by a tag or found as a parent of another image, both of which should be indexed lookups. Once done, we only remove those images and then iterate until the working set stops changing.
This commit is contained in:
Joseph Schorr 2017-06-22 18:09:17 -04:00
parent e9a95874ee
commit 45c7008078

View file

@ -181,49 +181,50 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
logger.debug('No candidate images for GC for repo: %s', repo.id)
return True
candidates_orphans = list(candidate_orphan_image_set)
all_images_removed = set()
all_storage_id_whitelist = set()
all_unreferenced_candidates = set()
with db_transaction():
Candidate = Image.alias()
Tagged = Image.alias()
ancestor_superset = Tagged.ancestors ** db_concat_func(Candidate.ancestors, Candidate.id, '/%')
# Iteratively try to remove images from the database. The only images we can remove are those
# that are not referenced by tags AND not the parents of other images. We continue removing images
# until no changes are found.
iteration = 0
while candidate_orphan_image_set:
iteration = iteration + 1
logger.debug('Starting iteration #%s for GC of repository %s with candidates: %s', iteration,
repo.id, candidate_orphan_image_set)
candidates_orphans = list(candidate_orphan_image_set)
# We are going to compute all images which are being referenced in two ways:
# First, we will find all images which have their ancestor paths appear in
# another image. Secondly, we union in all of the candidate images which are
# directly referenced by a tag. This can be used in a subquery to directly
# find which candidates are being referenced without any client side
# computation or extra round trips.
direct_referenced = (RepositoryTag
.select(RepositoryTag.image)
.where(RepositoryTag.repository == repo.id,
RepositoryTag.image << candidates_orphans))
cloned = direct_referenced.clone().alias('direct_ref')
directly_referenced_subquery = Image.alias().select(cloned.c.image_id).from_(cloned)
ancestor_referenced = (Candidate
.select(Candidate.id)
.join(Tagged, on=ancestor_superset)
.join(RepositoryTag, on=(Tagged.id == RepositoryTag.image))
with db_transaction():
# Any image directly referenced by a tag that still exists, cannot be GCed.
direct_referenced = (RepositoryTag
.select(RepositoryTag.image)
.where(RepositoryTag.repository == repo.id,
Candidate.id << candidates_orphans,
~(Candidate.id << directly_referenced_subquery)))
RepositoryTag.image << candidates_orphans))
referenced_candidates = (direct_referenced | ancestor_referenced)
# Any image which is the parent of another image, cannot be GCed.
parent_referenced = (Image
.select(Image.parent)
.where(Image.repository == repo.id,
Image.parent << candidates_orphans))
# We desire a few pieces of information from the database from the following
# query: all of the image ids which are associated with this repository,
# and the storages which are associated with those images.
unreferenced_candidates = (Image
.select(Image.id, Image.docker_image_id,
ImageStorage.id, ImageStorage.uuid)
.join(ImageStorage)
.where(Image.id << candidates_orphans,
~(Image.id << referenced_candidates)))
referenced_candidates = (direct_referenced | parent_referenced)
# We desire a few pieces of information from the database from the following
# query: all of the image ids which are associated with this repository,
# and the storages which are associated with those images.
unreferenced_candidates = (Image
.select(Image.id, Image.docker_image_id,
ImageStorage.id, ImageStorage.uuid)
.join(ImageStorage)
.where(Image.id << candidates_orphans,
~(Image.id << referenced_candidates)))
image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates]
if len(image_ids_to_remove) == 0:
# No more candidates to remove.
break
image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates]
if len(image_ids_to_remove) > 0:
logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove)
storage_id_whitelist = set([candidate.storage_id for candidate in unreferenced_candidates])
@ -253,15 +254,22 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
logger.info('Could not GC images %s; will try again soon', image_ids_to_remove)
return False
# Add the images to the removed set and remove them from the candidate set.
all_images_removed.update(image_ids_to_remove)
all_storage_id_whitelist.update(storage_id_whitelist)
all_unreferenced_candidates.update(unreferenced_candidates)
candidate_orphan_image_set.difference_update(image_ids_to_remove)
# If any images were removed, GC any orphaned storages.
if len(image_ids_to_remove) > 0:
logger.info('Garbage collecting storage for images: %s', image_ids_to_remove)
storage_ids_removed = set(storage.garbage_collect_storage(storage_id_whitelist))
if len(all_images_removed) > 0:
logger.info('Garbage collecting storage for images: %s', all_images_removed)
storage_ids_removed = set(storage.garbage_collect_storage(all_storage_id_whitelist))
# If any storages were removed and cleanup callbacks are registered, call them with
# the images+storages removed.
if storage_ids_removed and config.image_cleanup_callbacks:
image_storages_removed = [candidate for candidate in unreferenced_candidates
image_storages_removed = [candidate for candidate in all_unreferenced_candidates
if candidate.storage_id in storage_ids_removed]
for callback in config.image_cleanup_callbacks:
callback(image_storages_removed)