Remove directly referenced images from the candidate set before starting GC iteration

Makes the lookup query underneath the transaction smaller if there are a lot of images referenced directly by tag. We still must do the direct referenced check within the transaction, but this should reduce the scope of the search space a bit.
This commit is contained in:
Joseph Schorr 2017-06-22 18:14:06 -04:00
parent 45c7008078
commit cdd7cb9321

View file

@ -185,11 +185,19 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
all_storage_id_whitelist = set()
all_unreferenced_candidates = set()
# Remove any images directly referenced by tags, to prune the working set.
direct_referenced = (RepositoryTag
.select(RepositoryTag.image)
.where(RepositoryTag.repository == repo.id,
RepositoryTag.image << list(candidate_orphan_image_set)))
candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced])
# Iteratively try to remove images from the database. The only images we can remove are those
# that are not referenced by tags AND not the parents of other images. We continue removing images
# until no changes are found.
iteration = 0
while candidate_orphan_image_set:
making_progress = True
while candidate_orphan_image_set and making_progress:
iteration = iteration + 1
logger.debug('Starting iteration #%s for GC of repository %s with candidates: %s', iteration,
repo.id, candidate_orphan_image_set)
@ -221,6 +229,7 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
~(Image.id << referenced_candidates)))
image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates]
making_progress = bool(len(image_ids_to_remove))
if len(image_ids_to_remove) == 0:
# No more candidates to remove.
break
@ -254,12 +263,12 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
logger.info('Could not GC images %s; will try again soon', image_ids_to_remove)
return False
# Add the images to the removed set and remove them from the candidate set.
all_images_removed.update(image_ids_to_remove)
all_storage_id_whitelist.update(storage_id_whitelist)
all_unreferenced_candidates.update(unreferenced_candidates)
# Add the images to the removed set and remove them from the candidate set.
all_images_removed.update(image_ids_to_remove)
all_storage_id_whitelist.update(storage_id_whitelist)
all_unreferenced_candidates.update(unreferenced_candidates)
candidate_orphan_image_set.difference_update(image_ids_to_remove)
candidate_orphan_image_set.difference_update(image_ids_to_remove)
# If any images were removed, GC any orphaned storages.
if len(all_images_removed) > 0: