Merge pull request #3288 from quay/faster-gc

Change garbage collection queries to be far smaller by GCing per tag and per image
This commit is contained in:
Joseph Schorr 2018-11-07 14:35:38 -05:00 committed by GitHub
commit 3e63b08731
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 33 deletions

View file

@ -93,18 +93,8 @@ def purge_repository(namespace_name, repository_name):
ApprTag.delete().where(ApprTag.repository == repo, ~(ApprTag.linked_tag >> None)).execute() ApprTag.delete().where(ApprTag.repository == repo, ~(ApprTag.linked_tag >> None)).execute()
ApprTag.delete().where(ApprTag.repository == repo).execute() ApprTag.delete().where(ApprTag.repository == repo).execute()
# Delete all tags to allow gc to reclaim storage
previously_referenced = tag.purge_all_tags(repo)
unreferenced_image_q = Image.select(Image.id).where(Image.repository == repo)
if len(previously_referenced) > 0:
unreferenced_image_q = (unreferenced_image_q.where(~(Image.id << list(previously_referenced))))
unreferenced_candidates = set(img[0] for img in unreferenced_image_q.tuples())
# Gc to remove the images and storage # Gc to remove the images and storage
all_repo_images = previously_referenced | unreferenced_candidates successful_gc = garbage_collect_repo(repo, is_purge=True)
successful_gc = garbage_collect_repo(repo, all_repo_images, is_purge=True)
if not successful_gc: if not successful_gc:
return False return False
@ -175,18 +165,23 @@ def _all_images_for_gc(repo):
def _filter_to_unreferenced(repo, candidates_orphans): def _filter_to_unreferenced(repo, candidates_orphans):
""" Filters the given candidate orphan images into those unreferenced by any tag or """ Filters the given candidate orphan images into those unreferenced by any tag or
other image. """ other image. """
def _get_clause(field, candidates):
if len(candidates) == 1:
return field == candidates[0]
return field << candidates
# Any image directly referenced by a tag that still exists, cannot be GCed. # Any image directly referenced by a tag that still exists, cannot be GCed.
direct_referenced = (RepositoryTag direct_referenced = (RepositoryTag
.select(RepositoryTag.image) .select(RepositoryTag.image)
.where(RepositoryTag.repository == repo.id, .where(RepositoryTag.repository == repo.id,
RepositoryTag.image << candidates_orphans)) _get_clause(RepositoryTag.image, candidates_orphans)))
# Any image which is the parent of another image, cannot be GCed. # Any image which is the parent of another image, cannot be GCed.
parent_referenced = (Image parent_referenced = (Image
.select(Image.parent) .select(Image.parent)
.where(Image.repository == repo.id, .where(Image.repository == repo.id,
Image.parent << candidates_orphans)) _get_clause(Image.parent, candidates_orphans)))
referenced_candidates = (direct_referenced | parent_referenced) referenced_candidates = (direct_referenced | parent_referenced)
@ -197,12 +192,12 @@ def _filter_to_unreferenced(repo, candidates_orphans):
.select(Image.id, Image.docker_image_id, .select(Image.id, Image.docker_image_id,
ImageStorage.id, ImageStorage.uuid) ImageStorage.id, ImageStorage.uuid)
.join(ImageStorage) .join(ImageStorage)
.where(Image.id << candidates_orphans, .where(_get_clause(Image.id, candidates_orphans),
~(Image.id << referenced_candidates))) ~(_get_clause(Image.id, referenced_candidates))))
return list(unreferenced_candidates) return list(unreferenced_candidates)
def garbage_collect_repo(repo, extra_candidate_set=None, is_purge=False): def garbage_collect_repo(repo, is_purge=False):
""" Garbage collect the specified repository object. This will remove all """ Garbage collect the specified repository object. This will remove all
images, derived images, and other associated metadata, for images which images, derived images, and other associated metadata, for images which
are no longer referenced by a tag or another image which is itself are no longer referenced by a tag or another image which is itself
@ -212,26 +207,32 @@ def garbage_collect_repo(repo, extra_candidate_set=None, is_purge=False):
""" """
logger.debug('Garbage collecting repository %s', repo.id) logger.debug('Garbage collecting repository %s', repo.id)
storage_id_whitelist = set() if is_purge:
tag.purge_all_tags(repo)
images_for_tags_removed = {i.id for i in Image.select().where(Image.repository == repo)}
return _garbage_collect_from_image(repo, images_for_tags_removed, True)
candidate_orphan_image_set = tag.garbage_collect_tags(repo) images_for_tags_removed = tag.garbage_collect_tags(repo)
if extra_candidate_set: if not len(images_for_tags_removed):
candidate_orphan_image_set.update(extra_candidate_set) logger.debug('No images for GC for repo: %s', repo.id)
if not len(candidate_orphan_image_set):
logger.debug('No candidate images for GC for repo: %s', repo.id)
return True return True
for image in images_for_tags_removed:
candidate_list = [image.id] + list(reversed(image.ancestor_id_list()))
for candidate_id in candidate_list:
if not _garbage_collect_from_image(repo, {candidate_id}):
return False
return True
def _garbage_collect_from_image(repo, candidate_orphan_image_set, is_purge=False):
storage_id_whitelist = set()
all_images_removed = set() all_images_removed = set()
all_storage_id_whitelist = set() all_storage_id_whitelist = set()
all_unreferenced_candidates = set() all_unreferenced_candidates = set()
if not is_purge:
# Remove any images directly referenced by tags, to prune the working set.
direct_referenced = (RepositoryTag.select(RepositoryTag.image).where(
RepositoryTag.repository == repo.id, RepositoryTag.image << list(candidate_orphan_image_set)))
candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced])
# Iteratively try to remove images from the database. The only images we can remove are those # Iteratively try to remove images from the database. The only images we can remove are those
# that are not referenced by tags AND not the parents of other images. We continue removing images # that are not referenced by tags AND not the parents of other images. We continue removing images
# until no changes are found. # until no changes are found.

View file

@ -517,10 +517,7 @@ def _delete_tags(repo, query_modifier=None):
.execute()) .execute())
logger.debug('Removed %s tags with %s manifests', num_deleted_tags, num_deleted_manifests) logger.debug('Removed %s tags with %s manifests', num_deleted_tags, num_deleted_manifests)
ancestors = reduce(lambda r, l: r | l, return [tag.image for tag in tags_to_delete]
(set(tag.image.ancestor_id_list()) for tag in tags_to_delete))
direct_referenced = {tag.image.id for tag in tags_to_delete}
return ancestors | direct_referenced
def _get_repo_tag_image(tag_name, include_storage, modifier): def _get_repo_tag_image(tag_name, include_storage, modifier):