Change garbage collection queries to be far smaller by GCing per tag and per image

While this will require far more iterations and queries, each query itself will be quite small, thus preventing us from locking up the database
This commit is contained in:
Joseph Schorr 2018-11-05 11:57:32 -05:00
parent d0e1f464ff
commit 5124422332
2 changed files with 31 additions and 33 deletions

View file

@ -93,18 +93,8 @@ def purge_repository(namespace_name, repository_name):
ApprTag.delete().where(ApprTag.repository == repo, ~(ApprTag.linked_tag >> None)).execute()
ApprTag.delete().where(ApprTag.repository == repo).execute()
# Delete all tags to allow gc to reclaim storage
previously_referenced = tag.purge_all_tags(repo)
unreferenced_image_q = Image.select(Image.id).where(Image.repository == repo)
if len(previously_referenced) > 0:
unreferenced_image_q = (unreferenced_image_q.where(~(Image.id << list(previously_referenced))))
unreferenced_candidates = set(img[0] for img in unreferenced_image_q.tuples())
# Gc to remove the images and storage
all_repo_images = previously_referenced | unreferenced_candidates
successful_gc = garbage_collect_repo(repo, all_repo_images, is_purge=True)
successful_gc = garbage_collect_repo(repo, is_purge=True)
if not successful_gc:
return False
@ -175,18 +165,23 @@ def _all_images_for_gc(repo):
def _filter_to_unreferenced(repo, candidates_orphans):
""" Filters the given candidate orphan images into those unreferenced by any tag or
other image. """
def _get_clause(field, candidates):
if len(candidates) == 1:
return field == candidates[0]
return field << candidates
# Any image directly referenced by a tag that still exists, cannot be GCed.
direct_referenced = (RepositoryTag
.select(RepositoryTag.image)
.where(RepositoryTag.repository == repo.id,
RepositoryTag.image << candidates_orphans))
_get_clause(RepositoryTag.image, candidates_orphans)))
# Any image which is the parent of another image, cannot be GCed.
parent_referenced = (Image
.select(Image.parent)
.where(Image.repository == repo.id,
Image.parent << candidates_orphans))
_get_clause(Image.parent, candidates_orphans)))
referenced_candidates = (direct_referenced | parent_referenced)
@ -197,12 +192,12 @@ def _filter_to_unreferenced(repo, candidates_orphans):
.select(Image.id, Image.docker_image_id,
ImageStorage.id, ImageStorage.uuid)
.join(ImageStorage)
.where(Image.id << candidates_orphans,
~(Image.id << referenced_candidates)))
.where(_get_clause(Image.id, candidates_orphans),
~(_get_clause(Image.id, referenced_candidates))))
return list(unreferenced_candidates)
def garbage_collect_repo(repo, extra_candidate_set=None, is_purge=False):
def garbage_collect_repo(repo, is_purge=False):
""" Garbage collect the specified repository object. This will remove all
images, derived images, and other associated metadata, for images which
are no longer referenced by a tag or another image which is itself
@ -212,26 +207,32 @@ def garbage_collect_repo(repo, extra_candidate_set=None, is_purge=False):
"""
logger.debug('Garbage collecting repository %s', repo.id)
storage_id_whitelist = set()
if is_purge:
tag.purge_all_tags(repo)
images_for_tags_removed = {i.id for i in Image.select().where(Image.repository == repo)}
return _garbage_collect_from_image(repo, images_for_tags_removed, True)
candidate_orphan_image_set = tag.garbage_collect_tags(repo)
if extra_candidate_set:
candidate_orphan_image_set.update(extra_candidate_set)
if not len(candidate_orphan_image_set):
logger.debug('No candidate images for GC for repo: %s', repo.id)
images_for_tags_removed = tag.garbage_collect_tags(repo)
if not len(images_for_tags_removed):
logger.debug('No images for GC for repo: %s', repo.id)
return True
for image in images_for_tags_removed:
candidate_list = [image.id] + list(reversed(image.ancestor_id_list()))
for candidate_id in candidate_list:
if not _garbage_collect_from_image(repo, {candidate_id}):
return False
return True
def _garbage_collect_from_image(repo, candidate_orphan_image_set, is_purge=False):
storage_id_whitelist = set()
all_images_removed = set()
all_storage_id_whitelist = set()
all_unreferenced_candidates = set()
if not is_purge:
# Remove any images directly referenced by tags, to prune the working set.
direct_referenced = (RepositoryTag.select(RepositoryTag.image).where(
RepositoryTag.repository == repo.id, RepositoryTag.image << list(candidate_orphan_image_set)))
candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced])
# Iteratively try to remove images from the database. The only images we can remove are those
# that are not referenced by tags AND not the parents of other images. We continue removing images
# until no changes are found.

View file

@ -508,10 +508,7 @@ def _delete_tags(repo, query_modifier=None):
.execute())
logger.debug('Removed %s tags with %s manifests', num_deleted_tags, num_deleted_manifests)
ancestors = reduce(lambda r, l: r | l,
(set(tag.image.ancestor_id_list()) for tag in tags_to_delete))
direct_referenced = {tag.image.id for tag in tags_to_delete}
return ancestors | direct_referenced
return [tag.image for tag in tags_to_delete]
def _get_repo_tag_image(tag_name, include_storage, modifier):