Change garbage collection queries to be far smaller by GCing per tag and per image
While this will require far more iterations and queries, each query itself will be quite small, thus preventing us from locking up the database
This commit is contained in:
parent
d0e1f464ff
commit
5124422332
2 changed files with 31 additions and 33 deletions
|
@ -93,18 +93,8 @@ def purge_repository(namespace_name, repository_name):
|
|||
ApprTag.delete().where(ApprTag.repository == repo, ~(ApprTag.linked_tag >> None)).execute()
|
||||
ApprTag.delete().where(ApprTag.repository == repo).execute()
|
||||
|
||||
# Delete all tags to allow gc to reclaim storage
|
||||
previously_referenced = tag.purge_all_tags(repo)
|
||||
unreferenced_image_q = Image.select(Image.id).where(Image.repository == repo)
|
||||
|
||||
if len(previously_referenced) > 0:
|
||||
unreferenced_image_q = (unreferenced_image_q.where(~(Image.id << list(previously_referenced))))
|
||||
|
||||
unreferenced_candidates = set(img[0] for img in unreferenced_image_q.tuples())
|
||||
|
||||
# Gc to remove the images and storage
|
||||
all_repo_images = previously_referenced | unreferenced_candidates
|
||||
successful_gc = garbage_collect_repo(repo, all_repo_images, is_purge=True)
|
||||
successful_gc = garbage_collect_repo(repo, is_purge=True)
|
||||
if not successful_gc:
|
||||
return False
|
||||
|
||||
|
@ -175,18 +165,23 @@ def _all_images_for_gc(repo):
|
|||
def _filter_to_unreferenced(repo, candidates_orphans):
|
||||
""" Filters the given candidate orphan images into those unreferenced by any tag or
|
||||
other image. """
|
||||
def _get_clause(field, candidates):
|
||||
if len(candidates) == 1:
|
||||
return field == candidates[0]
|
||||
|
||||
return field << candidates
|
||||
|
||||
# Any image directly referenced by a tag that still exists, cannot be GCed.
|
||||
direct_referenced = (RepositoryTag
|
||||
.select(RepositoryTag.image)
|
||||
.where(RepositoryTag.repository == repo.id,
|
||||
RepositoryTag.image << candidates_orphans))
|
||||
_get_clause(RepositoryTag.image, candidates_orphans)))
|
||||
|
||||
# Any image which is the parent of another image, cannot be GCed.
|
||||
parent_referenced = (Image
|
||||
.select(Image.parent)
|
||||
.where(Image.repository == repo.id,
|
||||
Image.parent << candidates_orphans))
|
||||
_get_clause(Image.parent, candidates_orphans)))
|
||||
|
||||
referenced_candidates = (direct_referenced | parent_referenced)
|
||||
|
||||
|
@ -197,12 +192,12 @@ def _filter_to_unreferenced(repo, candidates_orphans):
|
|||
.select(Image.id, Image.docker_image_id,
|
||||
ImageStorage.id, ImageStorage.uuid)
|
||||
.join(ImageStorage)
|
||||
.where(Image.id << candidates_orphans,
|
||||
~(Image.id << referenced_candidates)))
|
||||
.where(_get_clause(Image.id, candidates_orphans),
|
||||
~(_get_clause(Image.id, referenced_candidates))))
|
||||
return list(unreferenced_candidates)
|
||||
|
||||
|
||||
def garbage_collect_repo(repo, extra_candidate_set=None, is_purge=False):
|
||||
def garbage_collect_repo(repo, is_purge=False):
|
||||
""" Garbage collect the specified repository object. This will remove all
|
||||
images, derived images, and other associated metadata, for images which
|
||||
are no longer referenced by a tag or another image which is itself
|
||||
|
@ -212,26 +207,32 @@ def garbage_collect_repo(repo, extra_candidate_set=None, is_purge=False):
|
|||
"""
|
||||
logger.debug('Garbage collecting repository %s', repo.id)
|
||||
|
||||
storage_id_whitelist = set()
|
||||
if is_purge:
|
||||
tag.purge_all_tags(repo)
|
||||
images_for_tags_removed = {i.id for i in Image.select().where(Image.repository == repo)}
|
||||
return _garbage_collect_from_image(repo, images_for_tags_removed, True)
|
||||
|
||||
candidate_orphan_image_set = tag.garbage_collect_tags(repo)
|
||||
if extra_candidate_set:
|
||||
candidate_orphan_image_set.update(extra_candidate_set)
|
||||
|
||||
if not len(candidate_orphan_image_set):
|
||||
logger.debug('No candidate images for GC for repo: %s', repo.id)
|
||||
images_for_tags_removed = tag.garbage_collect_tags(repo)
|
||||
if not len(images_for_tags_removed):
|
||||
logger.debug('No images for GC for repo: %s', repo.id)
|
||||
return True
|
||||
|
||||
for image in images_for_tags_removed:
|
||||
candidate_list = [image.id] + list(reversed(image.ancestor_id_list()))
|
||||
for candidate_id in candidate_list:
|
||||
if not _garbage_collect_from_image(repo, {candidate_id}):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _garbage_collect_from_image(repo, candidate_orphan_image_set, is_purge=False):
|
||||
storage_id_whitelist = set()
|
||||
|
||||
all_images_removed = set()
|
||||
all_storage_id_whitelist = set()
|
||||
all_unreferenced_candidates = set()
|
||||
|
||||
if not is_purge:
|
||||
# Remove any images directly referenced by tags, to prune the working set.
|
||||
direct_referenced = (RepositoryTag.select(RepositoryTag.image).where(
|
||||
RepositoryTag.repository == repo.id, RepositoryTag.image << list(candidate_orphan_image_set)))
|
||||
candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced])
|
||||
|
||||
# Iteratively try to remove images from the database. The only images we can remove are those
|
||||
# that are not referenced by tags AND not the parents of other images. We continue removing images
|
||||
# until no changes are found.
|
||||
|
|
|
@ -508,10 +508,7 @@ def _delete_tags(repo, query_modifier=None):
|
|||
.execute())
|
||||
|
||||
logger.debug('Removed %s tags with %s manifests', num_deleted_tags, num_deleted_manifests)
|
||||
ancestors = reduce(lambda r, l: r | l,
|
||||
(set(tag.image.ancestor_id_list()) for tag in tags_to_delete))
|
||||
direct_referenced = {tag.image.id for tag in tags_to_delete}
|
||||
return ancestors | direct_referenced
|
||||
return [tag.image for tag in tags_to_delete]
|
||||
|
||||
|
||||
def _get_repo_tag_image(tag_name, include_storage, modifier):
|
||||
|
|
Reference in a new issue