From 5124422332115cd3ce932bd3a6646b0ba1fdf52f Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Mon, 5 Nov 2018 11:57:32 -0500 Subject: [PATCH] Change garbage collection queries to be far smaller by GCing per tag and per image While this will require far more iterations and queries, each query itself will be quite small, thus preventing us from locking up the database --- data/model/repository.py | 59 ++++++++++++++++++++-------------------- data/model/tag.py | 5 +--- 2 files changed, 31 insertions(+), 33 deletions(-) diff --git a/data/model/repository.py b/data/model/repository.py index 6bd833a5b..ea01c63a4 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -93,18 +93,8 @@ def purge_repository(namespace_name, repository_name): ApprTag.delete().where(ApprTag.repository == repo, ~(ApprTag.linked_tag >> None)).execute() ApprTag.delete().where(ApprTag.repository == repo).execute() - # Delete all tags to allow gc to reclaim storage - previously_referenced = tag.purge_all_tags(repo) - unreferenced_image_q = Image.select(Image.id).where(Image.repository == repo) - - if len(previously_referenced) > 0: - unreferenced_image_q = (unreferenced_image_q.where(~(Image.id << list(previously_referenced)))) - - unreferenced_candidates = set(img[0] for img in unreferenced_image_q.tuples()) - # Gc to remove the images and storage - all_repo_images = previously_referenced | unreferenced_candidates - successful_gc = garbage_collect_repo(repo, all_repo_images, is_purge=True) + successful_gc = garbage_collect_repo(repo, is_purge=True) if not successful_gc: return False @@ -175,18 +165,23 @@ def _all_images_for_gc(repo): def _filter_to_unreferenced(repo, candidates_orphans): """ Filters the given candidate orphan images into those unreferenced by any tag or other image. """ + def _get_clause(field, candidates): + if len(candidates) == 1: + return field == candidates[0] + + return field << candidates # Any image directly referenced by a tag that still exists, cannot be GCed. direct_referenced = (RepositoryTag .select(RepositoryTag.image) .where(RepositoryTag.repository == repo.id, - RepositoryTag.image << candidates_orphans)) + _get_clause(RepositoryTag.image, candidates_orphans))) # Any image which is the parent of another image, cannot be GCed. parent_referenced = (Image .select(Image.parent) .where(Image.repository == repo.id, - Image.parent << candidates_orphans)) + _get_clause(Image.parent, candidates_orphans))) referenced_candidates = (direct_referenced | parent_referenced) @@ -197,12 +192,12 @@ def _filter_to_unreferenced(repo, candidates_orphans): .select(Image.id, Image.docker_image_id, ImageStorage.id, ImageStorage.uuid) .join(ImageStorage) - .where(Image.id << candidates_orphans, - ~(Image.id << referenced_candidates))) + .where(_get_clause(Image.id, candidates_orphans), + ~(_get_clause(Image.id, referenced_candidates)))) return list(unreferenced_candidates) -def garbage_collect_repo(repo, extra_candidate_set=None, is_purge=False): +def garbage_collect_repo(repo, is_purge=False): """ Garbage collect the specified repository object. This will remove all images, derived images, and other associated metadata, for images which are no longer referenced by a tag or another image which is itself @@ -212,26 +207,32 @@ def garbage_collect_repo(repo, extra_candidate_set=None, is_purge=False): """ logger.debug('Garbage collecting repository %s', repo.id) - storage_id_whitelist = set() + if is_purge: + tag.purge_all_tags(repo) + images_for_tags_removed = {i.id for i in Image.select().where(Image.repository == repo)} + return _garbage_collect_from_image(repo, images_for_tags_removed, True) - candidate_orphan_image_set = tag.garbage_collect_tags(repo) - if extra_candidate_set: - candidate_orphan_image_set.update(extra_candidate_set) - - if not len(candidate_orphan_image_set): - logger.debug('No candidate images for GC for repo: %s', repo.id) + images_for_tags_removed = tag.garbage_collect_tags(repo) + if not len(images_for_tags_removed): + logger.debug('No images for GC for repo: %s', repo.id) return True + for image in images_for_tags_removed: + candidate_list = [image.id] + list(reversed(image.ancestor_id_list())) + for candidate_id in candidate_list: + if not _garbage_collect_from_image(repo, {candidate_id}): + return False + + return True + + +def _garbage_collect_from_image(repo, candidate_orphan_image_set, is_purge=False): + storage_id_whitelist = set() + all_images_removed = set() all_storage_id_whitelist = set() all_unreferenced_candidates = set() - if not is_purge: - # Remove any images directly referenced by tags, to prune the working set. - direct_referenced = (RepositoryTag.select(RepositoryTag.image).where( - RepositoryTag.repository == repo.id, RepositoryTag.image << list(candidate_orphan_image_set))) - candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced]) - # Iteratively try to remove images from the database. The only images we can remove are those # that are not referenced by tags AND not the parents of other images. We continue removing images # until no changes are found. diff --git a/data/model/tag.py b/data/model/tag.py index 3cd14f298..6998beec1 100644 --- a/data/model/tag.py +++ b/data/model/tag.py @@ -508,10 +508,7 @@ def _delete_tags(repo, query_modifier=None): .execute()) logger.debug('Removed %s tags with %s manifests', num_deleted_tags, num_deleted_manifests) - ancestors = reduce(lambda r, l: r | l, - (set(tag.image.ancestor_id_list()) for tag in tags_to_delete)) - direct_referenced = {tag.image.id for tag in tags_to_delete} - return ancestors | direct_referenced + return [tag.image for tag in tags_to_delete] def _get_repo_tag_image(tag_name, include_storage, modifier):