From dfe371286a4031d8b6de9cb176a4bf8d1b033f1a Mon Sep 17 00:00:00 2001
From: Joseph Schorr <josephschorr@users.noreply.github.com>
Date: Fri, 14 Jul 2017 20:09:19 +0300
Subject: [PATCH] Optimize purging of a repository by skipping the unreferenced
 check

---
 data/model/repository.py | 85 ++++++++++++++++++++++++++--------------
 1 file changed, 56 insertions(+), 29 deletions(-)

diff --git a/data/model/repository.py b/data/model/repository.py
index 2ef204862..7700dde4b 100644
--- a/data/model/repository.py
+++ b/data/model/repository.py
@@ -92,8 +92,7 @@ def purge_repository(namespace_name, repository_name):
 
   # Gc to remove the images and storage
   all_repo_images = previously_referenced | unreferenced_candidates
-  successful_gc = garbage_collect_repo(repo, all_repo_images)
-
+  successful_gc = garbage_collect_repo(repo, all_repo_images, is_purge=True)
   if not successful_gc:
     return False
 
@@ -151,7 +150,47 @@ def find_repository_with_garbage(limit_to_gc_policy_s):
     return None
 
 
-def garbage_collect_repo(repo, extra_candidate_set=None):
+def _all_images_for_gc(repo):
+  """ Returns all the images found in the given repository, for the purposes of GC. """
+  images = (Image
+            .select(Image.id, Image.docker_image_id,
+                    ImageStorage.id, ImageStorage.uuid)
+            .join(ImageStorage)
+            .where(Image.repository == repo))
+  return list(images)
+
+
+def _filter_to_unreferenced(repo, candidates_orphans):
+  """ Filters the given candidate orphan images into those unreferenced by any tag or
+      other image. """
+
+  # Any image directly referenced by a tag that still exists, cannot be GCed.
+  direct_referenced = (RepositoryTag
+                       .select(RepositoryTag.image)
+                       .where(RepositoryTag.repository == repo.id,
+                              RepositoryTag.image << candidates_orphans))
+
+  # Any image which is the parent of another image, cannot be GCed.
+  parent_referenced = (Image
+                       .select(Image.parent)
+                       .where(Image.repository == repo.id,
+                              Image.parent << candidates_orphans))
+
+  referenced_candidates = (direct_referenced | parent_referenced)
+
+  # We desire a few pieces of information from the database from the following
+  # query: all of the image ids which are associated with this repository,
+  # and the storages which are associated with those images.
+  unreferenced_candidates = (Image
+                             .select(Image.id, Image.docker_image_id,
+                                     ImageStorage.id, ImageStorage.uuid)
+                             .join(ImageStorage)
+                             .where(Image.id << candidates_orphans,
+                                    ~(Image.id << referenced_candidates)))
+  return list(unreferenced_candidates)
+
+
+def garbage_collect_repo(repo, extra_candidate_set=None, is_purge=False):
   """ Garbage collect the specified repository object. This will remove all
       images, derived images, and other associated metadata, for images which
       are no longer referenced by a tag or another image which is itself
@@ -162,8 +201,8 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
   logger.debug('Garbage collecting repository %s', repo.id)
 
   storage_id_whitelist = set()
-  candidate_orphan_image_set = tag.garbage_collect_tags(repo)
 
+  candidate_orphan_image_set = tag.garbage_collect_tags(repo)
   if extra_candidate_set:
     candidate_orphan_image_set.update(extra_candidate_set)
 
@@ -175,10 +214,11 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
   all_storage_id_whitelist = set()
   all_unreferenced_candidates = set()
 
-  # Remove any images directly referenced by tags, to prune the working set.
-  direct_referenced = (RepositoryTag.select(RepositoryTag.image).where(
-    RepositoryTag.repository == repo.id, RepositoryTag.image << list(candidate_orphan_image_set)))
-  candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced])
+  if not is_purge:
+    # Remove any images directly referenced by tags, to prune the working set.
+    direct_referenced = (RepositoryTag.select(RepositoryTag.image).where(
+      RepositoryTag.repository == repo.id, RepositoryTag.image << list(candidate_orphan_image_set)))
+    candidate_orphan_image_set.difference_update([t.image_id for t in direct_referenced])
 
   # Iteratively try to remove images from the database. The only images we can remove are those
   # that are not referenced by tags AND not the parents of other images. We continue removing images
@@ -192,32 +232,19 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
     candidates_orphans = list(candidate_orphan_image_set)
 
     with db_transaction():
-      # Any image directly referenced by a tag that still exists, cannot be GCed.
-      direct_referenced = (RepositoryTag.select(RepositoryTag.image).where(
-        RepositoryTag.repository == repo.id, RepositoryTag.image << candidates_orphans))
+      # Find the images to delete.
+      images_to_gc = (_all_images_for_gc(repo) if is_purge
+                      else _filter_to_unreferenced(repo, candidates_orphans))
 
-      # Any image which is the parent of another image, cannot be GCed.
-      parent_referenced = (Image.select(Image.parent).where(Image.repository == repo.id,
-                                                            Image.parent << candidates_orphans))
-
-      referenced_candidates = (direct_referenced | parent_referenced)
-
-      # We desire a few pieces of information from the database from the following
-      # query: all of the image ids which are associated with this repository,
-      # and the storages which are associated with those images.
-      unreferenced_candidates = (Image.select(Image.id, Image.docker_image_id, ImageStorage.id,
-                                              ImageStorage.uuid).join(ImageStorage)
-                                 .where(Image.id << candidates_orphans,
-                                        ~(Image.id << referenced_candidates)))
-
-      image_ids_to_remove = [candidate.id for candidate in unreferenced_candidates]
+      # Make sure we are making progress.
+      image_ids_to_remove = [candidate.id for candidate in images_to_gc]
       making_progress = bool(len(image_ids_to_remove))
       if len(image_ids_to_remove) == 0:
-        # No more candidates to remove.
+        # No more images to remove.
         break
 
       logger.info('Cleaning up unreferenced images: %s', image_ids_to_remove)
-      storage_id_whitelist = set([candidate.storage_id for candidate in unreferenced_candidates])
+      storage_id_whitelist = set([candidate.storage_id for candidate in images_to_gc])
 
       # Lookup any derived images for the images to remove.
       derived = DerivedStorageForImage.select().where(DerivedStorageForImage.source_image <<
@@ -246,7 +273,7 @@ def garbage_collect_repo(repo, extra_candidate_set=None):
     # Add the images to the removed set and remove them from the candidate set.
     all_images_removed.update(image_ids_to_remove)
     all_storage_id_whitelist.update(storage_id_whitelist)
-    all_unreferenced_candidates.update(unreferenced_candidates)
+    all_unreferenced_candidates.update(images_to_gc)
 
     candidate_orphan_image_set.difference_update(image_ids_to_remove)