Reduce database bandwidth by tracking gc candidate images.

2016-08-26 14:48:39 -04:00 · 2016-08-26 14:48:39 -04:00 · 584a5a7ddd
commit 584a5a7ddd
parent 0815f6b6c4
5 changed files with 161 additions and 107 deletions
--- a/data/model/repository.py
+++ b/data/model/repository.py
@ -11,7 +11,7 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Us
                           Visibility, RepositoryPermission, RepositoryActionCount,
                           Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage,
                           Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
-                           db_random_func)
+                           db_random_func, db_concat_func)


 logger = logging.getLogger(__name__)
@ -43,45 +43,21 @@ def get_repository(namespace_name, repository_name):
    return None


-def _purge_all_repository_tags(namespace_name, repository_name):
-  """ Immediately purge all repository tags without respecting the lifeline procedure """
-  try:
-    repo = _basequery.get_existing_repository(namespace_name, repository_name)
-  except Repository.DoesNotExist:
-    raise DataModelException('Invalid repository \'%s/%s\'' %
-                             (namespace_name, repository_name))
-
-  # Finds all the tags to delete.
-  repo_tags = list(RepositoryTag.select().where(RepositoryTag.repository == repo.id))
-  if not repo_tags:
-    return
-
-  # Find all labels to delete.
-  manifest_labels_query = (TagManifestLabel
-                           .select()
-                           .where(TagManifestLabel.repository == repo))
-
-  label_ids = [manifest_label.label_id for manifest_label in manifest_labels_query]
-  if label_ids:
-    # Delete all the mapping entries.
-    TagManifestLabel.delete().where(TagManifestLabel.repository == repo).execute()
-
-    # Delete all the matching labels.
-    Label.delete().where(Label.id << label_ids).execute()
-
-  # Delete all the manifests.
-  TagManifest.delete().where(TagManifest.tag << repo_tags).execute()
-
-  # Delete all tags.
-  RepositoryTag.delete().where(RepositoryTag.repository == repo.id).execute()
-
-
 def purge_repository(namespace_name, repository_name):
+  repo = _basequery.get_existing_repository(namespace_name, repository_name)
+
  # Delete all tags to allow gc to reclaim storage
-  _purge_all_repository_tags(namespace_name, repository_name)
+  previously_referenced = tag.purge_all_tags(repo)
+  unreferenced_image_q = Image.select(Image.id).where(Image.repository == repo)
+
+  if len(previously_referenced) > 0:
+    unreferenced_image_q = (unreferenced_image_q
+                            .where(~(Image.id << list(previously_referenced))))
+
+  unreferenced_candidates = set(img[0] for img in unreferenced_image_q.tuples())

  # Gc to remove the images and storage
-  garbage_collect_repository(namespace_name, repository_name)
+  garbage_collect_repo(repo, previously_referenced | unreferenced_candidates)

  # Delete the rest of the repository metadata
  fetched = _basequery.get_existing_repository(namespace_name, repository_name)
@ -135,34 +111,46 @@ def find_repository_with_garbage(limit_to_gc_policy_s):
    return None


-def garbage_collect_repository(namespace_name, repository_name):
-  repo = get_repository(namespace_name, repository_name)
-  if repo is not None:
-    garbage_collect_repo(repo)
-
-
-def garbage_collect_repo(repo):
+def garbage_collect_repo(repo, extra_candidate_set=None):
  logger.debug('Garbage collecting repository %s', repo.id)

  storage_id_whitelist = set()
-  tag.garbage_collect_tags(repo)
+  candidate_orphan_image_set = tag.garbage_collect_tags(repo)
+
+  if extra_candidate_set:
+    candidate_orphan_image_set.update(extra_candidate_set)
+
+  if not len(candidate_orphan_image_set):
+    logger.debug('No candidate images for GC for repo: %s', repo.id)
+    return
+
+  candidates_orphans = list(candidate_orphan_image_set)

  with db_transaction():
-    # Get a list of all images used by tags in the repository
-    tagged_images = (Image
-                     .select(Image.id, Image.ancestors)
-                     .join(RepositoryTag)
-                     .where(Image.repository == repo))
+    Candidate = Image.alias()
+    Tagged = Image.alias()
+    ancestor_superset = Tagged.ancestors ** db_concat_func(Candidate.ancestors, Candidate.id, '/%')

-    def gen_referenced_ancestors():
-      for tagged_image in tagged_images:
-        # The ancestor list is in the format '/1/2/3/', extract just the ids
-        ancestor_id_strings = tagged_image.ancestor_list()
-        for img_id_str in ancestor_id_strings:
-          yield int(img_id_str)
-        yield tagged_image.id
+    # We are going to compute all images which are being referenced in two ways:
+    # First, we will find all images which have their ancestor paths appear in
+    # another image. Secondly, we union in all of the candidate images which are
+    # directly referenced by a tag. This can be used in a subquery to directly
+    # find which candidates are being referenced without any client side
+    # computation or extra round trips.
+    ancestor_referenced = (Candidate
+                           .select(Candidate.id)
+                           .join(Tagged, on=ancestor_superset)
+                           .join(RepositoryTag, on=(Tagged.id == RepositoryTag.image))
+                           .where(RepositoryTag.repository == repo.id,
+                                  Candidate.id << candidates_orphans))

-    referenced_ancestors = set(gen_referenced_ancestors())
+    direct_referenced = (Candidate
+                         .select(Candidate.id)
+                         .join(RepositoryTag)
+                         .where(RepositoryTag.repository == repo.id,
+                                Candidate.id << candidates_orphans))
+
+    referenced_candidates = (direct_referenced | ancestor_referenced)

    # We desire two pieces of information from the database from the following
    # query: all of the image ids which are associated with this repository,
@ -171,13 +159,18 @@ def garbage_collect_repo(repo):
    # code, which is overkill for just two fields, we use a tuple query, and
    # feed that directly to the dictionary tuple constructor which takes an
    # iterable of tuples containing [(k, v), (k, v), ...]
-    all_repo_images = Image.select(Image.id, Image.storage).where(Image.repository == repo).tuples()
-    images_to_storages = dict(all_repo_images)
-    to_remove = list(set(images_to_storages.keys()).difference(referenced_ancestors))
+    unreferenced_candidates = (Image
+                               .select(Image.id, Image.storage)
+                               .where(Image.id << candidates_orphans,
+                                      ~(Image.id << referenced_candidates))
+                               .tuples())
+
+    unreferecend_images_to_storages = dict(unreferenced_candidates)
+    to_remove = unreferecend_images_to_storages.keys()

    if len(to_remove) > 0:
      logger.info('Cleaning up unreferenced images: %s', to_remove)
-      storage_id_whitelist = {images_to_storages[to_remove_id] for to_remove_id in to_remove}
+      storage_id_whitelist = set(unreferecend_images_to_storages.values())

      # Lookup any derived images for the images to remove.
      derived = DerivedStorageForImage.select().where(