From f07b940bc57dc763a296b74c0ad9d63737bf2f81 Mon Sep 17 00:00:00 2001
From: Joseph Schorr <josephschorr@users.noreply.github.com>
Date: Thu, 3 Dec 2015 16:19:22 -0500
Subject: [PATCH] Optimize blob lookup

Fixes #1013
---
 data/model/blob.py    | 34 +++++++++++++++-------------------
 data/model/storage.py | 11 ++++++++++-
 test/test_queries.py  | 21 ++++++++++++++++++++-
 3 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/data/model/blob.py b/data/model/blob.py
index 5b8d38420..b05ce6100 100644
--- a/data/model/blob.py
+++ b/data/model/blob.py
@@ -1,6 +1,7 @@
 from uuid import uuid4
 
-from data.model import tag, _basequery, BlobDoesNotExist, InvalidBlobUpload, db_transaction
+from data.model import (tag, _basequery, BlobDoesNotExist, InvalidBlobUpload, db_transaction,
+                        storage as storage_model, InvalidImageException)
 from data.database import (Repository, Namespace, ImageStorage, Image, ImageStorageLocation,
                            ImageStoragePlacement, BlobUpload)
 
@@ -8,26 +9,21 @@ from data.database import (Repository, Namespace, ImageStorage, Image, ImageStor
 def get_repo_blob_by_digest(namespace, repo_name, blob_digest):
   """ Find the content-addressable blob linked to the specified repository.
   """
-  placements = list(ImageStoragePlacement
-                    .select(ImageStoragePlacement, ImageStorage, ImageStorageLocation)
-                    .join(ImageStorageLocation)
-                    .switch(ImageStoragePlacement)
-                    .join(ImageStorage)
-                    .join(Image)
-                    .join(Repository)
-                    .join(Namespace, on=(Namespace.id == Repository.namespace_user))
-                    .where(Repository.name == repo_name, Namespace.username == namespace,
-                           ImageStorage.content_checksum == blob_digest,
-                           ImageStorage.uploading == False))
-  if not placements:
+  try:
+    storage_id_query = (ImageStorage
+               .select(ImageStorage.id)
+               .join(Image)
+               .join(Repository)
+               .join(Namespace, on=(Namespace.id == Repository.namespace_user))
+               .where(Repository.name == repo_name, Namespace.username == namespace,
+                      ImageStorage.content_checksum == blob_digest,
+                      ImageStorage.uploading == False)
+               .limit(1))
+
+    return storage_model.get_storage_by_subquery(storage_id_query)
+  except InvalidImageException:
     raise BlobDoesNotExist('Blob does not exist with digest: {0}'.format(blob_digest))
 
-  found = placements[0].storage
-  found.locations = {placement.location.name for placement in placements
-                     if placement.storage.id == found.id}
-
-  return found
-
 
 def store_blob_record_and_temp_link(namespace, repo_name, blob_digest, location_obj, byte_count,
                                     link_expiration_s):
diff --git a/data/model/storage.py b/data/model/storage.py
index 5cc18b61f..f144d157a 100644
--- a/data/model/storage.py
+++ b/data/model/storage.py
@@ -131,6 +131,15 @@ def _get_storage(query_modifier):
   return found
 
 
+def get_storage_by_subquery(subquery):
+  """ Returns the storage (and its locations) for the storage id returned by the subquery. The
+      subquery must return at most 1 result, which is a storage ID. """
+  def filter_by_subquery(query):
+    return query.where(ImageStorage.id == subquery)
+
+  return _get_storage(filter_by_subquery)
+
+
 def get_storage_by_uuid(storage_uuid):
   def filter_to_uuid(query):
     return query.where(ImageStorage.uuid == storage_uuid)
@@ -202,7 +211,7 @@ def get_storage_locations(uuid):
            .select()
            .join(ImageStorageLocation)
            .switch(ImageStoragePlacement)
-           .join(ImageStorage, JOIN_LEFT_OUTER)
+           .join(ImageStorage)
            .where(ImageStorage.uuid == uuid))
 
   return [location.location.name for location in query]
diff --git a/test/test_queries.py b/test/test_queries.py
index d538bea74..17c383028 100644
--- a/test/test_queries.py
+++ b/test/test_queries.py
@@ -3,7 +3,7 @@ import unittest
 from app import app
 from initdb import setup_database_for_testing, finished_database_for_testing
 from data import model
-from data.database import RepositoryBuild
+from data.database import RepositoryBuild, Repository, Image, ImageStorage
 
 ADMIN_ACCESS_USER = 'devtable'
 SIMPLE_REPO = 'simple'
@@ -45,5 +45,24 @@ class TestSpecificQueries(unittest.TestCase):
     self.assertEquals(created.id, result.id)
     self.assertEquals(created.uuid, result.uuid)
 
+  def test_lookup_repo_blob(self):
+    repo = model.repository.get_repository(ADMIN_ACCESS_USER, SIMPLE_REPO)
+    expected = list(ImageStorage.select().join(Image).where(Image.repository == repo))
+    self.assertTrue(len(expected) > 0)
+
+    for storage in expected:
+      found = model.blob.get_repo_blob_by_digest(ADMIN_ACCESS_USER, SIMPLE_REPO,
+                                                 storage.content_checksum)
+      self.assertEquals(found.id, storage.id)
+
+    try:
+      model.blob.get_repo_blob_by_digest(ADMIN_ACCESS_USER, SIMPLE_REPO, 'invalidchecksum')
+    except model.BlobDoesNotExist:
+      return
+
+    self.fail('Expected BlobDoesNotExist exception')
+
+
+
 if __name__ == '__main__':
   unittest.main()
\ No newline at end of file