From b5bb76cdea7c43992e3c1ac42e0dae67b8b79887 Mon Sep 17 00:00:00 2001
From: Joseph Schorr <josephschorr@users.noreply.github.com>
Date: Mon, 27 Feb 2017 17:56:44 -0500
Subject: [PATCH] Optimize repository search by changing our lookup strategy

Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join.

Other changes:
- Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway.
- Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable
- Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
---
 data/interfaces/v1.py      |  10 ++-
 data/model/repository.py   | 122 +++++++++++++++++++++++--------------
 data/users/externalldap.py |   8 +--
 data/users/keystone.py     |   9 +--
 endpoints/api/search.py    |  21 +------
 endpoints/v1/index.py      |  30 +++------
 test/registry_tests.py     |  20 +++---
 test/test_api_usage.py     |   8 +--
 util/itertoolrecipes.py    |   6 ++
 9 files changed, 114 insertions(+), 120 deletions(-)
 create mode 100644 util/itertoolrecipes.py

diff --git a/data/interfaces/v1.py b/data/interfaces/v1.py
index 055808f14..b59142975 100644
--- a/data/interfaces/v1.py
+++ b/data/interfaces/v1.py
@@ -211,11 +211,9 @@ class DockerRegistryV1DataInterface(object):
     pass
 
   @abstractmethod
-  def get_sorted_matching_repositories(self, search_term, only_public, can_read, limit):
+  def get_sorted_matching_repositories(self, search_term, filter_username=None, offset=0, limit=25):
     """
     Returns a sorted list of repositories matching the given search term.
-    can_read is a callback that will be invoked for each repository found, to
-    filter results to only those visible to the current user (if any).
     """
     pass
 
@@ -384,9 +382,9 @@ class PreOCIModel(DockerRegistryV1DataInterface):
   def validate_oauth_token(self, token):
     return bool(model.oauth.validate_access_token(token))
 
-  def get_sorted_matching_repositories(self, search_term, only_public, can_read, limit):
-    repos = model.repository.get_sorted_matching_repositories(search_term, only_public, can_read,
-                                                              limit=limit)
+  def get_sorted_matching_repositories(self, search_term, filter_username=None, offset=0, limit=25):
+    repos = model.repository.get_filtered_matching_repositories(search_term, filter_username,
+                                                                offset, limit)
     return [_repository_for_repo(repo) for repo in repos]
 
 
diff --git a/data/model/repository.py b/data/model/repository.py
index 46de62bf7..fe12ede45 100644
--- a/data/model/repository.py
+++ b/data/model/repository.py
@@ -13,6 +13,7 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Im
                            Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
                            db_random_func, db_concat_func)
 from data.text import prefix_search
+from util.itertoolrecipes import take
 
 
 logger = logging.getLogger(__name__)
@@ -31,6 +32,9 @@ def create_repository(namespace, name, creating_user, visibility='private'):
   repo = Repository.create(name=name, visibility=private, namespace_user=namespace_user)
   admin = Role.get(name='admin')
 
+  yesterday = datetime.now() - timedelta(days=1)
+  RepositoryActionCount.create(repository=repo, count=0, date=yesterday)
+
   if creating_user and not creating_user.organization:
     RepositoryPermission.create(user=creating_user, repository=repo, role=admin)
 
@@ -326,70 +330,94 @@ def get_visible_repositories(username, namespace=None, include_public=False, sta
   return query
 
 
-def get_sorted_matching_repositories(lookup_value, only_public, checker, limit=10):
-  """ Returns repositories matching the given lookup string and passing the given checker
-      function.
+def get_filtered_matching_repositories(lookup_value, filter_username=None, offset=0, limit=25):
+  """ Returns an iterator of all repositories matching the given lookup value, with optional
+      filtering to a specific user. If the user is unspecified, only public repositories will
+      be returned.
   """
-  last_week = datetime.now() - timedelta(weeks=1)
-  results = []
-  existing_ids = []
 
-  def get_search_results(search_clause, with_count=False):
-    if len(results) >= limit:
-      return
+  # Build the unfiltered search query.
+  unfiltered_query = _get_sorted_matching_repositories(lookup_value,
+                                                       include_private=filter_username is not None)
 
-    select_items = [Repository, Namespace]
-    if with_count:
-      select_items.append(fn.Sum(RepositoryActionCount.count).alias('count'))
+  # Add a filter to the iterator, if necessary.
+  if filter_username is not None:
+    iterator = _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit)
+  else:
+    iterator = unfiltered_query
 
+  if offset > 0:
+    take(offset, iterator)
+
+  # Return the results.
+  return list(take(limit, iterator))
+
+
+def _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit):
+  encountered = set()
+  chunk_count = limit * 2
+  unfiltered_page = 0
+  iteration_count = 0
+
+  while iteration_count < 10: # Just to be safe
+    # Find the next chunk's worth of repository IDs, paginated by the chunk size.
+    unfiltered_page = unfiltered_page + 1
+    found_ids = [r.id for r in unfiltered_query.paginate(unfiltered_page, chunk_count)]
+
+    # Make sure we haven't encountered these results before. This code is used to handle
+    # the case where we've previously seen a result, as pagination is not necessary
+    # stable in SQL databases.
+    unfiltered_repository_ids = set(found_ids)
+    new_unfiltered_ids = unfiltered_repository_ids - encountered
+    if not new_unfiltered_ids:
+      break
+
+    encountered.update(new_unfiltered_ids)
+
+    # Filter the repositories found to only those visible to the current user.
     query = (Repository
-             .select(*select_items)
+             .select(Repository, Namespace)
+             .distinct()
              .join(Namespace, on=(Namespace.id == Repository.namespace_user))
              .switch(Repository)
-             .where(search_clause)
-             .group_by(Repository.id, Namespace.id))
+             .join(RepositoryPermission)
+             .where(Repository.id << list(new_unfiltered_ids)))
 
-    if only_public:
-      query = query.where(Repository.visibility == _basequery.get_public_repo_visibility())
+    filtered = _basequery.filter_to_repos_for_user(query, filter_username)
 
-    if existing_ids:
-      query = query.where(~(Repository.id << existing_ids))
+    for filtered_repo in filtered:
+      yield filtered_repo
 
-    if with_count:
-      query = (query
-               .switch(Repository)
-               .join(RepositoryActionCount)
-               .where(RepositoryActionCount.date >= last_week)
-               .order_by(fn.Sum(RepositoryActionCount.count).desc()))
+    # If the number of found IDs is less than the chunk count, then we're done.
+    if len(found_ids) < chunk_count:
+      break
 
-    for result in query:
-      if len(results) >= limit:
-        return results
+    iteration_count = iteration_count + 1
 
-      # Note: We compare IDs here, instead of objects, because calling .visibility on the
-      # Repository will kick off a new SQL query to retrieve that visibility enum value. We don't
-      # join the visibility table in SQL, as well, because it is ungodly slow in MySQL :-/
-      result.is_public = result.visibility_id == _basequery.get_public_repo_visibility().id
-      result.count = result.count if with_count else 0
 
-      if not checker(result):
-        continue
+def _get_sorted_matching_repositories(lookup_value, include_private=False):
+  """ Returns a query of repositories matching the given lookup string, with optional inclusion of
+      private repositories. Note that this method does *not* filter results based on visibility
+      to users.
+  """
+  last_week = datetime.now() - timedelta(weeks=1)
 
-      results.append(result)
-      existing_ids.append(result.id)
+  query = (Repository
+           .select(Repository, Namespace)
+           .join(Namespace, on=(Namespace.id == Repository.namespace_user))
+           .where(Repository.name.match(lookup_value) | Repository.description.match(lookup_value))
+           .group_by(Repository.id))
 
-  # For performance reasons, we conduct each set of searches on their own. This also affords us the
-  # ability to easily define an order precedence.
-  get_search_results(Repository.name.match(lookup_value), with_count=True)
-  get_search_results(Repository.name.match(lookup_value), with_count=False)
+  if not include_private:
+    query = query.where(Repository.visibility == _basequery.get_public_repo_visibility())
 
-  get_search_results(Repository.description.match(lookup_value), with_count=True)
-  get_search_results(Repository.description.match(lookup_value), with_count=False)
+  query = (query
+           .switch(Repository)
+           .join(RepositoryActionCount)
+           .where(RepositoryActionCount.date >= last_week)
+           .order_by(fn.Sum(RepositoryActionCount.count).desc()))
 
-  get_search_results(prefix_search(Namespace.username, lookup_value), with_count=True)
-  get_search_results(prefix_search(Namespace.username, lookup_value), with_count=False)
-
-  return results
+  return query
 
 
 def lookup_repository(repo_id):
diff --git a/data/users/externalldap.py b/data/users/externalldap.py
index cd6e460fb..8cd82ac28 100644
--- a/data/users/externalldap.py
+++ b/data/users/externalldap.py
@@ -1,10 +1,10 @@
 import ldap
 import logging
 import os
-import itertools
 
 from collections import namedtuple
 from data.users.federated import FederatedUsers, UserInformation
+from util.itertoolrecipes import take
 
 logger = logging.getLogger(__name__)
 
@@ -57,10 +57,6 @@ class LDAPConnection(object):
   def __exit__(self, exc_type, value, tb):
     self._conn.unbind_s()
 
-def _take(n, iterable):
-  "Return first n items of the iterable as a list"
-  return list(itertools.islice(iterable, n))
-
 
 class LDAPUsers(FederatedUsers):
   _LDAPResult = namedtuple('LDAPResult', ['dn', 'attrs'])
@@ -154,7 +150,7 @@ class LDAPUsers(FederatedUsers):
         return (None, err_msg)
 
       logger.debug('Found matching pairs: %s', pairs)
-      results = [LDAPUsers._LDAPResult(*pair) for pair in _take(limit, pairs)]
+      results = [LDAPUsers._LDAPResult(*pair) for pair in take(limit, pairs)]
 
       # Filter out pairs without DNs. Some LDAP impls will return such pairs.
       with_dns = [result for result in results if result.dn]
diff --git a/data/users/keystone.py b/data/users/keystone.py
index 6c8487a35..b3e7ad442 100644
--- a/data/users/keystone.py
+++ b/data/users/keystone.py
@@ -1,22 +1,17 @@
 import logging
 import os
-import itertools
 
 from keystoneclient.v2_0 import client as kclient
 from keystoneclient.v3 import client as kv3client
 from keystoneclient.exceptions import AuthorizationFailure as KeystoneAuthorizationFailure
 from keystoneclient.exceptions import Unauthorized as KeystoneUnauthorized
 from data.users.federated import FederatedUsers, UserInformation
+from util.itertoolrecipes import take
 
 logger = logging.getLogger(__name__)
 
 DEFAULT_TIMEOUT = 10 # seconds
 
-def _take(n, iterable):
-  "Return first n items of the iterable as a list"
-  return list(itertools.islice(iterable, n))
-
-
 def get_keystone_users(auth_version, auth_url, admin_username, admin_password, admin_tenant,
                        timeout=None, requires_email=True):
   if auth_version == 3:
@@ -134,7 +129,7 @@ class KeystoneV3Users(FederatedUsers):
       keystone_client = kv3client.Client(username=self.admin_username, password=self.admin_password,
                                          tenant_name=self.admin_tenant, auth_url=self.auth_url,
                                          timeout=self.timeout, debug=self.debug)
-      found_users = list(_take(limit, keystone_client.users.list(name=query)))
+      found_users = list(take(limit, keystone_client.users.list(name=query)))
       logger.debug('For Keystone query %s found users: %s', query, found_users)
       if not found_users:
         return ([], self.federated_service, None)
diff --git a/endpoints/api/search.py b/endpoints/api/search.py
index 018ab713c..c8a9be22d 100644
--- a/endpoints/api/search.py
+++ b/endpoints/api/search.py
@@ -229,31 +229,16 @@ def conduct_admined_team_search(username, query, encountered_teams, results):
 
 def conduct_repo_search(username, query, results):
   """ Finds matching repositories. """
-  def can_read(repo):
-    if repo.is_public:
-      return True
-
-    return ReadRepositoryPermission(repo.namespace_user.username, repo.name).can()
-
-  only_public = username is None
-  matching_repos = model.repository.get_sorted_matching_repositories(query, only_public, can_read,
-                                                                     limit=5)
+  matching_repos = model.repository.get_filtered_matching_repositories(query, username, limit=5)
 
   for repo in matching_repos:
-    repo_score = math.log(repo.count or 1, 10) or 1
-
-    # If the repository is under the user's namespace, give it 20% more weight.
-    namespace = repo.namespace_user.username
-    if OrganizationMemberPermission(namespace).can() or namespace == username:
-      repo_score = repo_score * 1.2
-
     results.append({
       'kind': 'repository',
       'namespace': search_entity_view(username, repo.namespace_user),
       'name': repo.name,
       'description': repo.description,
-      'is_public': repo.is_public,
-      'score': repo_score,
+      'is_public': model.repository.is_repository_public(repo),
+      'score': 4,
       'href': '/repository/' + repo.namespace_user.username + '/' + repo.name
     })
 
diff --git a/endpoints/v1/index.py b/endpoints/v1/index.py
index f70b77f5c..028379554 100644
--- a/endpoints/v1/index.py
+++ b/endpoints/v1/index.py
@@ -312,33 +312,19 @@ def get_search():
 
 def _conduct_repo_search(username, query, limit=25, page=1):
   """ Finds matching repositories. """
-  only_public = username is None
-
-  def can_read(repo):
-    if repo.is_public:
-      return True
-
-    if only_public:
-      return False
-
-    return ReadRepositoryPermission(repo.namespace_user.username, repo.name).can()
-
-  # Note: We put a max 5 page limit here. The Docker CLI doesn't seem to use the
-  # pagination and most customers hitting the API should be using V2 catalog, so this
-  # is a safety net for our slow search below, since we have to use the slow approach
-  # of finding *all* the results, and then slicing in-memory, because this old API requires
-  # the *full* page count in the returned results.
-  _MAX_PAGE_COUNT = 5
-  page = min(page, _MAX_PAGE_COUNT)
+  # Note that we put a maximum limit of five pages here, because this API should only really ever
+  # be used by the Docker CLI, and it doesn't even paginate.
+  page = min(page, 5)
+  offset = (page - 1) * limit
 
   if query:
-    matching_repos = model.get_sorted_matching_repositories(query, only_public, can_read,
-                                                            limit=limit*_MAX_PAGE_COUNT)
+    matching_repos = model.get_sorted_matching_repositories(query, username, limit=limit+1,
+                                                            offset=offset)
   else:
     matching_repos = []
 
   results = []
-  for repo in matching_repos[(page - 1) * _MAX_PAGE_COUNT:limit]:
+  for repo in matching_repos[0:limit]:
     results.append({
       'name': repo.namespace_name + '/' + repo.name,
       'description': repo.description,
@@ -350,7 +336,7 @@ def _conduct_repo_search(username, query, limit=25, page=1):
   return {
     'query': query,
     'num_results': len(results),
-    'num_pages': (len(matching_repos) / limit) + 1,
+    'num_pages': page + 1 if len(matching_repos) > limit else page,
     'page': page,
     'page_size': limit,
     'results': results,
diff --git a/test/registry_tests.py b/test/registry_tests.py
index 21cc7ad8e..a391535d2 100644
--- a/test/registry_tests.py
+++ b/test/registry_tests.py
@@ -1291,7 +1291,8 @@ class V1RegistryTests(V1RegistryPullMixin, V1RegistryPushMixin, RegistryTestsMix
 
   def test_search_pagination(self):
     # Check for the first page.
-    resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1'))
+    resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1'),
+                        auth=('devtable', 'password'))
     data = resp.json()
     self.assertEquals('s', data['query'])
 
@@ -1301,17 +1302,16 @@ class V1RegistryTests(V1RegistryPullMixin, V1RegistryPushMixin, RegistryTestsMix
     self.assertEquals(1, data['page'])
     self.assertTrue(data['num_pages'] > 1)
 
-    # Check for the followup pages.
-    for page_index in range(1, data['num_pages']):
-      resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1', page=page_index))
-      data = resp.json()
-      self.assertEquals('s', data['query'])
+    # Check for the followup page.
+    resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1', page=2),
+                        auth=('devtable', 'password'))
+    data = resp.json()
+    self.assertEquals('s', data['query'])
 
-      self.assertEquals(1, data['num_results'])
-      self.assertEquals(1, len(data['results']))
+    self.assertEquals(1, data['num_results'])
+    self.assertEquals(1, len(data['results']))
 
-      self.assertEquals(1, data['page'])
-      self.assertTrue(data['num_pages'] > 1)
+    self.assertEquals(2, data['page'])
 
 
   def test_users(self):
diff --git a/test/test_api_usage.py b/test/test_api_usage.py
index eead6487f..3a0ee42f0 100644
--- a/test/test_api_usage.py
+++ b/test/test_api_usage.py
@@ -971,11 +971,11 @@ class TestConductSearch(ApiTestCase):
                                 params=dict(query='public'))
 
     self.assertEquals(2, len(json['results']))
-    self.assertEquals(json['results'][0]['kind'], 'user')
-    self.assertEquals(json['results'][0]['name'], 'public')
+    self.assertEquals(json['results'][0]['kind'], 'repository')
+    self.assertEquals(json['results'][0]['name'], 'publicrepo')
 
-    self.assertEquals(json['results'][1]['kind'], 'repository')
-    self.assertEquals(json['results'][1]['name'], 'publicrepo')
+    self.assertEquals(json['results'][1]['kind'], 'user')
+    self.assertEquals(json['results'][1]['name'], 'public')
 
     json = self.getJsonResponse(ConductSearch,
                                 params=dict(query='owners'))
diff --git a/util/itertoolrecipes.py b/util/itertoolrecipes.py
new file mode 100644
index 000000000..1cd33386d
--- /dev/null
+++ b/util/itertoolrecipes.py
@@ -0,0 +1,6 @@
+from itertools import islice
+
+# From: https://docs.python.org/2/library/itertools.html
+def take(n, iterable):
+  """ Return first n items of the iterable as a list """
+  return list(islice(iterable, n))