Merge pull request #2392 from coreos-inc/search-optimization

Optimize repository search by changing our lookup strategy
2017-03-10 15:44:26 -05:00 · 2017-03-10 15:44:26 -05:00 · 432b2d3fe8
commit 432b2d3fe8
parent c724af12cd d42ec4e585
9 changed files with 123 additions and 123 deletions
--- a/data/interfaces/v1.py
+++ b/data/interfaces/v1.py
@ -211,11 +211,9 @@ class DockerRegistryV1DataInterface(object):
    pass

  @abstractmethod
-  def get_sorted_matching_repositories(self, search_term, only_public, can_read, limit):
+  def get_sorted_matching_repositories(self, search_term, filter_username=None, offset=0, limit=25):
    """
    Returns a sorted list of repositories matching the given search term.
-    can_read is a callback that will be invoked for each repository found, to
-    filter results to only those visible to the current user (if any).
    """
    pass

@ -384,9 +382,9 @@ class PreOCIModel(DockerRegistryV1DataInterface):
  def validate_oauth_token(self, token):
    return bool(model.oauth.validate_access_token(token))

-  def get_sorted_matching_repositories(self, search_term, only_public, can_read, limit):
-    repos = model.repository.get_sorted_matching_repositories(search_term, only_public, can_read,
-                                                              limit=limit)
+  def get_sorted_matching_repositories(self, search_term, filter_username=None, offset=0, limit=25):
+    repos = model.repository.get_filtered_matching_repositories(search_term, filter_username,
+                                                                offset, limit)
    return [_repository_for_repo(repo) for repo in repos]


--- a/data/model/repository.py
+++ b/data/model/repository.py
@ -13,6 +13,7 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Im
                           Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
                           db_random_func, db_concat_func)
 from data.text import prefix_search
+from util.itertoolrecipes import take


 logger = logging.getLogger(__name__)
@ -31,6 +32,9 @@ def create_repository(namespace, name, creating_user, visibility='private'):
  repo = Repository.create(name=name, visibility=private, namespace_user=namespace_user)
  admin = Role.get(name='admin')

+  yesterday = datetime.now() - timedelta(days=1)
+  RepositoryActionCount.create(repository=repo, count=0, date=yesterday)
+
  if creating_user and not creating_user.organization:
    RepositoryPermission.create(user=creating_user, repository=repo, role=admin)

@ -326,70 +330,94 @@ def get_visible_repositories(username, namespace=None, include_public=False, sta
  return query


-def get_sorted_matching_repositories(lookup_value, only_public, checker, limit=10):
-  """ Returns repositories matching the given lookup string and passing the given checker
-      function.
+def get_filtered_matching_repositories(lookup_value, filter_username=None, offset=0, limit=25):
+  """ Returns an iterator of all repositories matching the given lookup value, with optional
+      filtering to a specific user. If the user is unspecified, only public repositories will
+      be returned.
  """
-  last_week = datetime.now() - timedelta(weeks=1)
-  results = []
-  existing_ids = []

-  def get_search_results(search_clause, with_count=False):
-    if len(results) >= limit:
-      return
+  # Build the unfiltered search query.
+  unfiltered_query = _get_sorted_matching_repositories(lookup_value,
+                                                       include_private=filter_username is not None)

-    select_items = [Repository, Namespace]
-    if with_count:
-      select_items.append(fn.Sum(RepositoryActionCount.count).alias('count'))
+  # Add a filter to the iterator, if necessary.
+  if filter_username is not None:
+    iterator = _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit)
+  else:
+    iterator = unfiltered_query

+  if offset > 0:
+    take(offset, iterator)
+
+  # Return the results.
+  return list(take(limit, iterator))
+
+
+def _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit):
+  encountered = set()
+  chunk_count = limit * 2
+  unfiltered_page = 0
+  iteration_count = 0
+
+  while iteration_count < 10: # Just to be safe
+    # Find the next chunk's worth of repository IDs, paginated by the chunk size.
+    unfiltered_page = unfiltered_page + 1
+    found_ids = [r.id for r in unfiltered_query.paginate(unfiltered_page, chunk_count)]
+
+    # Make sure we haven't encountered these results before. This code is used to handle
+    # the case where we've previously seen a result, as pagination is not necessary
+    # stable in SQL databases.
+    unfiltered_repository_ids = set(found_ids)
+    new_unfiltered_ids = unfiltered_repository_ids - encountered
+    if not new_unfiltered_ids:
+      break
+
+    encountered.update(new_unfiltered_ids)
+
+    # Filter the repositories found to only those visible to the current user.
    query = (Repository
-             .select(*select_items)
+             .select(Repository, Namespace)
+             .distinct()
             .join(Namespace, on=(Namespace.id == Repository.namespace_user))
             .switch(Repository)
-             .where(search_clause)
-             .group_by(Repository.id, Namespace.id))
+             .join(RepositoryPermission)
+             .where(Repository.id << list(new_unfiltered_ids)))

-    if only_public:
-      query = query.where(Repository.visibility == _basequery.get_public_repo_visibility())
+    filtered = _basequery.filter_to_repos_for_user(query, filter_username)

-    if existing_ids:
-      query = query.where(~(Repository.id << existing_ids))
+    for filtered_repo in filtered:
+      yield filtered_repo

-    if with_count:
-      query = (query
-               .switch(Repository)
-               .join(RepositoryActionCount)
-               .where(RepositoryActionCount.date >= last_week)
-               .order_by(fn.Sum(RepositoryActionCount.count).desc()))
+    # If the number of found IDs is less than the chunk count, then we're done.
+    if len(found_ids) < chunk_count:
+      break

-    for result in query:
-      if len(results) >= limit:
-        return results
+    iteration_count = iteration_count + 1

-      # Note: We compare IDs here, instead of objects, because calling .visibility on the
-      # Repository will kick off a new SQL query to retrieve that visibility enum value. We don't
-      # join the visibility table in SQL, as well, because it is ungodly slow in MySQL :-/
-      result.is_public = result.visibility_id == _basequery.get_public_repo_visibility().id
-      result.count = result.count if with_count else 0

-      if not checker(result):
-        continue
+def _get_sorted_matching_repositories(lookup_value, include_private=False):
+  """ Returns a query of repositories matching the given lookup string, with optional inclusion of
+      private repositories. Note that this method does *not* filter results based on visibility
+      to users.
+  """
+  last_week = datetime.now() - timedelta(weeks=1)

-      results.append(result)
-      existing_ids.append(result.id)
+  query = (Repository
+           .select(Repository, Namespace)
+           .join(Namespace, on=(Namespace.id == Repository.namespace_user))
+           .where(Repository.name.match(lookup_value) | Repository.description.match(lookup_value))
+           .group_by(Repository.id))

-  # For performance reasons, we conduct each set of searches on their own. This also affords us the
-  # ability to easily define an order precedence.
-  get_search_results(Repository.name.match(lookup_value), with_count=True)
-  get_search_results(Repository.name.match(lookup_value), with_count=False)
+  if not include_private:
+    query = query.where(Repository.visibility == _basequery.get_public_repo_visibility())

-  get_search_results(Repository.description.match(lookup_value), with_count=True)
-  get_search_results(Repository.description.match(lookup_value), with_count=False)
+  query = (query
+           .switch(Repository)
+           .join(RepositoryActionCount)
+           .where(RepositoryActionCount.date >= last_week)
+           .order_by(fn.Sum(RepositoryActionCount.count).desc()))

-  get_search_results(prefix_search(Namespace.username, lookup_value), with_count=True)
-  get_search_results(prefix_search(Namespace.username, lookup_value), with_count=False)
-
-  return results
+  return query


 def lookup_repository(repo_id):
--- a/data/users/externalldap.py
+++ b/data/users/externalldap.py
@ -1,10 +1,10 @@
 import ldap
 import logging
 import os
-import itertools

 from collections import namedtuple
 from data.users.federated import FederatedUsers, UserInformation
+from util.itertoolrecipes import take

 logger = logging.getLogger(__name__)

@ -57,10 +57,6 @@ class LDAPConnection(object):
  def __exit__(self, exc_type, value, tb):
    self._conn.unbind_s()

-def _take(n, iterable):
-  "Return first n items of the iterable as a list"
-  return list(itertools.islice(iterable, n))
-

 class LDAPUsers(FederatedUsers):
  _LDAPResult = namedtuple('LDAPResult', ['dn', 'attrs'])
@ -154,7 +150,7 @@ class LDAPUsers(FederatedUsers):
        return (None, err_msg)

      logger.debug('Found matching pairs: %s', pairs)
-      results = [LDAPUsers._LDAPResult(*pair) for pair in _take(limit, pairs)]
+      results = [LDAPUsers._LDAPResult(*pair) for pair in take(limit, pairs)]

      # Filter out pairs without DNs. Some LDAP impls will return such pairs.
      with_dns = [result for result in results if result.dn]
--- a/data/users/keystone.py
+++ b/data/users/keystone.py
@ -1,22 +1,17 @@
 import logging
 import os
-import itertools

 from keystoneclient.v2_0 import client as kclient
 from keystoneclient.v3 import client as kv3client
 from keystoneclient.exceptions import AuthorizationFailure as KeystoneAuthorizationFailure
 from keystoneclient.exceptions import Unauthorized as KeystoneUnauthorized
 from data.users.federated import FederatedUsers, UserInformation
+from util.itertoolrecipes import take

 logger = logging.getLogger(__name__)

 DEFAULT_TIMEOUT = 10 # seconds

-def _take(n, iterable):
-  "Return first n items of the iterable as a list"
-  return list(itertools.islice(iterable, n))
-
-
 def get_keystone_users(auth_version, auth_url, admin_username, admin_password, admin_tenant,
                       timeout=None, requires_email=True):
  if auth_version == 3:
@ -134,7 +129,7 @@ class KeystoneV3Users(FederatedUsers):
      keystone_client = kv3client.Client(username=self.admin_username, password=self.admin_password,
                                         tenant_name=self.admin_tenant, auth_url=self.auth_url,
                                         timeout=self.timeout, debug=self.debug)
-      found_users = list(_take(limit, keystone_client.users.list(name=query)))
+      found_users = list(take(limit, keystone_client.users.list(name=query)))
      logger.debug('For Keystone query %s found users: %s', query, found_users)
      if not found_users:
        return ([], self.federated_service, None)