Merge pull request #2392 from coreos-inc/search-optimization

Optimize repository search by changing our lookup strategy
This commit is contained in:
josephschorr 2017-03-10 15:44:26 -05:00 committed by GitHub
commit 432b2d3fe8
9 changed files with 123 additions and 123 deletions

View file

@ -211,11 +211,9 @@ class DockerRegistryV1DataInterface(object):
pass
@abstractmethod
def get_sorted_matching_repositories(self, search_term, only_public, can_read, limit):
def get_sorted_matching_repositories(self, search_term, filter_username=None, offset=0, limit=25):
"""
Returns a sorted list of repositories matching the given search term.
can_read is a callback that will be invoked for each repository found, to
filter results to only those visible to the current user (if any).
"""
pass
@ -384,9 +382,9 @@ class PreOCIModel(DockerRegistryV1DataInterface):
def validate_oauth_token(self, token):
return bool(model.oauth.validate_access_token(token))
def get_sorted_matching_repositories(self, search_term, only_public, can_read, limit):
repos = model.repository.get_sorted_matching_repositories(search_term, only_public, can_read,
limit=limit)
def get_sorted_matching_repositories(self, search_term, filter_username=None, offset=0, limit=25):
repos = model.repository.get_filtered_matching_repositories(search_term, filter_username,
offset, limit)
return [_repository_for_repo(repo) for repo in repos]

View file

@ -13,6 +13,7 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Im
Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
db_random_func, db_concat_func)
from data.text import prefix_search
from util.itertoolrecipes import take
logger = logging.getLogger(__name__)
@ -31,6 +32,9 @@ def create_repository(namespace, name, creating_user, visibility='private'):
repo = Repository.create(name=name, visibility=private, namespace_user=namespace_user)
admin = Role.get(name='admin')
yesterday = datetime.now() - timedelta(days=1)
RepositoryActionCount.create(repository=repo, count=0, date=yesterday)
if creating_user and not creating_user.organization:
RepositoryPermission.create(user=creating_user, repository=repo, role=admin)
@ -326,70 +330,94 @@ def get_visible_repositories(username, namespace=None, include_public=False, sta
return query
def get_sorted_matching_repositories(lookup_value, only_public, checker, limit=10):
""" Returns repositories matching the given lookup string and passing the given checker
function.
def get_filtered_matching_repositories(lookup_value, filter_username=None, offset=0, limit=25):
""" Returns an iterator of all repositories matching the given lookup value, with optional
filtering to a specific user. If the user is unspecified, only public repositories will
be returned.
"""
last_week = datetime.now() - timedelta(weeks=1)
results = []
existing_ids = []
def get_search_results(search_clause, with_count=False):
if len(results) >= limit:
return
# Build the unfiltered search query.
unfiltered_query = _get_sorted_matching_repositories(lookup_value,
include_private=filter_username is not None)
select_items = [Repository, Namespace]
if with_count:
select_items.append(fn.Sum(RepositoryActionCount.count).alias('count'))
# Add a filter to the iterator, if necessary.
if filter_username is not None:
iterator = _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit)
else:
iterator = unfiltered_query
if offset > 0:
take(offset, iterator)
# Return the results.
return list(take(limit, iterator))
def _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit):
encountered = set()
chunk_count = limit * 2
unfiltered_page = 0
iteration_count = 0
while iteration_count < 10: # Just to be safe
# Find the next chunk's worth of repository IDs, paginated by the chunk size.
unfiltered_page = unfiltered_page + 1
found_ids = [r.id for r in unfiltered_query.paginate(unfiltered_page, chunk_count)]
# Make sure we haven't encountered these results before. This code is used to handle
# the case where we've previously seen a result, as pagination is not necessary
# stable in SQL databases.
unfiltered_repository_ids = set(found_ids)
new_unfiltered_ids = unfiltered_repository_ids - encountered
if not new_unfiltered_ids:
break
encountered.update(new_unfiltered_ids)
# Filter the repositories found to only those visible to the current user.
query = (Repository
.select(*select_items)
.select(Repository, Namespace)
.distinct()
.join(Namespace, on=(Namespace.id == Repository.namespace_user))
.switch(Repository)
.where(search_clause)
.group_by(Repository.id, Namespace.id))
.join(RepositoryPermission)
.where(Repository.id << list(new_unfiltered_ids)))
if only_public:
query = query.where(Repository.visibility == _basequery.get_public_repo_visibility())
filtered = _basequery.filter_to_repos_for_user(query, filter_username)
if existing_ids:
query = query.where(~(Repository.id << existing_ids))
for filtered_repo in filtered:
yield filtered_repo
if with_count:
query = (query
.switch(Repository)
.join(RepositoryActionCount)
.where(RepositoryActionCount.date >= last_week)
.order_by(fn.Sum(RepositoryActionCount.count).desc()))
# If the number of found IDs is less than the chunk count, then we're done.
if len(found_ids) < chunk_count:
break
for result in query:
if len(results) >= limit:
return results
iteration_count = iteration_count + 1
# Note: We compare IDs here, instead of objects, because calling .visibility on the
# Repository will kick off a new SQL query to retrieve that visibility enum value. We don't
# join the visibility table in SQL, as well, because it is ungodly slow in MySQL :-/
result.is_public = result.visibility_id == _basequery.get_public_repo_visibility().id
result.count = result.count if with_count else 0
if not checker(result):
continue
def _get_sorted_matching_repositories(lookup_value, include_private=False):
""" Returns a query of repositories matching the given lookup string, with optional inclusion of
private repositories. Note that this method does *not* filter results based on visibility
to users.
"""
last_week = datetime.now() - timedelta(weeks=1)
results.append(result)
existing_ids.append(result.id)
query = (Repository
.select(Repository, Namespace)
.join(Namespace, on=(Namespace.id == Repository.namespace_user))
.where(Repository.name.match(lookup_value) | Repository.description.match(lookup_value))
.group_by(Repository.id))
# For performance reasons, we conduct each set of searches on their own. This also affords us the
# ability to easily define an order precedence.
get_search_results(Repository.name.match(lookup_value), with_count=True)
get_search_results(Repository.name.match(lookup_value), with_count=False)
if not include_private:
query = query.where(Repository.visibility == _basequery.get_public_repo_visibility())
get_search_results(Repository.description.match(lookup_value), with_count=True)
get_search_results(Repository.description.match(lookup_value), with_count=False)
query = (query
.switch(Repository)
.join(RepositoryActionCount)
.where(RepositoryActionCount.date >= last_week)
.order_by(fn.Sum(RepositoryActionCount.count).desc()))
get_search_results(prefix_search(Namespace.username, lookup_value), with_count=True)
get_search_results(prefix_search(Namespace.username, lookup_value), with_count=False)
return results
return query
def lookup_repository(repo_id):

View file

@ -1,10 +1,10 @@
import ldap
import logging
import os
import itertools
from collections import namedtuple
from data.users.federated import FederatedUsers, UserInformation
from util.itertoolrecipes import take
logger = logging.getLogger(__name__)
@ -57,10 +57,6 @@ class LDAPConnection(object):
def __exit__(self, exc_type, value, tb):
self._conn.unbind_s()
def _take(n, iterable):
"Return first n items of the iterable as a list"
return list(itertools.islice(iterable, n))
class LDAPUsers(FederatedUsers):
_LDAPResult = namedtuple('LDAPResult', ['dn', 'attrs'])
@ -154,7 +150,7 @@ class LDAPUsers(FederatedUsers):
return (None, err_msg)
logger.debug('Found matching pairs: %s', pairs)
results = [LDAPUsers._LDAPResult(*pair) for pair in _take(limit, pairs)]
results = [LDAPUsers._LDAPResult(*pair) for pair in take(limit, pairs)]
# Filter out pairs without DNs. Some LDAP impls will return such pairs.
with_dns = [result for result in results if result.dn]

View file

@ -1,22 +1,17 @@
import logging
import os
import itertools
from keystoneclient.v2_0 import client as kclient
from keystoneclient.v3 import client as kv3client
from keystoneclient.exceptions import AuthorizationFailure as KeystoneAuthorizationFailure
from keystoneclient.exceptions import Unauthorized as KeystoneUnauthorized
from data.users.federated import FederatedUsers, UserInformation
from util.itertoolrecipes import take
logger = logging.getLogger(__name__)
DEFAULT_TIMEOUT = 10 # seconds
def _take(n, iterable):
"Return first n items of the iterable as a list"
return list(itertools.islice(iterable, n))
def get_keystone_users(auth_version, auth_url, admin_username, admin_password, admin_tenant,
timeout=None, requires_email=True):
if auth_version == 3:
@ -134,7 +129,7 @@ class KeystoneV3Users(FederatedUsers):
keystone_client = kv3client.Client(username=self.admin_username, password=self.admin_password,
tenant_name=self.admin_tenant, auth_url=self.auth_url,
timeout=self.timeout, debug=self.debug)
found_users = list(_take(limit, keystone_client.users.list(name=query)))
found_users = list(take(limit, keystone_client.users.list(name=query)))
logger.debug('For Keystone query %s found users: %s', query, found_users)
if not found_users:
return ([], self.federated_service, None)

View file

@ -18,6 +18,12 @@ from util.names import parse_robot_username
import anunidecode # Don't listen to pylint's lies. This import is required.
import math
ENTITY_SEARCH_SCORE = 1
TEAM_SEARCH_SCORE = 2
REPOSITORY_SEARCH_SCORE = 4
@resource('/v1/entities/link/<username>')
@internal_only
class LinkExternalEntity(ApiResource):
@ -179,7 +185,7 @@ def search_entity_view(username, entity, get_short_name=None):
'kind': kind,
'avatar': avatar_data,
'name': entity.username,
'score': 1,
'score': ENTITY_SEARCH_SCORE,
'href': href
}
@ -203,7 +209,7 @@ def conduct_team_search(username, query, encountered_teams, results):
'name': team.name,
'organization': search_entity_view(username, team.organization),
'avatar': avatar.get_data_for_team(team),
'score': 2,
'score': TEAM_SEARCH_SCORE,
'href': '/organization/' + team.organization.username + '/teams/' + team.name
})
@ -222,38 +228,23 @@ def conduct_admined_team_search(username, query, encountered_teams, results):
'name': team.name,
'organization': search_entity_view(username, team.organization),
'avatar': avatar.get_data_for_team(team),
'score': 2,
'score': TEAM_SEARCH_SCORE,
'href': '/organization/' + team.organization.username + '/teams/' + team.name
})
def conduct_repo_search(username, query, results):
""" Finds matching repositories. """
def can_read(repo):
if repo.is_public:
return True
return ReadRepositoryPermission(repo.namespace_user.username, repo.name).can()
only_public = username is None
matching_repos = model.repository.get_sorted_matching_repositories(query, only_public, can_read,
limit=5)
matching_repos = model.repository.get_filtered_matching_repositories(query, username, limit=5)
for repo in matching_repos:
repo_score = math.log(repo.count or 1, 10) or 1
# If the repository is under the user's namespace, give it 20% more weight.
namespace = repo.namespace_user.username
if OrganizationMemberPermission(namespace).can() or namespace == username:
repo_score = repo_score * 1.2
results.append({
'kind': 'repository',
'namespace': search_entity_view(username, repo.namespace_user),
'name': repo.name,
'description': repo.description,
'is_public': repo.is_public,
'score': repo_score,
'is_public': model.repository.is_repository_public(repo),
'score': REPOSITORY_SEARCH_SCORE,
'href': '/repository/' + repo.namespace_user.username + '/' + repo.name
})

View file

@ -312,33 +312,19 @@ def get_search():
def _conduct_repo_search(username, query, limit=25, page=1):
""" Finds matching repositories. """
only_public = username is None
def can_read(repo):
if repo.is_public:
return True
if only_public:
return False
return ReadRepositoryPermission(repo.namespace_user.username, repo.name).can()
# Note: We put a max 5 page limit here. The Docker CLI doesn't seem to use the
# pagination and most customers hitting the API should be using V2 catalog, so this
# is a safety net for our slow search below, since we have to use the slow approach
# of finding *all* the results, and then slicing in-memory, because this old API requires
# the *full* page count in the returned results.
_MAX_PAGE_COUNT = 5
page = min(page, _MAX_PAGE_COUNT)
# Note that we put a maximum limit of five pages here, because this API should only really ever
# be used by the Docker CLI, and it doesn't even paginate.
page = min(page, 5)
offset = (page - 1) * limit
if query:
matching_repos = model.get_sorted_matching_repositories(query, only_public, can_read,
limit=limit*_MAX_PAGE_COUNT)
matching_repos = model.get_sorted_matching_repositories(query, username, limit=limit+1,
offset=offset)
else:
matching_repos = []
results = []
for repo in matching_repos[(page - 1) * _MAX_PAGE_COUNT:limit]:
for repo in matching_repos[0:limit]:
results.append({
'name': repo.namespace_name + '/' + repo.name,
'description': repo.description,
@ -350,7 +336,7 @@ def _conduct_repo_search(username, query, limit=25, page=1):
return {
'query': query,
'num_results': len(results),
'num_pages': (len(matching_repos) / limit) + 1,
'num_pages': page + 1 if len(matching_repos) > limit else page,
'page': page,
'page_size': limit,
'results': results,

View file

@ -1291,7 +1291,8 @@ class V1RegistryTests(V1RegistryPullMixin, V1RegistryPushMixin, RegistryTestsMix
def test_search_pagination(self):
# Check for the first page.
resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1'))
resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1'),
auth=('devtable', 'password'))
data = resp.json()
self.assertEquals('s', data['query'])
@ -1301,17 +1302,16 @@ class V1RegistryTests(V1RegistryPullMixin, V1RegistryPushMixin, RegistryTestsMix
self.assertEquals(1, data['page'])
self.assertTrue(data['num_pages'] > 1)
# Check for the followup pages.
for page_index in range(1, data['num_pages']):
resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1', page=page_index))
data = resp.json()
self.assertEquals('s', data['query'])
# Check for the followup page.
resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1', page=2),
auth=('devtable', 'password'))
data = resp.json()
self.assertEquals('s', data['query'])
self.assertEquals(1, data['num_results'])
self.assertEquals(1, len(data['results']))
self.assertEquals(1, data['num_results'])
self.assertEquals(1, len(data['results']))
self.assertEquals(1, data['page'])
self.assertTrue(data['num_pages'] > 1)
self.assertEquals(2, data['page'])
def test_users(self):

View file

@ -971,11 +971,11 @@ class TestConductSearch(ApiTestCase):
params=dict(query='public'))
self.assertEquals(2, len(json['results']))
self.assertEquals(json['results'][0]['kind'], 'user')
self.assertEquals(json['results'][0]['name'], 'public')
self.assertEquals(json['results'][0]['kind'], 'repository')
self.assertEquals(json['results'][0]['name'], 'publicrepo')
self.assertEquals(json['results'][1]['kind'], 'repository')
self.assertEquals(json['results'][1]['name'], 'publicrepo')
self.assertEquals(json['results'][1]['kind'], 'user')
self.assertEquals(json['results'][1]['name'], 'public')
json = self.getJsonResponse(ConductSearch,
params=dict(query='owners'))

6
util/itertoolrecipes.py Normal file
View file

@ -0,0 +1,6 @@
from itertools import islice
# From: https://docs.python.org/2/library/itertools.html
def take(n, iterable):
""" Return first n items of the iterable as a list """
return list(islice(iterable, n))