Merge pull request #2392 from coreos-inc/search-optimization
Optimize repository search by changing our lookup strategy
This commit is contained in:
commit
432b2d3fe8
9 changed files with 123 additions and 123 deletions
|
@ -211,11 +211,9 @@ class DockerRegistryV1DataInterface(object):
|
|||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_sorted_matching_repositories(self, search_term, only_public, can_read, limit):
|
||||
def get_sorted_matching_repositories(self, search_term, filter_username=None, offset=0, limit=25):
|
||||
"""
|
||||
Returns a sorted list of repositories matching the given search term.
|
||||
can_read is a callback that will be invoked for each repository found, to
|
||||
filter results to only those visible to the current user (if any).
|
||||
"""
|
||||
pass
|
||||
|
||||
|
@ -384,9 +382,9 @@ class PreOCIModel(DockerRegistryV1DataInterface):
|
|||
def validate_oauth_token(self, token):
|
||||
return bool(model.oauth.validate_access_token(token))
|
||||
|
||||
def get_sorted_matching_repositories(self, search_term, only_public, can_read, limit):
|
||||
repos = model.repository.get_sorted_matching_repositories(search_term, only_public, can_read,
|
||||
limit=limit)
|
||||
def get_sorted_matching_repositories(self, search_term, filter_username=None, offset=0, limit=25):
|
||||
repos = model.repository.get_filtered_matching_repositories(search_term, filter_username,
|
||||
offset, limit)
|
||||
return [_repository_for_repo(repo) for repo in repos]
|
||||
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Im
|
|||
Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
|
||||
db_random_func, db_concat_func)
|
||||
from data.text import prefix_search
|
||||
from util.itertoolrecipes import take
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -31,6 +32,9 @@ def create_repository(namespace, name, creating_user, visibility='private'):
|
|||
repo = Repository.create(name=name, visibility=private, namespace_user=namespace_user)
|
||||
admin = Role.get(name='admin')
|
||||
|
||||
yesterday = datetime.now() - timedelta(days=1)
|
||||
RepositoryActionCount.create(repository=repo, count=0, date=yesterday)
|
||||
|
||||
if creating_user and not creating_user.organization:
|
||||
RepositoryPermission.create(user=creating_user, repository=repo, role=admin)
|
||||
|
||||
|
@ -326,70 +330,94 @@ def get_visible_repositories(username, namespace=None, include_public=False, sta
|
|||
return query
|
||||
|
||||
|
||||
def get_sorted_matching_repositories(lookup_value, only_public, checker, limit=10):
|
||||
""" Returns repositories matching the given lookup string and passing the given checker
|
||||
function.
|
||||
def get_filtered_matching_repositories(lookup_value, filter_username=None, offset=0, limit=25):
|
||||
""" Returns an iterator of all repositories matching the given lookup value, with optional
|
||||
filtering to a specific user. If the user is unspecified, only public repositories will
|
||||
be returned.
|
||||
"""
|
||||
last_week = datetime.now() - timedelta(weeks=1)
|
||||
results = []
|
||||
existing_ids = []
|
||||
|
||||
def get_search_results(search_clause, with_count=False):
|
||||
if len(results) >= limit:
|
||||
return
|
||||
# Build the unfiltered search query.
|
||||
unfiltered_query = _get_sorted_matching_repositories(lookup_value,
|
||||
include_private=filter_username is not None)
|
||||
|
||||
select_items = [Repository, Namespace]
|
||||
if with_count:
|
||||
select_items.append(fn.Sum(RepositoryActionCount.count).alias('count'))
|
||||
# Add a filter to the iterator, if necessary.
|
||||
if filter_username is not None:
|
||||
iterator = _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit)
|
||||
else:
|
||||
iterator = unfiltered_query
|
||||
|
||||
if offset > 0:
|
||||
take(offset, iterator)
|
||||
|
||||
# Return the results.
|
||||
return list(take(limit, iterator))
|
||||
|
||||
|
||||
def _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit):
|
||||
encountered = set()
|
||||
chunk_count = limit * 2
|
||||
unfiltered_page = 0
|
||||
iteration_count = 0
|
||||
|
||||
while iteration_count < 10: # Just to be safe
|
||||
# Find the next chunk's worth of repository IDs, paginated by the chunk size.
|
||||
unfiltered_page = unfiltered_page + 1
|
||||
found_ids = [r.id for r in unfiltered_query.paginate(unfiltered_page, chunk_count)]
|
||||
|
||||
# Make sure we haven't encountered these results before. This code is used to handle
|
||||
# the case where we've previously seen a result, as pagination is not necessary
|
||||
# stable in SQL databases.
|
||||
unfiltered_repository_ids = set(found_ids)
|
||||
new_unfiltered_ids = unfiltered_repository_ids - encountered
|
||||
if not new_unfiltered_ids:
|
||||
break
|
||||
|
||||
encountered.update(new_unfiltered_ids)
|
||||
|
||||
# Filter the repositories found to only those visible to the current user.
|
||||
query = (Repository
|
||||
.select(*select_items)
|
||||
.select(Repository, Namespace)
|
||||
.distinct()
|
||||
.join(Namespace, on=(Namespace.id == Repository.namespace_user))
|
||||
.switch(Repository)
|
||||
.where(search_clause)
|
||||
.group_by(Repository.id, Namespace.id))
|
||||
.join(RepositoryPermission)
|
||||
.where(Repository.id << list(new_unfiltered_ids)))
|
||||
|
||||
if only_public:
|
||||
query = query.where(Repository.visibility == _basequery.get_public_repo_visibility())
|
||||
filtered = _basequery.filter_to_repos_for_user(query, filter_username)
|
||||
|
||||
if existing_ids:
|
||||
query = query.where(~(Repository.id << existing_ids))
|
||||
for filtered_repo in filtered:
|
||||
yield filtered_repo
|
||||
|
||||
if with_count:
|
||||
query = (query
|
||||
.switch(Repository)
|
||||
.join(RepositoryActionCount)
|
||||
.where(RepositoryActionCount.date >= last_week)
|
||||
.order_by(fn.Sum(RepositoryActionCount.count).desc()))
|
||||
# If the number of found IDs is less than the chunk count, then we're done.
|
||||
if len(found_ids) < chunk_count:
|
||||
break
|
||||
|
||||
for result in query:
|
||||
if len(results) >= limit:
|
||||
return results
|
||||
iteration_count = iteration_count + 1
|
||||
|
||||
# Note: We compare IDs here, instead of objects, because calling .visibility on the
|
||||
# Repository will kick off a new SQL query to retrieve that visibility enum value. We don't
|
||||
# join the visibility table in SQL, as well, because it is ungodly slow in MySQL :-/
|
||||
result.is_public = result.visibility_id == _basequery.get_public_repo_visibility().id
|
||||
result.count = result.count if with_count else 0
|
||||
|
||||
if not checker(result):
|
||||
continue
|
||||
def _get_sorted_matching_repositories(lookup_value, include_private=False):
|
||||
""" Returns a query of repositories matching the given lookup string, with optional inclusion of
|
||||
private repositories. Note that this method does *not* filter results based on visibility
|
||||
to users.
|
||||
"""
|
||||
last_week = datetime.now() - timedelta(weeks=1)
|
||||
|
||||
results.append(result)
|
||||
existing_ids.append(result.id)
|
||||
query = (Repository
|
||||
.select(Repository, Namespace)
|
||||
.join(Namespace, on=(Namespace.id == Repository.namespace_user))
|
||||
.where(Repository.name.match(lookup_value) | Repository.description.match(lookup_value))
|
||||
.group_by(Repository.id))
|
||||
|
||||
# For performance reasons, we conduct each set of searches on their own. This also affords us the
|
||||
# ability to easily define an order precedence.
|
||||
get_search_results(Repository.name.match(lookup_value), with_count=True)
|
||||
get_search_results(Repository.name.match(lookup_value), with_count=False)
|
||||
if not include_private:
|
||||
query = query.where(Repository.visibility == _basequery.get_public_repo_visibility())
|
||||
|
||||
get_search_results(Repository.description.match(lookup_value), with_count=True)
|
||||
get_search_results(Repository.description.match(lookup_value), with_count=False)
|
||||
query = (query
|
||||
.switch(Repository)
|
||||
.join(RepositoryActionCount)
|
||||
.where(RepositoryActionCount.date >= last_week)
|
||||
.order_by(fn.Sum(RepositoryActionCount.count).desc()))
|
||||
|
||||
get_search_results(prefix_search(Namespace.username, lookup_value), with_count=True)
|
||||
get_search_results(prefix_search(Namespace.username, lookup_value), with_count=False)
|
||||
|
||||
return results
|
||||
return query
|
||||
|
||||
|
||||
def lookup_repository(repo_id):
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
import ldap
|
||||
import logging
|
||||
import os
|
||||
import itertools
|
||||
|
||||
from collections import namedtuple
|
||||
from data.users.federated import FederatedUsers, UserInformation
|
||||
from util.itertoolrecipes import take
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -57,10 +57,6 @@ class LDAPConnection(object):
|
|||
def __exit__(self, exc_type, value, tb):
|
||||
self._conn.unbind_s()
|
||||
|
||||
def _take(n, iterable):
|
||||
"Return first n items of the iterable as a list"
|
||||
return list(itertools.islice(iterable, n))
|
||||
|
||||
|
||||
class LDAPUsers(FederatedUsers):
|
||||
_LDAPResult = namedtuple('LDAPResult', ['dn', 'attrs'])
|
||||
|
@ -154,7 +150,7 @@ class LDAPUsers(FederatedUsers):
|
|||
return (None, err_msg)
|
||||
|
||||
logger.debug('Found matching pairs: %s', pairs)
|
||||
results = [LDAPUsers._LDAPResult(*pair) for pair in _take(limit, pairs)]
|
||||
results = [LDAPUsers._LDAPResult(*pair) for pair in take(limit, pairs)]
|
||||
|
||||
# Filter out pairs without DNs. Some LDAP impls will return such pairs.
|
||||
with_dns = [result for result in results if result.dn]
|
||||
|
|
|
@ -1,22 +1,17 @@
|
|||
import logging
|
||||
import os
|
||||
import itertools
|
||||
|
||||
from keystoneclient.v2_0 import client as kclient
|
||||
from keystoneclient.v3 import client as kv3client
|
||||
from keystoneclient.exceptions import AuthorizationFailure as KeystoneAuthorizationFailure
|
||||
from keystoneclient.exceptions import Unauthorized as KeystoneUnauthorized
|
||||
from data.users.federated import FederatedUsers, UserInformation
|
||||
from util.itertoolrecipes import take
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_TIMEOUT = 10 # seconds
|
||||
|
||||
def _take(n, iterable):
|
||||
"Return first n items of the iterable as a list"
|
||||
return list(itertools.islice(iterable, n))
|
||||
|
||||
|
||||
def get_keystone_users(auth_version, auth_url, admin_username, admin_password, admin_tenant,
|
||||
timeout=None, requires_email=True):
|
||||
if auth_version == 3:
|
||||
|
@ -134,7 +129,7 @@ class KeystoneV3Users(FederatedUsers):
|
|||
keystone_client = kv3client.Client(username=self.admin_username, password=self.admin_password,
|
||||
tenant_name=self.admin_tenant, auth_url=self.auth_url,
|
||||
timeout=self.timeout, debug=self.debug)
|
||||
found_users = list(_take(limit, keystone_client.users.list(name=query)))
|
||||
found_users = list(take(limit, keystone_client.users.list(name=query)))
|
||||
logger.debug('For Keystone query %s found users: %s', query, found_users)
|
||||
if not found_users:
|
||||
return ([], self.federated_service, None)
|
||||
|
|
|
@ -18,6 +18,12 @@ from util.names import parse_robot_username
|
|||
import anunidecode # Don't listen to pylint's lies. This import is required.
|
||||
import math
|
||||
|
||||
|
||||
ENTITY_SEARCH_SCORE = 1
|
||||
TEAM_SEARCH_SCORE = 2
|
||||
REPOSITORY_SEARCH_SCORE = 4
|
||||
|
||||
|
||||
@resource('/v1/entities/link/<username>')
|
||||
@internal_only
|
||||
class LinkExternalEntity(ApiResource):
|
||||
|
@ -179,7 +185,7 @@ def search_entity_view(username, entity, get_short_name=None):
|
|||
'kind': kind,
|
||||
'avatar': avatar_data,
|
||||
'name': entity.username,
|
||||
'score': 1,
|
||||
'score': ENTITY_SEARCH_SCORE,
|
||||
'href': href
|
||||
}
|
||||
|
||||
|
@ -203,7 +209,7 @@ def conduct_team_search(username, query, encountered_teams, results):
|
|||
'name': team.name,
|
||||
'organization': search_entity_view(username, team.organization),
|
||||
'avatar': avatar.get_data_for_team(team),
|
||||
'score': 2,
|
||||
'score': TEAM_SEARCH_SCORE,
|
||||
'href': '/organization/' + team.organization.username + '/teams/' + team.name
|
||||
})
|
||||
|
||||
|
@ -222,38 +228,23 @@ def conduct_admined_team_search(username, query, encountered_teams, results):
|
|||
'name': team.name,
|
||||
'organization': search_entity_view(username, team.organization),
|
||||
'avatar': avatar.get_data_for_team(team),
|
||||
'score': 2,
|
||||
'score': TEAM_SEARCH_SCORE,
|
||||
'href': '/organization/' + team.organization.username + '/teams/' + team.name
|
||||
})
|
||||
|
||||
|
||||
def conduct_repo_search(username, query, results):
|
||||
""" Finds matching repositories. """
|
||||
def can_read(repo):
|
||||
if repo.is_public:
|
||||
return True
|
||||
|
||||
return ReadRepositoryPermission(repo.namespace_user.username, repo.name).can()
|
||||
|
||||
only_public = username is None
|
||||
matching_repos = model.repository.get_sorted_matching_repositories(query, only_public, can_read,
|
||||
limit=5)
|
||||
matching_repos = model.repository.get_filtered_matching_repositories(query, username, limit=5)
|
||||
|
||||
for repo in matching_repos:
|
||||
repo_score = math.log(repo.count or 1, 10) or 1
|
||||
|
||||
# If the repository is under the user's namespace, give it 20% more weight.
|
||||
namespace = repo.namespace_user.username
|
||||
if OrganizationMemberPermission(namespace).can() or namespace == username:
|
||||
repo_score = repo_score * 1.2
|
||||
|
||||
results.append({
|
||||
'kind': 'repository',
|
||||
'namespace': search_entity_view(username, repo.namespace_user),
|
||||
'name': repo.name,
|
||||
'description': repo.description,
|
||||
'is_public': repo.is_public,
|
||||
'score': repo_score,
|
||||
'is_public': model.repository.is_repository_public(repo),
|
||||
'score': REPOSITORY_SEARCH_SCORE,
|
||||
'href': '/repository/' + repo.namespace_user.username + '/' + repo.name
|
||||
})
|
||||
|
||||
|
|
|
@ -312,33 +312,19 @@ def get_search():
|
|||
|
||||
def _conduct_repo_search(username, query, limit=25, page=1):
|
||||
""" Finds matching repositories. """
|
||||
only_public = username is None
|
||||
|
||||
def can_read(repo):
|
||||
if repo.is_public:
|
||||
return True
|
||||
|
||||
if only_public:
|
||||
return False
|
||||
|
||||
return ReadRepositoryPermission(repo.namespace_user.username, repo.name).can()
|
||||
|
||||
# Note: We put a max 5 page limit here. The Docker CLI doesn't seem to use the
|
||||
# pagination and most customers hitting the API should be using V2 catalog, so this
|
||||
# is a safety net for our slow search below, since we have to use the slow approach
|
||||
# of finding *all* the results, and then slicing in-memory, because this old API requires
|
||||
# the *full* page count in the returned results.
|
||||
_MAX_PAGE_COUNT = 5
|
||||
page = min(page, _MAX_PAGE_COUNT)
|
||||
# Note that we put a maximum limit of five pages here, because this API should only really ever
|
||||
# be used by the Docker CLI, and it doesn't even paginate.
|
||||
page = min(page, 5)
|
||||
offset = (page - 1) * limit
|
||||
|
||||
if query:
|
||||
matching_repos = model.get_sorted_matching_repositories(query, only_public, can_read,
|
||||
limit=limit*_MAX_PAGE_COUNT)
|
||||
matching_repos = model.get_sorted_matching_repositories(query, username, limit=limit+1,
|
||||
offset=offset)
|
||||
else:
|
||||
matching_repos = []
|
||||
|
||||
results = []
|
||||
for repo in matching_repos[(page - 1) * _MAX_PAGE_COUNT:limit]:
|
||||
for repo in matching_repos[0:limit]:
|
||||
results.append({
|
||||
'name': repo.namespace_name + '/' + repo.name,
|
||||
'description': repo.description,
|
||||
|
@ -350,7 +336,7 @@ def _conduct_repo_search(username, query, limit=25, page=1):
|
|||
return {
|
||||
'query': query,
|
||||
'num_results': len(results),
|
||||
'num_pages': (len(matching_repos) / limit) + 1,
|
||||
'num_pages': page + 1 if len(matching_repos) > limit else page,
|
||||
'page': page,
|
||||
'page_size': limit,
|
||||
'results': results,
|
||||
|
|
|
@ -1291,7 +1291,8 @@ class V1RegistryTests(V1RegistryPullMixin, V1RegistryPushMixin, RegistryTestsMix
|
|||
|
||||
def test_search_pagination(self):
|
||||
# Check for the first page.
|
||||
resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1'))
|
||||
resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1'),
|
||||
auth=('devtable', 'password'))
|
||||
data = resp.json()
|
||||
self.assertEquals('s', data['query'])
|
||||
|
||||
|
@ -1301,17 +1302,16 @@ class V1RegistryTests(V1RegistryPullMixin, V1RegistryPushMixin, RegistryTestsMix
|
|||
self.assertEquals(1, data['page'])
|
||||
self.assertTrue(data['num_pages'] > 1)
|
||||
|
||||
# Check for the followup pages.
|
||||
for page_index in range(1, data['num_pages']):
|
||||
resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1', page=page_index))
|
||||
data = resp.json()
|
||||
self.assertEquals('s', data['query'])
|
||||
# Check for the followup page.
|
||||
resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1', page=2),
|
||||
auth=('devtable', 'password'))
|
||||
data = resp.json()
|
||||
self.assertEquals('s', data['query'])
|
||||
|
||||
self.assertEquals(1, data['num_results'])
|
||||
self.assertEquals(1, len(data['results']))
|
||||
self.assertEquals(1, data['num_results'])
|
||||
self.assertEquals(1, len(data['results']))
|
||||
|
||||
self.assertEquals(1, data['page'])
|
||||
self.assertTrue(data['num_pages'] > 1)
|
||||
self.assertEquals(2, data['page'])
|
||||
|
||||
|
||||
def test_users(self):
|
||||
|
|
|
@ -971,11 +971,11 @@ class TestConductSearch(ApiTestCase):
|
|||
params=dict(query='public'))
|
||||
|
||||
self.assertEquals(2, len(json['results']))
|
||||
self.assertEquals(json['results'][0]['kind'], 'user')
|
||||
self.assertEquals(json['results'][0]['name'], 'public')
|
||||
self.assertEquals(json['results'][0]['kind'], 'repository')
|
||||
self.assertEquals(json['results'][0]['name'], 'publicrepo')
|
||||
|
||||
self.assertEquals(json['results'][1]['kind'], 'repository')
|
||||
self.assertEquals(json['results'][1]['name'], 'publicrepo')
|
||||
self.assertEquals(json['results'][1]['kind'], 'user')
|
||||
self.assertEquals(json['results'][1]['name'], 'public')
|
||||
|
||||
json = self.getJsonResponse(ConductSearch,
|
||||
params=dict(query='owners'))
|
||||
|
|
6
util/itertoolrecipes.py
Normal file
6
util/itertoolrecipes.py
Normal file
|
@ -0,0 +1,6 @@
|
|||
from itertools import islice
|
||||
|
||||
# From: https://docs.python.org/2/library/itertools.html
|
||||
def take(n, iterable):
|
||||
""" Return first n items of the iterable as a list """
|
||||
return list(islice(iterable, n))
|
Reference in a new issue