diff --git a/data/interfaces/v1.py b/data/interfaces/v1.py index 055808f14..b59142975 100644 --- a/data/interfaces/v1.py +++ b/data/interfaces/v1.py @@ -211,11 +211,9 @@ class DockerRegistryV1DataInterface(object): pass @abstractmethod - def get_sorted_matching_repositories(self, search_term, only_public, can_read, limit): + def get_sorted_matching_repositories(self, search_term, filter_username=None, offset=0, limit=25): """ Returns a sorted list of repositories matching the given search term. - can_read is a callback that will be invoked for each repository found, to - filter results to only those visible to the current user (if any). """ pass @@ -384,9 +382,9 @@ class PreOCIModel(DockerRegistryV1DataInterface): def validate_oauth_token(self, token): return bool(model.oauth.validate_access_token(token)) - def get_sorted_matching_repositories(self, search_term, only_public, can_read, limit): - repos = model.repository.get_sorted_matching_repositories(search_term, only_public, can_read, - limit=limit) + def get_sorted_matching_repositories(self, search_term, filter_username=None, offset=0, limit=25): + repos = model.repository.get_filtered_matching_repositories(search_term, filter_username, + offset, limit) return [_repository_for_repo(repo) for repo in repos] diff --git a/data/model/repository.py b/data/model/repository.py index 46de62bf7..fe12ede45 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -13,6 +13,7 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Im Label, TagManifestLabel, db_for_update, get_epoch_timestamp, db_random_func, db_concat_func) from data.text import prefix_search +from util.itertoolrecipes import take logger = logging.getLogger(__name__) @@ -31,6 +32,9 @@ def create_repository(namespace, name, creating_user, visibility='private'): repo = Repository.create(name=name, visibility=private, namespace_user=namespace_user) admin = Role.get(name='admin') + yesterday = datetime.now() - timedelta(days=1) + RepositoryActionCount.create(repository=repo, count=0, date=yesterday) + if creating_user and not creating_user.organization: RepositoryPermission.create(user=creating_user, repository=repo, role=admin) @@ -326,70 +330,94 @@ def get_visible_repositories(username, namespace=None, include_public=False, sta return query -def get_sorted_matching_repositories(lookup_value, only_public, checker, limit=10): - """ Returns repositories matching the given lookup string and passing the given checker - function. +def get_filtered_matching_repositories(lookup_value, filter_username=None, offset=0, limit=25): + """ Returns an iterator of all repositories matching the given lookup value, with optional + filtering to a specific user. If the user is unspecified, only public repositories will + be returned. """ - last_week = datetime.now() - timedelta(weeks=1) - results = [] - existing_ids = [] - def get_search_results(search_clause, with_count=False): - if len(results) >= limit: - return + # Build the unfiltered search query. + unfiltered_query = _get_sorted_matching_repositories(lookup_value, + include_private=filter_username is not None) - select_items = [Repository, Namespace] - if with_count: - select_items.append(fn.Sum(RepositoryActionCount.count).alias('count')) + # Add a filter to the iterator, if necessary. + if filter_username is not None: + iterator = _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit) + else: + iterator = unfiltered_query + if offset > 0: + take(offset, iterator) + + # Return the results. + return list(take(limit, iterator)) + + +def _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit): + encountered = set() + chunk_count = limit * 2 + unfiltered_page = 0 + iteration_count = 0 + + while iteration_count < 10: # Just to be safe + # Find the next chunk's worth of repository IDs, paginated by the chunk size. + unfiltered_page = unfiltered_page + 1 + found_ids = [r.id for r in unfiltered_query.paginate(unfiltered_page, chunk_count)] + + # Make sure we haven't encountered these results before. This code is used to handle + # the case where we've previously seen a result, as pagination is not necessary + # stable in SQL databases. + unfiltered_repository_ids = set(found_ids) + new_unfiltered_ids = unfiltered_repository_ids - encountered + if not new_unfiltered_ids: + break + + encountered.update(new_unfiltered_ids) + + # Filter the repositories found to only those visible to the current user. query = (Repository - .select(*select_items) + .select(Repository, Namespace) + .distinct() .join(Namespace, on=(Namespace.id == Repository.namespace_user)) .switch(Repository) - .where(search_clause) - .group_by(Repository.id, Namespace.id)) + .join(RepositoryPermission) + .where(Repository.id << list(new_unfiltered_ids))) - if only_public: - query = query.where(Repository.visibility == _basequery.get_public_repo_visibility()) + filtered = _basequery.filter_to_repos_for_user(query, filter_username) - if existing_ids: - query = query.where(~(Repository.id << existing_ids)) + for filtered_repo in filtered: + yield filtered_repo - if with_count: - query = (query - .switch(Repository) - .join(RepositoryActionCount) - .where(RepositoryActionCount.date >= last_week) - .order_by(fn.Sum(RepositoryActionCount.count).desc())) + # If the number of found IDs is less than the chunk count, then we're done. + if len(found_ids) < chunk_count: + break - for result in query: - if len(results) >= limit: - return results + iteration_count = iteration_count + 1 - # Note: We compare IDs here, instead of objects, because calling .visibility on the - # Repository will kick off a new SQL query to retrieve that visibility enum value. We don't - # join the visibility table in SQL, as well, because it is ungodly slow in MySQL :-/ - result.is_public = result.visibility_id == _basequery.get_public_repo_visibility().id - result.count = result.count if with_count else 0 - if not checker(result): - continue +def _get_sorted_matching_repositories(lookup_value, include_private=False): + """ Returns a query of repositories matching the given lookup string, with optional inclusion of + private repositories. Note that this method does *not* filter results based on visibility + to users. + """ + last_week = datetime.now() - timedelta(weeks=1) - results.append(result) - existing_ids.append(result.id) + query = (Repository + .select(Repository, Namespace) + .join(Namespace, on=(Namespace.id == Repository.namespace_user)) + .where(Repository.name.match(lookup_value) | Repository.description.match(lookup_value)) + .group_by(Repository.id)) - # For performance reasons, we conduct each set of searches on their own. This also affords us the - # ability to easily define an order precedence. - get_search_results(Repository.name.match(lookup_value), with_count=True) - get_search_results(Repository.name.match(lookup_value), with_count=False) + if not include_private: + query = query.where(Repository.visibility == _basequery.get_public_repo_visibility()) - get_search_results(Repository.description.match(lookup_value), with_count=True) - get_search_results(Repository.description.match(lookup_value), with_count=False) + query = (query + .switch(Repository) + .join(RepositoryActionCount) + .where(RepositoryActionCount.date >= last_week) + .order_by(fn.Sum(RepositoryActionCount.count).desc())) - get_search_results(prefix_search(Namespace.username, lookup_value), with_count=True) - get_search_results(prefix_search(Namespace.username, lookup_value), with_count=False) - - return results + return query def lookup_repository(repo_id): diff --git a/data/users/externalldap.py b/data/users/externalldap.py index cd6e460fb..8cd82ac28 100644 --- a/data/users/externalldap.py +++ b/data/users/externalldap.py @@ -1,10 +1,10 @@ import ldap import logging import os -import itertools from collections import namedtuple from data.users.federated import FederatedUsers, UserInformation +from util.itertoolrecipes import take logger = logging.getLogger(__name__) @@ -57,10 +57,6 @@ class LDAPConnection(object): def __exit__(self, exc_type, value, tb): self._conn.unbind_s() -def _take(n, iterable): - "Return first n items of the iterable as a list" - return list(itertools.islice(iterable, n)) - class LDAPUsers(FederatedUsers): _LDAPResult = namedtuple('LDAPResult', ['dn', 'attrs']) @@ -154,7 +150,7 @@ class LDAPUsers(FederatedUsers): return (None, err_msg) logger.debug('Found matching pairs: %s', pairs) - results = [LDAPUsers._LDAPResult(*pair) for pair in _take(limit, pairs)] + results = [LDAPUsers._LDAPResult(*pair) for pair in take(limit, pairs)] # Filter out pairs without DNs. Some LDAP impls will return such pairs. with_dns = [result for result in results if result.dn] diff --git a/data/users/keystone.py b/data/users/keystone.py index 6c8487a35..b3e7ad442 100644 --- a/data/users/keystone.py +++ b/data/users/keystone.py @@ -1,22 +1,17 @@ import logging import os -import itertools from keystoneclient.v2_0 import client as kclient from keystoneclient.v3 import client as kv3client from keystoneclient.exceptions import AuthorizationFailure as KeystoneAuthorizationFailure from keystoneclient.exceptions import Unauthorized as KeystoneUnauthorized from data.users.federated import FederatedUsers, UserInformation +from util.itertoolrecipes import take logger = logging.getLogger(__name__) DEFAULT_TIMEOUT = 10 # seconds -def _take(n, iterable): - "Return first n items of the iterable as a list" - return list(itertools.islice(iterable, n)) - - def get_keystone_users(auth_version, auth_url, admin_username, admin_password, admin_tenant, timeout=None, requires_email=True): if auth_version == 3: @@ -134,7 +129,7 @@ class KeystoneV3Users(FederatedUsers): keystone_client = kv3client.Client(username=self.admin_username, password=self.admin_password, tenant_name=self.admin_tenant, auth_url=self.auth_url, timeout=self.timeout, debug=self.debug) - found_users = list(_take(limit, keystone_client.users.list(name=query))) + found_users = list(take(limit, keystone_client.users.list(name=query))) logger.debug('For Keystone query %s found users: %s', query, found_users) if not found_users: return ([], self.federated_service, None) diff --git a/endpoints/api/search.py b/endpoints/api/search.py index 018ab713c..c8a9be22d 100644 --- a/endpoints/api/search.py +++ b/endpoints/api/search.py @@ -229,31 +229,16 @@ def conduct_admined_team_search(username, query, encountered_teams, results): def conduct_repo_search(username, query, results): """ Finds matching repositories. """ - def can_read(repo): - if repo.is_public: - return True - - return ReadRepositoryPermission(repo.namespace_user.username, repo.name).can() - - only_public = username is None - matching_repos = model.repository.get_sorted_matching_repositories(query, only_public, can_read, - limit=5) + matching_repos = model.repository.get_filtered_matching_repositories(query, username, limit=5) for repo in matching_repos: - repo_score = math.log(repo.count or 1, 10) or 1 - - # If the repository is under the user's namespace, give it 20% more weight. - namespace = repo.namespace_user.username - if OrganizationMemberPermission(namespace).can() or namespace == username: - repo_score = repo_score * 1.2 - results.append({ 'kind': 'repository', 'namespace': search_entity_view(username, repo.namespace_user), 'name': repo.name, 'description': repo.description, - 'is_public': repo.is_public, - 'score': repo_score, + 'is_public': model.repository.is_repository_public(repo), + 'score': 4, 'href': '/repository/' + repo.namespace_user.username + '/' + repo.name }) diff --git a/endpoints/v1/index.py b/endpoints/v1/index.py index f70b77f5c..028379554 100644 --- a/endpoints/v1/index.py +++ b/endpoints/v1/index.py @@ -312,33 +312,19 @@ def get_search(): def _conduct_repo_search(username, query, limit=25, page=1): """ Finds matching repositories. """ - only_public = username is None - - def can_read(repo): - if repo.is_public: - return True - - if only_public: - return False - - return ReadRepositoryPermission(repo.namespace_user.username, repo.name).can() - - # Note: We put a max 5 page limit here. The Docker CLI doesn't seem to use the - # pagination and most customers hitting the API should be using V2 catalog, so this - # is a safety net for our slow search below, since we have to use the slow approach - # of finding *all* the results, and then slicing in-memory, because this old API requires - # the *full* page count in the returned results. - _MAX_PAGE_COUNT = 5 - page = min(page, _MAX_PAGE_COUNT) + # Note that we put a maximum limit of five pages here, because this API should only really ever + # be used by the Docker CLI, and it doesn't even paginate. + page = min(page, 5) + offset = (page - 1) * limit if query: - matching_repos = model.get_sorted_matching_repositories(query, only_public, can_read, - limit=limit*_MAX_PAGE_COUNT) + matching_repos = model.get_sorted_matching_repositories(query, username, limit=limit+1, + offset=offset) else: matching_repos = [] results = [] - for repo in matching_repos[(page - 1) * _MAX_PAGE_COUNT:limit]: + for repo in matching_repos[0:limit]: results.append({ 'name': repo.namespace_name + '/' + repo.name, 'description': repo.description, @@ -350,7 +336,7 @@ def _conduct_repo_search(username, query, limit=25, page=1): return { 'query': query, 'num_results': len(results), - 'num_pages': (len(matching_repos) / limit) + 1, + 'num_pages': page + 1 if len(matching_repos) > limit else page, 'page': page, 'page_size': limit, 'results': results, diff --git a/test/registry_tests.py b/test/registry_tests.py index 21cc7ad8e..a391535d2 100644 --- a/test/registry_tests.py +++ b/test/registry_tests.py @@ -1291,7 +1291,8 @@ class V1RegistryTests(V1RegistryPullMixin, V1RegistryPushMixin, RegistryTestsMix def test_search_pagination(self): # Check for the first page. - resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1')) + resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1'), + auth=('devtable', 'password')) data = resp.json() self.assertEquals('s', data['query']) @@ -1301,17 +1302,16 @@ class V1RegistryTests(V1RegistryPullMixin, V1RegistryPushMixin, RegistryTestsMix self.assertEquals(1, data['page']) self.assertTrue(data['num_pages'] > 1) - # Check for the followup pages. - for page_index in range(1, data['num_pages']): - resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1', page=page_index)) - data = resp.json() - self.assertEquals('s', data['query']) + # Check for the followup page. + resp = self.conduct('GET', '/v1/search', params=dict(q='s', n='1', page=2), + auth=('devtable', 'password')) + data = resp.json() + self.assertEquals('s', data['query']) - self.assertEquals(1, data['num_results']) - self.assertEquals(1, len(data['results'])) + self.assertEquals(1, data['num_results']) + self.assertEquals(1, len(data['results'])) - self.assertEquals(1, data['page']) - self.assertTrue(data['num_pages'] > 1) + self.assertEquals(2, data['page']) def test_users(self): diff --git a/test/test_api_usage.py b/test/test_api_usage.py index eead6487f..3a0ee42f0 100644 --- a/test/test_api_usage.py +++ b/test/test_api_usage.py @@ -971,11 +971,11 @@ class TestConductSearch(ApiTestCase): params=dict(query='public')) self.assertEquals(2, len(json['results'])) - self.assertEquals(json['results'][0]['kind'], 'user') - self.assertEquals(json['results'][0]['name'], 'public') + self.assertEquals(json['results'][0]['kind'], 'repository') + self.assertEquals(json['results'][0]['name'], 'publicrepo') - self.assertEquals(json['results'][1]['kind'], 'repository') - self.assertEquals(json['results'][1]['name'], 'publicrepo') + self.assertEquals(json['results'][1]['kind'], 'user') + self.assertEquals(json['results'][1]['name'], 'public') json = self.getJsonResponse(ConductSearch, params=dict(query='owners')) diff --git a/util/itertoolrecipes.py b/util/itertoolrecipes.py new file mode 100644 index 000000000..1cd33386d --- /dev/null +++ b/util/itertoolrecipes.py @@ -0,0 +1,6 @@ +from itertools import islice + +# From: https://docs.python.org/2/library/itertools.html +def take(n, iterable): + """ Return first n items of the iterable as a list """ + return list(islice(iterable, n))