From e8cb359d968df435f65ff6bb756b075dcfdb213d Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 7 Oct 2015 10:00:12 -0700 Subject: [PATCH 1/2] Unionize the mega query - It needed more performance-based benefits --- data/model/_basequery.py | 64 +++++++++++++++++++++----------------- data/model/repository.py | 3 -- data/model/tag.py | 3 +- data/model/user.py | 3 +- test/test_image_sharing.py | 4 +++ 5 files changed, 41 insertions(+), 36 deletions(-) diff --git a/data/model/_basequery.py b/data/model/_basequery.py index 131f860e7..054e00eb5 100644 --- a/data/model/_basequery.py +++ b/data/model/_basequery.py @@ -25,7 +25,18 @@ def filter_to_repos_for_user(query, username=None, namespace=None, include_publi if not include_public and not username: return Repository.select().where(Repository.id == '-1') - where_clause = None + # Build a set of queries that, when unioned together, return the full set of visible repositories + # for the filters specified. + queries = [] + + where_clause = (True) + if namespace: + where_clause = (Namespace.username == namespace) + + if include_public: + queries.append(query.clone() + .where(Repository.visibility == get_public_repo_visibility(), where_clause)) + if username: UserThroughTeam = User.alias() Org = User.alias() @@ -33,37 +44,32 @@ def filter_to_repos_for_user(query, username=None, namespace=None, include_publi AdminTeamMember = TeamMember.alias() AdminUser = User.alias() - query = (query - .switch(RepositoryPermission) - .join(User, JOIN_LEFT_OUTER) - .switch(RepositoryPermission) - .join(Team, JOIN_LEFT_OUTER) - .join(TeamMember, JOIN_LEFT_OUTER) - .join(UserThroughTeam, JOIN_LEFT_OUTER, on=(UserThroughTeam.id == TeamMember.user)) - .switch(Repository) - .join(Org, JOIN_LEFT_OUTER, on=(Repository.namespace_user == Org.id)) - .join(AdminTeam, JOIN_LEFT_OUTER, on=(Org.id == AdminTeam.organization)) - .join(TeamRole, JOIN_LEFT_OUTER, on=(AdminTeam.role == TeamRole.id)) - .switch(AdminTeam) - .join(AdminTeamMember, JOIN_LEFT_OUTER, on=(AdminTeam.id == AdminTeamMember.team)) - .join(AdminUser, JOIN_LEFT_OUTER, on=(AdminTeamMember.user == AdminUser.id))) + # Add repositories in which the user has permission. + queries.append(query.clone() + .switch(RepositoryPermission) + .join(User) + .where(User.username == username, where_clause)) - where_clause = ((User.username == username) | (UserThroughTeam.username == username) | - ((AdminUser.username == username) & (TeamRole.name == 'admin'))) + # Add repositories in which the user is a member of a team that has permission. + queries.append(query.clone() + .switch(RepositoryPermission) + .join(Team) + .join(TeamMember) + .join(UserThroughTeam, on=(UserThroughTeam.id == TeamMember.user)) + .where(UserThroughTeam.username == username, where_clause)) - if namespace: - where_clause = where_clause & (Namespace.username == namespace) + # Add repositories under namespaces in which the user is the org admin. + queries.append(query.clone() + .switch(Repository) + .join(Org, on=(Repository.namespace_user == Org.id)) + .join(AdminTeam, on=(Org.id == AdminTeam.organization)) + .join(TeamRole, on=(AdminTeam.role == TeamRole.id)) + .switch(AdminTeam) + .join(AdminTeamMember, on=(AdminTeam.id == AdminTeamMember.team)) + .join(AdminUser, on=(AdminTeamMember.user == AdminUser.id)) + .where(AdminUser.username == username, where_clause)) - # TODO(jschorr, jake): Figure out why the old join on Visibility was so darn slow and - # remove this hack. - if include_public: - new_clause = (Repository.visibility == get_public_repo_visibility()) - if where_clause: - where_clause = where_clause | new_clause - else: - where_clause = new_clause - - return query.where(where_clause) + return reduce(lambda l, r: l | r, queries) def get_user_organizations(username): diff --git a/data/model/repository.py b/data/model/repository.py index 4a732d346..d5668b186 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -251,9 +251,6 @@ def get_visible_repositories(username, namespace=None, page=None, limit=None, in if limit: query = query.limit(limit) - if namespace: - query = query.where(Namespace.username == namespace) - return query diff --git a/data/model/tag.py b/data/model/tag.py index 5875c95b1..cf6ddf3c6 100644 --- a/data/model/tag.py +++ b/data/model/tag.py @@ -133,8 +133,7 @@ def list_repository_tag_history(repo_obj, page=1, size=100, specific_tag=None): .join(Image) .where(RepositoryTag.repository == repo_obj) .where(RepositoryTag.hidden == False) - .order_by(RepositoryTag.lifetime_start_ts.desc()) - .order_by(RepositoryTag.name) + .order_by(RepositoryTag.lifetime_start_ts.desc(), RepositoryTag.name) .paginate(page, size)) if specific_tag: diff --git a/data/model/user.py b/data/model/user.py index 1a7709ec7..29a2d88c1 100644 --- a/data/model/user.py +++ b/data/model/user.py @@ -496,12 +496,11 @@ def get_matching_user_namespaces(namespace_prefix, username, limit=10): base_query = (Namespace .select() .distinct() - .limit(limit) .join(Repository, on=(Repository.namespace_user == Namespace.id)) .join(RepositoryPermission, JOIN_LEFT_OUTER) .where(Namespace.username ** (namespace_prefix + '%'))) - return _basequery.filter_to_repos_for_user(base_query, username) + return _basequery.filter_to_repos_for_user(base_query, username).limit(limit) def get_matching_users(username_prefix, robot_namespace=None, organization=None): diff --git a/test/test_image_sharing.py b/test/test_image_sharing.py index d932f6dcc..b72b88875 100644 --- a/test/test_image_sharing.py +++ b/test/test_image_sharing.py @@ -217,3 +217,7 @@ class TestImageSharing(unittest.TestCase): still_uploading.save() self.assertDifferentStorage('an-image', still_uploading) + + +if __name__ == '__main__': + unittest.main() From c9daf7d8a991aa974b82f394efa6a7c0801a9ef4 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Tue, 13 Oct 2015 12:55:40 -0400 Subject: [PATCH 2/2] Add additional tests for repo visibility and further simplify the query for perf --- data/model/repository.py | 22 ++------- endpoints/api/repository.py | 2 +- test/test_visible_repos.py | 98 +++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 18 deletions(-) create mode 100644 test/test_visible_repos.py diff --git a/data/model/repository.py b/data/model/repository.py index d5668b186..e9c63d5f0 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -14,6 +14,10 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Im logger = logging.getLogger(__name__) +def get_public_repo_visibility(): + return _basequery.get_public_repo_visibility() + + def create_repository(namespace, name, creating_user, visibility='private'): private = Visibility.get(name=visibility) namespace_user = User.get(username=namespace) @@ -241,25 +245,9 @@ def get_visible_repositories(username, namespace=None, page=None, limit=None, in if not include_public and not username: return [] - fields = [Repository.name, Repository.id, Repository.description, Visibility.name, - Namespace.username] - - query = _visible_repository_query(username=username, page=page, - limit=limit, namespace=namespace, include_public=include_public, - select_models=fields) - - if limit: - query = query.limit(limit) - - return query - - -def _visible_repository_query(username=None, include_public=True, limit=None, - page=None, namespace=None, select_models=[]): query = (Repository - .select(*select_models) # MySQL/RDS complains is there are selected models for counts. + .select(Repository.name, Repository.id, Repository.description, Namespace.username) .distinct() - .join(Visibility) .switch(Repository) .join(Namespace, on=(Repository.namespace_user == Namespace.id)) .switch(Repository) diff --git a/endpoints/api/repository.py b/endpoints/api/repository.py index 131f1c02b..20ff89af3 100644 --- a/endpoints/api/repository.py +++ b/endpoints/api/repository.py @@ -203,7 +203,7 @@ class RepositoryList(ApiResource): 'namespace': repo_obj.namespace_user.username, 'name': repo_obj.name, 'description': repo_obj.description, - 'is_public': repo_obj.visibility.name == 'public' + 'is_public': repo_obj.visibility_id == model.repository.get_public_repo_visibility(), } repo_id = repo_obj.id diff --git a/test/test_visible_repos.py b/test/test_visible_repos.py new file mode 100644 index 000000000..243070473 --- /dev/null +++ b/test/test_visible_repos.py @@ -0,0 +1,98 @@ +import unittest + +from app import app +from initdb import setup_database_for_testing, finished_database_for_testing +from data import model + +NO_ACCESS_USER = 'freshuser' +READ_ACCESS_USER = 'reader' +ADMIN_ACCESS_USER = 'devtable' +PUBLIC_USER = 'public' +RANDOM_USER = 'randomuser' +OUTSIDE_ORG_USER = 'outsideorg' + +ADMIN_ROBOT_USER = 'devtable+dtrobot' + +ORGANIZATION = 'buynlarge' + +SIMPLE_REPO = 'simple' +PUBLIC_REPO = 'publicrepo' +RANDOM_REPO = 'randomrepo' + +OUTSIDE_ORG_REPO = 'coolrepo' + +ORG_REPO = 'orgrepo' +ANOTHER_ORG_REPO = 'anotherorgrepo' + +# Note: The shared repo has devtable as admin, public as a writer and reader as a reader. +SHARED_REPO = 'shared' + +class TestVisibleRepositories(unittest.TestCase): + def setUp(self): + setup_database_for_testing(self) + self.app = app.test_client() + self.ctx = app.test_request_context() + self.ctx.__enter__() + + def tearDown(self): + finished_database_for_testing(self) + self.ctx.__exit__(True, None, None) + + def assertDoesNotHaveRepo(self, username, name): + repos = list(model.repository.get_visible_repositories(username)) + names = [repo.name for repo in repos] + self.assertNotIn(name, names) + + def assertHasRepo(self, username, name): + repos = list(model.repository.get_visible_repositories(username)) + names = [repo.name for repo in repos] + self.assertIn(name, names) + + def test_noaccess(self): + repos = list(model.repository.get_visible_repositories(NO_ACCESS_USER)) + names = [repo.name for repo in repos] + self.assertEquals(0, len(names)) + + # Try retrieving public repos now. + repos = list(model.repository.get_visible_repositories(NO_ACCESS_USER, include_public=True)) + names = [repo.name for repo in repos] + self.assertIn(PUBLIC_REPO, names) + + + def test_public(self): + self.assertHasRepo(PUBLIC_USER, PUBLIC_REPO) + self.assertHasRepo(PUBLIC_USER, SHARED_REPO) + + self.assertDoesNotHaveRepo(PUBLIC_USER, SIMPLE_REPO) + self.assertDoesNotHaveRepo(PUBLIC_USER, RANDOM_REPO) + self.assertDoesNotHaveRepo(PUBLIC_USER, OUTSIDE_ORG_REPO) + + def test_reader(self): + self.assertHasRepo(READ_ACCESS_USER, SHARED_REPO) + self.assertHasRepo(READ_ACCESS_USER, ORG_REPO) + + self.assertDoesNotHaveRepo(READ_ACCESS_USER, SIMPLE_REPO) + self.assertDoesNotHaveRepo(READ_ACCESS_USER, RANDOM_REPO) + self.assertDoesNotHaveRepo(READ_ACCESS_USER, OUTSIDE_ORG_REPO) + self.assertDoesNotHaveRepo(READ_ACCESS_USER, PUBLIC_REPO) + + def test_random(self): + self.assertHasRepo(RANDOM_USER, RANDOM_REPO) + + self.assertDoesNotHaveRepo(RANDOM_USER, SIMPLE_REPO) + self.assertDoesNotHaveRepo(RANDOM_USER, SHARED_REPO) + self.assertDoesNotHaveRepo(RANDOM_USER, ORG_REPO) + self.assertDoesNotHaveRepo(RANDOM_USER, ANOTHER_ORG_REPO) + self.assertDoesNotHaveRepo(RANDOM_USER, PUBLIC_REPO) + + def test_admin(self): + self.assertHasRepo(ADMIN_ACCESS_USER, SIMPLE_REPO) + self.assertHasRepo(ADMIN_ACCESS_USER, SHARED_REPO) + + self.assertHasRepo(ADMIN_ACCESS_USER, ORG_REPO) + self.assertHasRepo(ADMIN_ACCESS_USER, ANOTHER_ORG_REPO) + + self.assertDoesNotHaveRepo(ADMIN_ACCESS_USER, OUTSIDE_ORG_REPO) + +if __name__ == '__main__': + unittest.main()