From 7a548ea1012364f1410b3b4b3bda5273fc7dfc05 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 16 Jul 2015 13:52:12 +0300 Subject: [PATCH] Fix queries for repository list popularity and action count Before this change, we used extremely inefficient outer joins as part of a single query of lookup, which was spiking our CPU usage to nearly 100% on the query. We now issue two separate queries for popularity and action account, by doing a lookup of the previously found IDs. Interestingly enough, because of the way the queries are now written, MySQL can actually do both queries *directly from the indicies*, which means they each occur in approx 20ms! Verified by local tests, postgres tests, and testing on staging with monitoring of our CPU usage during lookup --- data/model/legacy.py | 57 +++++++++++++++++++++---------------- endpoints/api/repository.py | 24 ++++++++++++---- 2 files changed, 50 insertions(+), 31 deletions(-) diff --git a/data/model/legacy.py b/data/model/legacy.py index bf42fb221..947dc188c 100644 --- a/data/model/legacy.py +++ b/data/model/legacy.py @@ -984,19 +984,43 @@ def get_user_teams_within_org(username, organization): User.username == username) +def get_when_last_modified(repository_ids): + tuples = (RepositoryTag + .select(RepositoryTag.repository, fn.Max(RepositoryTag.lifetime_start_ts)) + .where(RepositoryTag.repository << repository_ids) + .group_by(RepositoryTag.repository) + .tuples()) + + last_modified_map = {} + for record in tuples: + last_modified_map[record[0]] = record[1] + + return last_modified_map + + +def get_action_counts(repository_ids): + # Filter the join to recent entries only. + last_week = datetime.now() - timedelta(weeks=1) + tuples = (RepositoryActionCount + .select(RepositoryActionCount.repository, fn.Sum(RepositoryActionCount.count)) + .where(RepositoryActionCount.repository << repository_ids) + .where(RepositoryActionCount.date >= last_week) + .group_by(RepositoryActionCount.repository) + .tuples()) + + action_count_map = {} + for record in tuples: + action_count_map[record[0]] = record[1] + + return action_count_map + + def get_visible_repositories(username=None, include_public=True, page=None, - limit=None, namespace=None, namespace_only=False, - include_actions=False, include_latest_tag=False): + limit=None, namespace=None, namespace_only=False): fields = [Repository.name, Repository.id, Repository.description, Visibility.name, Namespace.username] - if include_actions: - fields.append(fn.Max(RepositoryActionCount.count)) - - if include_latest_tag: - fields.append(fn.Max(RepositoryTag.lifetime_start_ts)) - query = _visible_repository_query(username=username, include_public=include_public, page=page, limit=limit, namespace=namespace, select_models=fields) @@ -1007,23 +1031,6 @@ def get_visible_repositories(username=None, include_public=True, page=None, if namespace and namespace_only: query = query.where(Namespace.username == namespace) - if include_actions: - # Filter the join to recent entries only. - last_week = datetime.now() - timedelta(weeks=1) - join_query = ((RepositoryActionCount.repository == Repository.id) & - (RepositoryActionCount.date >= last_week)) - - query = (query.switch(Repository) - .join(RepositoryActionCount, JOIN_LEFT_OUTER, on=join_query) - .group_by(RepositoryActionCount.repository, Repository.name, Repository.id, - Repository.description, Visibility.name, Namespace.username)) - - if include_latest_tag: - query = (query.switch(Repository) - .join(RepositoryTag, JOIN_LEFT_OUTER) - .group_by(RepositoryTag.repository, Repository.name, Repository.id, - Repository.description, Visibility.name, Namespace.username)) - return TupleSelector(query, fields) diff --git a/endpoints/api/repository.py b/endpoints/api/repository.py index d2f3f4085..abf45327f 100644 --- a/endpoints/api/repository.py +++ b/endpoints/api/repository.py @@ -132,14 +132,24 @@ class RepositoryList(ApiResource): response = {} + # Find the matching repositories. repo_query = model.get_visible_repositories(username, limit=args['limit'], page=args['page'], include_public=args['public'], namespace=args['namespace'], - namespace_only=args['namespace_only'], - include_latest_tag=args['last_modified'], - include_actions=args['popularity']) + namespace_only=args['namespace_only']) + + # Collect the IDs of the repositories found for subequent lookup of popularity + # and/or last modified. + repository_ids = [repo.get(RepositoryTable.id) for repo in repo_query] + + if args['last_modified']: + last_modified_map = model.get_when_last_modified(repository_ids) + + if args['popularity']: + action_count_map = model.get_action_counts(repository_ids) + def repo_view(repo_obj): repo = { 'namespace': repo_obj.get(Namespace.username), @@ -148,14 +158,16 @@ class RepositoryList(ApiResource): 'is_public': repo_obj.get(Visibility.name) == 'public' } + repo_id = repo_obj.get(RepositoryTable.id) + if args['last_modified']: - repo['last_modified'] = repo_obj.get(fn.Max(RepositoryTag.lifetime_start_ts)) + repo['last_modified'] = last_modified_map.get(repo_id) if args['popularity']: - repo['popularity'] = repo_obj.get(fn.Max(RepositoryActionCount.count)) or 0 + repo['popularity'] = action_count_map.get(repo_id, 0) if get_authenticated_user(): - repo['is_starred'] = repo_obj.get(RepositoryTable.id) in star_lookup + repo['is_starred'] = repo_id in star_lookup return repo