Fix queries for repository list popularity and action count

Before this change, we used extremely inefficient outer joins as part of a single query of lookup, which was spiking our CPU usage to nearly 100% on the query. We now issue two separate queries for popularity and action account, by doing a lookup of the previously found IDs. Interestingly enough, because of the way the queries are now written, MySQL can actually do both queries *directly from the indicies*, which means they each occur in approx 20ms!

Verified by local tests, postgres tests, and testing on staging with monitoring of our CPU usage during lookup
This commit is contained in:
Joseph Schorr 2015-07-16 13:52:12 +03:00
parent d21251c910
commit 7a548ea101
2 changed files with 50 additions and 31 deletions

View file

@ -984,19 +984,43 @@ def get_user_teams_within_org(username, organization):
User.username == username)
def get_when_last_modified(repository_ids):
tuples = (RepositoryTag
.select(RepositoryTag.repository, fn.Max(RepositoryTag.lifetime_start_ts))
.where(RepositoryTag.repository << repository_ids)
.group_by(RepositoryTag.repository)
.tuples())
last_modified_map = {}
for record in tuples:
last_modified_map[record[0]] = record[1]
return last_modified_map
def get_action_counts(repository_ids):
# Filter the join to recent entries only.
last_week = datetime.now() - timedelta(weeks=1)
tuples = (RepositoryActionCount
.select(RepositoryActionCount.repository, fn.Sum(RepositoryActionCount.count))
.where(RepositoryActionCount.repository << repository_ids)
.where(RepositoryActionCount.date >= last_week)
.group_by(RepositoryActionCount.repository)
.tuples())
action_count_map = {}
for record in tuples:
action_count_map[record[0]] = record[1]
return action_count_map
def get_visible_repositories(username=None, include_public=True, page=None,
limit=None, namespace=None, namespace_only=False,
include_actions=False, include_latest_tag=False):
limit=None, namespace=None, namespace_only=False):
fields = [Repository.name, Repository.id, Repository.description, Visibility.name,
Namespace.username]
if include_actions:
fields.append(fn.Max(RepositoryActionCount.count))
if include_latest_tag:
fields.append(fn.Max(RepositoryTag.lifetime_start_ts))
query = _visible_repository_query(username=username, include_public=include_public, page=page,
limit=limit, namespace=namespace,
select_models=fields)
@ -1007,23 +1031,6 @@ def get_visible_repositories(username=None, include_public=True, page=None,
if namespace and namespace_only:
query = query.where(Namespace.username == namespace)
if include_actions:
# Filter the join to recent entries only.
last_week = datetime.now() - timedelta(weeks=1)
join_query = ((RepositoryActionCount.repository == Repository.id) &
(RepositoryActionCount.date >= last_week))
query = (query.switch(Repository)
.join(RepositoryActionCount, JOIN_LEFT_OUTER, on=join_query)
.group_by(RepositoryActionCount.repository, Repository.name, Repository.id,
Repository.description, Visibility.name, Namespace.username))
if include_latest_tag:
query = (query.switch(Repository)
.join(RepositoryTag, JOIN_LEFT_OUTER)
.group_by(RepositoryTag.repository, Repository.name, Repository.id,
Repository.description, Visibility.name, Namespace.username))
return TupleSelector(query, fields)

View file

@ -132,14 +132,24 @@ class RepositoryList(ApiResource):
response = {}
# Find the matching repositories.
repo_query = model.get_visible_repositories(username,
limit=args['limit'],
page=args['page'],
include_public=args['public'],
namespace=args['namespace'],
namespace_only=args['namespace_only'],
include_latest_tag=args['last_modified'],
include_actions=args['popularity'])
namespace_only=args['namespace_only'])
# Collect the IDs of the repositories found for subequent lookup of popularity
# and/or last modified.
repository_ids = [repo.get(RepositoryTable.id) for repo in repo_query]
if args['last_modified']:
last_modified_map = model.get_when_last_modified(repository_ids)
if args['popularity']:
action_count_map = model.get_action_counts(repository_ids)
def repo_view(repo_obj):
repo = {
'namespace': repo_obj.get(Namespace.username),
@ -148,14 +158,16 @@ class RepositoryList(ApiResource):
'is_public': repo_obj.get(Visibility.name) == 'public'
}
repo_id = repo_obj.get(RepositoryTable.id)
if args['last_modified']:
repo['last_modified'] = repo_obj.get(fn.Max(RepositoryTag.lifetime_start_ts))
repo['last_modified'] = last_modified_map.get(repo_id)
if args['popularity']:
repo['popularity'] = repo_obj.get(fn.Max(RepositoryActionCount.count)) or 0
repo['popularity'] = action_count_map.get(repo_id, 0)
if get_authenticated_user():
repo['is_starred'] = repo_obj.get(RepositoryTable.id) in star_lookup
repo['is_starred'] = repo_id in star_lookup
return repo