Fix queries for repository list popularity and action count
Before this change, we used extremely inefficient outer joins as part of a single query of lookup, which was spiking our CPU usage to nearly 100% on the query. We now issue two separate queries for popularity and action account, by doing a lookup of the previously found IDs. Interestingly enough, because of the way the queries are now written, MySQL can actually do both queries *directly from the indicies*, which means they each occur in approx 20ms! Verified by local tests, postgres tests, and testing on staging with monitoring of our CPU usage during lookup
This commit is contained in:
parent
d21251c910
commit
7a548ea101
2 changed files with 50 additions and 31 deletions
|
@ -984,19 +984,43 @@ def get_user_teams_within_org(username, organization):
|
|||
User.username == username)
|
||||
|
||||
|
||||
def get_when_last_modified(repository_ids):
|
||||
tuples = (RepositoryTag
|
||||
.select(RepositoryTag.repository, fn.Max(RepositoryTag.lifetime_start_ts))
|
||||
.where(RepositoryTag.repository << repository_ids)
|
||||
.group_by(RepositoryTag.repository)
|
||||
.tuples())
|
||||
|
||||
last_modified_map = {}
|
||||
for record in tuples:
|
||||
last_modified_map[record[0]] = record[1]
|
||||
|
||||
return last_modified_map
|
||||
|
||||
|
||||
def get_action_counts(repository_ids):
|
||||
# Filter the join to recent entries only.
|
||||
last_week = datetime.now() - timedelta(weeks=1)
|
||||
tuples = (RepositoryActionCount
|
||||
.select(RepositoryActionCount.repository, fn.Sum(RepositoryActionCount.count))
|
||||
.where(RepositoryActionCount.repository << repository_ids)
|
||||
.where(RepositoryActionCount.date >= last_week)
|
||||
.group_by(RepositoryActionCount.repository)
|
||||
.tuples())
|
||||
|
||||
action_count_map = {}
|
||||
for record in tuples:
|
||||
action_count_map[record[0]] = record[1]
|
||||
|
||||
return action_count_map
|
||||
|
||||
|
||||
def get_visible_repositories(username=None, include_public=True, page=None,
|
||||
limit=None, namespace=None, namespace_only=False,
|
||||
include_actions=False, include_latest_tag=False):
|
||||
limit=None, namespace=None, namespace_only=False):
|
||||
|
||||
fields = [Repository.name, Repository.id, Repository.description, Visibility.name,
|
||||
Namespace.username]
|
||||
|
||||
if include_actions:
|
||||
fields.append(fn.Max(RepositoryActionCount.count))
|
||||
|
||||
if include_latest_tag:
|
||||
fields.append(fn.Max(RepositoryTag.lifetime_start_ts))
|
||||
|
||||
query = _visible_repository_query(username=username, include_public=include_public, page=page,
|
||||
limit=limit, namespace=namespace,
|
||||
select_models=fields)
|
||||
|
@ -1007,23 +1031,6 @@ def get_visible_repositories(username=None, include_public=True, page=None,
|
|||
if namespace and namespace_only:
|
||||
query = query.where(Namespace.username == namespace)
|
||||
|
||||
if include_actions:
|
||||
# Filter the join to recent entries only.
|
||||
last_week = datetime.now() - timedelta(weeks=1)
|
||||
join_query = ((RepositoryActionCount.repository == Repository.id) &
|
||||
(RepositoryActionCount.date >= last_week))
|
||||
|
||||
query = (query.switch(Repository)
|
||||
.join(RepositoryActionCount, JOIN_LEFT_OUTER, on=join_query)
|
||||
.group_by(RepositoryActionCount.repository, Repository.name, Repository.id,
|
||||
Repository.description, Visibility.name, Namespace.username))
|
||||
|
||||
if include_latest_tag:
|
||||
query = (query.switch(Repository)
|
||||
.join(RepositoryTag, JOIN_LEFT_OUTER)
|
||||
.group_by(RepositoryTag.repository, Repository.name, Repository.id,
|
||||
Repository.description, Visibility.name, Namespace.username))
|
||||
|
||||
return TupleSelector(query, fields)
|
||||
|
||||
|
||||
|
|
|
@ -132,14 +132,24 @@ class RepositoryList(ApiResource):
|
|||
|
||||
response = {}
|
||||
|
||||
# Find the matching repositories.
|
||||
repo_query = model.get_visible_repositories(username,
|
||||
limit=args['limit'],
|
||||
page=args['page'],
|
||||
include_public=args['public'],
|
||||
namespace=args['namespace'],
|
||||
namespace_only=args['namespace_only'],
|
||||
include_latest_tag=args['last_modified'],
|
||||
include_actions=args['popularity'])
|
||||
namespace_only=args['namespace_only'])
|
||||
|
||||
# Collect the IDs of the repositories found for subequent lookup of popularity
|
||||
# and/or last modified.
|
||||
repository_ids = [repo.get(RepositoryTable.id) for repo in repo_query]
|
||||
|
||||
if args['last_modified']:
|
||||
last_modified_map = model.get_when_last_modified(repository_ids)
|
||||
|
||||
if args['popularity']:
|
||||
action_count_map = model.get_action_counts(repository_ids)
|
||||
|
||||
def repo_view(repo_obj):
|
||||
repo = {
|
||||
'namespace': repo_obj.get(Namespace.username),
|
||||
|
@ -148,14 +158,16 @@ class RepositoryList(ApiResource):
|
|||
'is_public': repo_obj.get(Visibility.name) == 'public'
|
||||
}
|
||||
|
||||
repo_id = repo_obj.get(RepositoryTable.id)
|
||||
|
||||
if args['last_modified']:
|
||||
repo['last_modified'] = repo_obj.get(fn.Max(RepositoryTag.lifetime_start_ts))
|
||||
repo['last_modified'] = last_modified_map.get(repo_id)
|
||||
|
||||
if args['popularity']:
|
||||
repo['popularity'] = repo_obj.get(fn.Max(RepositoryActionCount.count)) or 0
|
||||
repo['popularity'] = action_count_map.get(repo_id, 0)
|
||||
|
||||
if get_authenticated_user():
|
||||
repo['is_starred'] = repo_obj.get(RepositoryTable.id) in star_lookup
|
||||
repo['is_starred'] = repo_id in star_lookup
|
||||
|
||||
return repo
|
||||
|
||||
|
|
Reference in a new issue