From 0be0aed17d273eb88c25b6c294dd22ca3a74538c Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 9 Apr 2015 14:41:59 -0400 Subject: [PATCH] Move the repo sorting by pull count into the main matching query, to both make it more accurate and make the search faster --- data/model/legacy.py | 43 ++++++++++++++++++++++++++--------------- endpoints/api/search.py | 5 ++--- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/data/model/legacy.py b/data/model/legacy.py index 5cebbaac6..ad80ddaeb 100644 --- a/data/model/legacy.py +++ b/data/model/legacy.py @@ -688,10 +688,18 @@ def get_matching_user_namespaces(namespace_prefix, username, limit=10): .switch(Repository) .join(RepositoryPermission, JOIN_LEFT_OUTER) .where(Namespace.username ** (namespace_prefix + '%')) - .group_by(Repository.namespace_user, Repository) - .limit(limit)) + .group_by(Repository.namespace_user, Repository)) - return [r.namespace_user for r in _filter_to_repos_for_user(query, username)] + count = 0 + namespaces = {} + for repo in _filter_to_repos_for_user(query, username): + if not repo.namespace_user.username in namespaces: + namespaces[repo.namespace_user.username] = repo.namespace_user + count = count + 1 + if count >= limit: + break + + return namespaces.values() def get_matching_user_teams(team_prefix, user, limit=10): query = (Team.select() @@ -983,7 +991,8 @@ def _get_public_repo_visibility(): return _public_repo_visibility_cache -def get_matching_repositories(repo_term, username=None, limit=10, include_public=True): +def get_matching_repositories(repo_term, username=None, limit=10, include_public=True, + pull_count_sort=False): namespace_term = repo_term name_term = repo_term @@ -1001,21 +1010,23 @@ def get_matching_repositories(repo_term, username=None, limit=10, include_public search_clauses = (Repository.name ** ('%' + name_term + '%') & Namespace.username ** ('%' + namespace_term + '%')) - return visible.where(search_clauses).limit(limit) + query = visible.where(search_clauses).limit(limit) + if pull_count_sort: + repo_pull = LogEntryKind.get(name = 'pull_repo') + last_month = datetime.now() - timedelta(weeks=4) -def get_repository_pull_counts(repositories): - repo_pull = LogEntryKind.get(name = 'pull_repo') - if not repositories: - return [] + query = (query.switch(Repository) + .join(LogEntry, JOIN_LEFT_OUTER) + .where(((LogEntry.kind == repo_pull) & (LogEntry.datetime >= last_month)) | + (LogEntry.id >> None)) + .group_by(Repository, Namespace, Visibility) + .order_by(fn.Count(LogEntry.id).desc()) + .select(Repository, Namespace, Visibility, + fn.Count(LogEntry.id).alias('count'))) + + return query - last_month = datetime.now() - timedelta(weeks=4) - return (Repository.select(Repository.id, fn.Count(LogEntry.id)) - .where(Repository.id << [r.id for r in repositories]) - .join(LogEntry, JOIN_LEFT_OUTER) - .where(LogEntry.kind == repo_pull, LogEntry.datetime >= last_month) - .group_by(Repository.id, LogEntry.id) - .tuples()) def change_password(user, new_password): if not validate_password(new_password): diff --git a/endpoints/api/search.py b/endpoints/api/search.py index 5262fdad5..9619a5021 100644 --- a/endpoints/api/search.py +++ b/endpoints/api/search.py @@ -205,11 +205,10 @@ def conduct_admined_team_search(username, query, encountered_teams, results): def conduct_repo_search(username, query, results): """ Finds matching repositories. """ - matching_repos = list(model.get_matching_repositories(query, username, limit=5)) - matching_repo_counts = {t[0]: t[1] for t in model.get_repository_pull_counts(matching_repos)} + matching_repos = model.get_matching_repositories(query, username, limit=5, pull_count_sort=True) for repo in matching_repos: - repo_score = math.log(matching_repo_counts.get(repo.id, 1), 10) or 1 + repo_score = math.log(repo.count or 1, 10) or 1 # If the repository is under the user's namespace, give it 50% more weight. namespace = repo.namespace_user.username