Change search to use a set of queries for repo lookup rather than a single monolithic query, in the hopes that this will make things significantly faster and actually useable. The individual queries have been tested by hand on MySQL, but the real test will be staging

This commit is contained in:
Joseph Schorr 2015-04-10 15:27:37 -04:00
parent dddab60058
commit 1df025b57e
3 changed files with 87 additions and 23 deletions

View file

@ -991,8 +991,69 @@ def _get_public_repo_visibility():
return _public_repo_visibility_cache
def get_matching_repositories(repo_term, username=None, limit=10, include_public=True,
pull_count_sort=False):
def get_sorted_matching_repositories(prefix, only_public, checker, limit=10):
""" Returns repositories matching the given prefix string and passing the given checker
function.
"""
last_week = datetime.now() - timedelta(weeks=1)
results = []
existing_ids = []
def get_search_results(search_clause, with_count):
if len(results) >= limit:
return
selected = [Repository, Namespace]
if with_count:
selected.append(fn.Count(LogEntry.id).alias('count'))
query = (Repository.select(*selected)
.join(Namespace, JOIN_LEFT_OUTER, on=(Namespace.id == Repository.namespace_user))
.switch(Repository)
.where(search_clause)
.group_by(Repository, Namespace))
if only_public:
query = query.where(Repository.visibility == _get_public_repo_visibility())
if existing_ids:
query = query.where(~(Repository.id << existing_ids))
if with_count:
query = (query.join(LogEntry, JOIN_LEFT_OUTER)
.where(LogEntry.datetime >= last_week)
.order_by(fn.Count(LogEntry.id).desc()))
for result in query:
if len(results) >= limit:
return results
# Note: We compare IDs here, instead of objects, because calling .visibility on the
# Repository will kick off a new SQL query to retrieve that visibility enum value. We don't
# join the visibility table in SQL, as well, because it is ungodly slow in MySQL :-/
result.is_public = result.visibility_id == _get_public_repo_visibility().id
result.count = result.count if with_count else 0
if not checker(result):
continue
results.append(result)
existing_ids.append(result.id)
# For performance reasons, we conduct the repo name and repo namespace searches on their
# own, and with and without counts on their own. This also affords us the ability to give
# higher precedence to repository names matching over namespaces, which is semantically correct.
get_search_results((Repository.name ** (prefix + '%')), with_count=True)
get_search_results((Repository.name ** (prefix + '%')), with_count=False)
get_search_results((Namespace.username ** (prefix + '%')), with_count=True)
get_search_results((Namespace.username ** (prefix + '%')), with_count=False)
return results
def get_matching_repositories(repo_term, username=None, limit=10, include_public=True):
namespace_term = repo_term
name_term = repo_term
@ -1010,22 +1071,7 @@ def get_matching_repositories(repo_term, username=None, limit=10, include_public
search_clauses = (Repository.name ** ('%' + name_term + '%') &
Namespace.username ** ('%' + namespace_term + '%'))
query = visible.where(search_clauses).limit(limit)
if pull_count_sort:
repo_pull = LogEntryKind.get(name = 'pull_repo')
last_month = datetime.now() - timedelta(weeks=4)
query = (query.switch(Repository)
.join(LogEntry, JOIN_LEFT_OUTER)
.where(((LogEntry.kind == repo_pull) & (LogEntry.datetime >= last_month)) |
(LogEntry.id >> None))
.group_by(Repository, Namespace, Visibility)
.order_by(fn.Count(LogEntry.id).desc())
.select(Repository, Namespace, Visibility,
fn.Count(LogEntry.id).alias('count')))
return query
return visible.where(search_clauses).limit(limit)
def change_password(user, new_password):