Merge pull request #2392 from coreos-inc/search-optimization

Optimize repository search by changing our lookup strategy
This commit is contained in:
josephschorr 2017-03-10 15:44:26 -05:00 committed by GitHub
commit 432b2d3fe8
9 changed files with 123 additions and 123 deletions

View file

@ -13,6 +13,7 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Im
Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
db_random_func, db_concat_func)
from data.text import prefix_search
from util.itertoolrecipes import take
logger = logging.getLogger(__name__)
@ -31,6 +32,9 @@ def create_repository(namespace, name, creating_user, visibility='private'):
repo = Repository.create(name=name, visibility=private, namespace_user=namespace_user)
admin = Role.get(name='admin')
yesterday = datetime.now() - timedelta(days=1)
RepositoryActionCount.create(repository=repo, count=0, date=yesterday)
if creating_user and not creating_user.organization:
RepositoryPermission.create(user=creating_user, repository=repo, role=admin)
@ -326,70 +330,94 @@ def get_visible_repositories(username, namespace=None, include_public=False, sta
return query
def get_sorted_matching_repositories(lookup_value, only_public, checker, limit=10):
""" Returns repositories matching the given lookup string and passing the given checker
function.
def get_filtered_matching_repositories(lookup_value, filter_username=None, offset=0, limit=25):
""" Returns an iterator of all repositories matching the given lookup value, with optional
filtering to a specific user. If the user is unspecified, only public repositories will
be returned.
"""
last_week = datetime.now() - timedelta(weeks=1)
results = []
existing_ids = []
def get_search_results(search_clause, with_count=False):
if len(results) >= limit:
return
# Build the unfiltered search query.
unfiltered_query = _get_sorted_matching_repositories(lookup_value,
include_private=filter_username is not None)
select_items = [Repository, Namespace]
if with_count:
select_items.append(fn.Sum(RepositoryActionCount.count).alias('count'))
# Add a filter to the iterator, if necessary.
if filter_username is not None:
iterator = _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit)
else:
iterator = unfiltered_query
if offset > 0:
take(offset, iterator)
# Return the results.
return list(take(limit, iterator))
def _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit):
encountered = set()
chunk_count = limit * 2
unfiltered_page = 0
iteration_count = 0
while iteration_count < 10: # Just to be safe
# Find the next chunk's worth of repository IDs, paginated by the chunk size.
unfiltered_page = unfiltered_page + 1
found_ids = [r.id for r in unfiltered_query.paginate(unfiltered_page, chunk_count)]
# Make sure we haven't encountered these results before. This code is used to handle
# the case where we've previously seen a result, as pagination is not necessary
# stable in SQL databases.
unfiltered_repository_ids = set(found_ids)
new_unfiltered_ids = unfiltered_repository_ids - encountered
if not new_unfiltered_ids:
break
encountered.update(new_unfiltered_ids)
# Filter the repositories found to only those visible to the current user.
query = (Repository
.select(*select_items)
.select(Repository, Namespace)
.distinct()
.join(Namespace, on=(Namespace.id == Repository.namespace_user))
.switch(Repository)
.where(search_clause)
.group_by(Repository.id, Namespace.id))
.join(RepositoryPermission)
.where(Repository.id << list(new_unfiltered_ids)))
if only_public:
query = query.where(Repository.visibility == _basequery.get_public_repo_visibility())
filtered = _basequery.filter_to_repos_for_user(query, filter_username)
if existing_ids:
query = query.where(~(Repository.id << existing_ids))
for filtered_repo in filtered:
yield filtered_repo
if with_count:
query = (query
.switch(Repository)
.join(RepositoryActionCount)
.where(RepositoryActionCount.date >= last_week)
.order_by(fn.Sum(RepositoryActionCount.count).desc()))
# If the number of found IDs is less than the chunk count, then we're done.
if len(found_ids) < chunk_count:
break
for result in query:
if len(results) >= limit:
return results
iteration_count = iteration_count + 1
# Note: We compare IDs here, instead of objects, because calling .visibility on the
# Repository will kick off a new SQL query to retrieve that visibility enum value. We don't
# join the visibility table in SQL, as well, because it is ungodly slow in MySQL :-/
result.is_public = result.visibility_id == _basequery.get_public_repo_visibility().id
result.count = result.count if with_count else 0
if not checker(result):
continue
def _get_sorted_matching_repositories(lookup_value, include_private=False):
""" Returns a query of repositories matching the given lookup string, with optional inclusion of
private repositories. Note that this method does *not* filter results based on visibility
to users.
"""
last_week = datetime.now() - timedelta(weeks=1)
results.append(result)
existing_ids.append(result.id)
query = (Repository
.select(Repository, Namespace)
.join(Namespace, on=(Namespace.id == Repository.namespace_user))
.where(Repository.name.match(lookup_value) | Repository.description.match(lookup_value))
.group_by(Repository.id))
# For performance reasons, we conduct each set of searches on their own. This also affords us the
# ability to easily define an order precedence.
get_search_results(Repository.name.match(lookup_value), with_count=True)
get_search_results(Repository.name.match(lookup_value), with_count=False)
if not include_private:
query = query.where(Repository.visibility == _basequery.get_public_repo_visibility())
get_search_results(Repository.description.match(lookup_value), with_count=True)
get_search_results(Repository.description.match(lookup_value), with_count=False)
query = (query
.switch(Repository)
.join(RepositoryActionCount)
.where(RepositoryActionCount.date >= last_week)
.order_by(fn.Sum(RepositoryActionCount.count).desc()))
get_search_results(prefix_search(Namespace.username, lookup_value), with_count=True)
get_search_results(prefix_search(Namespace.username, lookup_value), with_count=False)
return results
return query
def lookup_repository(repo_id):