Optimize repository search by changing our lookup strategy
Previous to this change, repositories were looked up unfiltered in six different queries, and then filtered using the permissions model, which issued a query per repository found, making search incredibly slow. Instead, we now lookup a chunk of repositories unfiltered and then filter them via a single query to the database. By layering the filtering on top of the lookup, each as queries, we can minimize the number of queries necessary, without (at the same time) using a super expensive join. Other changes: - Remove the 5 page pre-lookup on V1 search and simply return that there is one more page available, until there isn't. While technically not correct, it is much more efficient, and no one should be using pagination with V1 search anyway. - Remove the lookup for repos without entries in the RAC table. Instead, we now add a new RAC entry when the repository is created for *the day before*, with count 0, so that it is immediately searchable - Remove lookup of results with a matching namespace; these aren't very relevant anyway, and it overly complicates sorting
This commit is contained in:
		
							parent
							
								
									b45dc07dce
								
							
						
					
					
						commit
						b5bb76cdea
					
				
					 9 changed files with 114 additions and 120 deletions
				
			
		|  | @ -13,6 +13,7 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Im | |||
|                            Label, TagManifestLabel, db_for_update, get_epoch_timestamp, | ||||
|                            db_random_func, db_concat_func) | ||||
| from data.text import prefix_search | ||||
| from util.itertoolrecipes import take | ||||
| 
 | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
|  | @ -31,6 +32,9 @@ def create_repository(namespace, name, creating_user, visibility='private'): | |||
|   repo = Repository.create(name=name, visibility=private, namespace_user=namespace_user) | ||||
|   admin = Role.get(name='admin') | ||||
| 
 | ||||
|   yesterday = datetime.now() - timedelta(days=1) | ||||
|   RepositoryActionCount.create(repository=repo, count=0, date=yesterday) | ||||
| 
 | ||||
|   if creating_user and not creating_user.organization: | ||||
|     RepositoryPermission.create(user=creating_user, repository=repo, role=admin) | ||||
| 
 | ||||
|  | @ -326,70 +330,94 @@ def get_visible_repositories(username, namespace=None, include_public=False, sta | |||
|   return query | ||||
| 
 | ||||
| 
 | ||||
| def get_sorted_matching_repositories(lookup_value, only_public, checker, limit=10): | ||||
|   """ Returns repositories matching the given lookup string and passing the given checker | ||||
|       function. | ||||
| def get_filtered_matching_repositories(lookup_value, filter_username=None, offset=0, limit=25): | ||||
|   """ Returns an iterator of all repositories matching the given lookup value, with optional | ||||
|       filtering to a specific user. If the user is unspecified, only public repositories will | ||||
|       be returned. | ||||
|   """ | ||||
|   last_week = datetime.now() - timedelta(weeks=1) | ||||
|   results = [] | ||||
|   existing_ids = [] | ||||
| 
 | ||||
|   def get_search_results(search_clause, with_count=False): | ||||
|     if len(results) >= limit: | ||||
|       return | ||||
|   # Build the unfiltered search query. | ||||
|   unfiltered_query = _get_sorted_matching_repositories(lookup_value, | ||||
|                                                        include_private=filter_username is not None) | ||||
| 
 | ||||
|     select_items = [Repository, Namespace] | ||||
|     if with_count: | ||||
|       select_items.append(fn.Sum(RepositoryActionCount.count).alias('count')) | ||||
|   # Add a filter to the iterator, if necessary. | ||||
|   if filter_username is not None: | ||||
|     iterator = _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit) | ||||
|   else: | ||||
|     iterator = unfiltered_query | ||||
| 
 | ||||
|   if offset > 0: | ||||
|     take(offset, iterator) | ||||
| 
 | ||||
|   # Return the results. | ||||
|   return list(take(limit, iterator)) | ||||
| 
 | ||||
| 
 | ||||
| def _filter_repositories_visible_to_username(unfiltered_query, filter_username, limit): | ||||
|   encountered = set() | ||||
|   chunk_count = limit * 2 | ||||
|   unfiltered_page = 0 | ||||
|   iteration_count = 0 | ||||
| 
 | ||||
|   while iteration_count < 10: # Just to be safe | ||||
|     # Find the next chunk's worth of repository IDs, paginated by the chunk size. | ||||
|     unfiltered_page = unfiltered_page + 1 | ||||
|     found_ids = [r.id for r in unfiltered_query.paginate(unfiltered_page, chunk_count)] | ||||
| 
 | ||||
|     # Make sure we haven't encountered these results before. This code is used to handle | ||||
|     # the case where we've previously seen a result, as pagination is not necessary | ||||
|     # stable in SQL databases. | ||||
|     unfiltered_repository_ids = set(found_ids) | ||||
|     new_unfiltered_ids = unfiltered_repository_ids - encountered | ||||
|     if not new_unfiltered_ids: | ||||
|       break | ||||
| 
 | ||||
|     encountered.update(new_unfiltered_ids) | ||||
| 
 | ||||
|     # Filter the repositories found to only those visible to the current user. | ||||
|     query = (Repository | ||||
|              .select(*select_items) | ||||
|              .select(Repository, Namespace) | ||||
|              .distinct() | ||||
|              .join(Namespace, on=(Namespace.id == Repository.namespace_user)) | ||||
|              .switch(Repository) | ||||
|              .where(search_clause) | ||||
|              .group_by(Repository.id, Namespace.id)) | ||||
|              .join(RepositoryPermission) | ||||
|              .where(Repository.id << list(new_unfiltered_ids))) | ||||
| 
 | ||||
|     if only_public: | ||||
|       query = query.where(Repository.visibility == _basequery.get_public_repo_visibility()) | ||||
|     filtered = _basequery.filter_to_repos_for_user(query, filter_username) | ||||
| 
 | ||||
|     if existing_ids: | ||||
|       query = query.where(~(Repository.id << existing_ids)) | ||||
|     for filtered_repo in filtered: | ||||
|       yield filtered_repo | ||||
| 
 | ||||
|     if with_count: | ||||
|       query = (query | ||||
|                .switch(Repository) | ||||
|                .join(RepositoryActionCount) | ||||
|                .where(RepositoryActionCount.date >= last_week) | ||||
|                .order_by(fn.Sum(RepositoryActionCount.count).desc())) | ||||
|     # If the number of found IDs is less than the chunk count, then we're done. | ||||
|     if len(found_ids) < chunk_count: | ||||
|       break | ||||
| 
 | ||||
|     for result in query: | ||||
|       if len(results) >= limit: | ||||
|         return results | ||||
|     iteration_count = iteration_count + 1 | ||||
| 
 | ||||
|       # Note: We compare IDs here, instead of objects, because calling .visibility on the | ||||
|       # Repository will kick off a new SQL query to retrieve that visibility enum value. We don't | ||||
|       # join the visibility table in SQL, as well, because it is ungodly slow in MySQL :-/ | ||||
|       result.is_public = result.visibility_id == _basequery.get_public_repo_visibility().id | ||||
|       result.count = result.count if with_count else 0 | ||||
| 
 | ||||
|       if not checker(result): | ||||
|         continue | ||||
| def _get_sorted_matching_repositories(lookup_value, include_private=False): | ||||
|   """ Returns a query of repositories matching the given lookup string, with optional inclusion of | ||||
|       private repositories. Note that this method does *not* filter results based on visibility | ||||
|       to users. | ||||
|   """ | ||||
|   last_week = datetime.now() - timedelta(weeks=1) | ||||
| 
 | ||||
|       results.append(result) | ||||
|       existing_ids.append(result.id) | ||||
|   query = (Repository | ||||
|            .select(Repository, Namespace) | ||||
|            .join(Namespace, on=(Namespace.id == Repository.namespace_user)) | ||||
|            .where(Repository.name.match(lookup_value) | Repository.description.match(lookup_value)) | ||||
|            .group_by(Repository.id)) | ||||
| 
 | ||||
|   # For performance reasons, we conduct each set of searches on their own. This also affords us the | ||||
|   # ability to easily define an order precedence. | ||||
|   get_search_results(Repository.name.match(lookup_value), with_count=True) | ||||
|   get_search_results(Repository.name.match(lookup_value), with_count=False) | ||||
|   if not include_private: | ||||
|     query = query.where(Repository.visibility == _basequery.get_public_repo_visibility()) | ||||
| 
 | ||||
|   get_search_results(Repository.description.match(lookup_value), with_count=True) | ||||
|   get_search_results(Repository.description.match(lookup_value), with_count=False) | ||||
|   query = (query | ||||
|            .switch(Repository) | ||||
|            .join(RepositoryActionCount) | ||||
|            .where(RepositoryActionCount.date >= last_week) | ||||
|            .order_by(fn.Sum(RepositoryActionCount.count).desc())) | ||||
| 
 | ||||
|   get_search_results(prefix_search(Namespace.username, lookup_value), with_count=True) | ||||
|   get_search_results(prefix_search(Namespace.username, lookup_value), with_count=False) | ||||
| 
 | ||||
|   return results | ||||
|   return query | ||||
| 
 | ||||
| 
 | ||||
| def lookup_repository(repo_id): | ||||
|  |  | |||
		Reference in a new issue