Full text search for repository name and description

Adds support for searching full text against the name and description of a repository [Delivers #134867401]
2017-01-11 15:03:14 -05:00 · 2017-01-11 15:03:14 -05:00 · 973a110ac7
commit 973a110ac7
parent d65d32b284
5 changed files with 73 additions and 12 deletions
--- a/data/database.py
+++ b/data/database.py
@ -21,7 +21,8 @@ from sqlalchemy.engine.url import make_url

 import resumablehashlib

-from data.fields import ResumableSHA256Field, ResumableSHA1Field, JSONField, Base64BinaryField
+from data.fields import (ResumableSHA256Field, ResumableSHA1Field, JSONField, Base64BinaryField,
+                         FullIndexedTextField, FullIndexedCharField)
 from data.text import match_mysql, match_like
 from data.read_slave import ReadSlaveModel
 from util.names import urn_generator
@ -31,10 +32,12 @@ logger = logging.getLogger(__name__)

 DEFAULT_DB_CONNECT_TIMEOUT = 10 # seconds

+
 # IMAGE_NOT_SCANNED_ENGINE_VERSION is the version found in security_indexed_engine when the
 # image has not yet been scanned.
 IMAGE_NOT_SCANNED_ENGINE_VERSION = -1

+
 _SCHEME_DRIVERS = {
  'mysql': MySQLDatabase,
  'mysql+pymysql': MySQLDatabase,
@ -43,6 +46,7 @@ _SCHEME_DRIVERS = {
  'postgresql+psycopg2': PostgresqlDatabase,
 }

+
 SCHEME_MATCH_FUNCTION = {
  'mysql': match_mysql,
  'mysql+pymysql': match_mysql,
@ -51,6 +55,7 @@ SCHEME_MATCH_FUNCTION = {
  'postgresql+psycopg2': match_like,
 }

+
 SCHEME_RANDOM_FUNCTION = {
  'mysql': fn.Rand,
  'mysql+pymysql': fn.Rand,
@ -59,6 +64,7 @@ SCHEME_RANDOM_FUNCTION = {
  'postgresql+psycopg2': fn.Random,
 }

+
 def pipes_concat(arg1, arg2, *extra_args):
  """ Concat function for sqlite, since it doesn't support fn.Concat.
      Concatenates clauses with || characters.
@ -482,9 +488,9 @@ class Visibility(BaseModel):

 class Repository(BaseModel):
  namespace_user = QuayUserField(null=True)
-  name = CharField()
+  name = FullIndexedCharField(match_function=db_match_func)
  visibility = ForeignKeyField(Visibility)
-  description = TextField(null=True)
+  description = FullIndexedTextField(match_function=db_match_func, null=True)
  badge_token = CharField(default=uuid_generator)

  class Meta:
--- a/data/migrations/versions/e2894a3a3c19_add_full_text_search_indexing_for_repo_.py
+++ b/data/migrations/versions/e2894a3a3c19_add_full_text_search_indexing_for_repo_.py
@ -0,0 +1,31 @@
+"""Add full text search indexing for repo name and description
+
+Revision ID: e2894a3a3c19
+Revises: 45fd8b9869d4
+Create Date: 2017-01-11 13:55:54.890774
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = 'e2894a3a3c19'
+down_revision = '45fd8b9869d4'
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import mysql
+
+def upgrade(tables):
+    if op.get_bind().engine.name == 'postgresql':
+      op.execute('CREATE EXTENSION IF NOT EXISTS pg_trgm')
+
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_index('repository_description__fulltext', 'repository', ['description'], unique=False, postgresql_using='gin', postgresql_ops={'description': 'gin_trgm_ops'}, mysql_prefix='FULLTEXT')
+    op.create_index('repository_name__fulltext', 'repository', ['name'], unique=False, postgresql_using='gin', postgresql_ops={'name': 'gin_trgm_ops'}, mysql_prefix='FULLTEXT')
+    # ### end Alembic commands ###
+
+
+def downgrade(tables):
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index('repository_name__fulltext', table_name='repository')
+    op.drop_index('repository_description__fulltext', table_name='repository')
+    # ### end Alembic commands ###
--- a/data/model/repository.py
+++ b/data/model/repository.py
@ -319,8 +319,8 @@ def get_visible_repositories(username, namespace=None, include_public=False, sta
  return query


-def get_sorted_matching_repositories(prefix, only_public, checker, limit=10):
-  """ Returns repositories matching the given prefix string and passing the given checker
+def get_sorted_matching_repositories(lookup_value, only_public, checker, limit=10):
+  """ Returns repositories matching the given lookup string and passing the given checker
      function.
  """
  last_week = datetime.now() - timedelta(weeks=1)
@ -371,14 +371,16 @@ def get_sorted_matching_repositories(prefix, only_public, checker, limit=10):
      results.append(result)
      existing_ids.append(result.id)

-  # For performance reasons, we conduct the repo name and repo namespace searches on their
-  # own. This also affords us the ability to give higher precedence to repository names matching
-  # over namespaces, which is semantically correct.
-  get_search_results(_basequery.prefix_search(Repository.name, prefix), with_count=True)
-  get_search_results(_basequery.prefix_search(Repository.name, prefix), with_count=False)
+  # For performance reasons, we conduct each set of searches on their own. This also affords us the
+  # ability to easily define an order precedence.
+  get_search_results(Repository.name.match(lookup_value), with_count=True)
+  get_search_results(Repository.name.match(lookup_value), with_count=False)

-  get_search_results(_basequery.prefix_search(Namespace.username, prefix), with_count=True)
-  get_search_results(_basequery.prefix_search(Namespace.username, prefix), with_count=False)
+  get_search_results(Repository.description.match(lookup_value), with_count=True)
+  get_search_results(Repository.description.match(lookup_value), with_count=False)
+
+  get_search_results(prefix_search(Namespace.username, lookup_value), with_count=True)
+  get_search_results(prefix_search(Namespace.username, lookup_value), with_count=False)

  return results

--- a/initdb.py
+++ b/initdb.py
@ -568,6 +568,11 @@ def populate_database(minimal=False, with_storage=False):
                        [(new_user_2, 'write'), (reader, 'read')],
                        (5, [], 'latest'))

+  __generate_repository(with_storage, new_user_1, 'text-full-repo',
+                        'This is a repository for testing text search', False,
+                        [(new_user_2, 'write'), (reader, 'read')],
+                        (5, [], 'latest'))
+
  building = __generate_repository(with_storage, new_user_1, 'building',
                                   'Empty repository which is building.',
                                   False, [], (0, [], None))
--- a/test/test_api_usage.py
+++ b/test/test_api_usage.py
@ -1001,6 +1001,23 @@ class TestConductSearch(ApiTestCase):
    self.assertEquals(json['results'][0]['name'], 'shared')


+  def test_full_text(self):
+    self.login(ADMIN_ACCESS_USER)
+
+    # Make sure the repository is found via `full` and `text search`.
+    json = self.getJsonResponse(ConductSearch,
+                                params=dict(query='full'))
+    self.assertEquals(1, len(json['results']))
+    self.assertEquals(json['results'][0]['kind'], 'repository')
+    self.assertEquals(json['results'][0]['name'], 'text-full-repo')
+
+    json = self.getJsonResponse(ConductSearch,
+                                params=dict(query='text search'))
+    self.assertEquals(1, len(json['results']))
+    self.assertEquals(json['results'][0]['kind'], 'repository')
+    self.assertEquals(json['results'][0]['name'], 'text-full-repo')
+
+
 class TestGetMatchingEntities(ApiTestCase):
  def test_simple_lookup(self):
    self.login(ADMIN_ACCESS_USER)