From 973a110ac7f89953325973fc343380d9da3b21a6 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 11 Jan 2017 15:03:14 -0500 Subject: [PATCH] Full text search for repository name and description Adds support for searching full text against the name and description of a repository [Delivers #134867401] --- data/database.py | 12 +++++-- ...add_full_text_search_indexing_for_repo_.py | 31 +++++++++++++++++++ data/model/repository.py | 20 ++++++------ initdb.py | 5 +++ test/test_api_usage.py | 17 ++++++++++ 5 files changed, 73 insertions(+), 12 deletions(-) create mode 100644 data/migrations/versions/e2894a3a3c19_add_full_text_search_indexing_for_repo_.py diff --git a/data/database.py b/data/database.py index 21692fb92..83dbe74de 100644 --- a/data/database.py +++ b/data/database.py @@ -21,7 +21,8 @@ from sqlalchemy.engine.url import make_url import resumablehashlib -from data.fields import ResumableSHA256Field, ResumableSHA1Field, JSONField, Base64BinaryField +from data.fields import (ResumableSHA256Field, ResumableSHA1Field, JSONField, Base64BinaryField, + FullIndexedTextField, FullIndexedCharField) from data.text import match_mysql, match_like from data.read_slave import ReadSlaveModel from util.names import urn_generator @@ -31,10 +32,12 @@ logger = logging.getLogger(__name__) DEFAULT_DB_CONNECT_TIMEOUT = 10 # seconds + # IMAGE_NOT_SCANNED_ENGINE_VERSION is the version found in security_indexed_engine when the # image has not yet been scanned. IMAGE_NOT_SCANNED_ENGINE_VERSION = -1 + _SCHEME_DRIVERS = { 'mysql': MySQLDatabase, 'mysql+pymysql': MySQLDatabase, @@ -43,6 +46,7 @@ _SCHEME_DRIVERS = { 'postgresql+psycopg2': PostgresqlDatabase, } + SCHEME_MATCH_FUNCTION = { 'mysql': match_mysql, 'mysql+pymysql': match_mysql, @@ -51,6 +55,7 @@ SCHEME_MATCH_FUNCTION = { 'postgresql+psycopg2': match_like, } + SCHEME_RANDOM_FUNCTION = { 'mysql': fn.Rand, 'mysql+pymysql': fn.Rand, @@ -59,6 +64,7 @@ SCHEME_RANDOM_FUNCTION = { 'postgresql+psycopg2': fn.Random, } + def pipes_concat(arg1, arg2, *extra_args): """ Concat function for sqlite, since it doesn't support fn.Concat. Concatenates clauses with || characters. @@ -482,9 +488,9 @@ class Visibility(BaseModel): class Repository(BaseModel): namespace_user = QuayUserField(null=True) - name = CharField() + name = FullIndexedCharField(match_function=db_match_func) visibility = ForeignKeyField(Visibility) - description = TextField(null=True) + description = FullIndexedTextField(match_function=db_match_func, null=True) badge_token = CharField(default=uuid_generator) class Meta: diff --git a/data/migrations/versions/e2894a3a3c19_add_full_text_search_indexing_for_repo_.py b/data/migrations/versions/e2894a3a3c19_add_full_text_search_indexing_for_repo_.py new file mode 100644 index 000000000..94264078b --- /dev/null +++ b/data/migrations/versions/e2894a3a3c19_add_full_text_search_indexing_for_repo_.py @@ -0,0 +1,31 @@ +"""Add full text search indexing for repo name and description + +Revision ID: e2894a3a3c19 +Revises: 45fd8b9869d4 +Create Date: 2017-01-11 13:55:54.890774 + +""" + +# revision identifiers, used by Alembic. +revision = 'e2894a3a3c19' +down_revision = '45fd8b9869d4' + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import mysql + +def upgrade(tables): + if op.get_bind().engine.name == 'postgresql': + op.execute('CREATE EXTENSION IF NOT EXISTS pg_trgm') + + # ### commands auto generated by Alembic - please adjust! ### + op.create_index('repository_description__fulltext', 'repository', ['description'], unique=False, postgresql_using='gin', postgresql_ops={'description': 'gin_trgm_ops'}, mysql_prefix='FULLTEXT') + op.create_index('repository_name__fulltext', 'repository', ['name'], unique=False, postgresql_using='gin', postgresql_ops={'name': 'gin_trgm_ops'}, mysql_prefix='FULLTEXT') + # ### end Alembic commands ### + + +def downgrade(tables): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index('repository_name__fulltext', table_name='repository') + op.drop_index('repository_description__fulltext', table_name='repository') + # ### end Alembic commands ### diff --git a/data/model/repository.py b/data/model/repository.py index 5621266dc..910ae358a 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -319,8 +319,8 @@ def get_visible_repositories(username, namespace=None, include_public=False, sta return query -def get_sorted_matching_repositories(prefix, only_public, checker, limit=10): - """ Returns repositories matching the given prefix string and passing the given checker +def get_sorted_matching_repositories(lookup_value, only_public, checker, limit=10): + """ Returns repositories matching the given lookup string and passing the given checker function. """ last_week = datetime.now() - timedelta(weeks=1) @@ -371,14 +371,16 @@ def get_sorted_matching_repositories(prefix, only_public, checker, limit=10): results.append(result) existing_ids.append(result.id) - # For performance reasons, we conduct the repo name and repo namespace searches on their - # own. This also affords us the ability to give higher precedence to repository names matching - # over namespaces, which is semantically correct. - get_search_results(_basequery.prefix_search(Repository.name, prefix), with_count=True) - get_search_results(_basequery.prefix_search(Repository.name, prefix), with_count=False) + # For performance reasons, we conduct each set of searches on their own. This also affords us the + # ability to easily define an order precedence. + get_search_results(Repository.name.match(lookup_value), with_count=True) + get_search_results(Repository.name.match(lookup_value), with_count=False) - get_search_results(_basequery.prefix_search(Namespace.username, prefix), with_count=True) - get_search_results(_basequery.prefix_search(Namespace.username, prefix), with_count=False) + get_search_results(Repository.description.match(lookup_value), with_count=True) + get_search_results(Repository.description.match(lookup_value), with_count=False) + + get_search_results(prefix_search(Namespace.username, lookup_value), with_count=True) + get_search_results(prefix_search(Namespace.username, lookup_value), with_count=False) return results diff --git a/initdb.py b/initdb.py index 0e92aec80..2c1615cf7 100644 --- a/initdb.py +++ b/initdb.py @@ -568,6 +568,11 @@ def populate_database(minimal=False, with_storage=False): [(new_user_2, 'write'), (reader, 'read')], (5, [], 'latest')) + __generate_repository(with_storage, new_user_1, 'text-full-repo', + 'This is a repository for testing text search', False, + [(new_user_2, 'write'), (reader, 'read')], + (5, [], 'latest')) + building = __generate_repository(with_storage, new_user_1, 'building', 'Empty repository which is building.', False, [], (0, [], None)) diff --git a/test/test_api_usage.py b/test/test_api_usage.py index 35c322ee3..a3cdf7eb0 100644 --- a/test/test_api_usage.py +++ b/test/test_api_usage.py @@ -1001,6 +1001,23 @@ class TestConductSearch(ApiTestCase): self.assertEquals(json['results'][0]['name'], 'shared') + def test_full_text(self): + self.login(ADMIN_ACCESS_USER) + + # Make sure the repository is found via `full` and `text search`. + json = self.getJsonResponse(ConductSearch, + params=dict(query='full')) + self.assertEquals(1, len(json['results'])) + self.assertEquals(json['results'][0]['kind'], 'repository') + self.assertEquals(json['results'][0]['name'], 'text-full-repo') + + json = self.getJsonResponse(ConductSearch, + params=dict(query='text search')) + self.assertEquals(1, len(json['results'])) + self.assertEquals(json['results'][0]['kind'], 'repository') + self.assertEquals(json['results'][0]['name'], 'text-full-repo') + + class TestGetMatchingEntities(ApiTestCase): def test_simple_lookup(self): self.login(ADMIN_ACCESS_USER)