Merge pull request #2272 from coreos-inc/fulltext-search

Full text search support in Quay
This commit is contained in:
josephschorr 2017-01-31 11:51:47 -05:00 committed by GitHub
commit 356530110c
14 changed files with 193 additions and 38 deletions

View file

@ -21,7 +21,9 @@ from sqlalchemy.engine.url import make_url
import resumablehashlib import resumablehashlib
from data.fields import ResumableSHA256Field, ResumableSHA1Field, JSONField, Base64BinaryField from data.fields import (ResumableSHA256Field, ResumableSHA1Field, JSONField, Base64BinaryField,
FullIndexedTextField, FullIndexedCharField)
from data.text import match_mysql, match_like
from data.read_slave import ReadSlaveModel from data.read_slave import ReadSlaveModel
from util.names import urn_generator from util.names import urn_generator
@ -30,10 +32,12 @@ logger = logging.getLogger(__name__)
DEFAULT_DB_CONNECT_TIMEOUT = 10 # seconds DEFAULT_DB_CONNECT_TIMEOUT = 10 # seconds
# IMAGE_NOT_SCANNED_ENGINE_VERSION is the version found in security_indexed_engine when the # IMAGE_NOT_SCANNED_ENGINE_VERSION is the version found in security_indexed_engine when the
# image has not yet been scanned. # image has not yet been scanned.
IMAGE_NOT_SCANNED_ENGINE_VERSION = -1 IMAGE_NOT_SCANNED_ENGINE_VERSION = -1
_SCHEME_DRIVERS = { _SCHEME_DRIVERS = {
'mysql': MySQLDatabase, 'mysql': MySQLDatabase,
'mysql+pymysql': MySQLDatabase, 'mysql+pymysql': MySQLDatabase,
@ -42,6 +46,16 @@ _SCHEME_DRIVERS = {
'postgresql+psycopg2': PostgresqlDatabase, 'postgresql+psycopg2': PostgresqlDatabase,
} }
SCHEME_MATCH_FUNCTION = {
'mysql': match_mysql,
'mysql+pymysql': match_mysql,
'sqlite': match_like,
'postgresql': match_like,
'postgresql+psycopg2': match_like,
}
SCHEME_RANDOM_FUNCTION = { SCHEME_RANDOM_FUNCTION = {
'mysql': fn.Rand, 'mysql': fn.Rand,
'mysql+pymysql': fn.Rand, 'mysql+pymysql': fn.Rand,
@ -50,6 +64,7 @@ SCHEME_RANDOM_FUNCTION = {
'postgresql+psycopg2': fn.Random, 'postgresql+psycopg2': fn.Random,
} }
def pipes_concat(arg1, arg2, *extra_args): def pipes_concat(arg1, arg2, *extra_args):
""" Concat function for sqlite, since it doesn't support fn.Concat. """ Concat function for sqlite, since it doesn't support fn.Concat.
Concatenates clauses with || characters. Concatenates clauses with || characters.
@ -211,6 +226,7 @@ class TupleSelector(object):
db = Proxy() db = Proxy()
read_slave = Proxy() read_slave = Proxy()
db_random_func = CallableProxy() db_random_func = CallableProxy()
db_match_func = CallableProxy()
db_for_update = CallableProxy() db_for_update = CallableProxy()
db_transaction = CallableProxy() db_transaction = CallableProxy()
db_concat_func = CallableProxy() db_concat_func = CallableProxy()
@ -257,6 +273,7 @@ def configure(config_object):
parsed_write_uri = make_url(write_db_uri) parsed_write_uri = make_url(write_db_uri)
db_random_func.initialize(SCHEME_RANDOM_FUNCTION[parsed_write_uri.drivername]) db_random_func.initialize(SCHEME_RANDOM_FUNCTION[parsed_write_uri.drivername])
db_match_func.initialize(SCHEME_MATCH_FUNCTION[parsed_write_uri.drivername])
db_for_update.initialize(SCHEME_SPECIALIZED_FOR_UPDATE.get(parsed_write_uri.drivername, db_for_update.initialize(SCHEME_SPECIALIZED_FOR_UPDATE.get(parsed_write_uri.drivername,
real_for_update)) real_for_update))
db_concat_func.initialize(SCHEME_SPECIALIZED_CONCAT.get(parsed_write_uri.drivername, db_concat_func.initialize(SCHEME_SPECIALIZED_CONCAT.get(parsed_write_uri.drivername,
@ -471,9 +488,9 @@ class Visibility(BaseModel):
class Repository(BaseModel): class Repository(BaseModel):
namespace_user = QuayUserField(null=True) namespace_user = QuayUserField(null=True)
name = CharField() name = FullIndexedCharField(match_function=db_match_func)
visibility = ForeignKeyField(Visibility) visibility = ForeignKeyField(Visibility)
description = TextField(null=True) description = FullIndexedTextField(match_function=db_match_func, null=True)
badge_token = CharField(default=uuid_generator) badge_token = CharField(default=uuid_generator)
class Meta: class Meta:

View file

@ -2,7 +2,8 @@ import base64
import resumablehashlib import resumablehashlib
import json import json
from peewee import TextField from peewee import TextField, CharField, Clause
from data.text import prefix_search
class _ResumableSHAField(TextField): class _ResumableSHAField(TextField):
@ -64,3 +65,44 @@ class Base64BinaryField(TextField):
if value is None: if value is None:
return None return None
return base64.b64decode(value) return base64.b64decode(value)
def _add_fulltext(field_class):
""" Adds support for full text indexing and lookup to the given field class. """
class indexed_class(field_class):
# Marker used by SQLAlchemy translation layer to add the proper index for full text searching.
__fulltext__ = True
def __init__(self, match_function, *args, **kwargs):
field_class.__init__(self, *args, **kwargs)
self.match_function = match_function
def match(self, query):
return self.match_function(self, query)
def match_prefix(self, query):
return prefix_search(self, query)
def __mod__(self, _):
raise Exception('Unsafe operation: Use `match` or `match_prefix`')
def __pow__(self, _):
raise Exception('Unsafe operation: Use `match` or `match_prefix`')
def __contains__(self, _):
raise Exception('Unsafe operation: Use `match` or `match_prefix`')
def contains(self, _):
raise Exception('Unsafe operation: Use `match` or `match_prefix`')
def startswith(self, _):
raise Exception('Unsafe operation: Use `match` or `match_prefix`')
def endswith(self, _):
raise Exception('Unsafe operation: Use `match` or `match_prefix`')
return indexed_class
FullIndexedCharField = _add_fulltext(CharField)
FullIndexedTextField = _add_fulltext(TextField)

View file

@ -0,0 +1,31 @@
"""Add full text search indexing for repo name and description
Revision ID: e2894a3a3c19
Revises: d42c175b439a
Create Date: 2017-01-11 13:55:54.890774
"""
# revision identifiers, used by Alembic.
revision = 'e2894a3a3c19'
down_revision = 'd42c175b439a'
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import mysql
def upgrade(tables):
if op.get_bind().engine.name == 'postgresql':
op.execute('CREATE EXTENSION IF NOT EXISTS pg_trgm')
# ### commands auto generated by Alembic - please adjust! ###
op.create_index('repository_description__fulltext', 'repository', ['description'], unique=False, postgresql_using='gin', postgresql_ops={'description': 'gin_trgm_ops'}, mysql_prefix='FULLTEXT')
op.create_index('repository_name__fulltext', 'repository', ['name'], unique=False, postgresql_using='gin', postgresql_ops={'name': 'gin_trgm_ops'}, mysql_prefix='FULLTEXT')
# ### end Alembic commands ###
def downgrade(tables):
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index('repository_name__fulltext', table_name='repository')
op.drop_index('repository_description__fulltext', table_name='repository')
# ### end Alembic commands ###

View file

@ -1,4 +1,4 @@
from peewee import Clause, SQL, fn from peewee import fn
from cachetools import lru_cache from cachetools import lru_cache
from data.model import DataModelException from data.model import DataModelException
@ -6,18 +6,6 @@ from data.database import (Repository, User, Team, TeamMember, RepositoryPermiss
Namespace, Visibility, ImageStorage, Image, db_for_update) Namespace, Visibility, ImageStorage, Image, db_for_update)
def prefix_search(field, prefix_query):
""" Returns the wildcard match for searching for the given prefix query. """
# Escape the known wildcard characters.
prefix_query = (prefix_query
.replace('!', '!!')
.replace('%', '!%')
.replace('_', '!_')
.replace('[', '!['))
return field ** Clause(prefix_query + '%', SQL("ESCAPE '!'"))
def get_existing_repository(namespace_name, repository_name, for_update=False): def get_existing_repository(namespace_name, repository_name, for_update=False):
query = (Repository query = (Repository
.select(Repository, Namespace) .select(Repository, Namespace)

View file

@ -4,7 +4,7 @@ from cachetools import lru_cache
from data.database import Label, TagManifestLabel, MediaType, LabelSourceType, db_transaction from data.database import Label, TagManifestLabel, MediaType, LabelSourceType, db_transaction
from data.model import InvalidLabelKeyException, InvalidMediaTypeException, DataModelException from data.model import InvalidLabelKeyException, InvalidMediaTypeException, DataModelException
from data.model._basequery import prefix_search from data.text import prefix_search
from util.validation import validate_label_key from util.validation import validate_label_key
from util.validation import is_json from util.validation import is_json

View file

@ -12,6 +12,7 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Im
Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage, Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage,
Label, TagManifestLabel, db_for_update, get_epoch_timestamp, Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
db_random_func, db_concat_func) db_random_func, db_concat_func)
from data.text import prefix_search
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -318,8 +319,8 @@ def get_visible_repositories(username, namespace=None, include_public=False, sta
return query return query
def get_sorted_matching_repositories(prefix, only_public, checker, limit=10): def get_sorted_matching_repositories(lookup_value, only_public, checker, limit=10):
""" Returns repositories matching the given prefix string and passing the given checker """ Returns repositories matching the given lookup string and passing the given checker
function. function.
""" """
last_week = datetime.now() - timedelta(weeks=1) last_week = datetime.now() - timedelta(weeks=1)
@ -370,14 +371,16 @@ def get_sorted_matching_repositories(prefix, only_public, checker, limit=10):
results.append(result) results.append(result)
existing_ids.append(result.id) existing_ids.append(result.id)
# For performance reasons, we conduct the repo name and repo namespace searches on their # For performance reasons, we conduct each set of searches on their own. This also affords us the
# own. This also affords us the ability to give higher precedence to repository names matching # ability to easily define an order precedence.
# over namespaces, which is semantically correct. get_search_results(Repository.name.match(lookup_value), with_count=True)
get_search_results(_basequery.prefix_search(Repository.name, prefix), with_count=True) get_search_results(Repository.name.match(lookup_value), with_count=False)
get_search_results(_basequery.prefix_search(Repository.name, prefix), with_count=False)
get_search_results(_basequery.prefix_search(Namespace.username, prefix), with_count=True) get_search_results(Repository.description.match(lookup_value), with_count=True)
get_search_results(_basequery.prefix_search(Namespace.username, prefix), with_count=False) get_search_results(Repository.description.match(lookup_value), with_count=False)
get_search_results(prefix_search(Namespace.username, lookup_value), with_count=True)
get_search_results(prefix_search(Namespace.username, lookup_value), with_count=False)
return results return results

View file

@ -1,5 +1,5 @@
from sqlalchemy import (Table, MetaData, Column, ForeignKey, Integer, String, Boolean, Text, from sqlalchemy import (Table, MetaData, Column, ForeignKey, Integer, String, Boolean, Text,
DateTime, Date, BigInteger, Index) DateTime, Date, BigInteger, Index, text)
from peewee import (PrimaryKeyField, CharField, BooleanField, DateTimeField, TextField, from peewee import (PrimaryKeyField, CharField, BooleanField, DateTimeField, TextField,
ForeignKeyField, BigIntegerField, IntegerField, DateField) ForeignKeyField, BigIntegerField, IntegerField, DateField)
@ -28,6 +28,7 @@ def gen_sqlalchemy_metadata(peewee_model_list):
meta = model._meta meta = model._meta
all_indexes = set(meta.indexes) all_indexes = set(meta.indexes)
fulltext_indexes = []
columns = [] columns = []
for field in meta.sorted_fields: for field in meta.sorted_fields:
@ -60,6 +61,10 @@ def gen_sqlalchemy_metadata(peewee_model_list):
else: else:
raise RuntimeError('Unknown column type: %s' % field) raise RuntimeError('Unknown column type: %s' % field)
if hasattr(field, '__fulltext__'):
# Add the fulltext index for the field, based on whether we are under MySQL or Postgres.
fulltext_indexes.append(field.name)
for option_name in OPTIONS_TO_COPY: for option_name in OPTIONS_TO_COPY:
alchemy_option_name = (OPTION_TRANSLATIONS[option_name] alchemy_option_name = (OPTION_TRANSLATIONS[option_name]
if option_name in OPTION_TRANSLATIONS else option_name) if option_name in OPTION_TRANSLATIONS else option_name)
@ -81,4 +86,11 @@ def gen_sqlalchemy_metadata(peewee_model_list):
col_refs = [getattr(new_table.c, col_name) for col_name in col_names] col_refs = [getattr(new_table.c, col_name) for col_name in col_names]
Index(index_name, *col_refs, unique=unique) Index(index_name, *col_refs, unique=unique)
for col_field_name in fulltext_indexes:
index_name = '%s_%s__fulltext' % (meta.db_table, col_field_name)
col_ref = getattr(new_table.c, col_field_name)
Index(index_name, col_ref, postgresql_ops={col_field_name: 'gin_trgm_ops'},
postgresql_using='gin',
mysql_prefix='FULLTEXT')
return metadata return metadata

View file

@ -1,6 +1,7 @@
from data.database import Team, TeamMember, TeamRole, User, TeamMemberInvite, RepositoryPermission from data.database import Team, TeamMember, TeamRole, User, TeamMemberInvite, RepositoryPermission
from data.model import (DataModelException, InvalidTeamException, UserAlreadyInTeam, from data.model import (DataModelException, InvalidTeamException, UserAlreadyInTeam,
InvalidTeamMemberException, user, _basequery) InvalidTeamMemberException, user, _basequery)
from data.text import prefix_search
from util.validation import validate_username from util.validation import validate_username
from peewee import fn, JOIN_LEFT_OUTER from peewee import fn, JOIN_LEFT_OUTER
from util.morecollections import AttrDict from util.morecollections import AttrDict
@ -137,7 +138,7 @@ def add_or_invite_to_team(inviter, team, user_obj=None, email=None, requires_inv
def get_matching_user_teams(team_prefix, user_obj, limit=10): def get_matching_user_teams(team_prefix, user_obj, limit=10):
team_prefix_search = _basequery.prefix_search(Team.name, team_prefix) team_prefix_search = prefix_search(Team.name, team_prefix)
query = (Team query = (Team
.select() .select()
.join(User) .join(User)
@ -163,7 +164,7 @@ def get_organization_team(orgname, teamname):
def get_matching_admined_teams(team_prefix, user_obj, limit=10): def get_matching_admined_teams(team_prefix, user_obj, limit=10):
team_prefix_search = _basequery.prefix_search(Team.name, team_prefix) team_prefix_search = prefix_search(Team.name, team_prefix)
admined_orgs = (_basequery.get_user_organizations(user_obj.username) admined_orgs = (_basequery.get_user_organizations(user_obj.username)
.switch(Team) .switch(Team)
.join(TeamRole) .join(TeamRole)
@ -182,7 +183,7 @@ def get_matching_admined_teams(team_prefix, user_obj, limit=10):
def get_matching_teams(team_prefix, organization): def get_matching_teams(team_prefix, organization):
team_prefix_search = _basequery.prefix_search(Team.name, team_prefix) team_prefix_search = prefix_search(Team.name, team_prefix)
query = Team.select().where(team_prefix_search, Team.organization == organization) query = Team.select().where(team_prefix_search, Team.organization == organization)
return query.limit(10) return query.limit(10)

View file

@ -18,6 +18,7 @@ from data.model import (DataModelException, InvalidPasswordException, InvalidRob
InvalidUsernameException, InvalidEmailAddressException, InvalidUsernameException, InvalidEmailAddressException,
TooManyLoginAttemptsException, db_transaction, TooManyLoginAttemptsException, db_transaction,
notification, config, repository, _basequery) notification, config, repository, _basequery)
from data.text import prefix_search
from util.names import format_robot_username, parse_robot_username from util.names import format_robot_username, parse_robot_username
from util.validation import (validate_username, validate_email, validate_password, from util.validation import (validate_username, validate_email, validate_password,
INVALID_PASSWORD_MESSAGE) INVALID_PASSWORD_MESSAGE)
@ -259,10 +260,10 @@ def get_matching_robots(name_prefix, username, limit=10):
prefix_checks = False prefix_checks = False
for org in admined_orgs: for org in admined_orgs:
org_search = _basequery.prefix_search(User.username, org.username + '+' + name_prefix) org_search = prefix_search(User.username, org.username + '+' + name_prefix)
prefix_checks = prefix_checks | org_search prefix_checks = prefix_checks | org_search
user_search = _basequery.prefix_search(User.username, username + '+' + name_prefix) user_search = prefix_search(User.username, username + '+' + name_prefix)
prefix_checks = prefix_checks | user_search prefix_checks = prefix_checks | user_search
return User.select().where(prefix_checks).limit(limit) return User.select().where(prefix_checks).limit(limit)
@ -562,7 +563,7 @@ def get_user_or_org_by_customer_id(customer_id):
def get_matching_user_namespaces(namespace_prefix, username, limit=10): def get_matching_user_namespaces(namespace_prefix, username, limit=10):
namespace_search = _basequery.prefix_search(Namespace.username, namespace_prefix) namespace_search = prefix_search(Namespace.username, namespace_prefix)
base_query = (Namespace base_query = (Namespace
.select() .select()
.distinct() .distinct()
@ -573,12 +574,12 @@ def get_matching_user_namespaces(namespace_prefix, username, limit=10):
return _basequery.filter_to_repos_for_user(base_query, username).limit(limit) return _basequery.filter_to_repos_for_user(base_query, username).limit(limit)
def get_matching_users(username_prefix, robot_namespace=None, organization=None, limit=20): def get_matching_users(username_prefix, robot_namespace=None, organization=None, limit=20):
user_search = _basequery.prefix_search(User.username, username_prefix) user_search = prefix_search(User.username, username_prefix)
direct_user_query = (user_search & (User.organization == False) & (User.robot == False)) direct_user_query = (user_search & (User.organization == False) & (User.robot == False))
if robot_namespace: if robot_namespace:
robot_prefix = format_robot_username(robot_namespace, username_prefix) robot_prefix = format_robot_username(robot_namespace, username_prefix)
robot_search = _basequery.prefix_search(User.username, robot_prefix) robot_search = prefix_search(User.username, robot_prefix)
direct_user_query = ((robot_search & (User.robot == True)) | direct_user_query) direct_user_query = ((robot_search & (User.robot == True)) | direct_user_query)
query = (User query = (User

38
data/text.py Normal file
View file

@ -0,0 +1,38 @@
from peewee import Clause, SQL, fn, TextField, Field
def _escape_wildcard(search_query):
""" Escapes the wildcards found in the given search query so that they are treated as *characters*
rather than wildcards when passed to a LIKE or ILIKE clause with an ESCAPE '!'.
"""
search_query = (search_query
.replace('!', '!!')
.replace('%', '!%')
.replace('_', '!_')
.replace('[', '!['))
return search_query
def prefix_search(field, prefix_query):
""" Returns the wildcard match for searching for the given prefix query. """
# Escape the known wildcard characters.
prefix_query = _escape_wildcard(prefix_query)
return Field.__pow__(field, Clause(prefix_query + '%', SQL("ESCAPE '!'")))
def match_mysql(field, search_query):
""" Generates a full-text match query using a Match operation, which is needed for MySQL.
"""
if field.name.find('`') >= 0: # Just to be safe.
raise Exception("How did field name '%s' end up containing a backtick?" % field.name)
return Clause(fn.MATCH(SQL("`%s`" % field.name)), fn.AGAINST(SQL('%s', search_query)),
parens=True)
def match_like(field, search_query):
""" Generates a full-text match query using an ILIKE operation, which is needed for SQLite and
Postgres.
"""
escaped_query = _escape_wildcard(search_query)
clause = Clause('%' + escaped_query + '%', SQL("ESCAPE '!'"))
return Field.__pow__(field, clause)

View file

@ -568,6 +568,11 @@ def populate_database(minimal=False, with_storage=False):
[(new_user_2, 'write'), (reader, 'read')], [(new_user_2, 'write'), (reader, 'read')],
(5, [], 'latest')) (5, [], 'latest'))
__generate_repository(with_storage, new_user_1, 'text-full-repo',
'This is a repository for testing text search', False,
[(new_user_2, 'write'), (reader, 'read')],
(5, [], 'latest'))
building = __generate_repository(with_storage, new_user_1, 'building', building = __generate_repository(with_storage, new_user_1, 'building',
'Empty repository which is building.', 'Empty repository which is building.',
False, [], (0, [], None)) False, [], (0, [], None))

View file

@ -60,7 +60,7 @@ redis
redlock redlock
reportlab==2.7 reportlab==2.7
semantic-version semantic-version
sqlalchemy sqlalchemy==1.1.5
stringscore stringscore
stripe stripe
toposort toposort

View file

@ -109,7 +109,7 @@ requests-oauthlib==0.7.0
rfc3986==0.4.1 rfc3986==0.4.1
semantic-version==2.6.0 semantic-version==2.6.0
six==1.10.0 six==1.10.0
SQLAlchemy==1.1.2 SQLAlchemy==1.1.5
stevedore==1.17.1 stevedore==1.17.1
stringscore==0.1.0 stringscore==0.1.0
stripe==1.41.0 stripe==1.41.0

View file

@ -1001,6 +1001,23 @@ class TestConductSearch(ApiTestCase):
self.assertEquals(json['results'][0]['name'], 'shared') self.assertEquals(json['results'][0]['name'], 'shared')
def test_full_text(self):
self.login(ADMIN_ACCESS_USER)
# Make sure the repository is found via `full` and `text search`.
json = self.getJsonResponse(ConductSearch,
params=dict(query='full'))
self.assertEquals(1, len(json['results']))
self.assertEquals(json['results'][0]['kind'], 'repository')
self.assertEquals(json['results'][0]['name'], 'text-full-repo')
json = self.getJsonResponse(ConductSearch,
params=dict(query='text search'))
self.assertEquals(1, len(json['results']))
self.assertEquals(json['results'][0]['kind'], 'repository')
self.assertEquals(json['results'][0]['name'], 'text-full-repo')
class TestGetMatchingEntities(ApiTestCase): class TestGetMatchingEntities(ApiTestCase):
def test_simple_lookup(self): def test_simple_lookup(self):
self.login(ADMIN_ACCESS_USER) self.login(ADMIN_ACCESS_USER)