Merge pull request #2272 from coreos-inc/fulltext-search

Full text search support in Quay
This commit is contained in:
josephschorr 2017-01-31 11:51:47 -05:00 committed by GitHub
commit 356530110c
14 changed files with 193 additions and 38 deletions

View file

@ -21,7 +21,9 @@ from sqlalchemy.engine.url import make_url
import resumablehashlib
from data.fields import ResumableSHA256Field, ResumableSHA1Field, JSONField, Base64BinaryField
from data.fields import (ResumableSHA256Field, ResumableSHA1Field, JSONField, Base64BinaryField,
FullIndexedTextField, FullIndexedCharField)
from data.text import match_mysql, match_like
from data.read_slave import ReadSlaveModel
from util.names import urn_generator
@ -30,10 +32,12 @@ logger = logging.getLogger(__name__)
DEFAULT_DB_CONNECT_TIMEOUT = 10 # seconds
# IMAGE_NOT_SCANNED_ENGINE_VERSION is the version found in security_indexed_engine when the
# image has not yet been scanned.
IMAGE_NOT_SCANNED_ENGINE_VERSION = -1
_SCHEME_DRIVERS = {
'mysql': MySQLDatabase,
'mysql+pymysql': MySQLDatabase,
@ -42,6 +46,16 @@ _SCHEME_DRIVERS = {
'postgresql+psycopg2': PostgresqlDatabase,
}
SCHEME_MATCH_FUNCTION = {
'mysql': match_mysql,
'mysql+pymysql': match_mysql,
'sqlite': match_like,
'postgresql': match_like,
'postgresql+psycopg2': match_like,
}
SCHEME_RANDOM_FUNCTION = {
'mysql': fn.Rand,
'mysql+pymysql': fn.Rand,
@ -50,6 +64,7 @@ SCHEME_RANDOM_FUNCTION = {
'postgresql+psycopg2': fn.Random,
}
def pipes_concat(arg1, arg2, *extra_args):
""" Concat function for sqlite, since it doesn't support fn.Concat.
Concatenates clauses with || characters.
@ -211,6 +226,7 @@ class TupleSelector(object):
db = Proxy()
read_slave = Proxy()
db_random_func = CallableProxy()
db_match_func = CallableProxy()
db_for_update = CallableProxy()
db_transaction = CallableProxy()
db_concat_func = CallableProxy()
@ -257,6 +273,7 @@ def configure(config_object):
parsed_write_uri = make_url(write_db_uri)
db_random_func.initialize(SCHEME_RANDOM_FUNCTION[parsed_write_uri.drivername])
db_match_func.initialize(SCHEME_MATCH_FUNCTION[parsed_write_uri.drivername])
db_for_update.initialize(SCHEME_SPECIALIZED_FOR_UPDATE.get(parsed_write_uri.drivername,
real_for_update))
db_concat_func.initialize(SCHEME_SPECIALIZED_CONCAT.get(parsed_write_uri.drivername,
@ -471,9 +488,9 @@ class Visibility(BaseModel):
class Repository(BaseModel):
namespace_user = QuayUserField(null=True)
name = CharField()
name = FullIndexedCharField(match_function=db_match_func)
visibility = ForeignKeyField(Visibility)
description = TextField(null=True)
description = FullIndexedTextField(match_function=db_match_func, null=True)
badge_token = CharField(default=uuid_generator)
class Meta:

View file

@ -2,7 +2,8 @@ import base64
import resumablehashlib
import json
from peewee import TextField
from peewee import TextField, CharField, Clause
from data.text import prefix_search
class _ResumableSHAField(TextField):
@ -64,3 +65,44 @@ class Base64BinaryField(TextField):
if value is None:
return None
return base64.b64decode(value)
def _add_fulltext(field_class):
""" Adds support for full text indexing and lookup to the given field class. """
class indexed_class(field_class):
# Marker used by SQLAlchemy translation layer to add the proper index for full text searching.
__fulltext__ = True
def __init__(self, match_function, *args, **kwargs):
field_class.__init__(self, *args, **kwargs)
self.match_function = match_function
def match(self, query):
return self.match_function(self, query)
def match_prefix(self, query):
return prefix_search(self, query)
def __mod__(self, _):
raise Exception('Unsafe operation: Use `match` or `match_prefix`')
def __pow__(self, _):
raise Exception('Unsafe operation: Use `match` or `match_prefix`')
def __contains__(self, _):
raise Exception('Unsafe operation: Use `match` or `match_prefix`')
def contains(self, _):
raise Exception('Unsafe operation: Use `match` or `match_prefix`')
def startswith(self, _):
raise Exception('Unsafe operation: Use `match` or `match_prefix`')
def endswith(self, _):
raise Exception('Unsafe operation: Use `match` or `match_prefix`')
return indexed_class
FullIndexedCharField = _add_fulltext(CharField)
FullIndexedTextField = _add_fulltext(TextField)

View file

@ -0,0 +1,31 @@
"""Add full text search indexing for repo name and description
Revision ID: e2894a3a3c19
Revises: d42c175b439a
Create Date: 2017-01-11 13:55:54.890774
"""
# revision identifiers, used by Alembic.
revision = 'e2894a3a3c19'
down_revision = 'd42c175b439a'
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import mysql
def upgrade(tables):
if op.get_bind().engine.name == 'postgresql':
op.execute('CREATE EXTENSION IF NOT EXISTS pg_trgm')
# ### commands auto generated by Alembic - please adjust! ###
op.create_index('repository_description__fulltext', 'repository', ['description'], unique=False, postgresql_using='gin', postgresql_ops={'description': 'gin_trgm_ops'}, mysql_prefix='FULLTEXT')
op.create_index('repository_name__fulltext', 'repository', ['name'], unique=False, postgresql_using='gin', postgresql_ops={'name': 'gin_trgm_ops'}, mysql_prefix='FULLTEXT')
# ### end Alembic commands ###
def downgrade(tables):
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index('repository_name__fulltext', table_name='repository')
op.drop_index('repository_description__fulltext', table_name='repository')
# ### end Alembic commands ###

View file

@ -1,4 +1,4 @@
from peewee import Clause, SQL, fn
from peewee import fn
from cachetools import lru_cache
from data.model import DataModelException
@ -6,18 +6,6 @@ from data.database import (Repository, User, Team, TeamMember, RepositoryPermiss
Namespace, Visibility, ImageStorage, Image, db_for_update)
def prefix_search(field, prefix_query):
""" Returns the wildcard match for searching for the given prefix query. """
# Escape the known wildcard characters.
prefix_query = (prefix_query
.replace('!', '!!')
.replace('%', '!%')
.replace('_', '!_')
.replace('[', '!['))
return field ** Clause(prefix_query + '%', SQL("ESCAPE '!'"))
def get_existing_repository(namespace_name, repository_name, for_update=False):
query = (Repository
.select(Repository, Namespace)

View file

@ -4,7 +4,7 @@ from cachetools import lru_cache
from data.database import Label, TagManifestLabel, MediaType, LabelSourceType, db_transaction
from data.model import InvalidLabelKeyException, InvalidMediaTypeException, DataModelException
from data.model._basequery import prefix_search
from data.text import prefix_search
from util.validation import validate_label_key
from util.validation import is_json

View file

@ -12,6 +12,7 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Im
Role, RepositoryAuthorizedEmail, TagManifest, DerivedStorageForImage,
Label, TagManifestLabel, db_for_update, get_epoch_timestamp,
db_random_func, db_concat_func)
from data.text import prefix_search
logger = logging.getLogger(__name__)
@ -318,8 +319,8 @@ def get_visible_repositories(username, namespace=None, include_public=False, sta
return query
def get_sorted_matching_repositories(prefix, only_public, checker, limit=10):
""" Returns repositories matching the given prefix string and passing the given checker
def get_sorted_matching_repositories(lookup_value, only_public, checker, limit=10):
""" Returns repositories matching the given lookup string and passing the given checker
function.
"""
last_week = datetime.now() - timedelta(weeks=1)
@ -370,14 +371,16 @@ def get_sorted_matching_repositories(prefix, only_public, checker, limit=10):
results.append(result)
existing_ids.append(result.id)
# For performance reasons, we conduct the repo name and repo namespace searches on their
# own. This also affords us the ability to give higher precedence to repository names matching
# over namespaces, which is semantically correct.
get_search_results(_basequery.prefix_search(Repository.name, prefix), with_count=True)
get_search_results(_basequery.prefix_search(Repository.name, prefix), with_count=False)
# For performance reasons, we conduct each set of searches on their own. This also affords us the
# ability to easily define an order precedence.
get_search_results(Repository.name.match(lookup_value), with_count=True)
get_search_results(Repository.name.match(lookup_value), with_count=False)
get_search_results(_basequery.prefix_search(Namespace.username, prefix), with_count=True)
get_search_results(_basequery.prefix_search(Namespace.username, prefix), with_count=False)
get_search_results(Repository.description.match(lookup_value), with_count=True)
get_search_results(Repository.description.match(lookup_value), with_count=False)
get_search_results(prefix_search(Namespace.username, lookup_value), with_count=True)
get_search_results(prefix_search(Namespace.username, lookup_value), with_count=False)
return results

View file

@ -1,5 +1,5 @@
from sqlalchemy import (Table, MetaData, Column, ForeignKey, Integer, String, Boolean, Text,
DateTime, Date, BigInteger, Index)
DateTime, Date, BigInteger, Index, text)
from peewee import (PrimaryKeyField, CharField, BooleanField, DateTimeField, TextField,
ForeignKeyField, BigIntegerField, IntegerField, DateField)
@ -28,6 +28,7 @@ def gen_sqlalchemy_metadata(peewee_model_list):
meta = model._meta
all_indexes = set(meta.indexes)
fulltext_indexes = []
columns = []
for field in meta.sorted_fields:
@ -60,6 +61,10 @@ def gen_sqlalchemy_metadata(peewee_model_list):
else:
raise RuntimeError('Unknown column type: %s' % field)
if hasattr(field, '__fulltext__'):
# Add the fulltext index for the field, based on whether we are under MySQL or Postgres.
fulltext_indexes.append(field.name)
for option_name in OPTIONS_TO_COPY:
alchemy_option_name = (OPTION_TRANSLATIONS[option_name]
if option_name in OPTION_TRANSLATIONS else option_name)
@ -81,4 +86,11 @@ def gen_sqlalchemy_metadata(peewee_model_list):
col_refs = [getattr(new_table.c, col_name) for col_name in col_names]
Index(index_name, *col_refs, unique=unique)
for col_field_name in fulltext_indexes:
index_name = '%s_%s__fulltext' % (meta.db_table, col_field_name)
col_ref = getattr(new_table.c, col_field_name)
Index(index_name, col_ref, postgresql_ops={col_field_name: 'gin_trgm_ops'},
postgresql_using='gin',
mysql_prefix='FULLTEXT')
return metadata

View file

@ -1,6 +1,7 @@
from data.database import Team, TeamMember, TeamRole, User, TeamMemberInvite, RepositoryPermission
from data.model import (DataModelException, InvalidTeamException, UserAlreadyInTeam,
InvalidTeamMemberException, user, _basequery)
from data.text import prefix_search
from util.validation import validate_username
from peewee import fn, JOIN_LEFT_OUTER
from util.morecollections import AttrDict
@ -137,7 +138,7 @@ def add_or_invite_to_team(inviter, team, user_obj=None, email=None, requires_inv
def get_matching_user_teams(team_prefix, user_obj, limit=10):
team_prefix_search = _basequery.prefix_search(Team.name, team_prefix)
team_prefix_search = prefix_search(Team.name, team_prefix)
query = (Team
.select()
.join(User)
@ -163,7 +164,7 @@ def get_organization_team(orgname, teamname):
def get_matching_admined_teams(team_prefix, user_obj, limit=10):
team_prefix_search = _basequery.prefix_search(Team.name, team_prefix)
team_prefix_search = prefix_search(Team.name, team_prefix)
admined_orgs = (_basequery.get_user_organizations(user_obj.username)
.switch(Team)
.join(TeamRole)
@ -182,7 +183,7 @@ def get_matching_admined_teams(team_prefix, user_obj, limit=10):
def get_matching_teams(team_prefix, organization):
team_prefix_search = _basequery.prefix_search(Team.name, team_prefix)
team_prefix_search = prefix_search(Team.name, team_prefix)
query = Team.select().where(team_prefix_search, Team.organization == organization)
return query.limit(10)

View file

@ -18,6 +18,7 @@ from data.model import (DataModelException, InvalidPasswordException, InvalidRob
InvalidUsernameException, InvalidEmailAddressException,
TooManyLoginAttemptsException, db_transaction,
notification, config, repository, _basequery)
from data.text import prefix_search
from util.names import format_robot_username, parse_robot_username
from util.validation import (validate_username, validate_email, validate_password,
INVALID_PASSWORD_MESSAGE)
@ -259,10 +260,10 @@ def get_matching_robots(name_prefix, username, limit=10):
prefix_checks = False
for org in admined_orgs:
org_search = _basequery.prefix_search(User.username, org.username + '+' + name_prefix)
org_search = prefix_search(User.username, org.username + '+' + name_prefix)
prefix_checks = prefix_checks | org_search
user_search = _basequery.prefix_search(User.username, username + '+' + name_prefix)
user_search = prefix_search(User.username, username + '+' + name_prefix)
prefix_checks = prefix_checks | user_search
return User.select().where(prefix_checks).limit(limit)
@ -562,7 +563,7 @@ def get_user_or_org_by_customer_id(customer_id):
def get_matching_user_namespaces(namespace_prefix, username, limit=10):
namespace_search = _basequery.prefix_search(Namespace.username, namespace_prefix)
namespace_search = prefix_search(Namespace.username, namespace_prefix)
base_query = (Namespace
.select()
.distinct()
@ -573,12 +574,12 @@ def get_matching_user_namespaces(namespace_prefix, username, limit=10):
return _basequery.filter_to_repos_for_user(base_query, username).limit(limit)
def get_matching_users(username_prefix, robot_namespace=None, organization=None, limit=20):
user_search = _basequery.prefix_search(User.username, username_prefix)
user_search = prefix_search(User.username, username_prefix)
direct_user_query = (user_search & (User.organization == False) & (User.robot == False))
if robot_namespace:
robot_prefix = format_robot_username(robot_namespace, username_prefix)
robot_search = _basequery.prefix_search(User.username, robot_prefix)
robot_search = prefix_search(User.username, robot_prefix)
direct_user_query = ((robot_search & (User.robot == True)) | direct_user_query)
query = (User

38
data/text.py Normal file
View file

@ -0,0 +1,38 @@
from peewee import Clause, SQL, fn, TextField, Field
def _escape_wildcard(search_query):
""" Escapes the wildcards found in the given search query so that they are treated as *characters*
rather than wildcards when passed to a LIKE or ILIKE clause with an ESCAPE '!'.
"""
search_query = (search_query
.replace('!', '!!')
.replace('%', '!%')
.replace('_', '!_')
.replace('[', '!['))
return search_query
def prefix_search(field, prefix_query):
""" Returns the wildcard match for searching for the given prefix query. """
# Escape the known wildcard characters.
prefix_query = _escape_wildcard(prefix_query)
return Field.__pow__(field, Clause(prefix_query + '%', SQL("ESCAPE '!'")))
def match_mysql(field, search_query):
""" Generates a full-text match query using a Match operation, which is needed for MySQL.
"""
if field.name.find('`') >= 0: # Just to be safe.
raise Exception("How did field name '%s' end up containing a backtick?" % field.name)
return Clause(fn.MATCH(SQL("`%s`" % field.name)), fn.AGAINST(SQL('%s', search_query)),
parens=True)
def match_like(field, search_query):
""" Generates a full-text match query using an ILIKE operation, which is needed for SQLite and
Postgres.
"""
escaped_query = _escape_wildcard(search_query)
clause = Clause('%' + escaped_query + '%', SQL("ESCAPE '!'"))
return Field.__pow__(field, clause)

View file

@ -568,6 +568,11 @@ def populate_database(minimal=False, with_storage=False):
[(new_user_2, 'write'), (reader, 'read')],
(5, [], 'latest'))
__generate_repository(with_storage, new_user_1, 'text-full-repo',
'This is a repository for testing text search', False,
[(new_user_2, 'write'), (reader, 'read')],
(5, [], 'latest'))
building = __generate_repository(with_storage, new_user_1, 'building',
'Empty repository which is building.',
False, [], (0, [], None))

View file

@ -60,7 +60,7 @@ redis
redlock
reportlab==2.7
semantic-version
sqlalchemy
sqlalchemy==1.1.5
stringscore
stripe
toposort

View file

@ -109,7 +109,7 @@ requests-oauthlib==0.7.0
rfc3986==0.4.1
semantic-version==2.6.0
six==1.10.0
SQLAlchemy==1.1.2
SQLAlchemy==1.1.5
stevedore==1.17.1
stringscore==0.1.0
stripe==1.41.0

View file

@ -1001,6 +1001,23 @@ class TestConductSearch(ApiTestCase):
self.assertEquals(json['results'][0]['name'], 'shared')
def test_full_text(self):
self.login(ADMIN_ACCESS_USER)
# Make sure the repository is found via `full` and `text search`.
json = self.getJsonResponse(ConductSearch,
params=dict(query='full'))
self.assertEquals(1, len(json['results']))
self.assertEquals(json['results'][0]['kind'], 'repository')
self.assertEquals(json['results'][0]['name'], 'text-full-repo')
json = self.getJsonResponse(ConductSearch,
params=dict(query='text search'))
self.assertEquals(1, len(json['results']))
self.assertEquals(json['results'][0]['kind'], 'repository')
self.assertEquals(json['results'][0]['name'], 'text-full-repo')
class TestGetMatchingEntities(ApiTestCase):
def test_simple_lookup(self):
self.login(ADMIN_ACCESS_USER)