Add a RepositoryActionCount table so we can use it (instead of LogEntry) when scoring repo search results

This commit is contained in:
Joseph Schorr 2015-04-13 13:31:07 -04:00
parent 703f48f194
commit 3f1e8f3c27
8 changed files with 137 additions and 19 deletions

View file

@ -0,0 +1,2 @@
#!/bin/sh
exec logger -i -t repositoryactioncounter

View file

@ -0,0 +1,8 @@
#! /bin/bash
echo 'Starting repository action count worker'
cd /
venv/bin/python -m workers.repositoryactioncounter 2>&1
echo 'Repository action worker exited'

View file

@ -299,7 +299,7 @@ class Repository(BaseModel):
# Therefore, we define our own deletion order here and use the dependency system to verify it.
ordered_dependencies = [RepositoryAuthorizedEmail, RepositoryTag, Image, LogEntry,
RepositoryBuild, RepositoryBuildTrigger, RepositoryNotification,
RepositoryPermission, AccessToken, Star]
RepositoryPermission, AccessToken, Star, RepositoryActionCount]
for query, fk in self.dependencies(search_nullable=True):
model = fk.model_class
@ -560,6 +560,20 @@ class LogEntry(BaseModel):
metadata_json = TextField(default='{}')
class RepositoryActionCount(BaseModel):
repository = ForeignKeyField(Repository, index=True)
count = IntegerField()
date = DateField(index=True)
class Meta:
database = db
read_slaves = (read_slave,)
indexes = (
# create a unique index on repository and date
(('repository', 'date'), True),
)
class OAuthApplication(BaseModel):
client_id = CharField(index=True, default=random_string_generator(length=20))
client_secret = CharField(default=random_string_generator(length=40))
@ -645,4 +659,4 @@ all_models = [User, Repository, Image, AccessToken, Role, RepositoryPermission,
ExternalNotificationEvent, ExternalNotificationMethod, RepositoryNotification,
RepositoryAuthorizedEmail, ImageStorageTransformation, DerivedImageStorage,
TeamMemberInvite, ImageStorageSignature, ImageStorageSignatureKind,
AccessTokenKind, Star]
AccessTokenKind, Star, RepositoryActionCount]

View file

@ -0,0 +1,36 @@
"""Add RepositoryActionCount table
Revision ID: 30c044b75632
Revises: 2b4dc0818a5e
Create Date: 2015-04-13 13:21:18.159602
"""
# revision identifiers, used by Alembic.
revision = '30c044b75632'
down_revision = '2b4dc0818a5e'
from alembic import op
import sqlalchemy as sa
def upgrade(tables):
### commands auto generated by Alembic - please adjust! ###
op.create_table('repositoryactioncount',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('repository_id', sa.Integer(), nullable=False),
sa.Column('count', sa.Integer(), nullable=False),
sa.Column('date', sa.Date(), nullable=False),
sa.ForeignKeyConstraint(['repository_id'], ['repository.id'], name=op.f('fk_repositoryactioncount_repository_id_repository')),
sa.PrimaryKeyConstraint('id', name=op.f('pk_repositoryactioncount'))
)
op.create_index('repositoryactioncount_date', 'repositoryactioncount', ['date'], unique=False)
op.create_index('repositoryactioncount_repository_id', 'repositoryactioncount', ['repository_id'], unique=False)
op.create_index('repositoryactioncount_repository_id_date', 'repositoryactioncount', ['repository_id', 'date'], unique=True)
### end Alembic commands ###
def downgrade(tables):
### commands auto generated by Alembic - please adjust! ###
op.drop_table('repositoryactioncount')
### end Alembic commands ###

View file

@ -18,7 +18,7 @@ from data.database import (User, Repository, Image, AccessToken, Role, Repositor
DerivedImageStorage, ImageStorageTransformation, random_string_generator,
db, BUILD_PHASE, QuayUserField, ImageStorageSignature, QueueItem,
ImageStorageSignatureKind, validate_database_url, db_for_update,
AccessTokenKind, Star, get_epoch_timestamp)
AccessTokenKind, Star, get_epoch_timestamp, RepositoryActionCount)
from peewee import JOIN_LEFT_OUTER, fn
from util.validation import (validate_username, validate_email, validate_password,
INVALID_PASSWORD_MESSAGE)
@ -995,20 +995,19 @@ def get_sorted_matching_repositories(prefix, only_public, checker, limit=10):
""" Returns repositories matching the given prefix string and passing the given checker
function.
"""
last_week = datetime.now() - timedelta(weeks=1)
results = []
existing_ids = []
def get_search_results(search_clause, with_count):
def get_search_results(search_clause, with_count=False):
if len(results) >= limit:
return
selected = [Repository, Namespace]
select_items = [Repository, Namespace]
if with_count:
selected.append(fn.Count(LogEntry.id).alias('count'))
select_items.append(fn.Sum(RepositoryActionCount.count).alias('count'))
query = (Repository.select(*selected)
query = (Repository.select(*select_items)
.join(Namespace, JOIN_LEFT_OUTER, on=(Namespace.id == Repository.namespace_user))
.switch(Repository)
.where(search_clause)
@ -1021,9 +1020,10 @@ def get_sorted_matching_repositories(prefix, only_public, checker, limit=10):
query = query.where(~(Repository.id << existing_ids))
if with_count:
query = (query.join(LogEntry, JOIN_LEFT_OUTER)
.where(LogEntry.datetime >= last_week)
.order_by(fn.Count(LogEntry.id).desc()))
query = (query.switch(Repository)
.join(RepositoryActionCount)
.where(RepositoryActionCount.date >= last_week)
.order_by(fn.Sum(RepositoryActionCount.count).desc()))
for result in query:
if len(results) >= limit:
@ -1042,13 +1042,13 @@ def get_sorted_matching_repositories(prefix, only_public, checker, limit=10):
existing_ids.append(result.id)
# For performance reasons, we conduct the repo name and repo namespace searches on their
# own, and with and without counts on their own. This also affords us the ability to give
# higher precedence to repository names matching over namespaces, which is semantically correct.
get_search_results((Repository.name ** (prefix + '%')), with_count=True)
get_search_results((Repository.name ** (prefix + '%')), with_count=False)
# own. This also affords us the ability to give higher precedence to repository names matching
# over namespaces, which is semantically correct.
get_search_results(Repository.name ** (prefix + '%'), with_count=True)
get_search_results(Repository.name ** (prefix + '%'), with_count=False)
get_search_results((Namespace.username ** (prefix + '%')), with_count=True)
get_search_results((Namespace.username ** (prefix + '%')), with_count=False)
get_search_results(Namespace.username ** (prefix + '%'), with_count=True)
get_search_results(Namespace.username ** (prefix + '%'), with_count=False)
return results

View file

@ -1,7 +1,7 @@
from sqlalchemy import (Table, MetaData, Column, ForeignKey, Integer, String, Boolean, Text,
DateTime, BigInteger, Index)
DateTime, Date, BigInteger, Index)
from peewee import (PrimaryKeyField, CharField, BooleanField, DateTimeField, TextField,
ForeignKeyField, BigIntegerField, IntegerField)
ForeignKeyField, BigIntegerField, IntegerField, DateField)
OPTIONS_TO_COPY = [
@ -42,6 +42,8 @@ def gen_sqlalchemy_metadata(peewee_model_list):
alchemy_type = Boolean
elif isinstance(field, DateTimeField):
alchemy_type = DateTime
elif isinstance(field, DateField):
alchemy_type = Date
elif isinstance(field, TextField):
alchemy_type = Text
elif isinstance(field, ForeignKeyField):

View file

@ -16,6 +16,8 @@ from data import model
from data.model import oauth
from app import app, storage as store
from workers import repositoryactioncounter
logger = logging.getLogger(__name__)
@ -582,6 +584,9 @@ def populate_database():
'trigger_id': trigger.uuid, 'config': json.loads(trigger.config),
'service': trigger.service.name})
while repositoryactioncounter.count_repository_actions():
pass
if __name__ == '__main__':
log_level = getattr(logging, app.config['LOGGING_LEVEL'])
logging.basicConfig(level=log_level)

View file

@ -0,0 +1,51 @@
import logging
from apscheduler.schedulers.blocking import BlockingScheduler
from data.database import Repository, LogEntry, RepositoryActionCount, db_random_func, fn
from datetime import date, datetime, timedelta
POLL_PERIOD_SECONDS = 30
logger = logging.getLogger(__name__)
sched = BlockingScheduler()
@sched.scheduled_job(trigger='interval', seconds=10)
def count_repository_actions():
""" Counts actions for a random repository for the previous day. """
try:
# Get a random repository to count.
today = date.today()
yesterday = today - timedelta(days=1)
has_yesterday_actions = (RepositoryActionCount.select(RepositoryActionCount.repository)
.where(RepositoryActionCount.date == yesterday))
to_count = (Repository.select()
.where(~(Repository.id << (has_yesterday_actions)))
.order_by(db_random_func()).get())
logger.debug('Counting: %s', to_count.id)
actions = (LogEntry.select()
.where(LogEntry.repository == to_count,
LogEntry.datetime >= yesterday,
LogEntry.datetime < today)
.count())
# Create the row.
try:
RepositoryActionCount.create(repository=to_count, date=yesterday, count=actions)
except:
logger.exception('Exception when writing count')
return True
except Repository.DoesNotExist:
logger.debug('No further repositories to count')
return False
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
sched.start()