From 3f1e8f3c27f83972431ab5e35e4ba2f00b6b030e Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Mon, 13 Apr 2015 13:31:07 -0400 Subject: [PATCH] Add a RepositoryActionCount table so we can use it (instead of LogEntry) when scoring repo search results --- .../service/repositoryactioncounter/log/run | 2 + conf/init/service/repositoryactioncounter/run | 8 +++ data/database.py | 18 ++++++- ...4b75632_add_repositoryactioncount_table.py | 36 +++++++++++++ data/model/legacy.py | 30 +++++------ data/model/sqlalchemybridge.py | 6 ++- initdb.py | 5 ++ workers/repositoryactioncounter.py | 51 +++++++++++++++++++ 8 files changed, 137 insertions(+), 19 deletions(-) create mode 100755 conf/init/service/repositoryactioncounter/log/run create mode 100755 conf/init/service/repositoryactioncounter/run create mode 100644 data/migrations/versions/30c044b75632_add_repositoryactioncount_table.py create mode 100644 workers/repositoryactioncounter.py diff --git a/conf/init/service/repositoryactioncounter/log/run b/conf/init/service/repositoryactioncounter/log/run new file mode 100755 index 000000000..d86d5766f --- /dev/null +++ b/conf/init/service/repositoryactioncounter/log/run @@ -0,0 +1,2 @@ +#!/bin/sh +exec logger -i -t repositoryactioncounter \ No newline at end of file diff --git a/conf/init/service/repositoryactioncounter/run b/conf/init/service/repositoryactioncounter/run new file mode 100755 index 000000000..08e0e3164 --- /dev/null +++ b/conf/init/service/repositoryactioncounter/run @@ -0,0 +1,8 @@ +#! /bin/bash + +echo 'Starting repository action count worker' + +cd / +venv/bin/python -m workers.repositoryactioncounter 2>&1 + +echo 'Repository action worker exited' \ No newline at end of file diff --git a/data/database.py b/data/database.py index 8bc0488a7..b039cf099 100644 --- a/data/database.py +++ b/data/database.py @@ -299,7 +299,7 @@ class Repository(BaseModel): # Therefore, we define our own deletion order here and use the dependency system to verify it. ordered_dependencies = [RepositoryAuthorizedEmail, RepositoryTag, Image, LogEntry, RepositoryBuild, RepositoryBuildTrigger, RepositoryNotification, - RepositoryPermission, AccessToken, Star] + RepositoryPermission, AccessToken, Star, RepositoryActionCount] for query, fk in self.dependencies(search_nullable=True): model = fk.model_class @@ -560,6 +560,20 @@ class LogEntry(BaseModel): metadata_json = TextField(default='{}') +class RepositoryActionCount(BaseModel): + repository = ForeignKeyField(Repository, index=True) + count = IntegerField() + date = DateField(index=True) + + class Meta: + database = db + read_slaves = (read_slave,) + indexes = ( + # create a unique index on repository and date + (('repository', 'date'), True), + ) + + class OAuthApplication(BaseModel): client_id = CharField(index=True, default=random_string_generator(length=20)) client_secret = CharField(default=random_string_generator(length=40)) @@ -645,4 +659,4 @@ all_models = [User, Repository, Image, AccessToken, Role, RepositoryPermission, ExternalNotificationEvent, ExternalNotificationMethod, RepositoryNotification, RepositoryAuthorizedEmail, ImageStorageTransformation, DerivedImageStorage, TeamMemberInvite, ImageStorageSignature, ImageStorageSignatureKind, - AccessTokenKind, Star] + AccessTokenKind, Star, RepositoryActionCount] diff --git a/data/migrations/versions/30c044b75632_add_repositoryactioncount_table.py b/data/migrations/versions/30c044b75632_add_repositoryactioncount_table.py new file mode 100644 index 000000000..8df45958e --- /dev/null +++ b/data/migrations/versions/30c044b75632_add_repositoryactioncount_table.py @@ -0,0 +1,36 @@ +"""Add RepositoryActionCount table + +Revision ID: 30c044b75632 +Revises: 2b4dc0818a5e +Create Date: 2015-04-13 13:21:18.159602 + +""" + +# revision identifiers, used by Alembic. +revision = '30c044b75632' +down_revision = '2b4dc0818a5e' + +from alembic import op +import sqlalchemy as sa + + +def upgrade(tables): + ### commands auto generated by Alembic - please adjust! ### + op.create_table('repositoryactioncount', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('repository_id', sa.Integer(), nullable=False), + sa.Column('count', sa.Integer(), nullable=False), + sa.Column('date', sa.Date(), nullable=False), + sa.ForeignKeyConstraint(['repository_id'], ['repository.id'], name=op.f('fk_repositoryactioncount_repository_id_repository')), + sa.PrimaryKeyConstraint('id', name=op.f('pk_repositoryactioncount')) + ) + op.create_index('repositoryactioncount_date', 'repositoryactioncount', ['date'], unique=False) + op.create_index('repositoryactioncount_repository_id', 'repositoryactioncount', ['repository_id'], unique=False) + op.create_index('repositoryactioncount_repository_id_date', 'repositoryactioncount', ['repository_id', 'date'], unique=True) + ### end Alembic commands ### + + +def downgrade(tables): + ### commands auto generated by Alembic - please adjust! ### + op.drop_table('repositoryactioncount') + ### end Alembic commands ### diff --git a/data/model/legacy.py b/data/model/legacy.py index 6f8ded0a7..ed7f1f8d1 100644 --- a/data/model/legacy.py +++ b/data/model/legacy.py @@ -18,7 +18,7 @@ from data.database import (User, Repository, Image, AccessToken, Role, Repositor DerivedImageStorage, ImageStorageTransformation, random_string_generator, db, BUILD_PHASE, QuayUserField, ImageStorageSignature, QueueItem, ImageStorageSignatureKind, validate_database_url, db_for_update, - AccessTokenKind, Star, get_epoch_timestamp) + AccessTokenKind, Star, get_epoch_timestamp, RepositoryActionCount) from peewee import JOIN_LEFT_OUTER, fn from util.validation import (validate_username, validate_email, validate_password, INVALID_PASSWORD_MESSAGE) @@ -995,20 +995,19 @@ def get_sorted_matching_repositories(prefix, only_public, checker, limit=10): """ Returns repositories matching the given prefix string and passing the given checker function. """ - last_week = datetime.now() - timedelta(weeks=1) results = [] existing_ids = [] - def get_search_results(search_clause, with_count): + def get_search_results(search_clause, with_count=False): if len(results) >= limit: return - selected = [Repository, Namespace] + select_items = [Repository, Namespace] if with_count: - selected.append(fn.Count(LogEntry.id).alias('count')) + select_items.append(fn.Sum(RepositoryActionCount.count).alias('count')) - query = (Repository.select(*selected) + query = (Repository.select(*select_items) .join(Namespace, JOIN_LEFT_OUTER, on=(Namespace.id == Repository.namespace_user)) .switch(Repository) .where(search_clause) @@ -1021,9 +1020,10 @@ def get_sorted_matching_repositories(prefix, only_public, checker, limit=10): query = query.where(~(Repository.id << existing_ids)) if with_count: - query = (query.join(LogEntry, JOIN_LEFT_OUTER) - .where(LogEntry.datetime >= last_week) - .order_by(fn.Count(LogEntry.id).desc())) + query = (query.switch(Repository) + .join(RepositoryActionCount) + .where(RepositoryActionCount.date >= last_week) + .order_by(fn.Sum(RepositoryActionCount.count).desc())) for result in query: if len(results) >= limit: @@ -1042,13 +1042,13 @@ def get_sorted_matching_repositories(prefix, only_public, checker, limit=10): existing_ids.append(result.id) # For performance reasons, we conduct the repo name and repo namespace searches on their - # own, and with and without counts on their own. This also affords us the ability to give - # higher precedence to repository names matching over namespaces, which is semantically correct. - get_search_results((Repository.name ** (prefix + '%')), with_count=True) - get_search_results((Repository.name ** (prefix + '%')), with_count=False) + # own. This also affords us the ability to give higher precedence to repository names matching + # over namespaces, which is semantically correct. + get_search_results(Repository.name ** (prefix + '%'), with_count=True) + get_search_results(Repository.name ** (prefix + '%'), with_count=False) - get_search_results((Namespace.username ** (prefix + '%')), with_count=True) - get_search_results((Namespace.username ** (prefix + '%')), with_count=False) + get_search_results(Namespace.username ** (prefix + '%'), with_count=True) + get_search_results(Namespace.username ** (prefix + '%'), with_count=False) return results diff --git a/data/model/sqlalchemybridge.py b/data/model/sqlalchemybridge.py index 8b7d8b664..43248b55a 100644 --- a/data/model/sqlalchemybridge.py +++ b/data/model/sqlalchemybridge.py @@ -1,7 +1,7 @@ from sqlalchemy import (Table, MetaData, Column, ForeignKey, Integer, String, Boolean, Text, - DateTime, BigInteger, Index) + DateTime, Date, BigInteger, Index) from peewee import (PrimaryKeyField, CharField, BooleanField, DateTimeField, TextField, - ForeignKeyField, BigIntegerField, IntegerField) + ForeignKeyField, BigIntegerField, IntegerField, DateField) OPTIONS_TO_COPY = [ @@ -42,6 +42,8 @@ def gen_sqlalchemy_metadata(peewee_model_list): alchemy_type = Boolean elif isinstance(field, DateTimeField): alchemy_type = DateTime + elif isinstance(field, DateField): + alchemy_type = Date elif isinstance(field, TextField): alchemy_type = Text elif isinstance(field, ForeignKeyField): diff --git a/initdb.py b/initdb.py index 104e0fc19..402a9e186 100644 --- a/initdb.py +++ b/initdb.py @@ -16,6 +16,8 @@ from data import model from data.model import oauth from app import app, storage as store +from workers import repositoryactioncounter + logger = logging.getLogger(__name__) @@ -582,6 +584,9 @@ def populate_database(): 'trigger_id': trigger.uuid, 'config': json.loads(trigger.config), 'service': trigger.service.name}) + while repositoryactioncounter.count_repository_actions(): + pass + if __name__ == '__main__': log_level = getattr(logging, app.config['LOGGING_LEVEL']) logging.basicConfig(level=log_level) diff --git a/workers/repositoryactioncounter.py b/workers/repositoryactioncounter.py new file mode 100644 index 000000000..1341cdfc9 --- /dev/null +++ b/workers/repositoryactioncounter.py @@ -0,0 +1,51 @@ +import logging + +from apscheduler.schedulers.blocking import BlockingScheduler + +from data.database import Repository, LogEntry, RepositoryActionCount, db_random_func, fn +from datetime import date, datetime, timedelta + +POLL_PERIOD_SECONDS = 30 + +logger = logging.getLogger(__name__) +sched = BlockingScheduler() + +@sched.scheduled_job(trigger='interval', seconds=10) +def count_repository_actions(): + """ Counts actions for a random repository for the previous day. """ + + try: + # Get a random repository to count. + today = date.today() + yesterday = today - timedelta(days=1) + has_yesterday_actions = (RepositoryActionCount.select(RepositoryActionCount.repository) + .where(RepositoryActionCount.date == yesterday)) + + to_count = (Repository.select() + .where(~(Repository.id << (has_yesterday_actions))) + .order_by(db_random_func()).get()) + + logger.debug('Counting: %s', to_count.id) + + actions = (LogEntry.select() + .where(LogEntry.repository == to_count, + LogEntry.datetime >= yesterday, + LogEntry.datetime < today) + .count()) + + # Create the row. + try: + RepositoryActionCount.create(repository=to_count, date=yesterday, count=actions) + except: + logger.exception('Exception when writing count') + + return True + + except Repository.DoesNotExist: + logger.debug('No further repositories to count') + return False + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + sched.start()