Add a RepositorySearchScore table and calculation to the RAC worker

This will be used in a followup PR to order search results instead of the RAC join. Currently, the join with the RAC table in search results in a lookup of ~600K rows, which causes searching to take ~6s. This PR denormalizes the data we need, as well as allowing us to score based on a wider band (6 months vs the current 1 week).
This commit is contained in:
Joseph Schorr 2017-03-17 13:51:45 -04:00
parent 1bfca871ec
commit df3f47c79a
10 changed files with 243 additions and 50 deletions

View file

@ -1,62 +1,39 @@
import logging
from datetime import date, timedelta
from app import app # This is required to initialize the database.
from data.database import Repository, LogEntry, RepositoryActionCount, db_random_func
from data import model
from workers.worker import Worker
POLL_PERIOD_SECONDS = 10
logger = logging.getLogger(__name__)
def count_repository_actions():
""" Aggregates repository actions from the LogEntry table and writes them to
the RepositoryActionCount table. Returns the number of repositories for
which actions were logged. Returns 0 when there is no more work.
"""
try:
# Get a random repository to count.
today = date.today()
yesterday = today - timedelta(days=1)
has_yesterday_actions = (RepositoryActionCount
.select(RepositoryActionCount.repository)
.where(RepositoryActionCount.date == yesterday))
to_count = (Repository
.select()
.where(~(Repository.id << (has_yesterday_actions)))
.order_by(db_random_func()).get())
logger.debug('Counting: %s', to_count.id)
actions = (LogEntry
.select()
.where(LogEntry.repository == to_count,
LogEntry.datetime >= yesterday,
LogEntry.datetime < today)
.count())
# Create the row.
try:
RepositoryActionCount.create(repository=to_count, date=yesterday, count=actions)
return 1
except:
logger.exception('Exception when writing count')
except Repository.DoesNotExist:
logger.debug('No further repositories to count')
return 0
class RepositoryActionCountWorker(Worker):
def __init__(self):
super(RepositoryActionCountWorker, self).__init__()
self.add_operation(self._count_repository_actions, POLL_PERIOD_SECONDS)
def _count_repository_actions(self):
""" Counts actions for a random repository for the previous day. """
count_repository_actions()
""" Counts actions and aggregates search scores for a random repository for the
previous day. """
to_count = model.repositoryactioncount.find_uncounted_repository()
if to_count is None:
logger.debug('No further repositories to count')
return
logger.debug('Found repository #%s to count', to_count.id)
was_counted = model.repositoryactioncount.count_repository_actions(to_count)
if not was_counted:
logger.debug('Repository #%s was counted by another worker', to_count.id)
return
logger.debug('Updating search score for repository #%s', to_count.id)
was_updated = model.repositoryactioncount.update_repository_score(to_count)
if not was_updated:
logger.debug('Repository #%s had its search score updated by another worker', to_count.id)
return
logger.debug('Repository #%s search score updated', to_count.id)
if __name__ == "__main__":
worker = RepositoryActionCountWorker()