Add a RepositorySearchScore table and calculation to the RAC worker
This will be used in a followup PR to order search results instead of the RAC join. Currently, the join with the RAC table in search results in a lookup of ~600K rows, which causes searching to take ~6s. This PR denormalizes the data we need, as well as allowing us to score based on a wider band (6 months vs the current 1 week).
This commit is contained in:
parent
1bfca871ec
commit
df3f47c79a
10 changed files with 243 additions and 50 deletions
|
@ -1,62 +1,39 @@
|
|||
import logging
|
||||
|
||||
from datetime import date, timedelta
|
||||
|
||||
from app import app # This is required to initialize the database.
|
||||
from data.database import Repository, LogEntry, RepositoryActionCount, db_random_func
|
||||
from data import model
|
||||
from workers.worker import Worker
|
||||
|
||||
POLL_PERIOD_SECONDS = 10
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def count_repository_actions():
|
||||
""" Aggregates repository actions from the LogEntry table and writes them to
|
||||
the RepositoryActionCount table. Returns the number of repositories for
|
||||
which actions were logged. Returns 0 when there is no more work.
|
||||
"""
|
||||
try:
|
||||
# Get a random repository to count.
|
||||
today = date.today()
|
||||
yesterday = today - timedelta(days=1)
|
||||
has_yesterday_actions = (RepositoryActionCount
|
||||
.select(RepositoryActionCount.repository)
|
||||
.where(RepositoryActionCount.date == yesterday))
|
||||
|
||||
to_count = (Repository
|
||||
.select()
|
||||
.where(~(Repository.id << (has_yesterday_actions)))
|
||||
.order_by(db_random_func()).get())
|
||||
|
||||
logger.debug('Counting: %s', to_count.id)
|
||||
|
||||
actions = (LogEntry
|
||||
.select()
|
||||
.where(LogEntry.repository == to_count,
|
||||
LogEntry.datetime >= yesterday,
|
||||
LogEntry.datetime < today)
|
||||
.count())
|
||||
|
||||
# Create the row.
|
||||
try:
|
||||
RepositoryActionCount.create(repository=to_count, date=yesterday, count=actions)
|
||||
return 1
|
||||
except:
|
||||
logger.exception('Exception when writing count')
|
||||
except Repository.DoesNotExist:
|
||||
logger.debug('No further repositories to count')
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
class RepositoryActionCountWorker(Worker):
|
||||
def __init__(self):
|
||||
super(RepositoryActionCountWorker, self).__init__()
|
||||
self.add_operation(self._count_repository_actions, POLL_PERIOD_SECONDS)
|
||||
|
||||
def _count_repository_actions(self):
|
||||
""" Counts actions for a random repository for the previous day. """
|
||||
count_repository_actions()
|
||||
""" Counts actions and aggregates search scores for a random repository for the
|
||||
previous day. """
|
||||
to_count = model.repositoryactioncount.find_uncounted_repository()
|
||||
if to_count is None:
|
||||
logger.debug('No further repositories to count')
|
||||
return
|
||||
|
||||
logger.debug('Found repository #%s to count', to_count.id)
|
||||
was_counted = model.repositoryactioncount.count_repository_actions(to_count)
|
||||
if not was_counted:
|
||||
logger.debug('Repository #%s was counted by another worker', to_count.id)
|
||||
return
|
||||
|
||||
logger.debug('Updating search score for repository #%s', to_count.id)
|
||||
was_updated = model.repositoryactioncount.update_repository_score(to_count)
|
||||
if not was_updated:
|
||||
logger.debug('Repository #%s had its search score updated by another worker', to_count.id)
|
||||
return
|
||||
|
||||
logger.debug('Repository #%s search score updated', to_count.id)
|
||||
|
||||
if __name__ == "__main__":
|
||||
worker = RepositoryActionCountWorker()
|
||||
|
|
Reference in a new issue