This repository has been archived on 2020-03-24. You can view files and clone it, but you cannot make any changes to it's state, such as pushing and creating new issues, pull requests or comments.
quay/data/model/repositoryactioncount.py
Joseph Schorr df3f47c79a Add a RepositorySearchScore table and calculation to the RAC worker
This will be used in a followup PR to order search results instead of the RAC join. Currently, the join with the RAC table in search results in a lookup of ~600K rows, which causes searching to take ~6s. This PR denormalizes the data we need, as well as allowing us to score based on a wider band (6 months vs the current 1 week).
2017-04-10 14:29:02 -04:00

118 lines
4.4 KiB
Python

import logging
from collections import namedtuple
from peewee import IntegrityError
from datetime import date, timedelta, datetime
from data.database import (Repository, LogEntry, RepositoryActionCount, RepositorySearchScore,
db_random_func, fn)
logger = logging.getLogger(__name__)
search_bucket = namedtuple('SearchBucket', ['delta', 'days', 'weight'])
# Defines the various buckets for search scoring. Each bucket is computed using the given time
# delta from today *minus the previous bucket's time period*. Once all the actions over the
# bucket's time period have been collected, they are multiplied by the given modifier. The modifiers
# for this bucket were determined via the integral of (2/((x/183)+1)^2)/183 over the period of days
# in the bucket; this integral over 0..183 has a sum of 1, so we get a good normalize score result.
SEARCH_BUCKETS = [
search_bucket(timedelta(days=1), 1, 0.010870),
search_bucket(timedelta(days=7), 6, 0.062815),
search_bucket(timedelta(days=31), 24, 0.21604),
search_bucket(timedelta(days=183), 152, 0.71028),
]
def find_uncounted_repository():
""" Returns a repository that has not yet had an entry added into the RepositoryActionCount
table for yesterday.
"""
try:
# Get a random repository to count.
today = date.today()
yesterday = today - timedelta(days=1)
has_yesterday_actions = (RepositoryActionCount
.select(RepositoryActionCount.repository)
.where(RepositoryActionCount.date == yesterday))
to_count = (Repository
.select()
.where(~(Repository.id << (has_yesterday_actions)))
.order_by(db_random_func()).get())
return to_count
except Repository.DoesNotExist:
return 0
def count_repository_actions(to_count):
""" Aggregates repository actions from the LogEntry table for the last day and writes them to
the RepositoryActionCount table. Return True if the repository was updated and False
otherwise.
"""
today = date.today()
yesterday = today - timedelta(days=1)
actions = (LogEntry
.select()
.where(LogEntry.repository == to_count,
LogEntry.datetime >= yesterday,
LogEntry.datetime < today)
.count())
try:
RepositoryActionCount.create(repository=to_count, date=yesterday, count=actions)
return True
except IntegrityError:
logger.exception('Exception when writing count for repository')
return False
def update_repository_score(repo):
""" Updates the repository score entry for the given table by retrieving information from
the RepositoryActionCount table. Note that count_repository_actions for the repo should
be called first. Returns True if the row was updated and False otherwise.
"""
today = date.today()
# Retrieve the counts for each bucket and calculate the final score.
final_score = 0.0
last_end_timedelta = timedelta(days=0)
for bucket in SEARCH_BUCKETS:
start_date = today - bucket.delta
end_date = today - last_end_timedelta
last_end_timedelta = bucket.delta
query = (RepositoryActionCount
.select(fn.Sum(RepositoryActionCount.count), fn.Count(RepositoryActionCount.id))
.where(RepositoryActionCount.date >= start_date,
RepositoryActionCount.date < end_date,
RepositoryActionCount.repository == repo))
bucket_tuple = query.tuples()[0]
logger.debug('Got bucket tuple %s for bucket %s for repository %s', bucket_tuple, bucket,
repo.id)
bucket_sum = bucket_tuple[0]
bucket_count = bucket_tuple[1]
if not bucket_count:
continue
bucket_score = bucket_sum / (bucket_count * 1.0)
final_score += bucket_score * bucket.weight
# Update the existing repo search score row or create a new one.
normalized_score = int(final_score * 100.0)
try:
try:
search_score_row = RepositorySearchScore.get(repository=repo)
search_score_row.last_updated = datetime.now()
search_score_row.score = normalized_score
search_score_row.save()
return True
except RepositorySearchScore.DoesNotExist:
RepositorySearchScore.create(repository=repo, score=normalized_score, last_updated=today)
return True
except IntegrityError:
logger.debug('RepositorySearchScore row already existed; skipping')
return False