2017-03-17 17:51:45 +00:00
|
|
|
import logging
|
|
|
|
|
|
|
|
from collections import namedtuple
|
|
|
|
from peewee import IntegrityError
|
|
|
|
|
|
|
|
from datetime import date, timedelta, datetime
|
2018-12-11 20:54:16 +00:00
|
|
|
from data.database import (Repository, LogEntry, RepositoryActionCount,
|
2018-05-18 16:54:38 +00:00
|
|
|
RepositorySearchScore, db_random_func, fn)
|
2017-03-17 17:51:45 +00:00
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
search_bucket = namedtuple('SearchBucket', ['delta', 'days', 'weight'])
|
|
|
|
|
|
|
|
# Defines the various buckets for search scoring. Each bucket is computed using the given time
|
|
|
|
# delta from today *minus the previous bucket's time period*. Once all the actions over the
|
|
|
|
# bucket's time period have been collected, they are multiplied by the given modifier. The modifiers
|
|
|
|
# for this bucket were determined via the integral of (2/((x/183)+1)^2)/183 over the period of days
|
|
|
|
# in the bucket; this integral over 0..183 has a sum of 1, so we get a good normalize score result.
|
|
|
|
SEARCH_BUCKETS = [
|
|
|
|
search_bucket(timedelta(days=1), 1, 0.010870),
|
|
|
|
search_bucket(timedelta(days=7), 6, 0.062815),
|
|
|
|
search_bucket(timedelta(days=31), 24, 0.21604),
|
|
|
|
search_bucket(timedelta(days=183), 152, 0.71028),
|
|
|
|
]
|
|
|
|
|
|
|
|
def find_uncounted_repository():
|
|
|
|
""" Returns a repository that has not yet had an entry added into the RepositoryActionCount
|
|
|
|
table for yesterday.
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
# Get a random repository to count.
|
|
|
|
today = date.today()
|
|
|
|
yesterday = today - timedelta(days=1)
|
|
|
|
has_yesterday_actions = (RepositoryActionCount
|
|
|
|
.select(RepositoryActionCount.repository)
|
|
|
|
.where(RepositoryActionCount.date == yesterday))
|
|
|
|
|
|
|
|
to_count = (Repository
|
|
|
|
.select()
|
|
|
|
.where(~(Repository.id << (has_yesterday_actions)))
|
|
|
|
.order_by(db_random_func()).get())
|
|
|
|
return to_count
|
|
|
|
except Repository.DoesNotExist:
|
2017-04-11 19:37:46 +00:00
|
|
|
return None
|
2017-03-17 17:51:45 +00:00
|
|
|
|
|
|
|
|
|
|
|
def count_repository_actions(to_count):
|
|
|
|
""" Aggregates repository actions from the LogEntry table for the last day and writes them to
|
|
|
|
the RepositoryActionCount table. Return True if the repository was updated and False
|
|
|
|
otherwise.
|
|
|
|
"""
|
|
|
|
today = date.today()
|
|
|
|
yesterday = today - timedelta(days=1)
|
|
|
|
|
2018-12-11 20:54:16 +00:00
|
|
|
actions = (LogEntry
|
2018-05-18 16:54:38 +00:00
|
|
|
.select()
|
2018-12-11 20:54:16 +00:00
|
|
|
.where(LogEntry.repository == to_count,
|
|
|
|
LogEntry.datetime >= yesterday,
|
|
|
|
LogEntry.datetime < today)
|
2018-05-18 16:54:38 +00:00
|
|
|
.count())
|
2017-03-17 17:51:45 +00:00
|
|
|
try:
|
|
|
|
RepositoryActionCount.create(repository=to_count, date=yesterday, count=actions)
|
|
|
|
return True
|
|
|
|
except IntegrityError:
|
2017-10-30 16:55:24 +00:00
|
|
|
logger.debug('Count already written for repository %s', to_count.id)
|
2017-03-17 17:51:45 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def update_repository_score(repo):
|
|
|
|
""" Updates the repository score entry for the given table by retrieving information from
|
|
|
|
the RepositoryActionCount table. Note that count_repository_actions for the repo should
|
|
|
|
be called first. Returns True if the row was updated and False otherwise.
|
|
|
|
"""
|
|
|
|
today = date.today()
|
|
|
|
|
|
|
|
# Retrieve the counts for each bucket and calculate the final score.
|
|
|
|
final_score = 0.0
|
|
|
|
last_end_timedelta = timedelta(days=0)
|
|
|
|
|
|
|
|
for bucket in SEARCH_BUCKETS:
|
|
|
|
start_date = today - bucket.delta
|
|
|
|
end_date = today - last_end_timedelta
|
|
|
|
last_end_timedelta = bucket.delta
|
|
|
|
|
|
|
|
query = (RepositoryActionCount
|
|
|
|
.select(fn.Sum(RepositoryActionCount.count), fn.Count(RepositoryActionCount.id))
|
|
|
|
.where(RepositoryActionCount.date >= start_date,
|
|
|
|
RepositoryActionCount.date < end_date,
|
|
|
|
RepositoryActionCount.repository == repo))
|
|
|
|
|
|
|
|
bucket_tuple = query.tuples()[0]
|
|
|
|
logger.debug('Got bucket tuple %s for bucket %s for repository %s', bucket_tuple, bucket,
|
|
|
|
repo.id)
|
|
|
|
|
2017-04-24 20:44:46 +00:00
|
|
|
if bucket_tuple[0] is None:
|
|
|
|
continue
|
|
|
|
|
|
|
|
bucket_sum = float(bucket_tuple[0])
|
|
|
|
bucket_count = int(bucket_tuple[1])
|
2017-03-17 17:51:45 +00:00
|
|
|
if not bucket_count:
|
|
|
|
continue
|
|
|
|
|
|
|
|
bucket_score = bucket_sum / (bucket_count * 1.0)
|
|
|
|
final_score += bucket_score * bucket.weight
|
|
|
|
|
|
|
|
# Update the existing repo search score row or create a new one.
|
|
|
|
normalized_score = int(final_score * 100.0)
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
search_score_row = RepositorySearchScore.get(repository=repo)
|
|
|
|
search_score_row.last_updated = datetime.now()
|
|
|
|
search_score_row.score = normalized_score
|
|
|
|
search_score_row.save()
|
|
|
|
return True
|
|
|
|
except RepositorySearchScore.DoesNotExist:
|
|
|
|
RepositorySearchScore.create(repository=repo, score=normalized_score, last_updated=today)
|
|
|
|
return True
|
|
|
|
except IntegrityError:
|
|
|
|
logger.debug('RepositorySearchScore row already existed; skipping')
|
|
|
|
return False
|