Add a RepositorySearchScore table and calculation to the RAC worker

This will be used in a followup PR to order search results instead of the RAC join. Currently, the join with the RAC table in search results in a lookup of ~600K rows, which causes searching to take ~6s. This PR denormalizes the data we need, as well as allowing us to score based on a wider band (6 months vs the current 1 week).
2017-03-17 13:51:45 -04:00 · 2017-03-17 13:51:45 -04:00 · df3f47c79a
commit df3f47c79a
parent 1bfca871ec
10 changed files with 243 additions and 50 deletions
--- a/data/model/repositoryactioncount.py
+++ b/data/model/repositoryactioncount.py
@ -0,0 +1,118 @@
+import logging
+
+from collections import namedtuple
+from peewee import IntegrityError
+
+from datetime import date, timedelta, datetime
+from data.database import (Repository, LogEntry, RepositoryActionCount, RepositorySearchScore,
+                           db_random_func, fn)
+
+logger = logging.getLogger(__name__)
+
+search_bucket = namedtuple('SearchBucket', ['delta', 'days', 'weight'])
+
+# Defines the various buckets for search scoring. Each bucket is computed using the given time
+# delta from today *minus the previous bucket's time period*. Once all the actions over the
+# bucket's time period have been collected, they are multiplied by the given modifier. The modifiers
+# for this bucket were determined via the integral of (2/((x/183)+1)^2)/183 over the period of days
+# in the bucket; this integral over 0..183 has a sum of 1, so we get a good normalize score result.
+SEARCH_BUCKETS = [
+  search_bucket(timedelta(days=1), 1, 0.010870),
+  search_bucket(timedelta(days=7), 6, 0.062815),
+  search_bucket(timedelta(days=31), 24, 0.21604),
+  search_bucket(timedelta(days=183), 152, 0.71028),
+]
+
+def find_uncounted_repository():
+  """ Returns a repository that has not yet had an entry added into the RepositoryActionCount
+      table for yesterday.
+  """
+  try:
+    # Get a random repository to count.
+    today = date.today()
+    yesterday = today - timedelta(days=1)
+    has_yesterday_actions = (RepositoryActionCount
+                             .select(RepositoryActionCount.repository)
+                             .where(RepositoryActionCount.date == yesterday))
+
+    to_count = (Repository
+                .select()
+                .where(~(Repository.id << (has_yesterday_actions)))
+                .order_by(db_random_func()).get())
+    return to_count
+  except Repository.DoesNotExist:
+    return 0
+
+
+def count_repository_actions(to_count):
+  """ Aggregates repository actions from the LogEntry table for the last day and writes them to
+      the RepositoryActionCount table. Return True if the repository was updated and False
+      otherwise.
+  """
+  today = date.today()
+  yesterday = today - timedelta(days=1)
+
+  actions = (LogEntry
+             .select()
+             .where(LogEntry.repository == to_count,
+                    LogEntry.datetime >= yesterday,
+                    LogEntry.datetime < today)
+             .count())
+
+  try:
+    RepositoryActionCount.create(repository=to_count, date=yesterday, count=actions)
+    return True
+  except IntegrityError:
+    logger.exception('Exception when writing count for repository')
+    return False
+
+
+def update_repository_score(repo):
+  """ Updates the repository score entry for the given table by retrieving information from
+      the RepositoryActionCount table. Note that count_repository_actions for the repo should
+      be called first. Returns True if the row was updated and False otherwise.
+  """
+  today = date.today()
+
+  # Retrieve the counts for each bucket and calculate the final score.
+  final_score = 0.0
+  last_end_timedelta = timedelta(days=0)
+
+  for bucket in SEARCH_BUCKETS:
+    start_date = today - bucket.delta
+    end_date = today - last_end_timedelta
+    last_end_timedelta = bucket.delta
+
+    query = (RepositoryActionCount
+             .select(fn.Sum(RepositoryActionCount.count), fn.Count(RepositoryActionCount.id))
+             .where(RepositoryActionCount.date >= start_date,
+                    RepositoryActionCount.date < end_date,
+                    RepositoryActionCount.repository == repo))
+
+    bucket_tuple = query.tuples()[0]
+    logger.debug('Got bucket tuple %s for bucket %s for repository %s', bucket_tuple, bucket,
+                 repo.id)
+
+    bucket_sum = bucket_tuple[0]
+    bucket_count = bucket_tuple[1]
+    if not bucket_count:
+      continue
+
+    bucket_score = bucket_sum / (bucket_count * 1.0)
+    final_score += bucket_score * bucket.weight
+
+  # Update the existing repo search score row or create a new one.
+  normalized_score = int(final_score * 100.0)
+  try:
+    try:
+      search_score_row = RepositorySearchScore.get(repository=repo)
+      search_score_row.last_updated = datetime.now()
+      search_score_row.score = normalized_score
+      search_score_row.save()
+      return True
+    except RepositorySearchScore.DoesNotExist:
+      RepositorySearchScore.create(repository=repo, score=normalized_score, last_updated=today)
+      return True
+  except IntegrityError:
+    logger.debug('RepositorySearchScore row already existed; skipping')
+    return False