From c767aafcd606f11438adb673b61e2d3c012ff856 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Thu, 7 May 2015 22:49:11 -0400 Subject: [PATCH 1/2] Make the repository API faster by only checking the log entries table once for each kind of entry, rather than twice. We make use of a special subquery-like syntax, which allows us to count those entries that are both 30 days only and 1 day old in the same query. This was tested successfully on MySQL, Postgres and Sqlite. --- data/model/legacy.py | 44 ++++++++++++++++++++++++------------- endpoints/api/repository.py | 17 ++++++++------ 2 files changed, 39 insertions(+), 22 deletions(-) diff --git a/data/model/legacy.py b/data/model/legacy.py index dd6a10121..94386cb53 100644 --- a/data/model/legacy.py +++ b/data/model/legacy.py @@ -19,7 +19,7 @@ from data.database import (User, Repository, Image, AccessToken, Role, Repositor db, BUILD_PHASE, QuayUserField, ImageStorageSignature, QueueItem, ImageStorageSignatureKind, validate_database_url, db_for_update, AccessTokenKind, Star, get_epoch_timestamp, RepositoryActionCount) -from peewee import JOIN_LEFT_OUTER, fn +from peewee import JOIN_LEFT_OUTER, fn, SQL from util.validation import (validate_username, validate_email, validate_password, INVALID_PASSWORD_MESSAGE) from util.names import format_robot_username, parse_robot_username @@ -2777,24 +2777,38 @@ def cancel_repository_build(build, work_queue): build.delete_instance() return True -def get_repository_pushes(repository, time_delta): +def _get_repository_events(repository, time_delta, time_delta_earlier, clause): since = date.today() - time_delta - push_repo = LogEntryKind.get(name = 'push_repo') - return (LogEntry.select() - .where(LogEntry.repository == repository) - .where(LogEntry.kind == push_repo) - .where(LogEntry.datetime >= since) - .count()) + since_earlier = date.today() - time_delta_earlier -def get_repository_pulls(repository, time_delta): - since = date.today() - time_delta + if since_earlier >= since: + raise Exception('time_delta_earlier must be greater than time_delta') + + # This uses a CASE WHEN inner clause to further filter the count. + formatted = since.strftime('%Y-%m-%d') + case_query = 'CASE WHEN datetime >= \'%s\' THEN 1 ELSE 0 END' % formatted + + result = (LogEntry.select(fn.Sum(SQL(case_query)), fn.Count(SQL('*'))) + .where(LogEntry.repository == repository) + .where(clause) + .where(LogEntry.datetime >= since_earlier) + .tuples() + .get()) + + return (result[0] or 0, result[1] or 0) + + +def get_repository_pushes(repository, time_delta, time_delta_earlier): + push_repo = LogEntryKind.get(name = 'push_repo') + clauses = (LogEntry.kind == push_repo) + return _get_repository_events(repository, time_delta, time_delta_earlier, clauses) + + +def get_repository_pulls(repository, time_delta, time_delta_earlier): repo_pull = LogEntryKind.get(name = 'pull_repo') repo_verb = LogEntryKind.get(name = 'repo_verb') - return (LogEntry.select() - .where(LogEntry.repository == repository) - .where((LogEntry.kind == repo_pull) | (LogEntry.kind == repo_verb)) - .where(LogEntry.datetime >= since) - .count()) + clauses = ((LogEntry.kind == repo_pull) | (LogEntry.kind == repo_verb)) + return _get_repository_events(repository, time_delta, time_delta_earlier, clauses) def get_repository_usage(): diff --git a/endpoints/api/repository.py b/endpoints/api/repository.py index c66a294ad..bbc1b49de 100644 --- a/endpoints/api/repository.py +++ b/endpoints/api/repository.py @@ -194,13 +194,17 @@ class Repository(RepositoryParamResource): tag_dict = {tag.name: tag_view(tag) for tag in tags} can_write = ModifyRepositoryPermission(namespace, repository).can() can_admin = AdministerRepositoryPermission(namespace, repository).can() - active_builds = model.list_repository_builds(namespace, repository, 1, - include_inactive=False) is_starred = (model.repository_is_starred(get_authenticated_user(), repo) if get_authenticated_user() else False) is_public = model.is_repository_public(repo) + (pull_today, pull_thirty_day) = model.get_repository_pulls(repo, timedelta(days=1), + timedelta(days=30)) + + (push_today, push_thirty_day) = model.get_repository_pushes(repo, timedelta(days=1), + timedelta(days=30)) + return { 'namespace': namespace, 'name': repository, @@ -209,18 +213,17 @@ class Repository(RepositoryParamResource): 'can_write': can_write, 'can_admin': can_admin, 'is_public': is_public, - 'is_building': len(list(active_builds)) > 0, 'is_organization': repo.namespace_user.organization, 'is_starred': is_starred, 'status_token': repo.badge_token if not is_public else '', 'stats': { 'pulls': { - 'today': model.get_repository_pulls(repo, timedelta(days=1)), - 'thirty_day': model.get_repository_pulls(repo, timedelta(days=30)) + 'today': pull_today, + 'thirty_day': pull_thirty_day }, 'pushes': { - 'today': model.get_repository_pushes(repo, timedelta(days=1)), - 'thirty_day': model.get_repository_pushes(repo, timedelta(days=30)) + 'today': push_today, + 'thirty_day': push_thirty_day } } } From 8ed83674047c4b0f376c14f0c95e8634cda4cd8c Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Fri, 8 May 2015 13:38:34 -0400 Subject: [PATCH 2/2] PR changes in response to comments --- data/model/legacy.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/data/model/legacy.py b/data/model/legacy.py index 94386cb53..5c632fbad 100644 --- a/data/model/legacy.py +++ b/data/model/legacy.py @@ -2778,11 +2778,19 @@ def cancel_repository_build(build, work_queue): return True def _get_repository_events(repository, time_delta, time_delta_earlier, clause): + """ Returns a pair representing the count of the number of events for the given + repository in each of the specified time deltas. The date ranges are calculated by + taking the current time today and subtracting the time delta given. Since + we want to grab *two* ranges, we restrict the second range to be greater + than the first (i.e. referring to an earlier time), so we can conduct the + lookup in a single query. The clause is used to further filter the kind of + events being found. + """ since = date.today() - time_delta since_earlier = date.today() - time_delta_earlier if since_earlier >= since: - raise Exception('time_delta_earlier must be greater than time_delta') + raise ValueError('time_delta_earlier must be greater than time_delta') # This uses a CASE WHEN inner clause to further filter the count. formatted = since.strftime('%Y-%m-%d') @@ -2799,22 +2807,22 @@ def _get_repository_events(repository, time_delta, time_delta_earlier, clause): def get_repository_pushes(repository, time_delta, time_delta_earlier): - push_repo = LogEntryKind.get(name = 'push_repo') + push_repo = LogEntryKind.get(name='push_repo') clauses = (LogEntry.kind == push_repo) return _get_repository_events(repository, time_delta, time_delta_earlier, clauses) def get_repository_pulls(repository, time_delta, time_delta_earlier): - repo_pull = LogEntryKind.get(name = 'pull_repo') - repo_verb = LogEntryKind.get(name = 'repo_verb') + repo_pull = LogEntryKind.get(name='pull_repo') + repo_verb = LogEntryKind.get(name='repo_verb') clauses = ((LogEntry.kind == repo_pull) | (LogEntry.kind == repo_verb)) return _get_repository_events(repository, time_delta, time_delta_earlier, clauses) def get_repository_usage(): one_month_ago = date.today() - timedelta(weeks=4) - repo_pull = LogEntryKind.get(name = 'pull_repo') - repo_verb = LogEntryKind.get(name = 'repo_verb') + repo_pull = LogEntryKind.get(name='pull_repo') + repo_verb = LogEntryKind.get(name='repo_verb') return (LogEntry.select(LogEntry.ip, LogEntry.repository) .where((LogEntry.kind == repo_pull) | (LogEntry.kind == repo_verb)) .where(~(LogEntry.repository >> None))