Implement a worker for batch exporting of usage logs
This will allow customers to request their usage logs for a repository or an entire namespace, and we can export the logs in a manner that doesn't absolutely destroy the database, with every step along the way timed.
This commit is contained in:
parent
b8d2e1be9c
commit
8a212728a3
18 changed files with 768 additions and 15 deletions
|
@ -15,11 +15,16 @@ logger = logging.getLogger(__name__)
|
|||
ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING = ['pull_repo']
|
||||
|
||||
|
||||
def _logs_query(selections, start_time, end_time, performer=None, repository=None, namespace=None,
|
||||
ignore=None, model=LogEntry):
|
||||
def _logs_query(selections, start_time=None, end_time=None, performer=None, repository=None,
|
||||
namespace=None, ignore=None, model=LogEntry, id_range=None):
|
||||
""" Returns a query for selecting logs from the table, with various options and filters. """
|
||||
joined = (model.select(*selections).switch(model)
|
||||
.where(model.datetime >= start_time, model.datetime < end_time))
|
||||
assert (start_time is not None and end_time is not None) or (id_range is not None)
|
||||
joined = (model.select(*selections).switch(model))
|
||||
|
||||
if id_range is not None:
|
||||
joined = joined.where(model.id >= id_range[0], model.id <= id_range[1])
|
||||
else:
|
||||
joined = joined.where(model.datetime >= start_time, model.datetime < end_time)
|
||||
|
||||
if repository:
|
||||
joined = joined.where(model.repository == repository)
|
||||
|
@ -67,8 +72,8 @@ def get_aggregated_logs(start_time, end_time, performer=None, repository=None, n
|
|||
return query.group_by(date, model.kind)
|
||||
|
||||
|
||||
def get_logs_query(start_time, end_time, performer=None, repository=None, namespace=None,
|
||||
ignore=None, model=LogEntry):
|
||||
def get_logs_query(start_time=None, end_time=None, performer=None, repository=None, namespace=None,
|
||||
ignore=None, model=LogEntry, id_range=None):
|
||||
""" Returns the logs matching the given filters. """
|
||||
Performer = User.alias()
|
||||
Account = User.alias()
|
||||
|
@ -78,13 +83,13 @@ def get_logs_query(start_time, end_time, performer=None, repository=None, namesp
|
|||
selections.append(Account)
|
||||
|
||||
query = _logs_query(selections, start_time, end_time, performer, repository, namespace, ignore,
|
||||
model=model)
|
||||
model=model, id_range=id_range)
|
||||
query = (query.switch(model).join(Performer, JOIN.LEFT_OUTER,
|
||||
on=(model.performer == Performer.id).alias('performer')))
|
||||
on=(model.performer == Performer.id).alias('performer')))
|
||||
|
||||
if namespace is None and repository is None:
|
||||
query = (query.switch(model).join(Account, JOIN.LEFT_OUTER,
|
||||
on=(model.account == Account.id).alias('account')))
|
||||
on=(model.account == Account.id).alias('account')))
|
||||
|
||||
return query
|
||||
|
||||
|
@ -191,3 +196,54 @@ def get_repositories_action_sums(repository_ids):
|
|||
action_count_map[record[0]] = record[1]
|
||||
|
||||
return action_count_map
|
||||
|
||||
|
||||
def get_minimum_id_for_logs(start_time, repository_id=None, namespace_id=None):
|
||||
""" Returns the minimum ID for logs matching the given repository or namespace in
|
||||
the logs table, starting at the given start time.
|
||||
"""
|
||||
# First try bounded by a day. Most repositories will meet this criteria, and therefore
|
||||
# can make a much faster query.
|
||||
day_after = start_time + timedelta(days=1)
|
||||
result = _get_bounded_id(fn.Min, LogEntry.datetime >= start_time,
|
||||
repository_id, namespace_id, LogEntry.datetime < day_after)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
return _get_bounded_id(fn.Min, LogEntry.datetime >= start_time, repository_id, namespace_id)
|
||||
|
||||
|
||||
def get_maximum_id_for_logs(end_time, repository_id=None, namespace_id=None):
|
||||
""" Returns the maximum ID for logs matching the given repository or namespace in
|
||||
the logs table, ending at the given end time.
|
||||
"""
|
||||
# First try bounded by a day. Most repositories will meet this criteria, and therefore
|
||||
# can make a much faster query.
|
||||
day_before = end_time - timedelta(days=1)
|
||||
result = _get_bounded_id(fn.Max, LogEntry.datetime <= end_time,
|
||||
repository_id, namespace_id, LogEntry.datetime > day_before)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
return _get_bounded_id(fn.Max, LogEntry.datetime <= end_time, repository_id, namespace_id)
|
||||
|
||||
|
||||
def _get_bounded_id(fn, filter_clause, repository_id, namespace_id, reduction_clause=None):
|
||||
assert (namespace_id is not None) or (repository_id is not None)
|
||||
query = (LogEntry
|
||||
.select(fn(LogEntry.id))
|
||||
.where(filter_clause))
|
||||
|
||||
if reduction_clause is not None:
|
||||
query = query.where(reduction_clause)
|
||||
|
||||
if repository_id is not None:
|
||||
query = query.where(LogEntry.repository == repository_id)
|
||||
else:
|
||||
query = query.where(LogEntry.account == namespace_id)
|
||||
|
||||
row = query.tuples()[0]
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return row[0]
|
||||
|
|
Reference in a new issue