Merge pull request #3323 from quay/joseph.schorr/QUAY-1282/log-interfacing

Interface out all action log data model operations
This commit is contained in:
Joseph Schorr 2019-01-28 15:09:25 -05:00 committed by GitHub
commit 9f09d68ad8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
26 changed files with 714 additions and 902 deletions

View file

@ -0,0 +1,20 @@
import os
import logging
from data.logs_model.table_logs_model import table_logs_model
logger = logging.getLogger(__name__)
class LogsModelProxy(object):
def __init__(self):
self._model = table_logs_model
def __getattr__(self, attr):
return getattr(self._model, attr)
logs_model = LogsModelProxy()
logger.info('===============================')
logger.info('Using logs model `%s`', logs_model._model)
logger.info('===============================')

View file

@ -0,0 +1,120 @@
import json
from calendar import timegm
from collections import namedtuple
from email.utils import formatdate
from cachetools import lru_cache
from data import model
from util.morecollections import AttrDict
def _format_date(date):
""" Output an RFC822 date format. """
if date is None:
return None
return formatdate(timegm(date.utctimetuple()))
@lru_cache(maxsize=1)
def _kinds():
return model.log.get_log_entry_kinds()
class LogEntriesPage(namedtuple('LogEntriesPage', ['logs', 'next_page_token'])):
""" Represents a page returned by the lookup_logs call. The `logs` contains the logs
found for the page and `next_page_token`, if not None, contains the token to be
encoded and returned for the followup call.
"""
class Log(namedtuple('Log', [
'metadata_json', 'ip', 'datetime', 'performer_email', 'performer_username', 'performer_robot',
'account_organization', 'account_username', 'account_email', 'account_robot', 'kind_id'])):
""" Represents a single log entry returned by the logs model. """
@classmethod
def for_logentry(cls, log):
account_organization = None
account_username = None
account_email = None
account_robot = None
try:
account_organization = log.account.organization
account_username = log.account.username
account_email = log.account.email
account_robot = log.account.robot
except AttributeError:
pass
performer_robot = None
performer_username = None
performer_email = None
try:
performer_robot = log.performer.robot
performer_username = log.performer.username
performer_email = log.performer.email
except AttributeError:
pass
return Log(log.metadata_json, log.ip, log.datetime, performer_email, performer_username,
performer_robot, account_organization, account_username,
account_email, account_robot, log.kind_id)
def to_dict(self, avatar, include_namespace=False):
view = {
'kind': _kinds()[self.kind_id],
'metadata': json.loads(self.metadata_json),
'ip': self.ip,
'datetime': _format_date(self.datetime),
}
if self.performer_username:
performer = AttrDict({'username': self.performer_username, 'email': self.performer_email})
performer.robot = None
if self.performer_robot:
performer.robot = self.performer_robot
view['performer'] = {
'kind': 'user',
'name': self.performer_username,
'is_robot': self.performer_robot,
'avatar': avatar.get_data_for_user(performer),
}
if include_namespace:
if self.account_username:
account = AttrDict({'username': self.account_username, 'email': self.account_email})
if self.account_organization:
view['namespace'] = {
'kind': 'org',
'name': self.account_username,
'avatar': avatar.get_data_for_org(account),
}
else:
account.robot = None
if self.account_robot:
account.robot = self.account_robot
view['namespace'] = {
'kind': 'user',
'name': self.account_username,
'avatar': avatar.get_data_for_user(account),
}
return view
class AggregatedLogCount(namedtuple('AggregatedLogCount', ['kind_id', 'count', 'datetime'])):
""" Represents the aggregated count of the number of logs, of a particular kind, on a day. """
def to_dict(self):
view = {
'kind': _kinds()[self.kind_id],
'count': self.count,
'datetime': _format_date(self.datetime),
}
return view

View file

@ -0,0 +1,62 @@
from abc import ABCMeta, abstractmethod
from six import add_metaclass
class LogsIterationTimeout(Exception):
""" Exception raised if logs iteration times out. """
@add_metaclass(ABCMeta)
class ActionLogsDataInterface(object):
""" Interface for code to work with the logs data model. The logs data model consists
of all access for reading and writing action logs.
"""
@abstractmethod
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
""" Looks up all logs between the start_datetime and end_datetime, filtered
by performer (a user), repository or namespace. Note that one (and only one) of the three
can be specified. Returns a LogEntriesPage. filter, if specified, is a set/list of the
kinds of logs to filter.
"""
@abstractmethod
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
repository_name=None, namespace_name=None, filter_kinds=None):
""" Returns the aggregated count of logs, by kind, between the start_datetime and end_datetime,
filtered by performer (a user), repository or namespace. Note that one (and only one) of
the three can be specified. Returns a list of AggregatedLogCount.
"""
@abstractmethod
def count_repository_actions(self, repository, day):
""" Returns the total number of repository actions over the given day, in the given repository
or None on error.
"""
@abstractmethod
def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue,
namespace_name=None, repository_name=None, callback_url=None,
callback_email=None, filter_kinds=None):
""" Queues logs between the start_datetime and end_time, filtered by a repository or namespace,
for export to the specified URL and/or email address. Returns the ID of the export job
queued or None if error.
"""
@abstractmethod
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
repository=None, repository_name=None, timestamp=None):
""" Logs a single action as having taken place. """
@abstractmethod
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
namespace_id=None, max_query_time=None):
""" Returns an iterator that yields bundles of all logs found between the start_datetime and
end_datetime, optionally filtered by the repository or namespace. This function should be
used for any bulk lookup operations, and should be implemented by implementors to put
minimal strain on the backing storage for large operations. If there was an error in setting
up, returns None.
If max_query_time is specified, each iteration that yields a log bundle will have its
queries run with a maximum timeout of that specified, and, if any exceed that threshold,
LogsIterationTimeout will be raised instead of returning the logs bundle.
"""

View file

@ -0,0 +1,227 @@
# pylint: disable=protected-access
import logging
import json
import uuid
from datetime import datetime, timedelta
from tzlocal import get_localzone
from dateutil.relativedelta import relativedelta
from data import model
from data.database import LogEntry, LogEntry2, LogEntry3
from data.logs_model.interface import ActionLogsDataInterface, LogsIterationTimeout
from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage, _format_date
logger = logging.getLogger(__name__)
MINIMUM_RANGE_SIZE = 1000
MAXIMUM_RANGE_SIZE = 100000
EXPECTED_ITERATION_LOG_COUNT = 1000
LOG_MODELS = [LogEntry3, LogEntry2, LogEntry]
class TableLogsModel(ActionLogsDataInterface):
"""
TableLogsModel implements the data model for the logs API backed by a single table
in the database.
"""
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
assert start_datetime is not None
assert end_datetime is not None
repository = None
if repository_name and namespace_name:
repository = model.repository.get_repository(namespace_name, repository_name)
performer = None
if performer_name:
performer = model.user.get_user(performer_name)
def get_logs(m):
logs_query = model.log.get_logs_query(start_datetime, end_datetime, performer=performer,
repository=repository, namespace=namespace_name,
ignore=filter_kinds, model=m)
logs, next_page_token = model.modelutil.paginate(logs_query, m,
descending=True, page_token=page_token,
limit=20,
max_page=max_page_count)
return LogEntriesPage([Log.for_logentry(log) for log in logs], next_page_token)
# First check the LogEntry3 table for the most recent logs, unless we've been expressly told
# to look inside the other tables.
TOKEN_TABLE_ID = 'tti'
table_index = 0
table_specified = page_token is not None and page_token.get(TOKEN_TABLE_ID) is not None
if table_specified:
table_index = page_token.get(TOKEN_TABLE_ID)
page_result = get_logs(LOG_MODELS[table_index])
if page_result.next_page_token is None and table_index < len(LOG_MODELS) - 1:
page_result = page_result._replace(next_page_token={TOKEN_TABLE_ID: table_index + 1})
return page_result
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
repository_name=None, namespace_name=None, filter_kinds=None):
if end_datetime - start_datetime >= timedelta(weeks=4):
raise Exception('Cannot lookup aggregated logs over a period longer than a month')
repository = None
if repository_name and namespace_name:
repository = model.repository.get_repository(namespace_name, repository_name)
performer = None
if performer_name:
performer = model.user.get_user(performer_name)
entries = {}
for log_model in LOG_MODELS:
aggregated = model.log.get_aggregated_logs(start_datetime, end_datetime,
performer=performer,
repository=repository,
namespace=namespace_name,
ignore=filter_kinds,
model=log_model)
for entry in aggregated:
synthetic_date = datetime(start_datetime.year, start_datetime.month, int(entry.day),
tzinfo=get_localzone())
if synthetic_date.day < start_datetime.day:
synthetic_date = synthetic_date + relativedelta(months=1)
key = '%s-%s' % (entry.kind_id, entry.day)
if key in entries:
entries[key] = AggregatedLogCount(entry.kind_id, entry.count + entries[key].count,
synthetic_date)
else:
entries[key] = AggregatedLogCount(entry.kind_id, entry.count, synthetic_date)
return entries.values()
def count_repository_actions(self, repository, day):
return model.repositoryactioncount.count_repository_actions(repository, day)
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
repository=None, repository_name=None, timestamp=None):
if repository_name is not None:
assert repository is None
assert namespace_name is not None
repository = model.repository.get_repository(namespace_name, repository_name)
model.log.log_action(kind_name, namespace_name, performer=performer, repository=repository,
ip=ip, metadata=metadata or {}, timestamp=timestamp)
def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue,
namespace_name=None, repository_name=None, callback_url=None,
callback_email=None, filter_kinds=None):
export_id = str(uuid.uuid4())
namespace = model.user.get_namespace_user(namespace_name)
if namespace is None:
return None
repository = None
if repository_name is not None:
repository = model.repository.get_repository(namespace_name, repository_name)
if repository is None:
return None
export_action_logs_queue.put([namespace_name], json.dumps({
'export_id': export_id,
'repository_id': repository.id if repository else None,
'namespace_id': namespace.id,
'namespace_name': namespace.username,
'repository_name': repository.name if repository else None,
'start_time': _format_date(start_datetime),
'end_time': _format_date(end_datetime),
'callback_url': callback_url,
'callback_email': callback_email,
}), retries_remaining=3)
return export_id
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
namespace_id=None, max_query_time=None):
# Lookup the starting and ending IDs for the log range in the table. This operation is quite
# quick, so we use it as a bounding box for the later lookups.
min_id, elapsed = _run_and_time(lambda: model.log.get_minimum_id_for_logs(start_datetime,
repository_id,
namespace_id))
if elapsed > max_query_time:
logger.error('Retrieval of min ID for export logs `%s/%s` timed out with time of `%s`',
namespace_id, repository_id, elapsed)
raise LogsIterationTimeout()
max_id, elapsed = _run_and_time(lambda: model.log.get_maximum_id_for_logs(end_datetime,
repository_id,
namespace_id))
if elapsed > max_query_time:
logger.error('Retrieval of max ID for export logs `%s/%s` timed out with time of `%s`',
namespace_id, repository_id, elapsed)
raise LogsIterationTimeout()
min_id = min_id or 1
max_id = max_id or 1
logger.info('Found log range of %s to %s for export logs `%s/%s`', min_id, max_id,
namespace_id, repository_id)
# Using an adjusting scale, start downloading log rows in batches, starting at
# MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or
# the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes
# longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out.
batch_start_time = datetime.utcnow()
current_start_id = min_id
current_batch_size = MINIMUM_RANGE_SIZE
while current_start_id <= max_id:
# Verify we haven't been working for too long.
work_elapsed = datetime.utcnow() - batch_start_time
if work_elapsed > max_query_time:
logger.error('Retrieval of logs `%s/%s` timed out with time of `%s`',
namespace_id, repository_id, work_elapsed)
raise LogsIterationTimeout()
id_range = [current_start_id, min(max_id, current_start_id + current_batch_size)]
# Load the next set of logs.
def load_logs():
logger.debug('Retrieving logs over range %s with namespace %s and repository %s',
id_range, namespace_id, repository_id)
logs_query = model.log.get_logs_query(namespace=namespace_id,
repository=repository_id,
id_range=id_range)
return [Log.for_logentry(log) for log in logs_query]
logs, elapsed = _run_and_time(load_logs)
if elapsed > max_query_time:
logger.error('Retrieval of logs for export logs `%s/%s` with range `%s` timed out at `%s`',
namespace_id, repository_id, id_range, elapsed)
raise LogsIterationTimeout()
yield logs
# Move forward.
current_start_id = id_range[1] + 1
# Increase the batch size if necessary.
if len(logs) < EXPECTED_ITERATION_LOG_COUNT:
current_batch_size = min(MAXIMUM_RANGE_SIZE, current_batch_size * 2)
def _run_and_time(fn):
start_time = datetime.utcnow()
result = fn()
return result, datetime.utcnow() - start_time
table_logs_model = TableLogsModel()

View file

@ -44,30 +44,34 @@ def find_uncounted_repository():
return None
def count_repository_actions(to_count):
""" Aggregates repository actions from the LogEntry table for the last day and writes them to
the RepositoryActionCount table. Return True if the repository was updated and False
otherwise.
def count_repository_actions(to_count, day):
""" Aggregates repository actions from the LogEntry table for the specified day. Returns the
count or None on error.
"""
today = date.today()
yesterday = today - timedelta(days=1)
# TODO(LogMigrate): Remove the branch once we're back on a single table.
def lookup_action_count(model):
return (model
.select()
.where(model.repository == to_count,
model.datetime >= yesterday,
model.datetime < today)
model.datetime >= day,
model.datetime < (day + timedelta(days=1)))
.count())
actions = (lookup_action_count(LogEntry3) + lookup_action_count(LogEntry2) +
lookup_action_count(LogEntry))
return actions
def store_repository_action_count(repository, day, action_count):
""" Stores the action count for a repository for a specific day. Returns False if the
repository already has an entry for the specified day.
"""
try:
RepositoryActionCount.create(repository=to_count, date=yesterday, count=actions)
RepositoryActionCount.create(repository=repository, date=day, count=action_count)
return True
except IntegrityError:
logger.debug('Count already written for repository %s', to_count.id)
logger.debug('Count already written for repository %s', repository.id)
return False