Merge pull request #3323 from quay/joseph.schorr/QUAY-1282/log-interfacing
Interface out all action log data model operations
This commit is contained in:
commit
9f09d68ad8
26 changed files with 714 additions and 902 deletions
20
data/logs_model/__init__.py
Normal file
20
data/logs_model/__init__.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
import os
|
||||
import logging
|
||||
|
||||
from data.logs_model.table_logs_model import table_logs_model
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LogsModelProxy(object):
|
||||
def __init__(self):
|
||||
self._model = table_logs_model
|
||||
|
||||
def __getattr__(self, attr):
|
||||
return getattr(self._model, attr)
|
||||
|
||||
|
||||
logs_model = LogsModelProxy()
|
||||
logger.info('===============================')
|
||||
logger.info('Using logs model `%s`', logs_model._model)
|
||||
logger.info('===============================')
|
120
data/logs_model/datatypes.py
Normal file
120
data/logs_model/datatypes.py
Normal file
|
@ -0,0 +1,120 @@
|
|||
import json
|
||||
|
||||
from calendar import timegm
|
||||
from collections import namedtuple
|
||||
from email.utils import formatdate
|
||||
|
||||
from cachetools import lru_cache
|
||||
|
||||
from data import model
|
||||
from util.morecollections import AttrDict
|
||||
|
||||
|
||||
def _format_date(date):
|
||||
""" Output an RFC822 date format. """
|
||||
if date is None:
|
||||
return None
|
||||
|
||||
return formatdate(timegm(date.utctimetuple()))
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _kinds():
|
||||
return model.log.get_log_entry_kinds()
|
||||
|
||||
|
||||
class LogEntriesPage(namedtuple('LogEntriesPage', ['logs', 'next_page_token'])):
|
||||
""" Represents a page returned by the lookup_logs call. The `logs` contains the logs
|
||||
found for the page and `next_page_token`, if not None, contains the token to be
|
||||
encoded and returned for the followup call.
|
||||
"""
|
||||
|
||||
|
||||
class Log(namedtuple('Log', [
|
||||
'metadata_json', 'ip', 'datetime', 'performer_email', 'performer_username', 'performer_robot',
|
||||
'account_organization', 'account_username', 'account_email', 'account_robot', 'kind_id'])):
|
||||
""" Represents a single log entry returned by the logs model. """
|
||||
|
||||
@classmethod
|
||||
def for_logentry(cls, log):
|
||||
account_organization = None
|
||||
account_username = None
|
||||
account_email = None
|
||||
account_robot = None
|
||||
try:
|
||||
account_organization = log.account.organization
|
||||
account_username = log.account.username
|
||||
account_email = log.account.email
|
||||
account_robot = log.account.robot
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
performer_robot = None
|
||||
performer_username = None
|
||||
performer_email = None
|
||||
|
||||
try:
|
||||
performer_robot = log.performer.robot
|
||||
performer_username = log.performer.username
|
||||
performer_email = log.performer.email
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
return Log(log.metadata_json, log.ip, log.datetime, performer_email, performer_username,
|
||||
performer_robot, account_organization, account_username,
|
||||
account_email, account_robot, log.kind_id)
|
||||
|
||||
def to_dict(self, avatar, include_namespace=False):
|
||||
view = {
|
||||
'kind': _kinds()[self.kind_id],
|
||||
'metadata': json.loads(self.metadata_json),
|
||||
'ip': self.ip,
|
||||
'datetime': _format_date(self.datetime),
|
||||
}
|
||||
|
||||
if self.performer_username:
|
||||
performer = AttrDict({'username': self.performer_username, 'email': self.performer_email})
|
||||
performer.robot = None
|
||||
if self.performer_robot:
|
||||
performer.robot = self.performer_robot
|
||||
|
||||
view['performer'] = {
|
||||
'kind': 'user',
|
||||
'name': self.performer_username,
|
||||
'is_robot': self.performer_robot,
|
||||
'avatar': avatar.get_data_for_user(performer),
|
||||
}
|
||||
|
||||
if include_namespace:
|
||||
if self.account_username:
|
||||
account = AttrDict({'username': self.account_username, 'email': self.account_email})
|
||||
if self.account_organization:
|
||||
|
||||
view['namespace'] = {
|
||||
'kind': 'org',
|
||||
'name': self.account_username,
|
||||
'avatar': avatar.get_data_for_org(account),
|
||||
}
|
||||
else:
|
||||
account.robot = None
|
||||
if self.account_robot:
|
||||
account.robot = self.account_robot
|
||||
view['namespace'] = {
|
||||
'kind': 'user',
|
||||
'name': self.account_username,
|
||||
'avatar': avatar.get_data_for_user(account),
|
||||
}
|
||||
|
||||
return view
|
||||
|
||||
|
||||
class AggregatedLogCount(namedtuple('AggregatedLogCount', ['kind_id', 'count', 'datetime'])):
|
||||
""" Represents the aggregated count of the number of logs, of a particular kind, on a day. """
|
||||
def to_dict(self):
|
||||
view = {
|
||||
'kind': _kinds()[self.kind_id],
|
||||
'count': self.count,
|
||||
'datetime': _format_date(self.datetime),
|
||||
}
|
||||
|
||||
return view
|
62
data/logs_model/interface.py
Normal file
62
data/logs_model/interface.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
from abc import ABCMeta, abstractmethod
|
||||
from six import add_metaclass
|
||||
|
||||
class LogsIterationTimeout(Exception):
|
||||
""" Exception raised if logs iteration times out. """
|
||||
|
||||
|
||||
@add_metaclass(ABCMeta)
|
||||
class ActionLogsDataInterface(object):
|
||||
""" Interface for code to work with the logs data model. The logs data model consists
|
||||
of all access for reading and writing action logs.
|
||||
"""
|
||||
@abstractmethod
|
||||
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
|
||||
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
|
||||
""" Looks up all logs between the start_datetime and end_datetime, filtered
|
||||
by performer (a user), repository or namespace. Note that one (and only one) of the three
|
||||
can be specified. Returns a LogEntriesPage. filter, if specified, is a set/list of the
|
||||
kinds of logs to filter.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
|
||||
repository_name=None, namespace_name=None, filter_kinds=None):
|
||||
""" Returns the aggregated count of logs, by kind, between the start_datetime and end_datetime,
|
||||
filtered by performer (a user), repository or namespace. Note that one (and only one) of
|
||||
the three can be specified. Returns a list of AggregatedLogCount.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def count_repository_actions(self, repository, day):
|
||||
""" Returns the total number of repository actions over the given day, in the given repository
|
||||
or None on error.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue,
|
||||
namespace_name=None, repository_name=None, callback_url=None,
|
||||
callback_email=None, filter_kinds=None):
|
||||
""" Queues logs between the start_datetime and end_time, filtered by a repository or namespace,
|
||||
for export to the specified URL and/or email address. Returns the ID of the export job
|
||||
queued or None if error.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
|
||||
repository=None, repository_name=None, timestamp=None):
|
||||
""" Logs a single action as having taken place. """
|
||||
|
||||
@abstractmethod
|
||||
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
|
||||
namespace_id=None, max_query_time=None):
|
||||
""" Returns an iterator that yields bundles of all logs found between the start_datetime and
|
||||
end_datetime, optionally filtered by the repository or namespace. This function should be
|
||||
used for any bulk lookup operations, and should be implemented by implementors to put
|
||||
minimal strain on the backing storage for large operations. If there was an error in setting
|
||||
up, returns None.
|
||||
|
||||
If max_query_time is specified, each iteration that yields a log bundle will have its
|
||||
queries run with a maximum timeout of that specified, and, if any exceed that threshold,
|
||||
LogsIterationTimeout will be raised instead of returning the logs bundle.
|
||||
"""
|
227
data/logs_model/table_logs_model.py
Normal file
227
data/logs_model/table_logs_model.py
Normal file
|
@ -0,0 +1,227 @@
|
|||
# pylint: disable=protected-access
|
||||
|
||||
import logging
|
||||
import json
|
||||
import uuid
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from tzlocal import get_localzone
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
from data import model
|
||||
from data.database import LogEntry, LogEntry2, LogEntry3
|
||||
from data.logs_model.interface import ActionLogsDataInterface, LogsIterationTimeout
|
||||
from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage, _format_date
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MINIMUM_RANGE_SIZE = 1000
|
||||
MAXIMUM_RANGE_SIZE = 100000
|
||||
EXPECTED_ITERATION_LOG_COUNT = 1000
|
||||
|
||||
|
||||
LOG_MODELS = [LogEntry3, LogEntry2, LogEntry]
|
||||
|
||||
|
||||
class TableLogsModel(ActionLogsDataInterface):
|
||||
"""
|
||||
TableLogsModel implements the data model for the logs API backed by a single table
|
||||
in the database.
|
||||
"""
|
||||
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
|
||||
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
|
||||
assert start_datetime is not None
|
||||
assert end_datetime is not None
|
||||
|
||||
repository = None
|
||||
if repository_name and namespace_name:
|
||||
repository = model.repository.get_repository(namespace_name, repository_name)
|
||||
|
||||
performer = None
|
||||
if performer_name:
|
||||
performer = model.user.get_user(performer_name)
|
||||
|
||||
def get_logs(m):
|
||||
logs_query = model.log.get_logs_query(start_datetime, end_datetime, performer=performer,
|
||||
repository=repository, namespace=namespace_name,
|
||||
ignore=filter_kinds, model=m)
|
||||
|
||||
logs, next_page_token = model.modelutil.paginate(logs_query, m,
|
||||
descending=True, page_token=page_token,
|
||||
limit=20,
|
||||
max_page=max_page_count)
|
||||
return LogEntriesPage([Log.for_logentry(log) for log in logs], next_page_token)
|
||||
|
||||
# First check the LogEntry3 table for the most recent logs, unless we've been expressly told
|
||||
# to look inside the other tables.
|
||||
TOKEN_TABLE_ID = 'tti'
|
||||
|
||||
table_index = 0
|
||||
table_specified = page_token is not None and page_token.get(TOKEN_TABLE_ID) is not None
|
||||
if table_specified:
|
||||
table_index = page_token.get(TOKEN_TABLE_ID)
|
||||
|
||||
page_result = get_logs(LOG_MODELS[table_index])
|
||||
if page_result.next_page_token is None and table_index < len(LOG_MODELS) - 1:
|
||||
page_result = page_result._replace(next_page_token={TOKEN_TABLE_ID: table_index + 1})
|
||||
|
||||
return page_result
|
||||
|
||||
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
|
||||
repository_name=None, namespace_name=None, filter_kinds=None):
|
||||
if end_datetime - start_datetime >= timedelta(weeks=4):
|
||||
raise Exception('Cannot lookup aggregated logs over a period longer than a month')
|
||||
|
||||
repository = None
|
||||
if repository_name and namespace_name:
|
||||
repository = model.repository.get_repository(namespace_name, repository_name)
|
||||
|
||||
performer = None
|
||||
if performer_name:
|
||||
performer = model.user.get_user(performer_name)
|
||||
|
||||
entries = {}
|
||||
for log_model in LOG_MODELS:
|
||||
aggregated = model.log.get_aggregated_logs(start_datetime, end_datetime,
|
||||
performer=performer,
|
||||
repository=repository,
|
||||
namespace=namespace_name,
|
||||
ignore=filter_kinds,
|
||||
model=log_model)
|
||||
|
||||
for entry in aggregated:
|
||||
synthetic_date = datetime(start_datetime.year, start_datetime.month, int(entry.day),
|
||||
tzinfo=get_localzone())
|
||||
if synthetic_date.day < start_datetime.day:
|
||||
synthetic_date = synthetic_date + relativedelta(months=1)
|
||||
|
||||
key = '%s-%s' % (entry.kind_id, entry.day)
|
||||
|
||||
if key in entries:
|
||||
entries[key] = AggregatedLogCount(entry.kind_id, entry.count + entries[key].count,
|
||||
synthetic_date)
|
||||
else:
|
||||
entries[key] = AggregatedLogCount(entry.kind_id, entry.count, synthetic_date)
|
||||
|
||||
return entries.values()
|
||||
|
||||
def count_repository_actions(self, repository, day):
|
||||
return model.repositoryactioncount.count_repository_actions(repository, day)
|
||||
|
||||
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
|
||||
repository=None, repository_name=None, timestamp=None):
|
||||
if repository_name is not None:
|
||||
assert repository is None
|
||||
assert namespace_name is not None
|
||||
repository = model.repository.get_repository(namespace_name, repository_name)
|
||||
|
||||
model.log.log_action(kind_name, namespace_name, performer=performer, repository=repository,
|
||||
ip=ip, metadata=metadata or {}, timestamp=timestamp)
|
||||
|
||||
def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue,
|
||||
namespace_name=None, repository_name=None, callback_url=None,
|
||||
callback_email=None, filter_kinds=None):
|
||||
export_id = str(uuid.uuid4())
|
||||
namespace = model.user.get_namespace_user(namespace_name)
|
||||
if namespace is None:
|
||||
return None
|
||||
|
||||
repository = None
|
||||
if repository_name is not None:
|
||||
repository = model.repository.get_repository(namespace_name, repository_name)
|
||||
if repository is None:
|
||||
return None
|
||||
|
||||
export_action_logs_queue.put([namespace_name], json.dumps({
|
||||
'export_id': export_id,
|
||||
'repository_id': repository.id if repository else None,
|
||||
'namespace_id': namespace.id,
|
||||
'namespace_name': namespace.username,
|
||||
'repository_name': repository.name if repository else None,
|
||||
'start_time': _format_date(start_datetime),
|
||||
'end_time': _format_date(end_datetime),
|
||||
'callback_url': callback_url,
|
||||
'callback_email': callback_email,
|
||||
}), retries_remaining=3)
|
||||
|
||||
return export_id
|
||||
|
||||
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
|
||||
namespace_id=None, max_query_time=None):
|
||||
# Lookup the starting and ending IDs for the log range in the table. This operation is quite
|
||||
# quick, so we use it as a bounding box for the later lookups.
|
||||
min_id, elapsed = _run_and_time(lambda: model.log.get_minimum_id_for_logs(start_datetime,
|
||||
repository_id,
|
||||
namespace_id))
|
||||
if elapsed > max_query_time:
|
||||
logger.error('Retrieval of min ID for export logs `%s/%s` timed out with time of `%s`',
|
||||
namespace_id, repository_id, elapsed)
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
max_id, elapsed = _run_and_time(lambda: model.log.get_maximum_id_for_logs(end_datetime,
|
||||
repository_id,
|
||||
namespace_id))
|
||||
if elapsed > max_query_time:
|
||||
logger.error('Retrieval of max ID for export logs `%s/%s` timed out with time of `%s`',
|
||||
namespace_id, repository_id, elapsed)
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
min_id = min_id or 1
|
||||
max_id = max_id or 1
|
||||
|
||||
logger.info('Found log range of %s to %s for export logs `%s/%s`', min_id, max_id,
|
||||
namespace_id, repository_id)
|
||||
|
||||
# Using an adjusting scale, start downloading log rows in batches, starting at
|
||||
# MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or
|
||||
# the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes
|
||||
# longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out.
|
||||
batch_start_time = datetime.utcnow()
|
||||
|
||||
current_start_id = min_id
|
||||
current_batch_size = MINIMUM_RANGE_SIZE
|
||||
|
||||
while current_start_id <= max_id:
|
||||
# Verify we haven't been working for too long.
|
||||
work_elapsed = datetime.utcnow() - batch_start_time
|
||||
if work_elapsed > max_query_time:
|
||||
logger.error('Retrieval of logs `%s/%s` timed out with time of `%s`',
|
||||
namespace_id, repository_id, work_elapsed)
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
id_range = [current_start_id, min(max_id, current_start_id + current_batch_size)]
|
||||
|
||||
# Load the next set of logs.
|
||||
def load_logs():
|
||||
logger.debug('Retrieving logs over range %s with namespace %s and repository %s',
|
||||
id_range, namespace_id, repository_id)
|
||||
|
||||
logs_query = model.log.get_logs_query(namespace=namespace_id,
|
||||
repository=repository_id,
|
||||
id_range=id_range)
|
||||
return [Log.for_logentry(log) for log in logs_query]
|
||||
|
||||
logs, elapsed = _run_and_time(load_logs)
|
||||
if elapsed > max_query_time:
|
||||
logger.error('Retrieval of logs for export logs `%s/%s` with range `%s` timed out at `%s`',
|
||||
namespace_id, repository_id, id_range, elapsed)
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
yield logs
|
||||
|
||||
# Move forward.
|
||||
current_start_id = id_range[1] + 1
|
||||
|
||||
# Increase the batch size if necessary.
|
||||
if len(logs) < EXPECTED_ITERATION_LOG_COUNT:
|
||||
current_batch_size = min(MAXIMUM_RANGE_SIZE, current_batch_size * 2)
|
||||
|
||||
|
||||
def _run_and_time(fn):
|
||||
start_time = datetime.utcnow()
|
||||
result = fn()
|
||||
return result, datetime.utcnow() - start_time
|
||||
|
||||
|
||||
table_logs_model = TableLogsModel()
|
|
@ -44,30 +44,34 @@ def find_uncounted_repository():
|
|||
return None
|
||||
|
||||
|
||||
def count_repository_actions(to_count):
|
||||
""" Aggregates repository actions from the LogEntry table for the last day and writes them to
|
||||
the RepositoryActionCount table. Return True if the repository was updated and False
|
||||
otherwise.
|
||||
def count_repository_actions(to_count, day):
|
||||
""" Aggregates repository actions from the LogEntry table for the specified day. Returns the
|
||||
count or None on error.
|
||||
"""
|
||||
today = date.today()
|
||||
yesterday = today - timedelta(days=1)
|
||||
|
||||
# TODO(LogMigrate): Remove the branch once we're back on a single table.
|
||||
def lookup_action_count(model):
|
||||
return (model
|
||||
.select()
|
||||
.where(model.repository == to_count,
|
||||
model.datetime >= yesterday,
|
||||
model.datetime < today)
|
||||
model.datetime >= day,
|
||||
model.datetime < (day + timedelta(days=1)))
|
||||
.count())
|
||||
|
||||
actions = (lookup_action_count(LogEntry3) + lookup_action_count(LogEntry2) +
|
||||
lookup_action_count(LogEntry))
|
||||
|
||||
return actions
|
||||
|
||||
|
||||
def store_repository_action_count(repository, day, action_count):
|
||||
""" Stores the action count for a repository for a specific day. Returns False if the
|
||||
repository already has an entry for the specified day.
|
||||
"""
|
||||
try:
|
||||
RepositoryActionCount.create(repository=to_count, date=yesterday, count=actions)
|
||||
RepositoryActionCount.create(repository=repository, date=day, count=action_count)
|
||||
return True
|
||||
except IntegrityError:
|
||||
logger.debug('Count already written for repository %s', to_count.id)
|
||||
logger.debug('Count already written for repository %s', repository.id)
|
||||
return False
|
||||
|
||||
|
||||
|
|
Reference in a new issue