This repository has been archived on 2020-03-24. You can view files and clone it, but cannot push or open issues or pull requests.
quay/data/logs_model/table_logs_model.py

292 lines
12 KiB
Python
Raw Normal View History

2019-11-12 16:09:47 +00:00
# pylint: disable=protected-access
import logging
from datetime import datetime, timedelta
from tzlocal import get_localzone
from dateutil.relativedelta import relativedelta
from data import model
from data.model import config
from data.database import LogEntry, LogEntry2, LogEntry3, UseThenDisconnect
from data.logs_model.interface import ActionLogsDataInterface, LogsIterationTimeout, \
LogRotationContextInterface
from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage
from data.logs_model.shared import SharedModel
from data.model.log import get_stale_logs, get_stale_logs_start_id, delete_stale_logs
logger = logging.getLogger(__name__)
MINIMUM_RANGE_SIZE = 1 # second
MAXIMUM_RANGE_SIZE = 60 * 60 * 24 * 30 # seconds ~= 1 month
EXPECTED_ITERATION_LOG_COUNT = 1000
LOG_MODELS = [LogEntry3, LogEntry2, LogEntry]
class TableLogsModel(SharedModel, ActionLogsDataInterface):
"""
TableLogsModel implements the data model for the logs API backed by a single table
in the database.
"""
def __init__(self, should_skip_logging=None, **kwargs):
self._should_skip_logging = should_skip_logging
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
if filter_kinds is not None:
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
assert start_datetime is not None
assert end_datetime is not None
repository = None
if repository_name and namespace_name:
repository = model.repository.get_repository(namespace_name, repository_name)
assert repository
performer = None
if performer_name:
performer = model.user.get_user(performer_name)
assert performer
def get_logs(m, page_token):
logs_query = model.log.get_logs_query(start_datetime, end_datetime, performer=performer,
repository=repository, namespace=namespace_name,
ignore=filter_kinds, model=m)
logs, next_page_token = model.modelutil.paginate(logs_query, m,
descending=True,
page_token=page_token,
limit=20,
max_page=max_page_count,
sort_field_name='datetime')
return logs, next_page_token
TOKEN_TABLE_ID = 'tti'
table_index = 0
logs = []
next_page_token = page_token or None
# Skip empty pages (empty table)
while len(logs) == 0 and table_index < len(LOG_MODELS) - 1:
table_specified = next_page_token is not None and next_page_token.get(TOKEN_TABLE_ID) is not None
if table_specified:
table_index = next_page_token.get(TOKEN_TABLE_ID)
logs_result, next_page_token = get_logs(LOG_MODELS[table_index], next_page_token)
logs.extend(logs_result)
if next_page_token is None and table_index < len(LOG_MODELS) - 1:
next_page_token = {TOKEN_TABLE_ID: table_index + 1}
return LogEntriesPage([Log.for_logentry(log) for log in logs], next_page_token)
def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None,
filter_kinds=None, size=20):
if filter_kinds is not None:
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
repository = None
if repository_name and namespace_name:
repository = model.repository.get_repository(namespace_name, repository_name)
assert repository
performer = None
if performer_name:
performer = model.user.get_user(performer_name)
assert performer
def get_latest_logs(m):
logs_query = model.log.get_latest_logs_query(performer=performer, repository=repository,
namespace=namespace_name, ignore=filter_kinds,
model=m, size=size)
logs = list(logs_query)
return [Log.for_logentry(log) for log in logs]
return get_latest_logs(LOG_MODELS[0])
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
repository_name=None, namespace_name=None, filter_kinds=None):
if filter_kinds is not None:
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
if end_datetime - start_datetime >= timedelta(weeks=4):
raise Exception('Cannot lookup aggregated logs over a period longer than a month')
repository = None
if repository_name and namespace_name:
repository = model.repository.get_repository(namespace_name, repository_name)
performer = None
if performer_name:
performer = model.user.get_user(performer_name)
entries = {}
for log_model in LOG_MODELS:
aggregated = model.log.get_aggregated_logs(start_datetime, end_datetime,
performer=performer,
repository=repository,
namespace=namespace_name,
ignore=filter_kinds,
model=log_model)
for entry in aggregated:
synthetic_date = datetime(start_datetime.year, start_datetime.month, int(entry.day),
tzinfo=get_localzone())
if synthetic_date.day < start_datetime.day:
synthetic_date = synthetic_date + relativedelta(months=1)
key = '%s-%s' % (entry.kind_id, entry.day)
if key in entries:
entries[key] = AggregatedLogCount(entry.kind_id, entry.count + entries[key].count,
synthetic_date)
else:
entries[key] = AggregatedLogCount(entry.kind_id, entry.count, synthetic_date)
return entries.values()
def count_repository_actions(self, repository, day):
return model.repositoryactioncount.count_repository_actions(repository, day)
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
repository=None, repository_name=None, timestamp=None, is_free_namespace=False):
if self._should_skip_logging and self._should_skip_logging(kind_name, namespace_name,
is_free_namespace):
return
if repository_name is not None:
assert repository is None
assert namespace_name is not None
repository = model.repository.get_repository(namespace_name, repository_name)
model.log.log_action(kind_name, namespace_name, performer=performer, repository=repository,
ip=ip, metadata=metadata or {}, timestamp=timestamp)
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
namespace_id=None, max_query_time=None):
# Using an adjusting scale, start downloading log rows in batches, starting at
# MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or
# the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes
# longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out.
batch_start_time = datetime.utcnow()
current_start_datetime = start_datetime
current_batch_size = timedelta(seconds=MINIMUM_RANGE_SIZE)
while current_start_datetime < end_datetime:
# Verify we haven't been working for too long.
work_elapsed = datetime.utcnow() - batch_start_time
if max_query_time is not None and work_elapsed > max_query_time:
logger.error('Retrieval of logs `%s/%s` timed out with time of `%s`',
namespace_id, repository_id, work_elapsed)
raise LogsIterationTimeout()
current_end_datetime = current_start_datetime + current_batch_size
current_end_datetime = min(current_end_datetime, end_datetime)
# Load the next set of logs.
def load_logs():
logger.debug('Retrieving logs over range %s -> %s with namespace %s and repository %s',
current_start_datetime, current_end_datetime, namespace_id, repository_id)
logs_query = model.log.get_logs_query(namespace=namespace_id,
repository=repository_id,
start_time=current_start_datetime,
end_time=current_end_datetime)
logs = list(logs_query)
for log in logs:
if namespace_id is not None:
assert log.account_id == namespace_id
if repository_id is not None:
assert log.repository_id == repository_id
logs = [Log.for_logentry(log) for log in logs]
return logs
logs, elapsed = _run_and_time(load_logs)
if max_query_time is not None and elapsed > max_query_time:
logger.error('Retrieval of logs for export `%s/%s` with range `%s-%s` timed out at `%s`',
namespace_id, repository_id, current_start_datetime, current_end_datetime,
elapsed)
raise LogsIterationTimeout()
yield logs
# Move forward.
current_start_datetime = current_end_datetime
# Increase the batch size if necessary.
if len(logs) < EXPECTED_ITERATION_LOG_COUNT:
seconds = min(MAXIMUM_RANGE_SIZE, current_batch_size.total_seconds() * 2)
current_batch_size = timedelta(seconds=seconds)
def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
""" Yield a context manager for a group of outdated logs. """
for log_model in LOG_MODELS:
while True:
with UseThenDisconnect(config.app_config):
start_id = get_stale_logs_start_id(log_model)
if start_id is None:
logger.warning('Failed to find start id')
break
logger.debug('Found starting ID %s', start_id)
lookup_end_id = start_id + min_logs_per_rotation
logs = [log for log in get_stale_logs(start_id, lookup_end_id,
log_model, cutoff_date)]
if not logs:
logger.debug('No further logs found')
break
end_id = max([log.id for log in logs])
context = DatabaseLogRotationContext(logs, log_model, start_id, end_id)
yield context
def _run_and_time(fn):
start_time = datetime.utcnow()
result = fn()
return result, datetime.utcnow() - start_time
table_logs_model = TableLogsModel()
class DatabaseLogRotationContext(LogRotationContextInterface):
"""
DatabaseLogRotationContext represents a batch of logs to be archived together.
i.e A set of logs to be archived in the same file (based on the number of logs per rotation).
When completed without exceptions, this context will delete the stale logs
from rows `start_id` to `end_id`.
"""
def __init__(self, logs, log_model, start_id, end_id):
self.logs = logs
self.log_model = log_model
self.start_id = start_id
self.end_id = end_id
def __enter__(self):
return self
def __exit__(self, ex_type, ex_value, ex_traceback):
if ex_type is None and ex_value is None and ex_traceback is None:
with UseThenDisconnect(config.app_config):
logger.debug('Deleting logs from IDs %s to %s', self.start_id, self.end_id)
delete_stale_logs(self.start_id, self.end_id, self.log_model)
def yield_logs_batch(self):
""" Yield a batch of logs and a filename for that batch. """
filename = '%d-%d-%s.txt.gz' % (self.start_id, self.end_id,
self.log_model.__name__.lower())
yield self.logs, filename