This repository has been archived on 2020-03-24. You can view files and clone it, but cannot push or open issues or pull requests.
quay/data/logs_model/document_logs_model.py

533 lines
20 KiB
Python
Raw Normal View History

2019-11-12 16:09:47 +00:00
# pylint: disable=protected-access
import json
import logging
import uuid
from time import time
from datetime import timedelta, datetime, date
from dateutil.parser import parse as parse_datetime
from abc import ABCMeta, abstractmethod
from six import add_metaclass
from elasticsearch.exceptions import ConnectionTimeout, NotFoundError
from data import model
from data.database import CloseForLongOperation
from data.model import config
from data.model.log import (_json_serialize, ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING,
DataModelException)
from data.logs_model.elastic_logs import LogEntry, configure_es
from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage
from data.logs_model.interface import (ActionLogsDataInterface, LogRotationContextInterface,
LogsIterationTimeout)
from data.logs_model.shared import SharedModel, epoch_ms
from data.logs_model.logs_producer import LogProducerProxy, LogSendException
from data.logs_model.logs_producer.kafka_logs_producer import KafkaLogsProducer
from data.logs_model.logs_producer.elasticsearch_logs_producer import ElasticsearchLogsProducer
from data.logs_model.logs_producer.kinesis_stream_logs_producer import KinesisStreamLogsProducer
logger = logging.getLogger(__name__)
PAGE_SIZE = 20
DEFAULT_RESULT_WINDOW = 5000
MAX_RESULT_WINDOW = 10000
# DATE_RANGE_LIMIT is to limit the query date time range to at most 1 month.
DATE_RANGE_LIMIT = 32
# Timeout for count_repository_actions
COUNT_REPOSITORY_ACTION_TIMEOUT = 30
def _date_range_descending(start_datetime, end_datetime, includes_end_datetime=False):
""" Generate the dates between `end_datetime` and `start_datetime`.
If `includes_end_datetime` is set, the generator starts at `end_datetime`,
otherwise, starts the generator at `end_datetime` minus 1 second.
"""
assert end_datetime >= start_datetime
start_date = start_datetime.date()
if includes_end_datetime:
current_date = end_datetime.date()
else:
current_date = (end_datetime - timedelta(seconds=1)).date()
while current_date >= start_date:
yield current_date
current_date = current_date - timedelta(days=1)
def _date_range_in_single_index(dt1, dt2):
""" Determine whether a single index can be searched given a range
of dates or datetimes. If date instances are given, difference should be 1 day.
NOTE: dt2 is exclusive to the search result set.
i.e. The date range is larger or equal to dt1 and strictly smaller than dt2
"""
assert isinstance(dt1, date) and isinstance(dt2, date)
dt = dt2 - dt1
# Check if date or datetime
if not isinstance(dt1, datetime) and not isinstance(dt2, datetime):
return dt == timedelta(days=1)
if dt < timedelta(days=1) and dt >= timedelta(days=0):
return dt2.day == dt1.day
# Check if datetime can be interpreted as a date: hour, minutes, seconds or microseconds set to 0
if dt == timedelta(days=1):
return dt1.hour == 0 and dt1.minute == 0 and dt1.second == 0 and dt1.microsecond == 0
return False
def _for_elasticsearch_logs(logs, repository_id=None, namespace_id=None):
namespace_ids = set()
for log in logs:
namespace_ids.add(log.account_id)
namespace_ids.add(log.performer_id)
assert namespace_id is None or log.account_id == namespace_id
assert repository_id is None or log.repository_id == repository_id
id_user_map = model.user.get_user_map_by_ids(namespace_ids)
return [Log.for_elasticsearch_log(log, id_user_map) for log in logs]
def _random_id():
""" Generates a unique uuid4 string for the random_id field in LogEntry.
It is used as tie-breaker for sorting logs based on datetime:
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-search-after.html
"""
return str(uuid.uuid4())
@add_metaclass(ABCMeta)
class ElasticsearchLogsModelInterface(object):
"""
Interface for Elasticsearch specific operations with the logs model.
These operations are usually index based.
"""
@abstractmethod
def can_delete_index(self, index, cutoff_date):
""" Return whether the given index is older than the given cutoff date. """
@abstractmethod
def list_indices(self):
""" List the logs model's indices. """
class DocumentLogsModel(SharedModel, ActionLogsDataInterface, ElasticsearchLogsModelInterface):
"""
DocumentLogsModel implements the data model for the logs API backed by an
elasticsearch service.
"""
def __init__(self, should_skip_logging=None, elasticsearch_config=None, producer=None, **kwargs):
self._should_skip_logging = should_skip_logging
self._logs_producer = LogProducerProxy()
self._es_client = configure_es(**elasticsearch_config)
if producer == 'kafka':
kafka_config = kwargs['kafka_config']
self._logs_producer.initialize(KafkaLogsProducer(**kafka_config))
elif producer == 'elasticsearch':
self._logs_producer.initialize(ElasticsearchLogsProducer())
elif producer == 'kinesis_stream':
kinesis_stream_config = kwargs['kinesis_stream_config']
self._logs_producer.initialize(KinesisStreamLogsProducer(**kinesis_stream_config))
else:
raise Exception('Invalid log producer: %s' % producer)
@staticmethod
def _get_ids_by_names(repository_name, namespace_name, performer_name):
""" Retrieve repository/namespace/performer ids based on their names.
throws DataModelException when the namespace_name does not match any
user in the database.
returns database ID or None if not exists.
"""
repository_id = None
account_id = None
performer_id = None
if repository_name and namespace_name:
repository = model.repository.get_repository(namespace_name, repository_name)
if repository:
repository_id = repository.id
account_id = repository.namespace_user.id
if namespace_name and account_id is None:
account = model.user.get_user_or_org(namespace_name)
if account is None:
raise DataModelException('Invalid namespace requested')
account_id = account.id
if performer_name:
performer = model.user.get_user(performer_name)
if performer:
performer_id = performer.id
return repository_id, account_id, performer_id
def _base_query(self, performer_id=None, repository_id=None, account_id=None, filter_kinds=None,
index=None):
if filter_kinds is not None:
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
if index is not None:
search = LogEntry.search(index=index)
else:
search = LogEntry.search()
if performer_id is not None:
assert isinstance(performer_id, int)
search = search.filter('term', performer_id=performer_id)
if repository_id is not None:
assert isinstance(repository_id, int)
search = search.filter('term', repository_id=repository_id)
if account_id is not None and repository_id is None:
assert isinstance(account_id, int)
search = search.filter('term', account_id=account_id)
if filter_kinds is not None:
kind_map = model.log.get_log_entry_kinds()
ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds]
search = search.exclude('terms', kind_id=ignore_ids)
return search
def _base_query_date_range(self, start_datetime, end_datetime, performer_id, repository_id,
account_id, filter_kinds, index=None):
skip_datetime_check = False
if _date_range_in_single_index(start_datetime, end_datetime):
index = self._es_client.index_name(start_datetime)
skip_datetime_check = self._es_client.index_exists(index)
if index and (skip_datetime_check or self._es_client.index_exists(index)):
search = self._base_query(performer_id, repository_id, account_id, filter_kinds,
index=index)
else:
search = self._base_query(performer_id, repository_id, account_id, filter_kinds)
if not skip_datetime_check:
search = search.query('range', datetime={'gte': start_datetime, 'lt': end_datetime})
return search
def _load_logs_for_day(self, logs_date, performer_id, repository_id, account_id, filter_kinds,
after_datetime=None, after_random_id=None, size=PAGE_SIZE):
index = self._es_client.index_name(logs_date)
if not self._es_client.index_exists(index):
return []
search = self._base_query(performer_id, repository_id, account_id, filter_kinds,
index=index)
search = search.sort({'datetime': 'desc'}, {'random_id.keyword': 'desc'})
search = search.extra(size=size)
if after_datetime is not None and after_random_id is not None:
after_datetime_epoch_ms = epoch_ms(after_datetime)
search = search.extra(search_after=[after_datetime_epoch_ms, after_random_id])
return search.execute()
def _load_latest_logs(self, performer_id, repository_id, account_id, filter_kinds, size):
""" Return the latest logs from Elasticsearch.
Look at indices up to theset logrotateworker threshold, or up to 30 days if not defined.
"""
# Set the last index to check to be the logrotateworker threshold, or 30 days
end_datetime = datetime.now()
start_datetime = end_datetime - timedelta(days=DATE_RANGE_LIMIT)
latest_logs = []
for day in _date_range_descending(start_datetime, end_datetime, includes_end_datetime=True):
try:
logs = self._load_logs_for_day(day, performer_id, repository_id, account_id, filter_kinds,
size=size)
latest_logs.extend(logs)
except NotFoundError:
continue
if len(latest_logs) >= size:
break
return _for_elasticsearch_logs(latest_logs[:size], repository_id, account_id)
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
assert start_datetime is not None and end_datetime is not None
# Check for a valid combined model token when migrating online from a combined model
if page_token is not None and page_token.get('readwrite_page_token') is not None:
page_token = page_token.get('readwrite_page_token')
if page_token is not None and max_page_count is not None:
page_number = page_token.get('page_number')
if page_number is not None and page_number + 1 > max_page_count:
return LogEntriesPage([], None)
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
repository_name, namespace_name, performer_name)
after_datetime = None
after_random_id = None
if page_token is not None:
after_datetime = parse_datetime(page_token['datetime'])
after_random_id = page_token['random_id']
if after_datetime is not None:
end_datetime = min(end_datetime, after_datetime)
all_logs = []
with CloseForLongOperation(config.app_config):
for current_date in _date_range_descending(start_datetime, end_datetime):
try:
logs = self._load_logs_for_day(current_date, performer_id, repository_id, account_id,
filter_kinds, after_datetime, after_random_id,
size=PAGE_SIZE+1)
all_logs.extend(logs)
except NotFoundError:
continue
if len(all_logs) > PAGE_SIZE:
break
next_page_token = None
all_logs = all_logs[0:PAGE_SIZE+1]
if len(all_logs) == PAGE_SIZE + 1:
# The last element in the response is used to check if there's more elements.
# The second element in the response is used as the pagination token because search_after does
# not include the exact match, and so the next page will start with the last element.
# This keeps the behavior exactly the same as table_logs_model, so that
# the caller can expect when a pagination token is non-empty, there must be
# at least 1 log to be retrieved.
next_page_token = {
'datetime': all_logs[-2].datetime.isoformat(),
'random_id': all_logs[-2].random_id,
'page_number': page_token['page_number'] + 1 if page_token else 1,
}
return LogEntriesPage(_for_elasticsearch_logs(all_logs[:PAGE_SIZE], repository_id, account_id),
next_page_token)
def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None,
filter_kinds=None, size=20):
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
repository_name, namespace_name, performer_name)
with CloseForLongOperation(config.app_config):
latest_logs = self._load_latest_logs(performer_id, repository_id, account_id, filter_kinds,
size)
return latest_logs
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
repository_name=None, namespace_name=None, filter_kinds=None):
if end_datetime - start_datetime >= timedelta(days=DATE_RANGE_LIMIT):
raise Exception('Cannot lookup aggregated logs over a period longer than a month')
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
repository_name, namespace_name, performer_name)
with CloseForLongOperation(config.app_config):
search = self._base_query_date_range(start_datetime, end_datetime, performer_id,
repository_id, account_id, filter_kinds)
search.aggs.bucket('by_id', 'terms', field='kind_id').bucket('by_date', 'date_histogram',
field='datetime', interval='day')
# es returns all buckets when size=0
search = search.extra(size=0)
resp = search.execute()
if not resp.aggregations:
return []
counts = []
by_id = resp.aggregations['by_id']
for id_bucket in by_id.buckets:
for date_bucket in id_bucket.by_date.buckets:
if date_bucket.doc_count > 0:
counts.append(AggregatedLogCount(id_bucket.key, date_bucket.doc_count, date_bucket.key))
return counts
def count_repository_actions(self, repository, day):
index = self._es_client.index_name(day)
search = self._base_query_date_range(day, day + timedelta(days=1),
None,
repository.id,
None,
None,
index=index)
search = search.params(request_timeout=COUNT_REPOSITORY_ACTION_TIMEOUT)
try:
return search.count()
except NotFoundError:
return 0
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
repository=None, repository_name=None, timestamp=None, is_free_namespace=False):
if self._should_skip_logging and self._should_skip_logging(kind_name, namespace_name,
is_free_namespace):
return
if repository_name is not None:
assert repository is None
assert namespace_name is not None
repository = model.repository.get_repository(namespace_name, repository_name)
if timestamp is None:
timestamp = datetime.today()
account_id = None
performer_id = None
repository_id = None
if namespace_name is not None:
account_id = model.user.get_namespace_user(namespace_name).id
if performer is not None:
performer_id = performer.id
if repository is not None:
repository_id = repository.id
metadata_json = json.dumps(metadata or {}, default=_json_serialize)
kind_id = model.log._get_log_entry_kind(kind_name)
log = LogEntry(random_id=_random_id(), kind_id=kind_id, account_id=account_id,
performer_id=performer_id, ip=ip, metadata_json=metadata_json,
repository_id=repository_id, datetime=timestamp)
try:
self._logs_producer.send(log)
except LogSendException as lse:
strict_logging_disabled = config.app_config.get('ALLOW_PULLS_WITHOUT_STRICT_LOGGING')
logger.exception('log_action failed', extra=({'exception': lse}).update(log.to_dict()))
if not (strict_logging_disabled and kind_name in ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING):
raise
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
namespace_id=None, max_query_time=None):
max_query_time = max_query_time.total_seconds() if max_query_time is not None else 300
search = self._base_query_date_range(start_datetime, end_datetime, None, repository_id,
namespace_id, None)
def raise_on_timeout(batch_generator):
start = time()
for batch in batch_generator:
elapsed = time() - start
if elapsed > max_query_time:
logger.error('Retrieval of logs `%s/%s` timed out with time of `%s`', namespace_id,
repository_id, elapsed)
raise LogsIterationTimeout()
yield batch
start = time()
def read_batch(scroll):
batch = []
for log in scroll:
batch.append(log)
if len(batch) == DEFAULT_RESULT_WINDOW:
yield _for_elasticsearch_logs(batch, repository_id=repository_id,
namespace_id=namespace_id)
batch = []
if batch:
yield _for_elasticsearch_logs(batch, repository_id=repository_id, namespace_id=namespace_id)
search = search.params(size=DEFAULT_RESULT_WINDOW, request_timeout=max_query_time)
try:
with CloseForLongOperation(config.app_config):
for batch in raise_on_timeout(read_batch(search.scan())):
yield batch
except ConnectionTimeout:
raise LogsIterationTimeout()
def can_delete_index(self, index, cutoff_date):
return self._es_client.can_delete_index(index, cutoff_date)
def list_indices(self):
return self._es_client.list_indices()
def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
""" Yield a context manager for a group of outdated logs. """
all_indices = self.list_indices()
for index in all_indices:
if not self.can_delete_index(index, cutoff_date):
continue
context = ElasticsearchLogRotationContext(index, min_logs_per_rotation, self._es_client)
yield context
class ElasticsearchLogRotationContext(LogRotationContextInterface):
"""
ElasticsearchLogRotationContext yield batch of logs from an index.
When completed without exceptions, this context will delete its associated
Elasticsearch index.
"""
def __init__(self, index, min_logs_per_rotation, es_client):
self._es_client = es_client
self.min_logs_per_rotation = min_logs_per_rotation
self.index = index
self.start_pos = 0
self.end_pos = 0
self.scroll = None
def __enter__(self):
search = self._base_query()
self.scroll = search.scan()
return self
def __exit__(self, ex_type, ex_value, ex_traceback):
if ex_type is None and ex_value is None and ex_traceback is None:
logger.debug('Deleting index %s', self.index)
self._es_client.delete_index(self.index)
def yield_logs_batch(self):
def batched_logs(gen, size):
batch = []
for log in gen:
batch.append(log)
if len(batch) == size:
yield batch
batch = []
if batch:
yield batch
for batch in batched_logs(self.scroll, self.min_logs_per_rotation):
self.end_pos = self.start_pos + len(batch) - 1
yield batch, self._generate_filename()
self.start_pos = self.end_pos + 1
def _base_query(self):
search = LogEntry.search(index=self.index)
return search
def _generate_filename(self):
""" Generate the filenames used to archive the action logs. """
filename = '%s_%d-%d' % (self.index, self.start_pos, self.end_pos)
filename = '.'.join((filename, 'txt.gz'))
return filename