This repository has been archived on 2020-03-24. You can view files and clone it, but cannot push or open issues or pull requests.
quay/data/logs_model/document_logs_model.py

251 lines
9.6 KiB
Python
Raw Normal View History

# pylint: disable=protected-access
import json
import logging
from calendar import timegm
from datetime import timedelta, datetime
from random import getrandbits
from time import time
from elasticsearch.exceptions import ConnectionTimeout, ElasticsearchException
from data import model
from data.model import config
from data.model.log import _json_serialize, ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING, DataModelException
from data.logs_model.elastic_logs import LogEntry, AWSElasticsearchLogs
from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage
from data.logs_model.interface import ActionLogsDataInterface, LogsIterationTimeout
from data.logs_model.shared import SharedModel
logger = logging.getLogger(__name__)
PAGE_SIZE = 100
MAX_RESULT_WINDOW = 100000
# DATE_RANGE_LIMIT is to limit the query date time range to at most 1 month.
DATE_RANGE_LIMIT = 32
def _for_elasticsearch_logs(logs):
namespace_ids = set()
for log in logs:
namespace_ids.add(log.account_id)
namespace_ids.add(log.performer_id)
id_user_map = model.user.get_user_map_by_ids(namespace_ids)
return [Log.for_elasticsearch_log(log, id_user_map) for log in logs]
def _epoch_ms(dt):
return (timegm(dt.utctimetuple()) * 1000) + (dt.microsecond / 1000)
def _random_id():
""" generates 31 random bits as integer for random_id field in LogEntry.
It's used as tie-breaker for sorting logs based on datetime.
We assume the chance of collision on both fields is low, but when they do
collide, only the result of lookup_logs function will be affected because
some log may be missing or duplicated.
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-search-after.html
"""
return getrandbits(31)
class DocumentLogsModel(SharedModel, ActionLogsDataInterface):
"""
DocumentLogsModel implements the data model for the logs API backed by an
elasticsearch service.
"""
def __init__(self, **kwargs):
self._client = AWSElasticsearchLogs(**kwargs)
@staticmethod
def _get_ids_by_names(repository_name, namespace_name, performer_name):
""" retrieve repository/namespace/performer ids based on their names.
throws DataModelException when the namespace_name does not match any
user in the database.
returns database ID or None if not exists.
"""
repository_id = None
account_id = None
performer_id = None
if repository_name and namespace_name:
repository = model.repository.get_repository(namespace_name, repository_name)
if repository:
repository_id = repository.id
account_id = repository.namespace_user.id
if namespace_name and account_id is None:
account = model.user.get_user_or_org(namespace_name)
if account is None:
raise DataModelException('Invalid namespace requested')
account_id = account.id
if performer_name:
performer = model.user.get_user(performer_name)
if performer:
performer_id = performer.id
return repository_id, account_id, performer_id
def _base_query(self, start_datetime, end_datetime, performer_id, repository_id, account_id,
filter_kinds):
search = LogEntry.search()
if performer_id is not None:
search = search.filter('term', performer_id=performer_id)
if repository_id is not None:
search = search.filter('term', repository_id=repository_id)
if account_id is not None:
search = search.filter('term', account_id=account_id)
if filter_kinds is not None:
kind_map = model.log.get_log_entry_kinds()
ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds]
search = search.exclude('terms', kind_id=ignore_ids)
search = search.query('range', datetime={'gte': start_datetime, 'lt': end_datetime})
return search
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
assert start_datetime is not None and end_datetime is not None
if (page_token is not None) and (max_page_count is not None) and (page_token['page_number'] + 1) > max_page_count:
return LogEntriesPage([], None)
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
repository_name, namespace_name, performer_name)
search = self._base_query(start_datetime, end_datetime, performer_id, repository_id,
account_id, filter_kinds)
search = search.sort({'datetime': 'desc'}, {'random_id': 'desc'})
search = search.extra(size=PAGE_SIZE + 1)
if page_token is not None:
search = search.extra(search_after=[page_token['datetime'], page_token['random_id']])
response = search.execute()
next_page_token = None
if len(response) == PAGE_SIZE + 1:
# The last element in the response is used to check if there's more elements.
# The second element in the response is used as the pagination token because search_after does
# not include the exact match, and so the next page will start with the last element.
# This keeps the behavior exactly the same as table_logs_model, so that
# the caller can expect when a pagination token is non-empty, there must be
# at least 1 log to be retrieved.
next_page_token = {
'datetime': _epoch_ms(response[-2].datetime),
'random_id': response[-2].random_id,
'page_number': page_token['page_number'] + 1 if page_token else 1,
}
return LogEntriesPage(_for_elasticsearch_logs(response[:PAGE_SIZE]), next_page_token)
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
repository_name=None, namespace_name=None, filter_kinds=None):
if end_datetime - start_datetime >= timedelta(days=DATE_RANGE_LIMIT):
raise Exception('Cannot lookup aggregated logs over a period longer than a month')
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
repository_name, namespace_name, performer_name)
search = self._base_query(start_datetime, end_datetime, performer_id, repository_id,
account_id, filter_kinds)
search.aggs.bucket('by_id', 'terms', field='kind_id').bucket('by_date', 'date_histogram',
field='datetime', interval='day')
# es returns all buckets when size=0
search = search.extra(size=0)
resp = search.execute()
if not resp.aggregations:
return []
counts = []
by_id = resp.aggregations['by_id']
for id_bucket in by_id.buckets:
for date_bucket in id_bucket.by_date.buckets:
if date_bucket.doc_count > 0:
counts.append(AggregatedLogCount(id_bucket.key, date_bucket.doc_count, date_bucket.key))
return counts
def count_repository_actions(self, repository, day):
return self._base_query(day, day + timedelta(days=1), None, repository.id, None, None).count()
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
repository=None, repository_name=None, timestamp=None):
if repository_name is not None:
assert repository is None
assert namespace_name is not None
repository = model.repository.get_repository(namespace_name, repository_name)
if timestamp is None:
timestamp = datetime.today()
account_id = None
performer_id = None
repository_id = None
if namespace_name is not None:
account_id = model.user.get_namespace_user(namespace_name).id
if performer is not None:
performer_id = performer.id
if repository is not None:
repository_id = repository.id
metadata_json = json.dumps(metadata or {}, default=_json_serialize)
kind_id = model.log._get_log_entry_kind(kind_name)
log = LogEntry(random_id=_random_id(), kind_id=kind_id, account_id=account_id,
performer_id=performer_id, ip=ip, metadata_json=metadata_json,
repository_id=repository_id, datetime=timestamp)
try:
log.save()
except ElasticsearchException as ex:
strict_logging_disabled = config.app_config.get('ALLOW_PULLS_WITHOUT_STRICT_LOGGING')
logger.error('log_action failed', extra=({'exception': ex}).update(log.to_dict()))
if not (strict_logging_disabled and kind_name in ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING):
raise
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
namespace_id=None, max_query_time=None):
max_query_time = max_query_time.total_seconds()
search = self._base_query(start_datetime, end_datetime, None, repository_id, namespace_id,
None)
def raise_on_timeout(batch_generator):
start = time()
for batch in batch_generator:
elapsed = time() - start
if elapsed > max_query_time:
logger.error('Retrieval of logs `%s/%s` timed out with time of `%s`', namespace_id,
repository_id, elapsed)
raise LogsIterationTimeout()
yield batch
start = time()
def read_batch(scroll):
batch = []
for log in scroll:
batch.append(log)
if len(batch) == MAX_RESULT_WINDOW:
yield _for_elasticsearch_logs(batch)
batch = []
if batch:
yield _for_elasticsearch_logs(batch)
search = search.params(size=MAX_RESULT_WINDOW, request_timeout=max_query_time)
try:
for batch in raise_on_timeout(read_batch(search.scan())):
yield batch
except ConnectionTimeout:
raise LogsIterationTimeout()