251 lines
9.6 KiB
Python
251 lines
9.6 KiB
Python
|
# pylint: disable=protected-access
|
||
|
|
||
|
import json
|
||
|
import logging
|
||
|
|
||
|
from calendar import timegm
|
||
|
from datetime import timedelta, datetime
|
||
|
from random import getrandbits
|
||
|
from time import time
|
||
|
|
||
|
from elasticsearch.exceptions import ConnectionTimeout, ElasticsearchException
|
||
|
|
||
|
from data import model
|
||
|
from data.model import config
|
||
|
from data.model.log import _json_serialize, ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING, DataModelException
|
||
|
from data.logs_model.elastic_logs import LogEntry, AWSElasticsearchLogs
|
||
|
from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage
|
||
|
from data.logs_model.interface import ActionLogsDataInterface, LogsIterationTimeout
|
||
|
from data.logs_model.shared import SharedModel
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
PAGE_SIZE = 100
|
||
|
MAX_RESULT_WINDOW = 100000
|
||
|
# DATE_RANGE_LIMIT is to limit the query date time range to at most 1 month.
|
||
|
DATE_RANGE_LIMIT = 32
|
||
|
|
||
|
def _for_elasticsearch_logs(logs):
|
||
|
namespace_ids = set()
|
||
|
for log in logs:
|
||
|
namespace_ids.add(log.account_id)
|
||
|
namespace_ids.add(log.performer_id)
|
||
|
|
||
|
id_user_map = model.user.get_user_map_by_ids(namespace_ids)
|
||
|
return [Log.for_elasticsearch_log(log, id_user_map) for log in logs]
|
||
|
|
||
|
|
||
|
def _epoch_ms(dt):
|
||
|
return (timegm(dt.utctimetuple()) * 1000) + (dt.microsecond / 1000)
|
||
|
|
||
|
|
||
|
def _random_id():
|
||
|
""" generates 31 random bits as integer for random_id field in LogEntry.
|
||
|
It's used as tie-breaker for sorting logs based on datetime.
|
||
|
We assume the chance of collision on both fields is low, but when they do
|
||
|
collide, only the result of lookup_logs function will be affected because
|
||
|
some log may be missing or duplicated.
|
||
|
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-search-after.html
|
||
|
"""
|
||
|
return getrandbits(31)
|
||
|
|
||
|
|
||
|
class DocumentLogsModel(SharedModel, ActionLogsDataInterface):
|
||
|
"""
|
||
|
DocumentLogsModel implements the data model for the logs API backed by an
|
||
|
elasticsearch service.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, **kwargs):
|
||
|
self._client = AWSElasticsearchLogs(**kwargs)
|
||
|
|
||
|
@staticmethod
|
||
|
def _get_ids_by_names(repository_name, namespace_name, performer_name):
|
||
|
""" retrieve repository/namespace/performer ids based on their names.
|
||
|
throws DataModelException when the namespace_name does not match any
|
||
|
user in the database.
|
||
|
returns database ID or None if not exists.
|
||
|
"""
|
||
|
repository_id = None
|
||
|
account_id = None
|
||
|
performer_id = None
|
||
|
|
||
|
if repository_name and namespace_name:
|
||
|
repository = model.repository.get_repository(namespace_name, repository_name)
|
||
|
if repository:
|
||
|
repository_id = repository.id
|
||
|
account_id = repository.namespace_user.id
|
||
|
|
||
|
if namespace_name and account_id is None:
|
||
|
account = model.user.get_user_or_org(namespace_name)
|
||
|
if account is None:
|
||
|
raise DataModelException('Invalid namespace requested')
|
||
|
account_id = account.id
|
||
|
|
||
|
if performer_name:
|
||
|
performer = model.user.get_user(performer_name)
|
||
|
if performer:
|
||
|
performer_id = performer.id
|
||
|
|
||
|
return repository_id, account_id, performer_id
|
||
|
|
||
|
def _base_query(self, start_datetime, end_datetime, performer_id, repository_id, account_id,
|
||
|
filter_kinds):
|
||
|
search = LogEntry.search()
|
||
|
if performer_id is not None:
|
||
|
search = search.filter('term', performer_id=performer_id)
|
||
|
|
||
|
if repository_id is not None:
|
||
|
search = search.filter('term', repository_id=repository_id)
|
||
|
|
||
|
if account_id is not None:
|
||
|
search = search.filter('term', account_id=account_id)
|
||
|
|
||
|
if filter_kinds is not None:
|
||
|
kind_map = model.log.get_log_entry_kinds()
|
||
|
ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds]
|
||
|
search = search.exclude('terms', kind_id=ignore_ids)
|
||
|
|
||
|
search = search.query('range', datetime={'gte': start_datetime, 'lt': end_datetime})
|
||
|
return search
|
||
|
|
||
|
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
|
||
|
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
|
||
|
assert start_datetime is not None and end_datetime is not None
|
||
|
|
||
|
if (page_token is not None) and (max_page_count is not None) and (page_token['page_number'] + 1) > max_page_count:
|
||
|
return LogEntriesPage([], None)
|
||
|
|
||
|
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
|
||
|
repository_name, namespace_name, performer_name)
|
||
|
search = self._base_query(start_datetime, end_datetime, performer_id, repository_id,
|
||
|
account_id, filter_kinds)
|
||
|
search = search.sort({'datetime': 'desc'}, {'random_id': 'desc'})
|
||
|
search = search.extra(size=PAGE_SIZE + 1)
|
||
|
|
||
|
if page_token is not None:
|
||
|
search = search.extra(search_after=[page_token['datetime'], page_token['random_id']])
|
||
|
|
||
|
response = search.execute()
|
||
|
next_page_token = None
|
||
|
|
||
|
if len(response) == PAGE_SIZE + 1:
|
||
|
# The last element in the response is used to check if there's more elements.
|
||
|
# The second element in the response is used as the pagination token because search_after does
|
||
|
# not include the exact match, and so the next page will start with the last element.
|
||
|
# This keeps the behavior exactly the same as table_logs_model, so that
|
||
|
# the caller can expect when a pagination token is non-empty, there must be
|
||
|
# at least 1 log to be retrieved.
|
||
|
next_page_token = {
|
||
|
'datetime': _epoch_ms(response[-2].datetime),
|
||
|
'random_id': response[-2].random_id,
|
||
|
'page_number': page_token['page_number'] + 1 if page_token else 1,
|
||
|
}
|
||
|
|
||
|
return LogEntriesPage(_for_elasticsearch_logs(response[:PAGE_SIZE]), next_page_token)
|
||
|
|
||
|
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
|
||
|
repository_name=None, namespace_name=None, filter_kinds=None):
|
||
|
if end_datetime - start_datetime >= timedelta(days=DATE_RANGE_LIMIT):
|
||
|
raise Exception('Cannot lookup aggregated logs over a period longer than a month')
|
||
|
|
||
|
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
|
||
|
repository_name, namespace_name, performer_name)
|
||
|
search = self._base_query(start_datetime, end_datetime, performer_id, repository_id,
|
||
|
account_id, filter_kinds)
|
||
|
search.aggs.bucket('by_id', 'terms', field='kind_id').bucket('by_date', 'date_histogram',
|
||
|
field='datetime', interval='day')
|
||
|
# es returns all buckets when size=0
|
||
|
search = search.extra(size=0)
|
||
|
resp = search.execute()
|
||
|
|
||
|
if not resp.aggregations:
|
||
|
return []
|
||
|
|
||
|
counts = []
|
||
|
by_id = resp.aggregations['by_id']
|
||
|
|
||
|
for id_bucket in by_id.buckets:
|
||
|
for date_bucket in id_bucket.by_date.buckets:
|
||
|
if date_bucket.doc_count > 0:
|
||
|
counts.append(AggregatedLogCount(id_bucket.key, date_bucket.doc_count, date_bucket.key))
|
||
|
|
||
|
return counts
|
||
|
|
||
|
def count_repository_actions(self, repository, day):
|
||
|
return self._base_query(day, day + timedelta(days=1), None, repository.id, None, None).count()
|
||
|
|
||
|
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
|
||
|
repository=None, repository_name=None, timestamp=None):
|
||
|
if repository_name is not None:
|
||
|
assert repository is None
|
||
|
assert namespace_name is not None
|
||
|
repository = model.repository.get_repository(namespace_name, repository_name)
|
||
|
|
||
|
if timestamp is None:
|
||
|
timestamp = datetime.today()
|
||
|
|
||
|
account_id = None
|
||
|
performer_id = None
|
||
|
repository_id = None
|
||
|
|
||
|
if namespace_name is not None:
|
||
|
account_id = model.user.get_namespace_user(namespace_name).id
|
||
|
|
||
|
if performer is not None:
|
||
|
performer_id = performer.id
|
||
|
|
||
|
if repository is not None:
|
||
|
repository_id = repository.id
|
||
|
|
||
|
metadata_json = json.dumps(metadata or {}, default=_json_serialize)
|
||
|
kind_id = model.log._get_log_entry_kind(kind_name)
|
||
|
log = LogEntry(random_id=_random_id(), kind_id=kind_id, account_id=account_id,
|
||
|
performer_id=performer_id, ip=ip, metadata_json=metadata_json,
|
||
|
repository_id=repository_id, datetime=timestamp)
|
||
|
|
||
|
try:
|
||
|
log.save()
|
||
|
except ElasticsearchException as ex:
|
||
|
strict_logging_disabled = config.app_config.get('ALLOW_PULLS_WITHOUT_STRICT_LOGGING')
|
||
|
logger.error('log_action failed', extra=({'exception': ex}).update(log.to_dict()))
|
||
|
if not (strict_logging_disabled and kind_name in ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING):
|
||
|
raise
|
||
|
|
||
|
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
|
||
|
namespace_id=None, max_query_time=None):
|
||
|
max_query_time = max_query_time.total_seconds()
|
||
|
search = self._base_query(start_datetime, end_datetime, None, repository_id, namespace_id,
|
||
|
None)
|
||
|
|
||
|
def raise_on_timeout(batch_generator):
|
||
|
start = time()
|
||
|
for batch in batch_generator:
|
||
|
elapsed = time() - start
|
||
|
if elapsed > max_query_time:
|
||
|
logger.error('Retrieval of logs `%s/%s` timed out with time of `%s`', namespace_id,
|
||
|
repository_id, elapsed)
|
||
|
raise LogsIterationTimeout()
|
||
|
|
||
|
yield batch
|
||
|
start = time()
|
||
|
|
||
|
def read_batch(scroll):
|
||
|
batch = []
|
||
|
for log in scroll:
|
||
|
batch.append(log)
|
||
|
if len(batch) == MAX_RESULT_WINDOW:
|
||
|
yield _for_elasticsearch_logs(batch)
|
||
|
batch = []
|
||
|
|
||
|
if batch:
|
||
|
yield _for_elasticsearch_logs(batch)
|
||
|
|
||
|
search = search.params(size=MAX_RESULT_WINDOW, request_timeout=max_query_time)
|
||
|
|
||
|
try:
|
||
|
for batch in raise_on_timeout(read_batch(search.scan())):
|
||
|
yield batch
|
||
|
except ConnectionTimeout:
|
||
|
raise LogsIterationTimeout()
|