initial import for Open Source 🎉
This commit is contained in:
parent
1898c361f3
commit
9c0dd3b722
2048 changed files with 218743 additions and 0 deletions
64
data/logs_model/__init__.py
Normal file
64
data/logs_model/__init__.py
Normal file
|
@ -0,0 +1,64 @@
|
|||
import logging
|
||||
|
||||
from data.logs_model.table_logs_model import TableLogsModel
|
||||
from data.logs_model.document_logs_model import DocumentLogsModel
|
||||
from data.logs_model.combined_model import CombinedLogsModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _transition_model(*args, **kwargs):
|
||||
return CombinedLogsModel(
|
||||
DocumentLogsModel(*args, **kwargs),
|
||||
TableLogsModel(*args, **kwargs),
|
||||
)
|
||||
|
||||
|
||||
_LOG_MODELS = {
|
||||
'database': TableLogsModel,
|
||||
'transition_reads_both_writes_es': _transition_model,
|
||||
'elasticsearch': DocumentLogsModel,
|
||||
}
|
||||
|
||||
_PULL_LOG_KINDS = {'pull_repo', 'repo_verb'}
|
||||
|
||||
class LogsModelProxy(object):
|
||||
def __init__(self):
|
||||
self._model = None
|
||||
|
||||
def initialize(self, model):
|
||||
self._model = model
|
||||
logger.info('===============================')
|
||||
logger.info('Using logs model `%s`', self._model)
|
||||
logger.info('===============================')
|
||||
|
||||
def __getattr__(self, attr):
|
||||
if not self._model:
|
||||
raise AttributeError("LogsModelProxy is not initialized")
|
||||
return getattr(self._model, attr)
|
||||
|
||||
|
||||
logs_model = LogsModelProxy()
|
||||
|
||||
|
||||
def configure(app_config):
|
||||
logger.debug('Configuring log lodel')
|
||||
model_name = app_config.get('LOGS_MODEL', 'database')
|
||||
model_config = app_config.get('LOGS_MODEL_CONFIG', {})
|
||||
|
||||
def should_skip_logging(kind_name, namespace_name, is_free_namespace):
|
||||
if namespace_name and namespace_name in app_config.get('DISABLED_FOR_AUDIT_LOGS', {}):
|
||||
return True
|
||||
|
||||
if kind_name in _PULL_LOG_KINDS:
|
||||
if namespace_name and namespace_name in app_config.get('DISABLED_FOR_PULL_LOGS', {}):
|
||||
return True
|
||||
|
||||
if app_config.get('FEATURE_DISABLE_PULL_LOGS_FOR_FREE_NAMESPACES'):
|
||||
if is_free_namespace:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
model_config['should_skip_logging'] = should_skip_logging
|
||||
logs_model.initialize(_LOG_MODELS[model_name](**model_config))
|
132
data/logs_model/combined_model.py
Normal file
132
data/logs_model/combined_model.py
Normal file
|
@ -0,0 +1,132 @@
|
|||
import logging
|
||||
import itertools
|
||||
|
||||
from data.logs_model.datatypes import AggregatedLogCount, LogEntriesPage
|
||||
from data.logs_model.interface import ActionLogsDataInterface
|
||||
from data.logs_model.shared import SharedModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _merge_aggregated_log_counts(*args):
|
||||
""" Merge two lists of AggregatedLogCount based on the value of their kind_id and datetime.
|
||||
"""
|
||||
matching_keys = {}
|
||||
aggregated_log_counts_list = itertools.chain.from_iterable(args)
|
||||
|
||||
def canonical_key_from_kind_date_tuple(kind_id, dt):
|
||||
""" Return a comma separated key from an AggregatedLogCount's kind_id and datetime. """
|
||||
return str(kind_id) + ',' + str(dt)
|
||||
|
||||
for kind_id, count, dt in aggregated_log_counts_list:
|
||||
kind_date_key = canonical_key_from_kind_date_tuple(kind_id, dt)
|
||||
if kind_date_key in matching_keys:
|
||||
existing_count = matching_keys[kind_date_key][2]
|
||||
matching_keys[kind_date_key] = (kind_id, dt, existing_count + count)
|
||||
else:
|
||||
matching_keys[kind_date_key] = (kind_id, dt, count)
|
||||
|
||||
return [AggregatedLogCount(kind_id, count, dt) for (kind_id, dt, count) in matching_keys.values()]
|
||||
|
||||
|
||||
class CombinedLogsModel(SharedModel, ActionLogsDataInterface):
|
||||
"""
|
||||
CombinedLogsModel implements the data model that logs to the first logs model and reads from
|
||||
both.
|
||||
"""
|
||||
|
||||
def __init__(self, read_write_logs_model, read_only_logs_model):
|
||||
self.read_write_logs_model = read_write_logs_model
|
||||
self.read_only_logs_model = read_only_logs_model
|
||||
|
||||
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
|
||||
repository=None, repository_name=None, timestamp=None, is_free_namespace=False):
|
||||
return self.read_write_logs_model.log_action(kind_name, namespace_name, performer, ip, metadata,
|
||||
repository, repository_name, timestamp,
|
||||
is_free_namespace)
|
||||
|
||||
def count_repository_actions(self, repository, day):
|
||||
rw_count = self.read_write_logs_model.count_repository_actions(repository, day)
|
||||
ro_count = self.read_only_logs_model.count_repository_actions(repository, day)
|
||||
return rw_count + ro_count
|
||||
|
||||
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
|
||||
repository_name=None, namespace_name=None, filter_kinds=None):
|
||||
rw_model = self.read_write_logs_model
|
||||
ro_model = self.read_only_logs_model
|
||||
rw_count = rw_model.get_aggregated_log_counts(start_datetime, end_datetime,
|
||||
performer_name=performer_name,
|
||||
repository_name=repository_name,
|
||||
namespace_name=namespace_name,
|
||||
filter_kinds=filter_kinds)
|
||||
ro_count = ro_model.get_aggregated_log_counts(start_datetime, end_datetime,
|
||||
performer_name=performer_name,
|
||||
repository_name=repository_name,
|
||||
namespace_name=namespace_name,
|
||||
filter_kinds=filter_kinds)
|
||||
return _merge_aggregated_log_counts(rw_count, ro_count)
|
||||
|
||||
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
|
||||
namespace_id=None, max_query_time=None):
|
||||
rw_model = self.read_write_logs_model
|
||||
ro_model = self.read_only_logs_model
|
||||
rw_logs = rw_model.yield_logs_for_export(start_datetime, end_datetime, repository_id,
|
||||
namespace_id, max_query_time)
|
||||
ro_logs = ro_model.yield_logs_for_export(start_datetime, end_datetime, repository_id,
|
||||
namespace_id, max_query_time)
|
||||
for batch in itertools.chain(rw_logs, ro_logs):
|
||||
yield batch
|
||||
|
||||
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
|
||||
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
|
||||
rw_model = self.read_write_logs_model
|
||||
ro_model = self.read_only_logs_model
|
||||
|
||||
page_token = page_token or {}
|
||||
|
||||
new_page_token = {}
|
||||
if page_token is None or not page_token.get('under_readonly_model', False):
|
||||
rw_page_token = page_token.get('readwrite_page_token')
|
||||
rw_logs = rw_model.lookup_logs(start_datetime, end_datetime, performer_name,
|
||||
repository_name, namespace_name, filter_kinds,
|
||||
rw_page_token, max_page_count)
|
||||
logs, next_page_token = rw_logs
|
||||
new_page_token['under_readonly_model'] = next_page_token is None
|
||||
new_page_token['readwrite_page_token'] = next_page_token
|
||||
return LogEntriesPage(logs, new_page_token)
|
||||
else:
|
||||
readonly_page_token = page_token.get('readonly_page_token')
|
||||
ro_logs = ro_model.lookup_logs(start_datetime, end_datetime, performer_name,
|
||||
repository_name, namespace_name, filter_kinds,
|
||||
readonly_page_token, max_page_count)
|
||||
logs, next_page_token = ro_logs
|
||||
if next_page_token is None:
|
||||
return LogEntriesPage(logs, None)
|
||||
|
||||
new_page_token['under_readonly_model'] = True
|
||||
new_page_token['readonly_page_token'] = next_page_token
|
||||
return LogEntriesPage(logs, new_page_token)
|
||||
|
||||
def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None,
|
||||
filter_kinds=None, size=20):
|
||||
latest_logs = []
|
||||
rw_model = self.read_write_logs_model
|
||||
ro_model = self.read_only_logs_model
|
||||
|
||||
rw_logs = rw_model.lookup_latest_logs(performer_name, repository_name, namespace_name,
|
||||
filter_kinds, size)
|
||||
latest_logs.extend(rw_logs)
|
||||
if len(latest_logs) < size:
|
||||
ro_logs = ro_model.lookup_latest_logs(performer_name, repository_name, namespace_name,
|
||||
filter_kinds, size - len(latest_logs))
|
||||
latest_logs.extend(ro_logs)
|
||||
|
||||
return latest_logs
|
||||
|
||||
def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
|
||||
ro_model = self.read_only_logs_model
|
||||
rw_model = self.read_write_logs_model
|
||||
ro_ctx = ro_model.yield_log_rotation_context(cutoff_date, min_logs_per_rotation)
|
||||
rw_ctx = rw_model.yield_log_rotation_context(cutoff_date, min_logs_per_rotation)
|
||||
for ctx in itertools.chain(ro_ctx, rw_ctx):
|
||||
yield ctx
|
155
data/logs_model/datatypes.py
Normal file
155
data/logs_model/datatypes.py
Normal file
|
@ -0,0 +1,155 @@
|
|||
import json
|
||||
|
||||
from calendar import timegm
|
||||
from collections import namedtuple
|
||||
from email.utils import formatdate
|
||||
|
||||
from cachetools.func import lru_cache
|
||||
|
||||
from data import model
|
||||
from util.morecollections import AttrDict
|
||||
|
||||
|
||||
def _format_date(date):
|
||||
""" Output an RFC822 date format. """
|
||||
if date is None:
|
||||
return None
|
||||
|
||||
return formatdate(timegm(date.utctimetuple()))
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _kinds():
|
||||
return model.log.get_log_entry_kinds()
|
||||
|
||||
|
||||
class LogEntriesPage(namedtuple('LogEntriesPage', ['logs', 'next_page_token'])):
|
||||
""" Represents a page returned by the lookup_logs call. The `logs` contains the logs
|
||||
found for the page and `next_page_token`, if not None, contains the token to be
|
||||
encoded and returned for the followup call.
|
||||
"""
|
||||
|
||||
|
||||
class Log(namedtuple('Log', [
|
||||
'metadata_json', 'ip', 'datetime', 'performer_email', 'performer_username', 'performer_robot',
|
||||
'account_organization', 'account_username', 'account_email', 'account_robot', 'kind_id'])):
|
||||
""" Represents a single log entry returned by the logs model. """
|
||||
|
||||
@classmethod
|
||||
def for_logentry(cls, log):
|
||||
account_organization = None
|
||||
account_username = None
|
||||
account_email = None
|
||||
account_robot = None
|
||||
|
||||
try:
|
||||
account_organization = log.account.organization
|
||||
account_username = log.account.username
|
||||
account_email = log.account.email
|
||||
account_robot = log.account.robot
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
performer_robot = None
|
||||
performer_username = None
|
||||
performer_email = None
|
||||
|
||||
try:
|
||||
performer_robot = log.performer.robot
|
||||
performer_username = log.performer.username
|
||||
performer_email = log.performer.email
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
return Log(log.metadata_json, log.ip, log.datetime, performer_email, performer_username,
|
||||
performer_robot, account_organization, account_username, account_email,
|
||||
account_robot, log.kind_id)
|
||||
|
||||
@classmethod
|
||||
def for_elasticsearch_log(cls, log, id_user_map):
|
||||
account_organization = None
|
||||
account_username = None
|
||||
account_email = None
|
||||
account_robot = None
|
||||
|
||||
try:
|
||||
if log.account_id:
|
||||
account = id_user_map[log.account_id]
|
||||
account_organization = account.organization
|
||||
account_username = account.username
|
||||
account_email = account.email
|
||||
account_robot = account.robot
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
performer_robot = None
|
||||
performer_username = None
|
||||
performer_email = None
|
||||
|
||||
try:
|
||||
if log.performer_id:
|
||||
performer = id_user_map[log.performer_id]
|
||||
performer_robot = performer.robot
|
||||
performer_username = performer.username
|
||||
performer_email = performer.email
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
return Log(log.metadata_json, str(log.ip), log.datetime, performer_email, performer_username,
|
||||
performer_robot, account_organization, account_username, account_email,
|
||||
account_robot, log.kind_id)
|
||||
|
||||
def to_dict(self, avatar, include_namespace=False):
|
||||
view = {
|
||||
'kind': _kinds()[self.kind_id],
|
||||
'metadata': json.loads(self.metadata_json),
|
||||
'ip': self.ip,
|
||||
'datetime': _format_date(self.datetime),
|
||||
}
|
||||
|
||||
if self.performer_username:
|
||||
performer = AttrDict({'username': self.performer_username, 'email': self.performer_email})
|
||||
performer.robot = None
|
||||
if self.performer_robot:
|
||||
performer.robot = self.performer_robot
|
||||
|
||||
view['performer'] = {
|
||||
'kind': 'user',
|
||||
'name': self.performer_username,
|
||||
'is_robot': self.performer_robot,
|
||||
'avatar': avatar.get_data_for_user(performer),
|
||||
}
|
||||
|
||||
if include_namespace:
|
||||
if self.account_username:
|
||||
account = AttrDict({'username': self.account_username, 'email': self.account_email})
|
||||
if self.account_organization:
|
||||
|
||||
view['namespace'] = {
|
||||
'kind': 'org',
|
||||
'name': self.account_username,
|
||||
'avatar': avatar.get_data_for_org(account),
|
||||
}
|
||||
else:
|
||||
account.robot = None
|
||||
if self.account_robot:
|
||||
account.robot = self.account_robot
|
||||
view['namespace'] = {
|
||||
'kind': 'user',
|
||||
'name': self.account_username,
|
||||
'avatar': avatar.get_data_for_user(account),
|
||||
}
|
||||
|
||||
return view
|
||||
|
||||
|
||||
class AggregatedLogCount(namedtuple('AggregatedLogCount', ['kind_id', 'count', 'datetime'])):
|
||||
""" Represents the aggregated count of the number of logs, of a particular kind, on a day. """
|
||||
def to_dict(self):
|
||||
view = {
|
||||
'kind': _kinds()[self.kind_id],
|
||||
'count': self.count,
|
||||
'datetime': _format_date(self.datetime),
|
||||
}
|
||||
|
||||
return view
|
532
data/logs_model/document_logs_model.py
Normal file
532
data/logs_model/document_logs_model.py
Normal file
|
@ -0,0 +1,532 @@
|
|||
# pylint: disable=protected-access
|
||||
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
from time import time
|
||||
from datetime import timedelta, datetime, date
|
||||
from dateutil.parser import parse as parse_datetime
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from six import add_metaclass
|
||||
|
||||
from elasticsearch.exceptions import ConnectionTimeout, NotFoundError
|
||||
|
||||
from data import model
|
||||
from data.database import CloseForLongOperation
|
||||
from data.model import config
|
||||
from data.model.log import (_json_serialize, ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING,
|
||||
DataModelException)
|
||||
from data.logs_model.elastic_logs import LogEntry, configure_es
|
||||
from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage
|
||||
from data.logs_model.interface import (ActionLogsDataInterface, LogRotationContextInterface,
|
||||
LogsIterationTimeout)
|
||||
from data.logs_model.shared import SharedModel, epoch_ms
|
||||
|
||||
from data.logs_model.logs_producer import LogProducerProxy, LogSendException
|
||||
from data.logs_model.logs_producer.kafka_logs_producer import KafkaLogsProducer
|
||||
from data.logs_model.logs_producer.elasticsearch_logs_producer import ElasticsearchLogsProducer
|
||||
from data.logs_model.logs_producer.kinesis_stream_logs_producer import KinesisStreamLogsProducer
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PAGE_SIZE = 20
|
||||
DEFAULT_RESULT_WINDOW = 5000
|
||||
MAX_RESULT_WINDOW = 10000
|
||||
|
||||
# DATE_RANGE_LIMIT is to limit the query date time range to at most 1 month.
|
||||
DATE_RANGE_LIMIT = 32
|
||||
|
||||
# Timeout for count_repository_actions
|
||||
COUNT_REPOSITORY_ACTION_TIMEOUT = 30
|
||||
|
||||
|
||||
|
||||
def _date_range_descending(start_datetime, end_datetime, includes_end_datetime=False):
|
||||
""" Generate the dates between `end_datetime` and `start_datetime`.
|
||||
|
||||
If `includes_end_datetime` is set, the generator starts at `end_datetime`,
|
||||
otherwise, starts the generator at `end_datetime` minus 1 second.
|
||||
"""
|
||||
assert end_datetime >= start_datetime
|
||||
start_date = start_datetime.date()
|
||||
|
||||
if includes_end_datetime:
|
||||
current_date = end_datetime.date()
|
||||
else:
|
||||
current_date = (end_datetime - timedelta(seconds=1)).date()
|
||||
|
||||
while current_date >= start_date:
|
||||
yield current_date
|
||||
current_date = current_date - timedelta(days=1)
|
||||
|
||||
|
||||
def _date_range_in_single_index(dt1, dt2):
|
||||
""" Determine whether a single index can be searched given a range
|
||||
of dates or datetimes. If date instances are given, difference should be 1 day.
|
||||
|
||||
NOTE: dt2 is exclusive to the search result set.
|
||||
i.e. The date range is larger or equal to dt1 and strictly smaller than dt2
|
||||
"""
|
||||
assert isinstance(dt1, date) and isinstance(dt2, date)
|
||||
|
||||
dt = dt2 - dt1
|
||||
|
||||
# Check if date or datetime
|
||||
if not isinstance(dt1, datetime) and not isinstance(dt2, datetime):
|
||||
return dt == timedelta(days=1)
|
||||
|
||||
if dt < timedelta(days=1) and dt >= timedelta(days=0):
|
||||
return dt2.day == dt1.day
|
||||
|
||||
# Check if datetime can be interpreted as a date: hour, minutes, seconds or microseconds set to 0
|
||||
if dt == timedelta(days=1):
|
||||
return dt1.hour == 0 and dt1.minute == 0 and dt1.second == 0 and dt1.microsecond == 0
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _for_elasticsearch_logs(logs, repository_id=None, namespace_id=None):
|
||||
namespace_ids = set()
|
||||
for log in logs:
|
||||
namespace_ids.add(log.account_id)
|
||||
namespace_ids.add(log.performer_id)
|
||||
assert namespace_id is None or log.account_id == namespace_id
|
||||
assert repository_id is None or log.repository_id == repository_id
|
||||
|
||||
id_user_map = model.user.get_user_map_by_ids(namespace_ids)
|
||||
return [Log.for_elasticsearch_log(log, id_user_map) for log in logs]
|
||||
|
||||
|
||||
def _random_id():
|
||||
""" Generates a unique uuid4 string for the random_id field in LogEntry.
|
||||
It is used as tie-breaker for sorting logs based on datetime:
|
||||
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-search-after.html
|
||||
"""
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
@add_metaclass(ABCMeta)
|
||||
class ElasticsearchLogsModelInterface(object):
|
||||
"""
|
||||
Interface for Elasticsearch specific operations with the logs model.
|
||||
These operations are usually index based.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def can_delete_index(self, index, cutoff_date):
|
||||
""" Return whether the given index is older than the given cutoff date. """
|
||||
|
||||
@abstractmethod
|
||||
def list_indices(self):
|
||||
""" List the logs model's indices. """
|
||||
|
||||
|
||||
class DocumentLogsModel(SharedModel, ActionLogsDataInterface, ElasticsearchLogsModelInterface):
|
||||
"""
|
||||
DocumentLogsModel implements the data model for the logs API backed by an
|
||||
elasticsearch service.
|
||||
"""
|
||||
def __init__(self, should_skip_logging=None, elasticsearch_config=None, producer=None, **kwargs):
|
||||
self._should_skip_logging = should_skip_logging
|
||||
self._logs_producer = LogProducerProxy()
|
||||
self._es_client = configure_es(**elasticsearch_config)
|
||||
|
||||
if producer == 'kafka':
|
||||
kafka_config = kwargs['kafka_config']
|
||||
self._logs_producer.initialize(KafkaLogsProducer(**kafka_config))
|
||||
elif producer == 'elasticsearch':
|
||||
self._logs_producer.initialize(ElasticsearchLogsProducer())
|
||||
elif producer == 'kinesis_stream':
|
||||
kinesis_stream_config = kwargs['kinesis_stream_config']
|
||||
self._logs_producer.initialize(KinesisStreamLogsProducer(**kinesis_stream_config))
|
||||
else:
|
||||
raise Exception('Invalid log producer: %s' % producer)
|
||||
|
||||
@staticmethod
|
||||
def _get_ids_by_names(repository_name, namespace_name, performer_name):
|
||||
""" Retrieve repository/namespace/performer ids based on their names.
|
||||
throws DataModelException when the namespace_name does not match any
|
||||
user in the database.
|
||||
returns database ID or None if not exists.
|
||||
"""
|
||||
repository_id = None
|
||||
account_id = None
|
||||
performer_id = None
|
||||
|
||||
if repository_name and namespace_name:
|
||||
repository = model.repository.get_repository(namespace_name, repository_name)
|
||||
if repository:
|
||||
repository_id = repository.id
|
||||
account_id = repository.namespace_user.id
|
||||
|
||||
if namespace_name and account_id is None:
|
||||
account = model.user.get_user_or_org(namespace_name)
|
||||
if account is None:
|
||||
raise DataModelException('Invalid namespace requested')
|
||||
|
||||
account_id = account.id
|
||||
|
||||
if performer_name:
|
||||
performer = model.user.get_user(performer_name)
|
||||
if performer:
|
||||
performer_id = performer.id
|
||||
|
||||
return repository_id, account_id, performer_id
|
||||
|
||||
def _base_query(self, performer_id=None, repository_id=None, account_id=None, filter_kinds=None,
|
||||
index=None):
|
||||
if filter_kinds is not None:
|
||||
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
|
||||
|
||||
if index is not None:
|
||||
search = LogEntry.search(index=index)
|
||||
else:
|
||||
search = LogEntry.search()
|
||||
|
||||
if performer_id is not None:
|
||||
assert isinstance(performer_id, int)
|
||||
search = search.filter('term', performer_id=performer_id)
|
||||
|
||||
if repository_id is not None:
|
||||
assert isinstance(repository_id, int)
|
||||
search = search.filter('term', repository_id=repository_id)
|
||||
|
||||
if account_id is not None and repository_id is None:
|
||||
assert isinstance(account_id, int)
|
||||
search = search.filter('term', account_id=account_id)
|
||||
|
||||
if filter_kinds is not None:
|
||||
kind_map = model.log.get_log_entry_kinds()
|
||||
ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds]
|
||||
search = search.exclude('terms', kind_id=ignore_ids)
|
||||
|
||||
return search
|
||||
|
||||
def _base_query_date_range(self, start_datetime, end_datetime, performer_id, repository_id,
|
||||
account_id, filter_kinds, index=None):
|
||||
skip_datetime_check = False
|
||||
if _date_range_in_single_index(start_datetime, end_datetime):
|
||||
index = self._es_client.index_name(start_datetime)
|
||||
skip_datetime_check = self._es_client.index_exists(index)
|
||||
|
||||
if index and (skip_datetime_check or self._es_client.index_exists(index)):
|
||||
search = self._base_query(performer_id, repository_id, account_id, filter_kinds,
|
||||
index=index)
|
||||
else:
|
||||
search = self._base_query(performer_id, repository_id, account_id, filter_kinds)
|
||||
|
||||
if not skip_datetime_check:
|
||||
search = search.query('range', datetime={'gte': start_datetime, 'lt': end_datetime})
|
||||
|
||||
return search
|
||||
|
||||
def _load_logs_for_day(self, logs_date, performer_id, repository_id, account_id, filter_kinds,
|
||||
after_datetime=None, after_random_id=None, size=PAGE_SIZE):
|
||||
index = self._es_client.index_name(logs_date)
|
||||
if not self._es_client.index_exists(index):
|
||||
return []
|
||||
|
||||
search = self._base_query(performer_id, repository_id, account_id, filter_kinds,
|
||||
index=index)
|
||||
search = search.sort({'datetime': 'desc'}, {'random_id.keyword': 'desc'})
|
||||
search = search.extra(size=size)
|
||||
|
||||
if after_datetime is not None and after_random_id is not None:
|
||||
after_datetime_epoch_ms = epoch_ms(after_datetime)
|
||||
search = search.extra(search_after=[after_datetime_epoch_ms, after_random_id])
|
||||
|
||||
return search.execute()
|
||||
|
||||
def _load_latest_logs(self, performer_id, repository_id, account_id, filter_kinds, size):
|
||||
""" Return the latest logs from Elasticsearch.
|
||||
|
||||
Look at indices up to theset logrotateworker threshold, or up to 30 days if not defined.
|
||||
"""
|
||||
# Set the last index to check to be the logrotateworker threshold, or 30 days
|
||||
end_datetime = datetime.now()
|
||||
start_datetime = end_datetime - timedelta(days=DATE_RANGE_LIMIT)
|
||||
|
||||
latest_logs = []
|
||||
for day in _date_range_descending(start_datetime, end_datetime, includes_end_datetime=True):
|
||||
try:
|
||||
logs = self._load_logs_for_day(day, performer_id, repository_id, account_id, filter_kinds,
|
||||
size=size)
|
||||
latest_logs.extend(logs)
|
||||
except NotFoundError:
|
||||
continue
|
||||
|
||||
if len(latest_logs) >= size:
|
||||
break
|
||||
|
||||
return _for_elasticsearch_logs(latest_logs[:size], repository_id, account_id)
|
||||
|
||||
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
|
||||
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
|
||||
assert start_datetime is not None and end_datetime is not None
|
||||
|
||||
# Check for a valid combined model token when migrating online from a combined model
|
||||
if page_token is not None and page_token.get('readwrite_page_token') is not None:
|
||||
page_token = page_token.get('readwrite_page_token')
|
||||
|
||||
if page_token is not None and max_page_count is not None:
|
||||
page_number = page_token.get('page_number')
|
||||
if page_number is not None and page_number + 1 > max_page_count:
|
||||
return LogEntriesPage([], None)
|
||||
|
||||
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
|
||||
repository_name, namespace_name, performer_name)
|
||||
|
||||
after_datetime = None
|
||||
after_random_id = None
|
||||
if page_token is not None:
|
||||
after_datetime = parse_datetime(page_token['datetime'])
|
||||
after_random_id = page_token['random_id']
|
||||
|
||||
if after_datetime is not None:
|
||||
end_datetime = min(end_datetime, after_datetime)
|
||||
|
||||
all_logs = []
|
||||
|
||||
with CloseForLongOperation(config.app_config):
|
||||
for current_date in _date_range_descending(start_datetime, end_datetime):
|
||||
try:
|
||||
logs = self._load_logs_for_day(current_date, performer_id, repository_id, account_id,
|
||||
filter_kinds, after_datetime, after_random_id,
|
||||
size=PAGE_SIZE+1)
|
||||
|
||||
all_logs.extend(logs)
|
||||
except NotFoundError:
|
||||
continue
|
||||
|
||||
if len(all_logs) > PAGE_SIZE:
|
||||
break
|
||||
|
||||
next_page_token = None
|
||||
all_logs = all_logs[0:PAGE_SIZE+1]
|
||||
|
||||
if len(all_logs) == PAGE_SIZE + 1:
|
||||
# The last element in the response is used to check if there's more elements.
|
||||
# The second element in the response is used as the pagination token because search_after does
|
||||
# not include the exact match, and so the next page will start with the last element.
|
||||
# This keeps the behavior exactly the same as table_logs_model, so that
|
||||
# the caller can expect when a pagination token is non-empty, there must be
|
||||
# at least 1 log to be retrieved.
|
||||
next_page_token = {
|
||||
'datetime': all_logs[-2].datetime.isoformat(),
|
||||
'random_id': all_logs[-2].random_id,
|
||||
'page_number': page_token['page_number'] + 1 if page_token else 1,
|
||||
}
|
||||
|
||||
return LogEntriesPage(_for_elasticsearch_logs(all_logs[:PAGE_SIZE], repository_id, account_id),
|
||||
next_page_token)
|
||||
|
||||
def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None,
|
||||
filter_kinds=None, size=20):
|
||||
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
|
||||
repository_name, namespace_name, performer_name)
|
||||
|
||||
with CloseForLongOperation(config.app_config):
|
||||
latest_logs = self._load_latest_logs(performer_id, repository_id, account_id, filter_kinds,
|
||||
size)
|
||||
|
||||
return latest_logs
|
||||
|
||||
|
||||
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
|
||||
repository_name=None, namespace_name=None, filter_kinds=None):
|
||||
if end_datetime - start_datetime >= timedelta(days=DATE_RANGE_LIMIT):
|
||||
raise Exception('Cannot lookup aggregated logs over a period longer than a month')
|
||||
|
||||
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
|
||||
repository_name, namespace_name, performer_name)
|
||||
|
||||
with CloseForLongOperation(config.app_config):
|
||||
search = self._base_query_date_range(start_datetime, end_datetime, performer_id,
|
||||
repository_id, account_id, filter_kinds)
|
||||
search.aggs.bucket('by_id', 'terms', field='kind_id').bucket('by_date', 'date_histogram',
|
||||
field='datetime', interval='day')
|
||||
# es returns all buckets when size=0
|
||||
search = search.extra(size=0)
|
||||
resp = search.execute()
|
||||
|
||||
if not resp.aggregations:
|
||||
return []
|
||||
|
||||
counts = []
|
||||
by_id = resp.aggregations['by_id']
|
||||
|
||||
for id_bucket in by_id.buckets:
|
||||
for date_bucket in id_bucket.by_date.buckets:
|
||||
if date_bucket.doc_count > 0:
|
||||
counts.append(AggregatedLogCount(id_bucket.key, date_bucket.doc_count, date_bucket.key))
|
||||
|
||||
return counts
|
||||
|
||||
def count_repository_actions(self, repository, day):
|
||||
index = self._es_client.index_name(day)
|
||||
search = self._base_query_date_range(day, day + timedelta(days=1),
|
||||
None,
|
||||
repository.id,
|
||||
None,
|
||||
None,
|
||||
index=index)
|
||||
search = search.params(request_timeout=COUNT_REPOSITORY_ACTION_TIMEOUT)
|
||||
|
||||
try:
|
||||
return search.count()
|
||||
except NotFoundError:
|
||||
return 0
|
||||
|
||||
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
|
||||
repository=None, repository_name=None, timestamp=None, is_free_namespace=False):
|
||||
if self._should_skip_logging and self._should_skip_logging(kind_name, namespace_name,
|
||||
is_free_namespace):
|
||||
return
|
||||
|
||||
if repository_name is not None:
|
||||
assert repository is None
|
||||
assert namespace_name is not None
|
||||
repository = model.repository.get_repository(namespace_name, repository_name)
|
||||
|
||||
if timestamp is None:
|
||||
timestamp = datetime.today()
|
||||
|
||||
account_id = None
|
||||
performer_id = None
|
||||
repository_id = None
|
||||
|
||||
if namespace_name is not None:
|
||||
account_id = model.user.get_namespace_user(namespace_name).id
|
||||
|
||||
if performer is not None:
|
||||
performer_id = performer.id
|
||||
|
||||
if repository is not None:
|
||||
repository_id = repository.id
|
||||
|
||||
metadata_json = json.dumps(metadata or {}, default=_json_serialize)
|
||||
kind_id = model.log._get_log_entry_kind(kind_name)
|
||||
log = LogEntry(random_id=_random_id(), kind_id=kind_id, account_id=account_id,
|
||||
performer_id=performer_id, ip=ip, metadata_json=metadata_json,
|
||||
repository_id=repository_id, datetime=timestamp)
|
||||
|
||||
try:
|
||||
self._logs_producer.send(log)
|
||||
except LogSendException as lse:
|
||||
strict_logging_disabled = config.app_config.get('ALLOW_PULLS_WITHOUT_STRICT_LOGGING')
|
||||
logger.exception('log_action failed', extra=({'exception': lse}).update(log.to_dict()))
|
||||
if not (strict_logging_disabled and kind_name in ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING):
|
||||
raise
|
||||
|
||||
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
|
||||
namespace_id=None, max_query_time=None):
|
||||
max_query_time = max_query_time.total_seconds() if max_query_time is not None else 300
|
||||
search = self._base_query_date_range(start_datetime, end_datetime, None, repository_id,
|
||||
namespace_id, None)
|
||||
|
||||
def raise_on_timeout(batch_generator):
|
||||
start = time()
|
||||
for batch in batch_generator:
|
||||
elapsed = time() - start
|
||||
if elapsed > max_query_time:
|
||||
logger.error('Retrieval of logs `%s/%s` timed out with time of `%s`', namespace_id,
|
||||
repository_id, elapsed)
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
yield batch
|
||||
start = time()
|
||||
|
||||
def read_batch(scroll):
|
||||
batch = []
|
||||
for log in scroll:
|
||||
batch.append(log)
|
||||
if len(batch) == DEFAULT_RESULT_WINDOW:
|
||||
yield _for_elasticsearch_logs(batch, repository_id=repository_id,
|
||||
namespace_id=namespace_id)
|
||||
batch = []
|
||||
|
||||
if batch:
|
||||
yield _for_elasticsearch_logs(batch, repository_id=repository_id, namespace_id=namespace_id)
|
||||
|
||||
search = search.params(size=DEFAULT_RESULT_WINDOW, request_timeout=max_query_time)
|
||||
|
||||
try:
|
||||
with CloseForLongOperation(config.app_config):
|
||||
for batch in raise_on_timeout(read_batch(search.scan())):
|
||||
yield batch
|
||||
except ConnectionTimeout:
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
def can_delete_index(self, index, cutoff_date):
|
||||
return self._es_client.can_delete_index(index, cutoff_date)
|
||||
|
||||
def list_indices(self):
|
||||
return self._es_client.list_indices()
|
||||
|
||||
def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
|
||||
""" Yield a context manager for a group of outdated logs. """
|
||||
all_indices = self.list_indices()
|
||||
for index in all_indices:
|
||||
if not self.can_delete_index(index, cutoff_date):
|
||||
continue
|
||||
|
||||
context = ElasticsearchLogRotationContext(index, min_logs_per_rotation, self._es_client)
|
||||
yield context
|
||||
|
||||
|
||||
class ElasticsearchLogRotationContext(LogRotationContextInterface):
|
||||
"""
|
||||
ElasticsearchLogRotationContext yield batch of logs from an index.
|
||||
|
||||
When completed without exceptions, this context will delete its associated
|
||||
Elasticsearch index.
|
||||
"""
|
||||
def __init__(self, index, min_logs_per_rotation, es_client):
|
||||
self._es_client = es_client
|
||||
self.min_logs_per_rotation = min_logs_per_rotation
|
||||
self.index = index
|
||||
|
||||
self.start_pos = 0
|
||||
self.end_pos = 0
|
||||
|
||||
self.scroll = None
|
||||
|
||||
def __enter__(self):
|
||||
search = self._base_query()
|
||||
self.scroll = search.scan()
|
||||
return self
|
||||
|
||||
def __exit__(self, ex_type, ex_value, ex_traceback):
|
||||
if ex_type is None and ex_value is None and ex_traceback is None:
|
||||
logger.debug('Deleting index %s', self.index)
|
||||
self._es_client.delete_index(self.index)
|
||||
|
||||
def yield_logs_batch(self):
|
||||
def batched_logs(gen, size):
|
||||
batch = []
|
||||
for log in gen:
|
||||
batch.append(log)
|
||||
if len(batch) == size:
|
||||
yield batch
|
||||
batch = []
|
||||
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
for batch in batched_logs(self.scroll, self.min_logs_per_rotation):
|
||||
self.end_pos = self.start_pos + len(batch) - 1
|
||||
yield batch, self._generate_filename()
|
||||
self.start_pos = self.end_pos + 1
|
||||
|
||||
def _base_query(self):
|
||||
search = LogEntry.search(index=self.index)
|
||||
return search
|
||||
|
||||
def _generate_filename(self):
|
||||
""" Generate the filenames used to archive the action logs. """
|
||||
filename = '%s_%d-%d' % (self.index, self.start_pos, self.end_pos)
|
||||
filename = '.'.join((filename, 'txt.gz'))
|
||||
return filename
|
255
data/logs_model/elastic_logs.py
Normal file
255
data/logs_model/elastic_logs.py
Normal file
|
@ -0,0 +1,255 @@
|
|||
import os
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from requests_aws4auth import AWS4Auth
|
||||
|
||||
from elasticsearch import RequestsHttpConnection
|
||||
from elasticsearch.exceptions import NotFoundError, AuthorizationException
|
||||
from elasticsearch_dsl import Index, Document, Integer, Date, Text, Ip, Keyword
|
||||
from elasticsearch_dsl.connections import connections
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Name of the connection used for Elasticearch's template API
|
||||
ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS = 'logentry_template'
|
||||
|
||||
# Prefix of autogenerated indices
|
||||
INDEX_NAME_PREFIX = 'logentry_'
|
||||
|
||||
# Time-based index date format
|
||||
INDEX_DATE_FORMAT = '%Y-%m-%d'
|
||||
|
||||
# Timeout for default connection
|
||||
ELASTICSEARCH_DEFAULT_CONNECTION_TIMEOUT = 15
|
||||
|
||||
# Timeout for template api Connection
|
||||
ELASTICSEARCH_TEMPLATE_CONNECTION_TIMEOUT = 60
|
||||
|
||||
# Force an index template update
|
||||
ELASTICSEARCH_FORCE_INDEX_TEMPLATE_UPDATE = os.environ.get('FORCE_INDEX_TEMPLATE_UPDATE', '')
|
||||
|
||||
# Valid index prefix pattern
|
||||
VALID_INDEX_PATTERN = r'^((?!\.$|\.\.$|[-_+])([^A-Z:\/*?\"<>|,# ]){1,255})$'
|
||||
|
||||
|
||||
class LogEntry(Document):
|
||||
# random_id is the tie-breaker for sorting in pagination.
|
||||
# random_id is also used for deduplication of records when using a "at-least-once" delivery stream.
|
||||
# Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-search-after.html
|
||||
#
|
||||
# We use don't use the _id of a document since a `doc_values` is not build for this field:
|
||||
# An on-disk data structure that stores the same data in a columnar format
|
||||
# for optimized sorting and aggregations.
|
||||
# Reference: https://github.com/elastic/elasticsearch/issues/35369
|
||||
random_id = Text(fields={'keyword': Keyword()})
|
||||
kind_id = Integer()
|
||||
account_id = Integer()
|
||||
performer_id = Integer()
|
||||
repository_id = Integer()
|
||||
ip = Ip()
|
||||
metadata_json = Text()
|
||||
datetime = Date()
|
||||
|
||||
_initialized = False
|
||||
|
||||
@classmethod
|
||||
def init(cls, index_prefix, index_settings=None, skip_template_init=False):
|
||||
"""
|
||||
Create the index template, and populate LogEntry's mapping and index settings.
|
||||
"""
|
||||
wildcard_index = Index(name=index_prefix + '*')
|
||||
wildcard_index.settings(**(index_settings or {}))
|
||||
wildcard_index.document(cls)
|
||||
cls._index = wildcard_index
|
||||
cls._index_prefix = index_prefix
|
||||
|
||||
if not skip_template_init:
|
||||
cls.create_or_update_template()
|
||||
|
||||
# Since the elasticsearch-dsl API requires the document's index being defined as an inner class at the class level,
|
||||
# this function needs to be called first before being able to call `save`.
|
||||
cls._initialized = True
|
||||
|
||||
@classmethod
|
||||
def create_or_update_template(cls):
|
||||
assert cls._index and cls._index_prefix
|
||||
index_template = cls._index.as_template(cls._index_prefix)
|
||||
index_template.save(using=ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS)
|
||||
|
||||
def save(self, **kwargs):
|
||||
# We group the logs based on year, month and day as different indexes, so that
|
||||
# dropping those indexes based on retention range is easy.
|
||||
#
|
||||
# NOTE: This is only used if logging directly to Elasticsearch
|
||||
# When using Kinesis or Kafka, the consumer of these streams
|
||||
# will be responsible for the management of the indices' lifecycle.
|
||||
assert LogEntry._initialized
|
||||
kwargs['index'] = self.datetime.strftime(self._index_prefix + INDEX_DATE_FORMAT)
|
||||
return super(LogEntry, self).save(**kwargs)
|
||||
|
||||
|
||||
class ElasticsearchLogs(object):
|
||||
"""
|
||||
Model for logs operations stored in an Elasticsearch cluster.
|
||||
"""
|
||||
|
||||
def __init__(self, host=None, port=None, access_key=None, secret_key=None, aws_region=None,
|
||||
index_settings=None, use_ssl=True, index_prefix=INDEX_NAME_PREFIX):
|
||||
# For options in index_settings, refer to:
|
||||
# https://www.elastic.co/guide/en/elasticsearch/guide/master/_index_settings.html
|
||||
# some index settings are set at index creation time, and therefore, you should NOT
|
||||
# change those settings once the index is set.
|
||||
self._host = host
|
||||
self._port = port
|
||||
self._access_key = access_key
|
||||
self._secret_key = secret_key
|
||||
self._aws_region = aws_region
|
||||
self._index_prefix = index_prefix
|
||||
self._index_settings = index_settings
|
||||
self._use_ssl = use_ssl
|
||||
|
||||
self._client = None
|
||||
self._initialized = False
|
||||
|
||||
def _initialize(self):
|
||||
"""
|
||||
Initialize a connection to an ES cluster and
|
||||
creates an index template if it does not exist.
|
||||
"""
|
||||
if not self._initialized:
|
||||
http_auth = None
|
||||
if self._access_key and self._secret_key and self._aws_region:
|
||||
http_auth = AWS4Auth(self._access_key, self._secret_key, self._aws_region, 'es')
|
||||
elif self._access_key and self._secret_key:
|
||||
http_auth = (self._access_key, self._secret_key)
|
||||
else:
|
||||
logger.warn("Connecting to Elasticsearch without HTTP auth")
|
||||
|
||||
self._client = connections.create_connection(
|
||||
hosts=[{
|
||||
'host': self._host,
|
||||
'port': self._port
|
||||
}],
|
||||
http_auth=http_auth,
|
||||
use_ssl=self._use_ssl,
|
||||
verify_certs=True,
|
||||
connection_class=RequestsHttpConnection,
|
||||
timeout=ELASTICSEARCH_DEFAULT_CONNECTION_TIMEOUT,
|
||||
)
|
||||
|
||||
# Create a second connection with a timeout of 60s vs 10s.
|
||||
# For some reason the PUT template API can take anywhere between
|
||||
# 10s and 30s on the test cluster.
|
||||
# This only needs to be done once to initialize the index template
|
||||
connections.create_connection(
|
||||
alias=ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS,
|
||||
hosts=[{
|
||||
'host': self._host,
|
||||
'port': self._port
|
||||
}],
|
||||
http_auth=http_auth,
|
||||
use_ssl=self._use_ssl,
|
||||
verify_certs=True,
|
||||
connection_class=RequestsHttpConnection,
|
||||
timeout=ELASTICSEARCH_TEMPLATE_CONNECTION_TIMEOUT,
|
||||
)
|
||||
|
||||
try:
|
||||
force_template_update = ELASTICSEARCH_FORCE_INDEX_TEMPLATE_UPDATE.lower() == 'true'
|
||||
self._client.indices.get_template(self._index_prefix)
|
||||
LogEntry.init(self._index_prefix, self._index_settings,
|
||||
skip_template_init=not force_template_update)
|
||||
except NotFoundError:
|
||||
LogEntry.init(self._index_prefix, self._index_settings, skip_template_init=False)
|
||||
finally:
|
||||
try:
|
||||
connections.remove_connection(ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS)
|
||||
except KeyError as ke:
|
||||
logger.exception('Elasticsearch connection not found to remove %s: %s',
|
||||
ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS, ke)
|
||||
|
||||
self._initialized = True
|
||||
|
||||
def index_name(self, day):
|
||||
""" Return an index name for the given day. """
|
||||
return self._index_prefix + day.strftime(INDEX_DATE_FORMAT)
|
||||
|
||||
def index_exists(self, index):
|
||||
try:
|
||||
return index in self._client.indices.get(index)
|
||||
except NotFoundError:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _valid_index_prefix(prefix):
|
||||
""" Check that the given index prefix is valid with the set of
|
||||
indices used by this class.
|
||||
"""
|
||||
return re.match(VALID_INDEX_PATTERN, prefix) is not None
|
||||
|
||||
def _valid_index_name(self, index):
|
||||
""" Check that the given index name is valid and follows the format:
|
||||
<index_prefix>YYYY-MM-DD
|
||||
"""
|
||||
if not ElasticsearchLogs._valid_index_prefix(index):
|
||||
return False
|
||||
|
||||
if not index.startswith(self._index_prefix) or len(index) > 255:
|
||||
return False
|
||||
|
||||
index_dt_str = index.split(self._index_prefix, 1)[-1]
|
||||
try:
|
||||
datetime.strptime(index_dt_str, INDEX_DATE_FORMAT)
|
||||
return True
|
||||
except ValueError:
|
||||
logger.exception('Invalid date format (YYYY-MM-DD) for index: %s', index)
|
||||
return False
|
||||
|
||||
def can_delete_index(self, index, cutoff_date):
|
||||
""" Check if the given index can be deleted based on the given index's date and cutoff date. """
|
||||
assert self._valid_index_name(index)
|
||||
index_dt = datetime.strptime(index[len(self._index_prefix):], INDEX_DATE_FORMAT)
|
||||
return index_dt < cutoff_date and cutoff_date - index_dt >= timedelta(days=1)
|
||||
|
||||
def list_indices(self):
|
||||
self._initialize()
|
||||
try:
|
||||
return self._client.indices.get(self._index_prefix + '*').keys()
|
||||
except NotFoundError as nfe:
|
||||
logger.exception('`%s` indices not found: %s', self._index_prefix, nfe.info)
|
||||
return []
|
||||
except AuthorizationException as ae:
|
||||
logger.exception('Unauthorized for indices `%s`: %s', self._index_prefix, ae.info)
|
||||
return None
|
||||
|
||||
def delete_index(self, index):
|
||||
self._initialize()
|
||||
assert self._valid_index_name(index)
|
||||
|
||||
try:
|
||||
self._client.indices.delete(index)
|
||||
return index
|
||||
except NotFoundError as nfe:
|
||||
logger.exception('`%s` indices not found: %s', index, nfe.info)
|
||||
return None
|
||||
except AuthorizationException as ae:
|
||||
logger.exception('Unauthorized to delete index `%s`: %s', index, ae.info)
|
||||
return None
|
||||
|
||||
|
||||
def configure_es(host, port, access_key=None, secret_key=None, aws_region=None,
|
||||
index_prefix=None, use_ssl=True, index_settings=None):
|
||||
"""
|
||||
For options in index_settings, refer to:
|
||||
https://www.elastic.co/guide/en/elasticsearch/guide/master/_index_settings.html
|
||||
some index settings are set at index creation time, and therefore, you should NOT
|
||||
change those settings once the index is set.
|
||||
"""
|
||||
es_client = ElasticsearchLogs(host=host, port=port, access_key=access_key, secret_key=secret_key,
|
||||
aws_region=aws_region, index_prefix=index_prefix or INDEX_NAME_PREFIX,
|
||||
use_ssl=use_ssl, index_settings=index_settings)
|
||||
es_client._initialize()
|
||||
return es_client
|
244
data/logs_model/inmemory_model.py
Normal file
244
data/logs_model/inmemory_model.py
Normal file
|
@ -0,0 +1,244 @@
|
|||
import logging
|
||||
import json
|
||||
|
||||
from collections import namedtuple
|
||||
from datetime import datetime
|
||||
from tzlocal import get_localzone
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
from data import model
|
||||
from data.logs_model.datatypes import AggregatedLogCount, LogEntriesPage, Log
|
||||
from data.logs_model.interface import (ActionLogsDataInterface, LogRotationContextInterface,
|
||||
LogsIterationTimeout)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
LogAndRepository = namedtuple('LogAndRepository', ['log', 'stored_log', 'repository'])
|
||||
|
||||
StoredLog = namedtuple('StoredLog', ['kind_id',
|
||||
'account_id',
|
||||
'performer_id',
|
||||
'ip',
|
||||
'metadata_json',
|
||||
'repository_id',
|
||||
'datetime'])
|
||||
|
||||
class InMemoryModel(ActionLogsDataInterface):
|
||||
"""
|
||||
InMemoryModel implements the data model for logs in-memory. FOR TESTING ONLY.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.logs = []
|
||||
|
||||
def _filter_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
|
||||
namespace_name=None, filter_kinds=None):
|
||||
if filter_kinds is not None:
|
||||
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
|
||||
|
||||
for log_and_repo in self.logs:
|
||||
if log_and_repo.log.datetime < start_datetime or log_and_repo.log.datetime > end_datetime:
|
||||
continue
|
||||
|
||||
if performer_name and log_and_repo.log.performer_username != performer_name:
|
||||
continue
|
||||
|
||||
if (repository_name and
|
||||
(not log_and_repo.repository or log_and_repo.repository.name != repository_name)):
|
||||
continue
|
||||
|
||||
if namespace_name and log_and_repo.log.account_username != namespace_name:
|
||||
continue
|
||||
|
||||
if filter_kinds:
|
||||
kind_map = model.log.get_log_entry_kinds()
|
||||
ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds]
|
||||
if log_and_repo.log.kind_id in ignore_ids:
|
||||
continue
|
||||
|
||||
yield log_and_repo
|
||||
|
||||
def _filter_latest_logs(self, performer_name=None, repository_name=None,
|
||||
namespace_name=None, filter_kinds=None):
|
||||
if filter_kinds is not None:
|
||||
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
|
||||
|
||||
for log_and_repo in sorted(self.logs, key=lambda t: t.log.datetime, reverse=True):
|
||||
if performer_name and log_and_repo.log.performer_username != performer_name:
|
||||
continue
|
||||
|
||||
if (repository_name and
|
||||
(not log_and_repo.repository or log_and_repo.repository.name != repository_name)):
|
||||
continue
|
||||
|
||||
if namespace_name and log_and_repo.log.account_username != namespace_name:
|
||||
continue
|
||||
|
||||
if filter_kinds:
|
||||
kind_map = model.log.get_log_entry_kinds()
|
||||
ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds]
|
||||
if log_and_repo.log.kind_id in ignore_ids:
|
||||
continue
|
||||
|
||||
yield log_and_repo
|
||||
|
||||
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
|
||||
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
|
||||
logs = []
|
||||
for log_and_repo in self._filter_logs(start_datetime, end_datetime, performer_name,
|
||||
repository_name, namespace_name, filter_kinds):
|
||||
logs.append(log_and_repo.log)
|
||||
return LogEntriesPage(logs, None)
|
||||
|
||||
def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None,
|
||||
filter_kinds=None, size=20):
|
||||
latest_logs = []
|
||||
for log_and_repo in self._filter_latest_logs(performer_name, repository_name, namespace_name,
|
||||
filter_kinds):
|
||||
if size is not None and len(latest_logs) == size:
|
||||
break
|
||||
|
||||
latest_logs.append(log_and_repo.log)
|
||||
|
||||
return latest_logs
|
||||
|
||||
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
|
||||
repository_name=None, namespace_name=None, filter_kinds=None):
|
||||
entries = {}
|
||||
for log_and_repo in self._filter_logs(start_datetime, end_datetime, performer_name,
|
||||
repository_name, namespace_name, filter_kinds):
|
||||
entry = log_and_repo.log
|
||||
synthetic_date = datetime(start_datetime.year, start_datetime.month, int(entry.datetime.day),
|
||||
tzinfo=get_localzone())
|
||||
if synthetic_date.day < start_datetime.day:
|
||||
synthetic_date = synthetic_date + relativedelta(months=1)
|
||||
|
||||
key = '%s-%s' % (entry.kind_id, entry.datetime.day)
|
||||
|
||||
if key in entries:
|
||||
entries[key] = AggregatedLogCount(entry.kind_id, entries[key].count + 1,
|
||||
synthetic_date)
|
||||
else:
|
||||
entries[key] = AggregatedLogCount(entry.kind_id, 1, synthetic_date)
|
||||
|
||||
return entries.values()
|
||||
|
||||
def count_repository_actions(self, repository, day):
|
||||
count = 0
|
||||
for log_and_repo in self.logs:
|
||||
if log_and_repo.repository != repository:
|
||||
continue
|
||||
|
||||
if log_and_repo.log.datetime.day != day.day:
|
||||
continue
|
||||
|
||||
count += 1
|
||||
|
||||
return count
|
||||
|
||||
def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue,
|
||||
namespace_name=None, repository_name=None, callback_url=None,
|
||||
callback_email=None, filter_kinds=None):
|
||||
raise NotImplementedError
|
||||
|
||||
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
|
||||
repository=None, repository_name=None, timestamp=None, is_free_namespace=False):
|
||||
timestamp = timestamp or datetime.today()
|
||||
|
||||
if not repository and repository_name and namespace_name:
|
||||
repository = model.repository.get_repository(namespace_name, repository_name)
|
||||
|
||||
account = None
|
||||
account_id = None
|
||||
performer_id = None
|
||||
repository_id = None
|
||||
|
||||
if namespace_name is not None:
|
||||
account = model.user.get_namespace_user(namespace_name)
|
||||
account_id = account.id
|
||||
|
||||
if performer is not None:
|
||||
performer_id = performer.id
|
||||
|
||||
if repository is not None:
|
||||
repository_id = repository.id
|
||||
|
||||
metadata_json = json.dumps(metadata or {})
|
||||
kind_id = model.log.get_log_entry_kinds()[kind_name]
|
||||
|
||||
stored_log = StoredLog(
|
||||
kind_id,
|
||||
account_id,
|
||||
performer_id,
|
||||
ip,
|
||||
metadata_json,
|
||||
repository_id,
|
||||
timestamp
|
||||
)
|
||||
|
||||
log = Log(metadata_json=metadata,
|
||||
ip=ip,
|
||||
datetime=timestamp,
|
||||
performer_email=performer.email if performer else None,
|
||||
performer_username=performer.username if performer else None,
|
||||
performer_robot=performer.robot if performer else None,
|
||||
account_organization=account.organization if account else None,
|
||||
account_username=account.username if account else None,
|
||||
account_email=account.email if account else None,
|
||||
account_robot=account.robot if account else None,
|
||||
kind_id=kind_id)
|
||||
|
||||
self.logs.append(LogAndRepository(log, stored_log, repository))
|
||||
|
||||
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
|
||||
namespace_id=None, max_query_time=None):
|
||||
# Just for testing.
|
||||
if max_query_time is not None:
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
logs = []
|
||||
for log_and_repo in self._filter_logs(start_datetime, end_datetime):
|
||||
if (repository_id and
|
||||
(not log_and_repo.repository or log_and_repo.repository.id != repository_id)):
|
||||
continue
|
||||
|
||||
if namespace_id:
|
||||
if log_and_repo.log.account_username is None:
|
||||
continue
|
||||
|
||||
namespace = model.user.get_namespace_user(log_and_repo.log.account_username)
|
||||
if namespace.id != namespace_id:
|
||||
continue
|
||||
|
||||
logs.append(log_and_repo.log)
|
||||
|
||||
yield logs
|
||||
|
||||
def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
|
||||
expired_logs = [log_and_repo for log_and_repo in self.logs
|
||||
if log_and_repo.log.datetime <= cutoff_date]
|
||||
while True:
|
||||
if not expired_logs:
|
||||
break
|
||||
context = InMemoryLogRotationContext(expired_logs[:min_logs_per_rotation], self.logs)
|
||||
expired_logs = expired_logs[min_logs_per_rotation:]
|
||||
yield context
|
||||
|
||||
|
||||
class InMemoryLogRotationContext(LogRotationContextInterface):
|
||||
def __init__(self, expired_logs, all_logs):
|
||||
self.expired_logs = expired_logs
|
||||
self.all_logs = all_logs
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, ex_type, ex_value, ex_traceback):
|
||||
if ex_type is None and ex_value is None and ex_traceback is None:
|
||||
for log in self.expired_logs:
|
||||
self.all_logs.remove(log)
|
||||
|
||||
def yield_logs_batch(self):
|
||||
""" Yield a batch of logs and a filename for that batch. """
|
||||
filename = 'inmemory_model_filename_placeholder'
|
||||
filename = '.'.join((filename, 'txt.gz'))
|
||||
yield [log_and_repo.stored_log for log_and_repo in self.expired_logs], filename
|
95
data/logs_model/interface.py
Normal file
95
data/logs_model/interface.py
Normal file
|
@ -0,0 +1,95 @@
|
|||
from abc import ABCMeta, abstractmethod
|
||||
from six import add_metaclass
|
||||
|
||||
class LogsIterationTimeout(Exception):
|
||||
""" Exception raised if logs iteration times out. """
|
||||
|
||||
|
||||
@add_metaclass(ABCMeta)
|
||||
class ActionLogsDataInterface(object):
|
||||
""" Interface for code to work with the logs data model. The logs data model consists
|
||||
of all access for reading and writing action logs.
|
||||
"""
|
||||
@abstractmethod
|
||||
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
|
||||
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
|
||||
""" Looks up all logs between the start_datetime and end_datetime, filtered
|
||||
by performer (a user), repository or namespace. Note that one (and only one) of the three
|
||||
can be specified. Returns a LogEntriesPage. `filter_kinds`, if specified, is a set/list
|
||||
of the kinds of logs to filter out.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None,
|
||||
filter_kinds=None, size=20):
|
||||
""" Looks up latest logs of a specific kind, filtered by performer (a user),
|
||||
repository or namespace. Note that one (and only one) of the three can be specified.
|
||||
Returns a list of `Log`.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
|
||||
repository_name=None, namespace_name=None, filter_kinds=None):
|
||||
""" Returns the aggregated count of logs, by kind, between the start_datetime and end_datetime,
|
||||
filtered by performer (a user), repository or namespace. Note that one (and only one) of
|
||||
the three can be specified. Returns a list of AggregatedLogCount.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def count_repository_actions(self, repository, day):
|
||||
""" Returns the total number of repository actions over the given day, in the given repository
|
||||
or None on error.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue,
|
||||
namespace_name=None, repository_name=None, callback_url=None,
|
||||
callback_email=None, filter_kinds=None):
|
||||
""" Queues logs between the start_datetime and end_time, filtered by a repository or namespace,
|
||||
for export to the specified URL and/or email address. Returns the ID of the export job
|
||||
queued or None if error.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
|
||||
repository=None, repository_name=None, timestamp=None, is_free_namespace=False):
|
||||
""" Logs a single action as having taken place. """
|
||||
|
||||
@abstractmethod
|
||||
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
|
||||
namespace_id=None, max_query_time=None):
|
||||
""" Returns an iterator that yields bundles of all logs found between the start_datetime and
|
||||
end_datetime, optionally filtered by the repository or namespace. This function should be
|
||||
used for any bulk lookup operations, and should be implemented by implementors to put
|
||||
minimal strain on the backing storage for large operations. If there was an error in setting
|
||||
up, returns None.
|
||||
|
||||
If max_query_time is specified, each iteration that yields a log bundle will have its
|
||||
queries run with a maximum timeout of that specified, and, if any exceed that threshold,
|
||||
LogsIterationTimeout will be raised instead of returning the logs bundle.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
|
||||
"""
|
||||
A generator that yields contexts implementing the LogRotationContextInterface.
|
||||
Each context represents a set of logs to be archived and deleted once
|
||||
the context completes without exceptions.
|
||||
|
||||
For database logs, the LogRotationContext abstracts over a set of rows. When the context
|
||||
finishes, its associated rows get deleted.
|
||||
|
||||
For Elasticsearch logs, the LogRotationContext abstracts over indices. When the context
|
||||
finishes, its associated index gets deleted.
|
||||
"""
|
||||
|
||||
|
||||
@add_metaclass(ABCMeta)
|
||||
class LogRotationContextInterface(object):
|
||||
""" Interface for iterating over a set of logs to be archived. """
|
||||
@abstractmethod
|
||||
def yield_logs_batch(self):
|
||||
"""
|
||||
Generator yielding batch of logs and a filename for that batch.
|
||||
A batch is a subset of the logs part of the context.
|
||||
"""
|
27
data/logs_model/logs_producer/__init__.py
Normal file
27
data/logs_model/logs_producer/__init__.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import logging
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LogSendException(Exception):
|
||||
""" A generic error when sending the logs to its destination.
|
||||
e.g. Kinesis, Kafka, Elasticsearch, ...
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class LogProducerProxy(object):
|
||||
def __init__(self):
|
||||
self._model = None
|
||||
|
||||
def initialize(self, model):
|
||||
self._model = model
|
||||
logger.info('===============================')
|
||||
logger.info('Using producer `%s`', self._model)
|
||||
logger.info('===============================')
|
||||
|
||||
def __getattr__(self, attr):
|
||||
if not self._model:
|
||||
raise AttributeError("LogsModelProxy is not initialized")
|
||||
return getattr(self._model, attr)
|
25
data/logs_model/logs_producer/elasticsearch_logs_producer.py
Normal file
25
data/logs_model/logs_producer/elasticsearch_logs_producer.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
import logging
|
||||
|
||||
from elasticsearch.exceptions import ElasticsearchException
|
||||
|
||||
from data.logs_model.logs_producer.interface import LogProducerInterface
|
||||
from data.logs_model.logs_producer import LogSendException
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ElasticsearchLogsProducer(LogProducerInterface):
|
||||
""" Log producer writing log entries to Elasticsearch.
|
||||
|
||||
This implementation writes directly to Elasticsearch without a streaming/queueing service.
|
||||
"""
|
||||
def send(self, logentry):
|
||||
try:
|
||||
logentry.save()
|
||||
except ElasticsearchException as ex:
|
||||
logger.exception('ElasticsearchLogsProducer error sending log to Elasticsearch: %s', ex)
|
||||
raise LogSendException('ElasticsearchLogsProducer error sending log to Elasticsearch: %s' % ex)
|
||||
except Exception as e:
|
||||
logger.exception('ElasticsearchLogsProducer exception sending log to Elasticsearch: %s', e)
|
||||
raise LogSendException('ElasticsearchLogsProducer exception sending log to Elasticsearch: %s' % e)
|
8
data/logs_model/logs_producer/interface.py
Normal file
8
data/logs_model/logs_producer/interface.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from abc import ABCMeta, abstractmethod
|
||||
from six import add_metaclass
|
||||
|
||||
@add_metaclass(ABCMeta)
|
||||
class LogProducerInterface(object):
|
||||
@abstractmethod
|
||||
def send(self, logentry):
|
||||
""" Send a log entry to the configured log infrastructure. """
|
45
data/logs_model/logs_producer/kafka_logs_producer.py
Normal file
45
data/logs_model/logs_producer/kafka_logs_producer.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
import logging
|
||||
|
||||
from kafka.errors import KafkaError, KafkaTimeoutError
|
||||
from kafka import KafkaProducer
|
||||
|
||||
from data.logs_model.shared import epoch_ms
|
||||
from data.logs_model.logs_producer.interface import LogProducerInterface
|
||||
from data.logs_model.logs_producer.util import logs_json_serializer
|
||||
from data.logs_model.logs_producer import LogSendException
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_MAX_BLOCK_SECONDS = 5
|
||||
|
||||
|
||||
class KafkaLogsProducer(LogProducerInterface):
|
||||
""" Log producer writing log entries to a Kafka stream. """
|
||||
def __init__(self, bootstrap_servers=None, topic=None, client_id=None, max_block_seconds=None):
|
||||
self.bootstrap_servers = bootstrap_servers
|
||||
self.topic = topic
|
||||
self.client_id = client_id
|
||||
self.max_block_ms = (max_block_seconds or DEFAULT_MAX_BLOCK_SECONDS) * 1000
|
||||
|
||||
self._producer = KafkaProducer(bootstrap_servers=self.bootstrap_servers,
|
||||
client_id=self.client_id,
|
||||
max_block_ms=self.max_block_ms,
|
||||
value_serializer=logs_json_serializer)
|
||||
|
||||
def send(self, logentry):
|
||||
try:
|
||||
# send() has a (max_block_ms) timeout and get() has a (max_block_ms) timeout
|
||||
# for an upper bound of 2x(max_block_ms) before guaranteed delivery
|
||||
future = self._producer.send(self.topic, logentry.to_dict(), timestamp_ms=epoch_ms(logentry.datetime))
|
||||
record_metadata = future.get(timeout=self.max_block_ms)
|
||||
assert future.succeeded
|
||||
except KafkaTimeoutError as kte:
|
||||
logger.exception('KafkaLogsProducer timeout sending log to Kafka: %s', kte)
|
||||
raise LogSendException('KafkaLogsProducer timeout sending log to Kafka: %s' % kte)
|
||||
except KafkaError as ke:
|
||||
logger.exception('KafkaLogsProducer error sending log to Kafka: %s', ke)
|
||||
raise LogSendException('KafkaLogsProducer error sending log to Kafka: %s' % ke)
|
||||
except Exception as e:
|
||||
logger.exception('KafkaLogsProducer exception sending log to Kafka: %s', e)
|
||||
raise LogSendException('KafkaLogsProducer exception sending log to Kafka: %s' % e)
|
|
@ -0,0 +1,75 @@
|
|||
import logging
|
||||
import hashlib
|
||||
import random
|
||||
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
from botocore.client import Config
|
||||
|
||||
from data.logs_model.logs_producer.interface import LogProducerInterface
|
||||
from data.logs_model.logs_producer.util import logs_json_serializer
|
||||
from data.logs_model.logs_producer import LogSendException
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
KINESIS_PARTITION_KEY_PREFIX = 'logentry_partition_key_'
|
||||
DEFAULT_CONNECT_TIMEOUT = 5
|
||||
DEFAULT_READ_TIMEOUT = 5
|
||||
MAX_RETRY_ATTEMPTS = 5
|
||||
DEFAULT_MAX_POOL_CONNECTIONS = 10
|
||||
|
||||
|
||||
def _partition_key(number_of_shards=None):
|
||||
""" Generate a partition key for AWS Kinesis stream.
|
||||
If the number of shards is specified, generate keys where the size of the key space is
|
||||
the number of shards.
|
||||
"""
|
||||
key = None
|
||||
if number_of_shards is not None:
|
||||
shard_number = random.randrange(0, number_of_shards)
|
||||
key = hashlib.sha1(KINESIS_PARTITION_KEY_PREFIX + str(shard_number)).hexdigest()
|
||||
else:
|
||||
key = hashlib.sha1(KINESIS_PARTITION_KEY_PREFIX + str(random.getrandbits(256))).hexdigest()
|
||||
|
||||
return key
|
||||
|
||||
|
||||
class KinesisStreamLogsProducer(LogProducerInterface):
|
||||
""" Log producer writing log entries to an Amazon Kinesis Data Stream. """
|
||||
def __init__(self, stream_name, aws_region, aws_access_key=None, aws_secret_key=None,
|
||||
connect_timeout=None, read_timeout=None, max_retries=None,
|
||||
max_pool_connections=None):
|
||||
self._stream_name = stream_name
|
||||
self._aws_region = aws_region
|
||||
self._aws_access_key = aws_access_key
|
||||
self._aws_secret_key = aws_secret_key
|
||||
self._connect_timeout = connect_timeout or DEFAULT_CONNECT_TIMEOUT
|
||||
self._read_timeout = read_timeout or DEFAULT_READ_TIMEOUT
|
||||
self._max_retries = max_retries or MAX_RETRY_ATTEMPTS
|
||||
self._max_pool_connections=max_pool_connections or DEFAULT_MAX_POOL_CONNECTIONS
|
||||
|
||||
client_config = Config(connect_timeout=self._connect_timeout,
|
||||
read_timeout=self._read_timeout ,
|
||||
retries={'max_attempts': self._max_retries},
|
||||
max_pool_connections=self._max_pool_connections)
|
||||
self._producer = boto3.client('kinesis', use_ssl=True,
|
||||
region_name=self._aws_region,
|
||||
aws_access_key_id=self._aws_access_key,
|
||||
aws_secret_access_key=self._aws_secret_key,
|
||||
config=client_config)
|
||||
|
||||
def send(self, logentry):
|
||||
try:
|
||||
data = logs_json_serializer(logentry)
|
||||
self._producer.put_record(
|
||||
StreamName=self._stream_name,
|
||||
Data=data,
|
||||
PartitionKey=_partition_key()
|
||||
)
|
||||
except ClientError as ce:
|
||||
logger.exception('KinesisStreamLogsProducer client error sending log to Kinesis: %s', ce)
|
||||
raise LogSendException('KinesisStreamLogsProducer client error sending log to Kinesis: %s' % ce)
|
||||
except Exception as e:
|
||||
logger.exception('KinesisStreamLogsProducer exception sending log to Kinesis: %s', e)
|
||||
raise LogSendException('KinesisStreamLogsProducer exception sending log to Kinesis: %s' % e)
|
|
@ -0,0 +1,45 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import json
|
||||
from datetime import datetime
|
||||
import pytest
|
||||
|
||||
from data.logs_model.logs_producer.util import logs_json_serializer
|
||||
from data.logs_model.elastic_logs import LogEntry
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
TEST_DATETIME = datetime.utcnow()
|
||||
|
||||
TEST_JSON_STRING = '{"a": "b", "c": "d"}'
|
||||
TEST_JSON_STRING_WITH_UNICODE = u'{"éëê": "îôû"}'
|
||||
|
||||
VALID_LOGENTRY = LogEntry(random_id='123-45', ip='0.0.0.0', metadata_json=TEST_JSON_STRING, datetime=TEST_DATETIME)
|
||||
VALID_LOGENTRY_WITH_UNICODE = LogEntry(random_id='123-45', ip='0.0.0.0', metadata_json=TEST_JSON_STRING_WITH_UNICODE, datetime=TEST_DATETIME)
|
||||
|
||||
VALID_LOGENTRY_EXPECTED_OUTPUT = '{"datetime": "%s", "ip": "0.0.0.0", "metadata_json": "{\\"a\\": \\"b\\", \\"c\\": \\"d\\"}", "random_id": "123-45"}' % TEST_DATETIME.isoformat()
|
||||
VALID_LOGENTRY_WITH_UNICODE_EXPECTED_OUTPUT = '{"datetime": "%s", "ip": "0.0.0.0", "metadata_json": "{\\"\\u00e9\\u00eb\\u00ea\\": \\"\\u00ee\\u00f4\\u00fb\\"}", "random_id": "123-45"}' % TEST_DATETIME.isoformat()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'is_valid, given_input, expected_output',
|
||||
[
|
||||
# Valid inputs
|
||||
pytest.param(True, VALID_LOGENTRY, VALID_LOGENTRY_EXPECTED_OUTPUT),
|
||||
# With unicode
|
||||
pytest.param(True, VALID_LOGENTRY_WITH_UNICODE, VALID_LOGENTRY_WITH_UNICODE_EXPECTED_OUTPUT),
|
||||
])
|
||||
def test_logs_json_serializer(is_valid, given_input, expected_output):
|
||||
if not is_valid:
|
||||
with pytest.raises(ValueError) as ve:
|
||||
data = logs_json_serializer(given_input)
|
||||
else:
|
||||
data = logs_json_serializer(given_input, sort_keys=True)
|
||||
assert data == expected_output
|
||||
|
||||
# Make sure the datetime was serialized in the correct ISO8601
|
||||
datetime_str = json.loads(data)['datetime']
|
||||
assert datetime_str == TEST_DATETIME.isoformat()
|
15
data/logs_model/logs_producer/util.py
Normal file
15
data/logs_model/logs_producer/util.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
import json
|
||||
from datetime import datetime
|
||||
|
||||
class LogEntryJSONEncoder(json.JSONEncoder):
|
||||
""" JSON encoder to encode datetimes to ISO8601 format. """
|
||||
def default(self, obj):
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
|
||||
return super(LogEntryJSONEncoder, self).default(obj)
|
||||
|
||||
def logs_json_serializer(logentry, sort_keys=False):
|
||||
""" Serializes a LogEntry to json bytes. """
|
||||
return json.dumps(logentry.to_dict(), cls=LogEntryJSONEncoder,
|
||||
ensure_ascii=True, sort_keys=sort_keys).encode('ascii')
|
53
data/logs_model/shared.py
Normal file
53
data/logs_model/shared.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
import uuid
|
||||
import json
|
||||
|
||||
from calendar import timegm
|
||||
|
||||
from data import model
|
||||
|
||||
|
||||
class SharedModel:
|
||||
def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue,
|
||||
namespace_name=None, repository_name=None, callback_url=None,
|
||||
callback_email=None, filter_kinds=None):
|
||||
""" Queues logs between the start_datetime and end_time, filtered by a repository or namespace,
|
||||
for export to the specified URL and/or email address. Returns the ID of the export job
|
||||
queued or None if error.
|
||||
"""
|
||||
export_id = str(uuid.uuid4())
|
||||
namespace = model.user.get_namespace_user(namespace_name)
|
||||
if namespace is None:
|
||||
return None
|
||||
|
||||
repository = None
|
||||
if repository_name is not None:
|
||||
repository = model.repository.get_repository(namespace_name, repository_name)
|
||||
if repository is None:
|
||||
return None
|
||||
|
||||
export_action_logs_queue.put([namespace_name],
|
||||
json.dumps({
|
||||
'export_id': export_id,
|
||||
'repository_id': repository.id if repository else None,
|
||||
'namespace_id': namespace.id,
|
||||
'namespace_name': namespace.username,
|
||||
'repository_name': repository.name if repository else None,
|
||||
'start_time': start_datetime.strftime('%m/%d/%Y'),
|
||||
'end_time': end_datetime.strftime('%m/%d/%Y'),
|
||||
'callback_url': callback_url,
|
||||
'callback_email': callback_email,
|
||||
}), retries_remaining=3)
|
||||
|
||||
return export_id
|
||||
|
||||
|
||||
def epoch_ms(dt):
|
||||
return (timegm(dt.timetuple()) * 1000) + (dt.microsecond / 1000)
|
||||
|
||||
|
||||
def get_kinds_filter(kinds):
|
||||
""" Given a list of kinds, return the set of kinds not that are not part of that list.
|
||||
i.e Returns the list of kinds to be filtered out. """
|
||||
kind_map = model.log.get_log_entry_kinds()
|
||||
kind_map = {key: kind_map[key] for key in kind_map if not isinstance(key, int)}
|
||||
return [kind_name for kind_name in kind_map if kind_name not in kinds]
|
291
data/logs_model/table_logs_model.py
Normal file
291
data/logs_model/table_logs_model.py
Normal file
|
@ -0,0 +1,291 @@
|
|||
# pylint: disable=protected-access
|
||||
|
||||
import logging
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from tzlocal import get_localzone
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
from data import model
|
||||
from data.model import config
|
||||
from data.database import LogEntry, LogEntry2, LogEntry3, UseThenDisconnect
|
||||
from data.logs_model.interface import ActionLogsDataInterface, LogsIterationTimeout, \
|
||||
LogRotationContextInterface
|
||||
from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage
|
||||
from data.logs_model.shared import SharedModel
|
||||
from data.model.log import get_stale_logs, get_stale_logs_start_id, delete_stale_logs
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MINIMUM_RANGE_SIZE = 1 # second
|
||||
MAXIMUM_RANGE_SIZE = 60 * 60 * 24 * 30 # seconds ~= 1 month
|
||||
EXPECTED_ITERATION_LOG_COUNT = 1000
|
||||
|
||||
|
||||
LOG_MODELS = [LogEntry3, LogEntry2, LogEntry]
|
||||
|
||||
|
||||
class TableLogsModel(SharedModel, ActionLogsDataInterface):
|
||||
"""
|
||||
TableLogsModel implements the data model for the logs API backed by a single table
|
||||
in the database.
|
||||
"""
|
||||
def __init__(self, should_skip_logging=None, **kwargs):
|
||||
self._should_skip_logging = should_skip_logging
|
||||
|
||||
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
|
||||
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
|
||||
if filter_kinds is not None:
|
||||
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
|
||||
|
||||
assert start_datetime is not None
|
||||
assert end_datetime is not None
|
||||
|
||||
repository = None
|
||||
if repository_name and namespace_name:
|
||||
repository = model.repository.get_repository(namespace_name, repository_name)
|
||||
assert repository
|
||||
|
||||
performer = None
|
||||
if performer_name:
|
||||
performer = model.user.get_user(performer_name)
|
||||
assert performer
|
||||
|
||||
def get_logs(m, page_token):
|
||||
logs_query = model.log.get_logs_query(start_datetime, end_datetime, performer=performer,
|
||||
repository=repository, namespace=namespace_name,
|
||||
ignore=filter_kinds, model=m)
|
||||
|
||||
logs, next_page_token = model.modelutil.paginate(logs_query, m,
|
||||
descending=True,
|
||||
page_token=page_token,
|
||||
limit=20,
|
||||
max_page=max_page_count,
|
||||
sort_field_name='datetime')
|
||||
|
||||
return logs, next_page_token
|
||||
|
||||
TOKEN_TABLE_ID = 'tti'
|
||||
table_index = 0
|
||||
logs = []
|
||||
next_page_token = page_token or None
|
||||
|
||||
# Skip empty pages (empty table)
|
||||
while len(logs) == 0 and table_index < len(LOG_MODELS) - 1:
|
||||
table_specified = next_page_token is not None and next_page_token.get(TOKEN_TABLE_ID) is not None
|
||||
if table_specified:
|
||||
table_index = next_page_token.get(TOKEN_TABLE_ID)
|
||||
|
||||
logs_result, next_page_token = get_logs(LOG_MODELS[table_index], next_page_token)
|
||||
logs.extend(logs_result)
|
||||
|
||||
if next_page_token is None and table_index < len(LOG_MODELS) - 1:
|
||||
next_page_token = {TOKEN_TABLE_ID: table_index + 1}
|
||||
|
||||
return LogEntriesPage([Log.for_logentry(log) for log in logs], next_page_token)
|
||||
|
||||
def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None,
|
||||
filter_kinds=None, size=20):
|
||||
if filter_kinds is not None:
|
||||
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
|
||||
|
||||
repository = None
|
||||
if repository_name and namespace_name:
|
||||
repository = model.repository.get_repository(namespace_name, repository_name)
|
||||
assert repository
|
||||
|
||||
performer = None
|
||||
if performer_name:
|
||||
performer = model.user.get_user(performer_name)
|
||||
assert performer
|
||||
|
||||
def get_latest_logs(m):
|
||||
logs_query = model.log.get_latest_logs_query(performer=performer, repository=repository,
|
||||
namespace=namespace_name, ignore=filter_kinds,
|
||||
model=m, size=size)
|
||||
|
||||
logs = list(logs_query)
|
||||
return [Log.for_logentry(log) for log in logs]
|
||||
|
||||
return get_latest_logs(LOG_MODELS[0])
|
||||
|
||||
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
|
||||
repository_name=None, namespace_name=None, filter_kinds=None):
|
||||
if filter_kinds is not None:
|
||||
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
|
||||
|
||||
if end_datetime - start_datetime >= timedelta(weeks=4):
|
||||
raise Exception('Cannot lookup aggregated logs over a period longer than a month')
|
||||
|
||||
repository = None
|
||||
if repository_name and namespace_name:
|
||||
repository = model.repository.get_repository(namespace_name, repository_name)
|
||||
|
||||
performer = None
|
||||
if performer_name:
|
||||
performer = model.user.get_user(performer_name)
|
||||
|
||||
entries = {}
|
||||
for log_model in LOG_MODELS:
|
||||
aggregated = model.log.get_aggregated_logs(start_datetime, end_datetime,
|
||||
performer=performer,
|
||||
repository=repository,
|
||||
namespace=namespace_name,
|
||||
ignore=filter_kinds,
|
||||
model=log_model)
|
||||
|
||||
for entry in aggregated:
|
||||
synthetic_date = datetime(start_datetime.year, start_datetime.month, int(entry.day),
|
||||
tzinfo=get_localzone())
|
||||
if synthetic_date.day < start_datetime.day:
|
||||
synthetic_date = synthetic_date + relativedelta(months=1)
|
||||
|
||||
key = '%s-%s' % (entry.kind_id, entry.day)
|
||||
|
||||
if key in entries:
|
||||
entries[key] = AggregatedLogCount(entry.kind_id, entry.count + entries[key].count,
|
||||
synthetic_date)
|
||||
else:
|
||||
entries[key] = AggregatedLogCount(entry.kind_id, entry.count, synthetic_date)
|
||||
|
||||
return entries.values()
|
||||
|
||||
def count_repository_actions(self, repository, day):
|
||||
return model.repositoryactioncount.count_repository_actions(repository, day)
|
||||
|
||||
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
|
||||
repository=None, repository_name=None, timestamp=None, is_free_namespace=False):
|
||||
if self._should_skip_logging and self._should_skip_logging(kind_name, namespace_name,
|
||||
is_free_namespace):
|
||||
return
|
||||
|
||||
if repository_name is not None:
|
||||
assert repository is None
|
||||
assert namespace_name is not None
|
||||
repository = model.repository.get_repository(namespace_name, repository_name)
|
||||
|
||||
model.log.log_action(kind_name, namespace_name, performer=performer, repository=repository,
|
||||
ip=ip, metadata=metadata or {}, timestamp=timestamp)
|
||||
|
||||
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
|
||||
namespace_id=None, max_query_time=None):
|
||||
# Using an adjusting scale, start downloading log rows in batches, starting at
|
||||
# MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or
|
||||
# the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes
|
||||
# longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out.
|
||||
batch_start_time = datetime.utcnow()
|
||||
|
||||
current_start_datetime = start_datetime
|
||||
current_batch_size = timedelta(seconds=MINIMUM_RANGE_SIZE)
|
||||
|
||||
while current_start_datetime < end_datetime:
|
||||
# Verify we haven't been working for too long.
|
||||
work_elapsed = datetime.utcnow() - batch_start_time
|
||||
if max_query_time is not None and work_elapsed > max_query_time:
|
||||
logger.error('Retrieval of logs `%s/%s` timed out with time of `%s`',
|
||||
namespace_id, repository_id, work_elapsed)
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
current_end_datetime = current_start_datetime + current_batch_size
|
||||
current_end_datetime = min(current_end_datetime, end_datetime)
|
||||
|
||||
# Load the next set of logs.
|
||||
def load_logs():
|
||||
logger.debug('Retrieving logs over range %s -> %s with namespace %s and repository %s',
|
||||
current_start_datetime, current_end_datetime, namespace_id, repository_id)
|
||||
|
||||
logs_query = model.log.get_logs_query(namespace=namespace_id,
|
||||
repository=repository_id,
|
||||
start_time=current_start_datetime,
|
||||
end_time=current_end_datetime)
|
||||
logs = list(logs_query)
|
||||
for log in logs:
|
||||
if namespace_id is not None:
|
||||
assert log.account_id == namespace_id
|
||||
|
||||
if repository_id is not None:
|
||||
assert log.repository_id == repository_id
|
||||
|
||||
logs = [Log.for_logentry(log) for log in logs]
|
||||
return logs
|
||||
|
||||
logs, elapsed = _run_and_time(load_logs)
|
||||
if max_query_time is not None and elapsed > max_query_time:
|
||||
logger.error('Retrieval of logs for export `%s/%s` with range `%s-%s` timed out at `%s`',
|
||||
namespace_id, repository_id, current_start_datetime, current_end_datetime,
|
||||
elapsed)
|
||||
raise LogsIterationTimeout()
|
||||
|
||||
yield logs
|
||||
|
||||
# Move forward.
|
||||
current_start_datetime = current_end_datetime
|
||||
|
||||
# Increase the batch size if necessary.
|
||||
if len(logs) < EXPECTED_ITERATION_LOG_COUNT:
|
||||
seconds = min(MAXIMUM_RANGE_SIZE, current_batch_size.total_seconds() * 2)
|
||||
current_batch_size = timedelta(seconds=seconds)
|
||||
|
||||
def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
|
||||
""" Yield a context manager for a group of outdated logs. """
|
||||
for log_model in LOG_MODELS:
|
||||
while True:
|
||||
with UseThenDisconnect(config.app_config):
|
||||
start_id = get_stale_logs_start_id(log_model)
|
||||
|
||||
if start_id is None:
|
||||
logger.warning('Failed to find start id')
|
||||
break
|
||||
|
||||
logger.debug('Found starting ID %s', start_id)
|
||||
lookup_end_id = start_id + min_logs_per_rotation
|
||||
logs = [log for log in get_stale_logs(start_id, lookup_end_id,
|
||||
log_model, cutoff_date)]
|
||||
|
||||
if not logs:
|
||||
logger.debug('No further logs found')
|
||||
break
|
||||
|
||||
end_id = max([log.id for log in logs])
|
||||
context = DatabaseLogRotationContext(logs, log_model, start_id, end_id)
|
||||
yield context
|
||||
|
||||
|
||||
def _run_and_time(fn):
|
||||
start_time = datetime.utcnow()
|
||||
result = fn()
|
||||
return result, datetime.utcnow() - start_time
|
||||
|
||||
|
||||
table_logs_model = TableLogsModel()
|
||||
|
||||
|
||||
class DatabaseLogRotationContext(LogRotationContextInterface):
|
||||
"""
|
||||
DatabaseLogRotationContext represents a batch of logs to be archived together.
|
||||
i.e A set of logs to be archived in the same file (based on the number of logs per rotation).
|
||||
|
||||
When completed without exceptions, this context will delete the stale logs
|
||||
from rows `start_id` to `end_id`.
|
||||
"""
|
||||
def __init__(self, logs, log_model, start_id, end_id):
|
||||
self.logs = logs
|
||||
self.log_model = log_model
|
||||
self.start_id = start_id
|
||||
self.end_id = end_id
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, ex_type, ex_value, ex_traceback):
|
||||
if ex_type is None and ex_value is None and ex_traceback is None:
|
||||
with UseThenDisconnect(config.app_config):
|
||||
logger.debug('Deleting logs from IDs %s to %s', self.start_id, self.end_id)
|
||||
delete_stale_logs(self.start_id, self.end_id, self.log_model)
|
||||
|
||||
def yield_logs_batch(self):
|
||||
""" Yield a batch of logs and a filename for that batch. """
|
||||
filename = '%d-%d-%s.txt.gz' % (self.start_id, self.end_id,
|
||||
self.log_model.__name__.lower())
|
||||
yield self.logs, filename
|
0
data/logs_model/test/__init__.py
Normal file
0
data/logs_model/test/__init__.py
Normal file
390
data/logs_model/test/fake_elasticsearch.py
Normal file
390
data/logs_model/test/fake_elasticsearch.py
Normal file
|
@ -0,0 +1,390 @@
|
|||
import json
|
||||
import uuid
|
||||
import fnmatch
|
||||
|
||||
from collections import defaultdict
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime
|
||||
|
||||
import dateutil.parser
|
||||
|
||||
from httmock import urlmatch, HTTMock
|
||||
|
||||
FAKE_ES_HOST = 'fakees'
|
||||
|
||||
EMPTY_RESULT = {
|
||||
'hits': {'hits': [], 'total': 0},
|
||||
'_shards': {'successful': 1, 'total': 1},
|
||||
}
|
||||
|
||||
def parse_query(query):
|
||||
if not query:
|
||||
return {}
|
||||
|
||||
return {s.split('=')[0]: s.split('=')[1] for s in query.split("&")}
|
||||
|
||||
|
||||
@contextmanager
|
||||
def fake_elasticsearch(allow_wildcard=True):
|
||||
templates = {}
|
||||
docs = defaultdict(list)
|
||||
scrolls = {}
|
||||
id_counter = [1]
|
||||
|
||||
def transform(value, field_name):
|
||||
# TODO: implement this using a real index template if we ever need more than a few
|
||||
# fields here.
|
||||
if field_name == 'datetime':
|
||||
if isinstance(value, int):
|
||||
return datetime.utcfromtimestamp(value / 1000)
|
||||
|
||||
parsed = dateutil.parser.parse(value)
|
||||
return parsed
|
||||
|
||||
return value
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST, path=r'/_template/(.+)', method='GET')
|
||||
def get_template(url, request):
|
||||
template_name = url[len('/_template/'):]
|
||||
if template_name in templates:
|
||||
return {'status_code': 200}
|
||||
|
||||
return {'status_code': 404}
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST, path=r'/_template/(.+)', method='PUT')
|
||||
def put_template(url, request):
|
||||
template_name = url[len('/_template/'):]
|
||||
templates[template_name] = True
|
||||
return {'status_code': 201}
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)/_doc', method='POST')
|
||||
def post_doc(url, request):
|
||||
index_name, _ = url.path[1:].split('/')
|
||||
item = json.loads(request.body)
|
||||
item['_id'] = item['random_id']
|
||||
id_counter[0] += 1
|
||||
docs[index_name].append(item)
|
||||
return {
|
||||
'status_code': 204,
|
||||
'headers': {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
'content': json.dumps({
|
||||
"result": "created",
|
||||
}),
|
||||
}
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)$', method='DELETE')
|
||||
def index_delete(url, request):
|
||||
index_name_or_pattern = url.path[1:]
|
||||
to_delete = []
|
||||
for index_name in docs.keys():
|
||||
if not fnmatch.fnmatch(index_name, index_name_or_pattern):
|
||||
continue
|
||||
|
||||
to_delete.append(index_name)
|
||||
|
||||
for index in to_delete:
|
||||
docs.pop(index)
|
||||
|
||||
return {
|
||||
'status_code': 200,
|
||||
'headers': {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
'content': {'acknowledged': True}
|
||||
}
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)$', method='GET')
|
||||
def index_lookup(url, request):
|
||||
index_name_or_pattern = url.path[1:]
|
||||
found = {}
|
||||
for index_name in docs.keys():
|
||||
if not fnmatch.fnmatch(index_name, index_name_or_pattern):
|
||||
continue
|
||||
|
||||
found[index_name] = {}
|
||||
|
||||
if not found:
|
||||
return {
|
||||
'status_code': 404,
|
||||
}
|
||||
|
||||
return {
|
||||
'status_code': 200,
|
||||
'headers': {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
'content': json.dumps(found),
|
||||
}
|
||||
|
||||
def _match_query(index_name_or_pattern, query):
|
||||
found = []
|
||||
found_index = False
|
||||
|
||||
for index_name in docs.keys():
|
||||
if not allow_wildcard and index_name_or_pattern.find('*') >= 0:
|
||||
break
|
||||
|
||||
if not fnmatch.fnmatch(index_name, index_name_or_pattern):
|
||||
continue
|
||||
|
||||
found_index = True
|
||||
|
||||
def _is_match(doc, current_query):
|
||||
if current_query is None:
|
||||
return True
|
||||
|
||||
for filter_type, filter_params in current_query.iteritems():
|
||||
for field_name, filter_props in filter_params.iteritems():
|
||||
if filter_type == 'range':
|
||||
lt = transform(filter_props['lt'], field_name)
|
||||
gte = transform(filter_props['gte'], field_name)
|
||||
doc_value = transform(doc[field_name], field_name)
|
||||
if not (doc_value < lt and doc_value >= gte):
|
||||
return False
|
||||
elif filter_type == 'term':
|
||||
doc_value = transform(doc[field_name], field_name)
|
||||
return doc_value == filter_props
|
||||
elif filter_type == 'terms':
|
||||
doc_value = transform(doc[field_name], field_name)
|
||||
return doc_value in filter_props
|
||||
elif filter_type == 'bool':
|
||||
assert not 'should' in filter_params, 'should is unsupported'
|
||||
|
||||
must = filter_params.get('must')
|
||||
must_not = filter_params.get('must_not')
|
||||
filter_bool = filter_params.get('filter')
|
||||
|
||||
if must:
|
||||
for check in must:
|
||||
if not _is_match(doc, check):
|
||||
return False
|
||||
|
||||
if must_not:
|
||||
for check in must_not:
|
||||
if _is_match(doc, check):
|
||||
return False
|
||||
|
||||
if filter_bool:
|
||||
for check in filter_bool:
|
||||
if not _is_match(doc, check):
|
||||
return False
|
||||
else:
|
||||
raise Exception('Unimplemented query %s: %s' % (filter_type, query))
|
||||
|
||||
return True
|
||||
|
||||
for doc in docs[index_name]:
|
||||
if not _is_match(doc, query):
|
||||
continue
|
||||
|
||||
found.append({'_source': doc, '_index': index_name})
|
||||
|
||||
return found, found_index or (index_name_or_pattern.find('*') >= 0)
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)/_count$', method='GET')
|
||||
def count_docs(url, request):
|
||||
request = json.loads(request.body)
|
||||
index_name_or_pattern, _ = url.path[1:].split('/')
|
||||
|
||||
found, found_index = _match_query(index_name_or_pattern, request['query'])
|
||||
if not found_index:
|
||||
return {
|
||||
'status_code': 404,
|
||||
}
|
||||
|
||||
return {
|
||||
'status_code': 200,
|
||||
'headers': {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
'content': json.dumps({'count': len(found)}),
|
||||
}
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST, path=r'/_search/scroll$', method='GET')
|
||||
def lookup_scroll(url, request):
|
||||
request_obj = json.loads(request.body)
|
||||
scroll_id = request_obj['scroll_id']
|
||||
if scroll_id in scrolls:
|
||||
return {
|
||||
'status_code': 200,
|
||||
'headers': {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
'content': json.dumps(scrolls[scroll_id]),
|
||||
}
|
||||
|
||||
return {
|
||||
'status_code': 404,
|
||||
}
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST, path=r'/_search/scroll$', method='DELETE')
|
||||
def delete_scroll(url, request):
|
||||
request = json.loads(request.body)
|
||||
for scroll_id in request['scroll_id']:
|
||||
scrolls.pop(scroll_id, None)
|
||||
|
||||
return {
|
||||
'status_code': 404,
|
||||
}
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)/_search$', method='GET')
|
||||
def lookup_docs(url, request):
|
||||
query_params = parse_query(url.query)
|
||||
|
||||
request = json.loads(request.body)
|
||||
index_name_or_pattern, _ = url.path[1:].split('/')
|
||||
|
||||
# Find matching docs.
|
||||
query = request.get('query')
|
||||
found, found_index = _match_query(index_name_or_pattern, query)
|
||||
if not found_index:
|
||||
return {
|
||||
'status_code': 404,
|
||||
}
|
||||
|
||||
# Sort.
|
||||
sort = request.get('sort')
|
||||
if sort:
|
||||
if sort == ['_doc'] or sort == '_doc':
|
||||
found.sort(key=lambda x: x['_source']['_id'])
|
||||
else:
|
||||
def get_sort_key(item):
|
||||
source = item['_source']
|
||||
key = ''
|
||||
for sort_config in sort:
|
||||
for sort_key, direction in sort_config.iteritems():
|
||||
assert direction == 'desc'
|
||||
sort_key = sort_key.replace('.keyword', '')
|
||||
key += str(transform(source[sort_key], sort_key))
|
||||
key += '|'
|
||||
return key
|
||||
|
||||
found.sort(key=get_sort_key, reverse=True)
|
||||
|
||||
# Search after.
|
||||
search_after = request.get('search_after')
|
||||
if search_after:
|
||||
sort_fields = []
|
||||
for sort_config in sort:
|
||||
if isinstance(sort_config, unicode):
|
||||
sort_fields.append(sort_config)
|
||||
continue
|
||||
|
||||
for sort_key, _ in sort_config.iteritems():
|
||||
sort_key = sort_key.replace('.keyword', '')
|
||||
sort_fields.append(sort_key)
|
||||
|
||||
for index, search_after_value in enumerate(search_after):
|
||||
field_name = sort_fields[index]
|
||||
value = transform(search_after_value, field_name)
|
||||
if field_name == '_doc':
|
||||
found = [f for f in found if transform(f['_source']['_id'], field_name) > value]
|
||||
else:
|
||||
found = [f for f in found if transform(f['_source'][field_name], field_name) < value]
|
||||
if len(found) < 2:
|
||||
break
|
||||
|
||||
if field_name == '_doc':
|
||||
if found[0]['_source']['_id'] != found[1]['_source']:
|
||||
break
|
||||
else:
|
||||
if found[0]['_source'][field_name] != found[1]['_source']:
|
||||
break
|
||||
|
||||
# Size.
|
||||
size = request.get('size')
|
||||
if size:
|
||||
found = found[0:size]
|
||||
|
||||
# Aggregation.
|
||||
# {u'query':
|
||||
# {u'range':
|
||||
# {u'datetime': {u'lt': u'2019-06-27T15:45:09.768085',
|
||||
# u'gte': u'2019-06-27T15:35:09.768085'}}},
|
||||
# u'aggs': {
|
||||
# u'by_id': {
|
||||
# u'terms': {u'field': u'kind_id'},
|
||||
# u'aggs': {
|
||||
# u'by_date': {u'date_histogram': {u'field': u'datetime', u'interval': u'day'}}}}},
|
||||
# u'size': 0}
|
||||
def _by_field(agg_field_params, results):
|
||||
aggregated_by_field = defaultdict(list)
|
||||
|
||||
for agg_means, agg_means_params in agg_field_params.iteritems():
|
||||
if agg_means == 'terms':
|
||||
field_name = agg_means_params['field']
|
||||
for result in results:
|
||||
value = result['_source'][field_name]
|
||||
aggregated_by_field[value].append(result)
|
||||
elif agg_means == 'date_histogram':
|
||||
field_name = agg_means_params['field']
|
||||
interval = agg_means_params['interval']
|
||||
for result in results:
|
||||
value = transform(result['_source'][field_name], field_name)
|
||||
aggregated_by_field[getattr(value, interval)].append(result)
|
||||
elif agg_means == 'aggs':
|
||||
# Skip. Handled below.
|
||||
continue
|
||||
else:
|
||||
raise Exception('Unsupported aggregation method: %s' % agg_means)
|
||||
|
||||
# Invoke the aggregation recursively.
|
||||
buckets = []
|
||||
for field_value, field_results in aggregated_by_field.iteritems():
|
||||
aggregated = _aggregate(agg_field_params, field_results)
|
||||
if isinstance(aggregated, list):
|
||||
aggregated = {'doc_count': len(aggregated)}
|
||||
|
||||
aggregated['key'] = field_value
|
||||
buckets.append(aggregated)
|
||||
|
||||
return {'buckets': buckets}
|
||||
|
||||
def _aggregate(query_config, results):
|
||||
agg_params = query_config.get(u'aggs')
|
||||
if not agg_params:
|
||||
return results
|
||||
|
||||
by_field_name = {}
|
||||
for agg_field_name, agg_field_params in agg_params.iteritems():
|
||||
by_field_name[agg_field_name] = _by_field(agg_field_params, results)
|
||||
|
||||
return by_field_name
|
||||
|
||||
final_result = {
|
||||
'hits': {
|
||||
'hits': found,
|
||||
'total': len(found),
|
||||
},
|
||||
'_shards': {
|
||||
'successful': 1,
|
||||
'total': 1,
|
||||
},
|
||||
'aggregations': _aggregate(request, found),
|
||||
}
|
||||
|
||||
if query_params.get('scroll'):
|
||||
scroll_id = str(uuid.uuid4())
|
||||
scrolls[scroll_id] = EMPTY_RESULT
|
||||
final_result['_scroll_id'] = scroll_id
|
||||
|
||||
return {
|
||||
'status_code': 200,
|
||||
'headers': {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
'content': json.dumps(final_result),
|
||||
}
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST)
|
||||
def catchall_handler(url, request):
|
||||
print "Unsupported URL: %s %s" % (request.method, url, )
|
||||
return {'status_code': 501}
|
||||
|
||||
handlers = [get_template, put_template, index_delete, index_lookup, post_doc, count_docs,
|
||||
lookup_docs, lookup_scroll, delete_scroll, catchall_handler]
|
||||
|
||||
with HTTMock(*handlers):
|
||||
yield
|
400
data/logs_model/test/mock_elasticsearch.py
Normal file
400
data/logs_model/test/mock_elasticsearch.py
Normal file
|
@ -0,0 +1,400 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
|
||||
from datetime import datetime
|
||||
from dateutil.parser import parse
|
||||
|
||||
from data.logs_model.datatypes import LogEntriesPage, Log, AggregatedLogCount
|
||||
|
||||
|
||||
def _status(d, code=200):
|
||||
return {"status_code": code, "content": json.dumps(d)}
|
||||
|
||||
|
||||
def _shards(d, total=5, failed=0, successful=5):
|
||||
d.update({"_shards": {"total": total, "failed": failed, "successful": successful}})
|
||||
return d
|
||||
|
||||
|
||||
def _hits(hits):
|
||||
return {"hits": {"total": len(hits), "max_score": None, "hits": hits}}
|
||||
|
||||
|
||||
INDEX_LIST_RESPONSE_HIT1_HIT2 = _status({
|
||||
"logentry_2018-03-08": {},
|
||||
"logentry_2018-04-02": {}
|
||||
})
|
||||
|
||||
|
||||
INDEX_LIST_RESPONSE_HIT2 = _status({
|
||||
"logentry_2018-04-02": {}
|
||||
})
|
||||
|
||||
|
||||
INDEX_LIST_RESPONSE = _status({
|
||||
"logentry_2019-01-01": {},
|
||||
"logentry_2017-03-08": {},
|
||||
"logentry_2018-03-08": {},
|
||||
"logentry_2018-04-02": {}
|
||||
})
|
||||
|
||||
|
||||
DEFAULT_TEMPLATE_RESPONSE = _status({"acknowledged": True})
|
||||
INDEX_RESPONSE_2019_01_01 = _status(
|
||||
_shards({
|
||||
"_index": "logentry_2019-01-01",
|
||||
"_type": "_doc",
|
||||
"_id": "1",
|
||||
"_version": 1,
|
||||
"_seq_no": 0,
|
||||
"_primary_term": 1,
|
||||
"result": "created"
|
||||
}))
|
||||
|
||||
INDEX_RESPONSE_2017_03_08 = _status(
|
||||
_shards({
|
||||
"_index": "logentry_2017-03-08",
|
||||
"_type": "_doc",
|
||||
"_id": "1",
|
||||
"_version": 1,
|
||||
"_seq_no": 0,
|
||||
"_primary_term": 1,
|
||||
"result": "created"
|
||||
}))
|
||||
|
||||
FAILURE_400 = _status({}, 400)
|
||||
|
||||
INDEX_REQUEST_2019_01_01 = [
|
||||
"logentry_2019-01-01", {
|
||||
"account_id":
|
||||
1,
|
||||
"repository_id":
|
||||
1,
|
||||
"ip":
|
||||
"192.168.1.1",
|
||||
"random_id":
|
||||
233,
|
||||
"datetime":
|
||||
"2019-01-01T03:30:00",
|
||||
"metadata_json": json.loads("{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1520479800}"),
|
||||
"performer_id":
|
||||
1,
|
||||
"kind_id":
|
||||
1
|
||||
}
|
||||
]
|
||||
|
||||
INDEX_REQUEST_2017_03_08 = [
|
||||
"logentry_2017-03-08", {
|
||||
"repository_id":
|
||||
1,
|
||||
"account_id":
|
||||
1,
|
||||
"ip":
|
||||
"192.168.1.1",
|
||||
"random_id":
|
||||
233,
|
||||
"datetime":
|
||||
"2017-03-08T03:30:00",
|
||||
"metadata_json": json.loads("{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1520479800}"),
|
||||
"performer_id":
|
||||
1,
|
||||
"kind_id":
|
||||
2
|
||||
}
|
||||
]
|
||||
|
||||
_hit1 = {
|
||||
"_index": "logentry_2018-03-08",
|
||||
"_type": "doc",
|
||||
"_id": "1",
|
||||
"_score": None,
|
||||
"_source": {
|
||||
"random_id":
|
||||
233,
|
||||
"kind_id":
|
||||
1,
|
||||
"account_id":
|
||||
1,
|
||||
"performer_id":
|
||||
1,
|
||||
"repository_id":
|
||||
1,
|
||||
"ip":
|
||||
"192.168.1.1",
|
||||
"metadata_json":
|
||||
"{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1520479800}",
|
||||
"datetime":
|
||||
"2018-03-08T03:30",
|
||||
},
|
||||
"sort": [1520479800000, 233]
|
||||
}
|
||||
|
||||
_hit2 = {
|
||||
"_index": "logentry_2018-04-02",
|
||||
"_type": "doc",
|
||||
"_id": "2",
|
||||
"_score": None,
|
||||
"_source": {
|
||||
"random_id":
|
||||
233,
|
||||
"kind_id":
|
||||
2,
|
||||
"account_id":
|
||||
1,
|
||||
"performer_id":
|
||||
1,
|
||||
"repository_id":
|
||||
1,
|
||||
"ip":
|
||||
"192.168.1.2",
|
||||
"metadata_json":
|
||||
"{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1522639800}",
|
||||
"datetime":
|
||||
"2018-04-02T03:30",
|
||||
},
|
||||
"sort": [1522639800000, 233]
|
||||
}
|
||||
|
||||
_log1 = Log(
|
||||
"{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1520479800}",
|
||||
"192.168.1.1", parse("2018-03-08T03:30"), "user1.email", "user1.username", "user1.robot",
|
||||
"user1.organization", "user1.username", "user1.email", "user1.robot", 1)
|
||||
_log2 = Log(
|
||||
"{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1522639800}",
|
||||
"192.168.1.2", parse("2018-04-02T03:30"), "user1.email", "user1.username", "user1.robot",
|
||||
"user1.organization", "user1.username", "user1.email", "user1.robot", 2)
|
||||
|
||||
SEARCH_RESPONSE_START = _status(_shards(_hits([_hit1, _hit2])))
|
||||
SEARCH_RESPONSE_END = _status(_shards(_hits([_hit2])))
|
||||
SEARCH_REQUEST_START = {
|
||||
"sort": [{
|
||||
"datetime": "desc"
|
||||
}, {
|
||||
"random_id.keyword": "desc"
|
||||
}],
|
||||
"query": {
|
||||
"bool": {
|
||||
"filter": [{
|
||||
"term": {
|
||||
"performer_id": 1
|
||||
}
|
||||
}, {
|
||||
"term": {
|
||||
"repository_id": 1
|
||||
}
|
||||
}]
|
||||
}
|
||||
},
|
||||
"size": 2
|
||||
}
|
||||
SEARCH_REQUEST_END = {
|
||||
"sort": [{
|
||||
"datetime": "desc"
|
||||
}, {
|
||||
"random_id.keyword": "desc"
|
||||
}],
|
||||
"query": {
|
||||
"bool": {
|
||||
"filter": [{
|
||||
"term": {
|
||||
"performer_id": 1
|
||||
}
|
||||
}, {
|
||||
"term": {
|
||||
"repository_id": 1
|
||||
}
|
||||
}]
|
||||
}
|
||||
},
|
||||
"search_after": [1520479800000, 233],
|
||||
"size": 2
|
||||
}
|
||||
SEARCH_REQUEST_FILTER = {
|
||||
"sort": [{
|
||||
"datetime": "desc"
|
||||
}, {
|
||||
"random_id.keyword": "desc"
|
||||
}],
|
||||
"query": {
|
||||
"bool": {
|
||||
"filter": [{
|
||||
"term": {
|
||||
"performer_id": 1
|
||||
}
|
||||
}, {
|
||||
"term": {
|
||||
"repository_id": 1
|
||||
}
|
||||
}, {
|
||||
"bool": {
|
||||
"must_not": [{
|
||||
"terms": {
|
||||
"kind_id": [1]
|
||||
}
|
||||
}]
|
||||
}
|
||||
}]
|
||||
}
|
||||
},
|
||||
"size": 2
|
||||
}
|
||||
SEARCH_PAGE_TOKEN = {
|
||||
"datetime": datetime(2018, 3, 8, 3, 30).isoformat(),
|
||||
"random_id": 233,
|
||||
"page_number": 1
|
||||
}
|
||||
SEARCH_PAGE_START = LogEntriesPage(logs=[_log1], next_page_token=SEARCH_PAGE_TOKEN)
|
||||
SEARCH_PAGE_END = LogEntriesPage(logs=[_log2], next_page_token=None)
|
||||
SEARCH_PAGE_EMPTY = LogEntriesPage([], None)
|
||||
|
||||
AGGS_RESPONSE = _status(
|
||||
_shards({
|
||||
"hits": {
|
||||
"total": 4,
|
||||
"max_score": None,
|
||||
"hits": []
|
||||
},
|
||||
"aggregations": {
|
||||
"by_id": {
|
||||
"doc_count_error_upper_bound":
|
||||
0,
|
||||
"sum_other_doc_count":
|
||||
0,
|
||||
"buckets": [{
|
||||
"key": 2,
|
||||
"doc_count": 3,
|
||||
"by_date": {
|
||||
"buckets": [{
|
||||
"key_as_string": "2009-11-12T00:00:00.000Z",
|
||||
"key": 1257984000000,
|
||||
"doc_count": 1
|
||||
}, {
|
||||
"key_as_string": "2009-11-13T00:00:00.000Z",
|
||||
"key": 1258070400000,
|
||||
"doc_count": 0
|
||||
}, {
|
||||
"key_as_string": "2009-11-14T00:00:00.000Z",
|
||||
"key": 1258156800000,
|
||||
"doc_count": 2
|
||||
}]
|
||||
}
|
||||
}, {
|
||||
"key": 1,
|
||||
"doc_count": 1,
|
||||
"by_date": {
|
||||
"buckets": [{
|
||||
"key_as_string": "2009-11-15T00:00:00.000Z",
|
||||
"key": 1258243200000,
|
||||
"doc_count": 1
|
||||
}]
|
||||
}
|
||||
}]
|
||||
}
|
||||
}
|
||||
}))
|
||||
|
||||
AGGS_REQUEST = {
|
||||
"query": {
|
||||
"bool": {
|
||||
"filter": [{
|
||||
"term": {
|
||||
"performer_id": 1
|
||||
}
|
||||
}, {
|
||||
"term": {
|
||||
"repository_id": 1
|
||||
}
|
||||
}, {
|
||||
"bool": {
|
||||
"must_not": [{
|
||||
"terms": {
|
||||
"kind_id": [2]
|
||||
}
|
||||
}]
|
||||
}
|
||||
}],
|
||||
"must": [{
|
||||
"range": {
|
||||
"datetime": {
|
||||
"lt": "2018-04-08T03:30:00",
|
||||
"gte": "2018-03-08T03:30:00"
|
||||
}
|
||||
}
|
||||
}]
|
||||
}
|
||||
},
|
||||
"aggs": {
|
||||
"by_id": {
|
||||
"terms": {
|
||||
"field": "kind_id"
|
||||
},
|
||||
"aggs": {
|
||||
"by_date": {
|
||||
"date_histogram": {
|
||||
"field": "datetime",
|
||||
"interval": "day"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"size": 0
|
||||
}
|
||||
|
||||
AGGS_COUNT = [
|
||||
AggregatedLogCount(1, 1, parse("2009-11-15T00:00:00.000")),
|
||||
AggregatedLogCount(2, 1, parse("2009-11-12T00:00:00.000")),
|
||||
AggregatedLogCount(2, 2, parse("2009-11-14T00:00:00.000"))
|
||||
]
|
||||
|
||||
COUNT_REQUEST = {
|
||||
"query": {
|
||||
"bool": {
|
||||
"filter": [{
|
||||
"term": {
|
||||
"repository_id": 1
|
||||
}
|
||||
}]
|
||||
}
|
||||
}
|
||||
}
|
||||
COUNT_RESPONSE = _status(_shards({
|
||||
"count": 1,
|
||||
}))
|
||||
|
||||
# assume there are 2 pages
|
||||
_scroll_id = "DnF1ZXJ5VGhlbkZldGNoBQAAAAAAACEmFkk1aGlTRzdSUWllejZmYTlEYTN3SVEAAAAAAAAhJRZJNWhpU0c3UlFpZXo2ZmE5RGEzd0lRAAAAAAAAHtAWLWZpaFZXVzVSTy1OTXA5V3MwcHZrZwAAAAAAAB7RFi1maWhWV1c1Uk8tTk1wOVdzMHB2a2cAAAAAAAAhJxZJNWhpU0c3UlFpZXo2ZmE5RGEzd0lR"
|
||||
|
||||
|
||||
def _scroll(d):
|
||||
d["_scroll_id"] = _scroll_id
|
||||
return d
|
||||
|
||||
|
||||
SCROLL_CREATE = _status(_shards(_scroll(_hits([_hit1]))))
|
||||
SCROLL_GET = _status(_shards(_scroll(_hits([_hit2]))))
|
||||
SCROLL_GET_2 = _status(_shards(_scroll(_hits([]))))
|
||||
SCROLL_DELETE = _status({"succeeded": True, "num_freed": 5})
|
||||
SCROLL_LOGS = [[_log1], [_log2]]
|
||||
|
||||
SCROLL_REQUESTS = [
|
||||
[
|
||||
"5m", 1, {
|
||||
"sort": "_doc",
|
||||
"query": {
|
||||
"range": {
|
||||
"datetime": {
|
||||
"lt": "2018-04-02T00:00:00",
|
||||
"gte": "2018-03-08T00:00:00"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
[{"scroll": "5m", "scroll_id": _scroll_id}],
|
||||
[{"scroll":"5m", "scroll_id": _scroll_id}],
|
||||
[{"scroll_id": [_scroll_id]}],
|
||||
]
|
||||
|
||||
SCROLL_RESPONSES = [SCROLL_CREATE, SCROLL_GET, SCROLL_GET_2, SCROLL_DELETE]
|
130
data/logs_model/test/test_combined_model.py
Normal file
130
data/logs_model/test/test_combined_model.py
Normal file
|
@ -0,0 +1,130 @@
|
|||
from datetime import date, datetime, timedelta
|
||||
|
||||
from freezegun import freeze_time
|
||||
|
||||
from data.logs_model.inmemory_model import InMemoryModel
|
||||
from data.logs_model.combined_model import CombinedLogsModel
|
||||
|
||||
from test.fixtures import *
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def first_model():
|
||||
return InMemoryModel()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def second_model():
|
||||
return InMemoryModel()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def combined_model(first_model, second_model, initialized_db):
|
||||
return CombinedLogsModel(first_model, second_model)
|
||||
|
||||
|
||||
def test_log_action(first_model, second_model, combined_model, initialized_db):
|
||||
day = date(2019, 1, 1)
|
||||
|
||||
# Write to the combined model.
|
||||
with freeze_time(day):
|
||||
combined_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
|
||||
simple_repo = model.repository.get_repository('devtable', 'simple')
|
||||
|
||||
# Make sure it is found in the first model but not the second.
|
||||
assert combined_model.count_repository_actions(simple_repo, day) == 1
|
||||
assert first_model.count_repository_actions(simple_repo, day) == 1
|
||||
assert second_model.count_repository_actions(simple_repo, day) == 0
|
||||
|
||||
|
||||
def test_count_repository_actions(first_model, second_model, combined_model, initialized_db):
|
||||
# Write to each model.
|
||||
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
|
||||
second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
|
||||
# Ensure the counts match as expected.
|
||||
day = datetime.today() - timedelta(minutes=60)
|
||||
simple_repo = model.repository.get_repository('devtable', 'simple')
|
||||
|
||||
assert first_model.count_repository_actions(simple_repo, day) == 3
|
||||
assert second_model.count_repository_actions(simple_repo, day) == 2
|
||||
assert combined_model.count_repository_actions(simple_repo, day) == 5
|
||||
|
||||
|
||||
def test_yield_logs_for_export(first_model, second_model, combined_model, initialized_db):
|
||||
now = datetime.now()
|
||||
|
||||
# Write to each model.
|
||||
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
|
||||
second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
|
||||
later = datetime.now()
|
||||
|
||||
# Ensure the full set of logs is yielded.
|
||||
first_logs = list(first_model.yield_logs_for_export(now, later))[0]
|
||||
second_logs = list(second_model.yield_logs_for_export(now, later))[0]
|
||||
|
||||
combined = list(combined_model.yield_logs_for_export(now, later))
|
||||
full_combined = []
|
||||
for subset in combined:
|
||||
full_combined.extend(subset)
|
||||
|
||||
assert len(full_combined) == len(first_logs) + len(second_logs)
|
||||
assert full_combined == (first_logs + second_logs)
|
||||
|
||||
|
||||
def test_lookup_logs(first_model, second_model, combined_model, initialized_db):
|
||||
now = datetime.now()
|
||||
|
||||
# Write to each model.
|
||||
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
|
||||
second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
|
||||
later = datetime.now()
|
||||
|
||||
def _collect_logs(model):
|
||||
page_token = None
|
||||
all_logs = []
|
||||
while True:
|
||||
paginated_logs = model.lookup_logs(now, later, page_token=page_token)
|
||||
page_token = paginated_logs.next_page_token
|
||||
all_logs.extend(paginated_logs.logs)
|
||||
if page_token is None:
|
||||
break
|
||||
return all_logs
|
||||
|
||||
first_logs = _collect_logs(first_model)
|
||||
second_logs = _collect_logs(second_model)
|
||||
combined = _collect_logs(combined_model)
|
||||
|
||||
assert len(combined) == len(first_logs) + len(second_logs)
|
||||
assert combined == (first_logs + second_logs)
|
529
data/logs_model/test/test_elasticsearch.py
Normal file
529
data/logs_model/test/test_elasticsearch.py
Normal file
|
@ -0,0 +1,529 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# pylint: disable=redefined-outer-name, wildcard-import
|
||||
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pytest
|
||||
from mock import patch, Mock
|
||||
from dateutil.parser import parse
|
||||
|
||||
from httmock import urlmatch, HTTMock
|
||||
|
||||
from data.model.log import _json_serialize
|
||||
from data.logs_model.elastic_logs import ElasticsearchLogs, INDEX_NAME_PREFIX, INDEX_DATE_FORMAT
|
||||
from data.logs_model import configure, LogsModelProxy
|
||||
from mock_elasticsearch import *
|
||||
|
||||
FAKE_ES_HOST = 'fakees'
|
||||
FAKE_ES_HOST_PATTERN = r'fakees.*'
|
||||
FAKE_ES_PORT = 443
|
||||
FAKE_AWS_ACCESS_KEY = None
|
||||
FAKE_AWS_SECRET_KEY = None
|
||||
FAKE_AWS_REGION = None
|
||||
|
||||
@pytest.fixture()
|
||||
def logs_model_config():
|
||||
conf = {
|
||||
'LOGS_MODEL': 'elasticsearch',
|
||||
'LOGS_MODEL_CONFIG': {
|
||||
'producer': 'elasticsearch',
|
||||
'elasticsearch_config': {
|
||||
'host': FAKE_ES_HOST,
|
||||
'port': FAKE_ES_PORT,
|
||||
'access_key': FAKE_AWS_ACCESS_KEY,
|
||||
'secret_key': FAKE_AWS_SECRET_KEY,
|
||||
'aws_region': FAKE_AWS_REGION
|
||||
}
|
||||
}
|
||||
}
|
||||
return conf
|
||||
|
||||
|
||||
FAKE_LOG_ENTRY_KINDS = {'push_repo': 1, 'pull_repo': 2}
|
||||
FAKE_NAMESPACES = {
|
||||
'user1':
|
||||
Mock(id=1, organization="user1.organization", username="user1.username", email="user1.email",
|
||||
robot="user1.robot"),
|
||||
'user2':
|
||||
Mock(id=2, organization="user2.organization", username="user2.username", email="user2.email",
|
||||
robot="user2.robot")
|
||||
}
|
||||
FAKE_REPOSITORIES = {
|
||||
'user1/repo1': Mock(id=1, namespace_user=FAKE_NAMESPACES['user1']),
|
||||
'user2/repo2': Mock(id=2, namespace_user=FAKE_NAMESPACES['user2']),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def logs_model():
|
||||
# prevent logs model from changing
|
||||
logs_model = LogsModelProxy()
|
||||
with patch('data.logs_model.logs_model', logs_model):
|
||||
yield logs_model
|
||||
|
||||
|
||||
@pytest.fixture(scope='function')
|
||||
def app_config(logs_model_config):
|
||||
fake_config = {}
|
||||
fake_config.update(logs_model_config)
|
||||
with patch("data.logs_model.document_logs_model.config.app_config", fake_config):
|
||||
yield fake_config
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_page_size():
|
||||
with patch('data.logs_model.document_logs_model.PAGE_SIZE', 1):
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_max_result_window():
|
||||
with patch('data.logs_model.document_logs_model.DEFAULT_RESULT_WINDOW', 1):
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_random_id():
|
||||
mock_random = Mock(return_value=233)
|
||||
with patch('data.logs_model.document_logs_model._random_id', mock_random):
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_db_model():
|
||||
def get_user_map_by_ids(namespace_ids):
|
||||
mapping = {}
|
||||
for i in namespace_ids:
|
||||
for name in FAKE_NAMESPACES:
|
||||
if FAKE_NAMESPACES[name].id == i:
|
||||
mapping[i] = FAKE_NAMESPACES[name]
|
||||
return mapping
|
||||
|
||||
model = Mock(
|
||||
user=Mock(
|
||||
get_namespace_user=FAKE_NAMESPACES.get,
|
||||
get_user_or_org=FAKE_NAMESPACES.get,
|
||||
get_user=FAKE_NAMESPACES.get,
|
||||
get_user_map_by_ids=get_user_map_by_ids,
|
||||
),
|
||||
repository=Mock(get_repository=lambda user_name, repo_name: FAKE_REPOSITORIES.get(
|
||||
user_name + '/' + repo_name),
|
||||
),
|
||||
log=Mock(
|
||||
_get_log_entry_kind=lambda name: FAKE_LOG_ENTRY_KINDS[name],
|
||||
_json_serialize=_json_serialize,
|
||||
get_log_entry_kinds=Mock(return_value=FAKE_LOG_ENTRY_KINDS),
|
||||
),
|
||||
)
|
||||
|
||||
with patch('data.logs_model.document_logs_model.model', model), patch(
|
||||
'data.logs_model.datatypes.model', model):
|
||||
yield
|
||||
|
||||
|
||||
def parse_query(query):
|
||||
return {s.split('=')[0]: s.split('=')[1] for s in query.split("&") if s != ""}
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_elasticsearch():
|
||||
mock = Mock()
|
||||
mock.template.side_effect = NotImplementedError
|
||||
mock.index.side_effect = NotImplementedError
|
||||
mock.count.side_effect = NotImplementedError
|
||||
mock.scroll_get.side_effect = NotImplementedError
|
||||
mock.scroll_delete.side_effect = NotImplementedError
|
||||
mock.search_scroll_create.side_effect = NotImplementedError
|
||||
mock.search_aggs.side_effect = NotImplementedError
|
||||
mock.search_after.side_effect = NotImplementedError
|
||||
mock.list_indices.side_effect = NotImplementedError
|
||||
|
||||
@urlmatch(netloc=r'.*', path=r'.*')
|
||||
def default(url, req):
|
||||
raise Exception('\nurl={}\nmethod={}\nreq.url={}\nheaders={}\nbody={}'.format(
|
||||
url, req.method, req.url, req.headers, req.body))
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/_template/.*')
|
||||
def template(url, req):
|
||||
return mock.template(url.query.split('/')[-1], req.body)
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/logentry_(\*|[0-9\-]+)')
|
||||
def list_indices(url, req):
|
||||
return mock.list_indices()
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/logentry_[0-9\-]*/_doc')
|
||||
def index(url, req):
|
||||
index = url.path.split('/')[1]
|
||||
body = json.loads(req.body)
|
||||
body['metadata_json'] = json.loads(body['metadata_json'])
|
||||
return mock.index(index, body)
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/logentry_([0-9\-]*|\*)/_count')
|
||||
def count(_, req):
|
||||
return mock.count(json.loads(req.body))
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/_search/scroll')
|
||||
def scroll(url, req):
|
||||
if req.method == 'DELETE':
|
||||
return mock.scroll_delete(json.loads(req.body))
|
||||
elif req.method == 'GET':
|
||||
request_obj = json.loads(req.body)
|
||||
return mock.scroll_get(request_obj)
|
||||
raise NotImplementedError()
|
||||
|
||||
@urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/logentry_(\*|[0-9\-]*)/_search')
|
||||
def search(url, req):
|
||||
if "scroll" in url.query:
|
||||
query = parse_query(url.query)
|
||||
window_size = query['scroll']
|
||||
maximum_result_size = int(query['size'])
|
||||
return mock.search_scroll_create(window_size, maximum_result_size, json.loads(req.body))
|
||||
elif "aggs" in req.body:
|
||||
return mock.search_aggs(json.loads(req.body))
|
||||
else:
|
||||
return mock.search_after(json.loads(req.body))
|
||||
|
||||
with HTTMock(scroll, count, search, index, template, list_indices, default):
|
||||
yield mock
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"""
|
||||
unlogged_pulls_ok, kind_name, namespace_name, repository, repository_name,
|
||||
timestamp,
|
||||
index_response, expected_request, throws
|
||||
""",
|
||||
[
|
||||
# Invalid inputs
|
||||
pytest.param(
|
||||
False, 'non-existing', None, None, None,
|
||||
None,
|
||||
None, None, True,
|
||||
id="Invalid Kind"
|
||||
),
|
||||
pytest.param(
|
||||
False, 'pull_repo', 'user1', Mock(id=1), 'repo1',
|
||||
None,
|
||||
None, None, True,
|
||||
id="Invalid Parameters"
|
||||
),
|
||||
|
||||
# Remote exceptions
|
||||
pytest.param(
|
||||
False, 'pull_repo', 'user1', Mock(id=1), None,
|
||||
None,
|
||||
FAILURE_400, None, True,
|
||||
id="Throw on pull log failure"
|
||||
),
|
||||
pytest.param(
|
||||
True, 'pull_repo', 'user1', Mock(id=1), None,
|
||||
parse("2017-03-08T03:30"),
|
||||
FAILURE_400, INDEX_REQUEST_2017_03_08, False,
|
||||
id="Ok on pull log failure"
|
||||
),
|
||||
|
||||
# Success executions
|
||||
pytest.param(
|
||||
False, 'pull_repo', 'user1', Mock(id=1), None,
|
||||
parse("2017-03-08T03:30"),
|
||||
INDEX_RESPONSE_2017_03_08, INDEX_REQUEST_2017_03_08, False,
|
||||
id="Log with namespace name and repository"
|
||||
),
|
||||
pytest.param(
|
||||
False, 'push_repo', 'user1', None, 'repo1',
|
||||
parse("2019-01-01T03:30"),
|
||||
INDEX_RESPONSE_2019_01_01, INDEX_REQUEST_2019_01_01, False,
|
||||
id="Log with namespace name and repository name"
|
||||
),
|
||||
])
|
||||
def test_log_action(unlogged_pulls_ok, kind_name, namespace_name, repository, repository_name,
|
||||
timestamp,
|
||||
index_response, expected_request, throws,
|
||||
app_config, logs_model, mock_elasticsearch, mock_db_model, mock_random_id):
|
||||
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
|
||||
mock_elasticsearch.index = Mock(return_value=index_response)
|
||||
app_config['ALLOW_PULLS_WITHOUT_STRICT_LOGGING'] = unlogged_pulls_ok
|
||||
configure(app_config)
|
||||
|
||||
performer = Mock(id=1)
|
||||
ip = "192.168.1.1"
|
||||
metadata = {'key': 'value', 'time': parse("2018-03-08T03:30"), '😂': '😂👌👌👌👌'}
|
||||
if throws:
|
||||
with pytest.raises(Exception):
|
||||
logs_model.log_action(kind_name, namespace_name, performer, ip, metadata, repository,
|
||||
repository_name, timestamp)
|
||||
else:
|
||||
logs_model.log_action(kind_name, namespace_name, performer, ip, metadata, repository,
|
||||
repository_name, timestamp)
|
||||
mock_elasticsearch.index.assert_called_with(*expected_request)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"""
|
||||
start_datetime, end_datetime,
|
||||
performer_name, repository_name, namespace_name,
|
||||
filter_kinds,
|
||||
page_token,
|
||||
max_page_count,
|
||||
search_response,
|
||||
list_indices_response,
|
||||
expected_request,
|
||||
expected_page,
|
||||
throws
|
||||
""",
|
||||
[
|
||||
# 1st page
|
||||
pytest.param(
|
||||
parse('2018-03-08T03:30'), parse('2018-04-08T03:30'),
|
||||
'user1', 'repo1', 'user1',
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
SEARCH_RESPONSE_START,
|
||||
INDEX_LIST_RESPONSE_HIT1_HIT2,
|
||||
SEARCH_REQUEST_START,
|
||||
SEARCH_PAGE_START,
|
||||
False,
|
||||
id="1st page"
|
||||
),
|
||||
|
||||
# Last page
|
||||
pytest.param(
|
||||
parse('2018-03-08T03:30'), parse('2018-04-08T03:30'),
|
||||
'user1', 'repo1', 'user1',
|
||||
None,
|
||||
SEARCH_PAGE_TOKEN,
|
||||
None,
|
||||
SEARCH_RESPONSE_END,
|
||||
INDEX_LIST_RESPONSE_HIT1_HIT2,
|
||||
SEARCH_REQUEST_END,
|
||||
SEARCH_PAGE_END,
|
||||
False,
|
||||
id="Search using pagination token"
|
||||
),
|
||||
|
||||
# Filter
|
||||
pytest.param(
|
||||
parse('2018-03-08T03:30'), parse('2018-04-08T03:30'),
|
||||
'user1', 'repo1', 'user1',
|
||||
['push_repo'],
|
||||
None,
|
||||
None,
|
||||
SEARCH_RESPONSE_END,
|
||||
INDEX_LIST_RESPONSE_HIT2,
|
||||
SEARCH_REQUEST_FILTER,
|
||||
SEARCH_PAGE_END,
|
||||
False,
|
||||
id="Filtered search"
|
||||
),
|
||||
|
||||
# Max page count
|
||||
pytest.param(
|
||||
parse('2018-03-08T03:30'), parse('2018-04-08T03:30'),
|
||||
'user1', 'repo1', 'user1',
|
||||
None,
|
||||
SEARCH_PAGE_TOKEN,
|
||||
1,
|
||||
AssertionError, # Assert that it should not reach the ES server
|
||||
None,
|
||||
None,
|
||||
SEARCH_PAGE_EMPTY,
|
||||
False,
|
||||
id="Page token reaches maximum page count",
|
||||
),
|
||||
])
|
||||
def test_lookup_logs(start_datetime, end_datetime,
|
||||
performer_name, repository_name, namespace_name,
|
||||
filter_kinds,
|
||||
page_token,
|
||||
max_page_count,
|
||||
search_response,
|
||||
list_indices_response,
|
||||
expected_request,
|
||||
expected_page,
|
||||
throws,
|
||||
logs_model, mock_elasticsearch, mock_db_model, mock_page_size, app_config):
|
||||
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
|
||||
mock_elasticsearch.search_after = Mock(return_value=search_response)
|
||||
mock_elasticsearch.list_indices = Mock(return_value=list_indices_response)
|
||||
|
||||
configure(app_config)
|
||||
if throws:
|
||||
with pytest.raises(Exception):
|
||||
logs_model.lookup_logs(start_datetime, end_datetime, performer_name, repository_name,
|
||||
namespace_name, filter_kinds, page_token, max_page_count)
|
||||
else:
|
||||
page = logs_model.lookup_logs(start_datetime, end_datetime, performer_name, repository_name,
|
||||
namespace_name, filter_kinds, page_token, max_page_count)
|
||||
assert page == expected_page
|
||||
if expected_request:
|
||||
mock_elasticsearch.search_after.assert_called_with(expected_request)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"""
|
||||
start_datetime, end_datetime,
|
||||
performer_name, repository_name, namespace_name,
|
||||
filter_kinds, search_response, expected_request, expected_counts, throws
|
||||
""",
|
||||
[
|
||||
# Valid
|
||||
pytest.param(
|
||||
parse('2018-03-08T03:30'), parse('2018-04-08T03:30'),
|
||||
'user1', 'repo1', 'user1',
|
||||
['pull_repo'], AGGS_RESPONSE, AGGS_REQUEST, AGGS_COUNT, False,
|
||||
id="Valid Counts"
|
||||
),
|
||||
|
||||
# Invalid case: date range too big
|
||||
pytest.param(
|
||||
parse('2018-03-08T03:30'), parse('2018-04-09T03:30'),
|
||||
'user1', 'repo1', 'user1',
|
||||
[], None, None, None, True,
|
||||
id="Throw on date range too big"
|
||||
)
|
||||
])
|
||||
def test_get_aggregated_log_counts(start_datetime, end_datetime,
|
||||
performer_name, repository_name, namespace_name,
|
||||
filter_kinds, search_response, expected_request, expected_counts, throws,
|
||||
logs_model, mock_elasticsearch, mock_db_model, app_config):
|
||||
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
|
||||
mock_elasticsearch.search_aggs = Mock(return_value=search_response)
|
||||
|
||||
configure(app_config)
|
||||
if throws:
|
||||
with pytest.raises(Exception):
|
||||
logs_model.get_aggregated_log_counts(start_datetime, end_datetime, performer_name,
|
||||
repository_name, namespace_name, filter_kinds)
|
||||
else:
|
||||
counts = logs_model.get_aggregated_log_counts(start_datetime, end_datetime, performer_name,
|
||||
repository_name, namespace_name, filter_kinds)
|
||||
assert set(counts) == set(expected_counts)
|
||||
if expected_request:
|
||||
mock_elasticsearch.search_aggs.assert_called_with(expected_request)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"""
|
||||
repository,
|
||||
day,
|
||||
count_response, expected_request, expected_count, throws
|
||||
""",
|
||||
[
|
||||
pytest.param(
|
||||
FAKE_REPOSITORIES['user1/repo1'],
|
||||
parse("2018-03-08").date(),
|
||||
COUNT_RESPONSE, COUNT_REQUEST, 1, False,
|
||||
id="Valid Count with 1 as result"),
|
||||
])
|
||||
def test_count_repository_actions(repository,
|
||||
day,
|
||||
count_response, expected_request, expected_count, throws,
|
||||
logs_model, mock_elasticsearch, mock_db_model, app_config):
|
||||
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
|
||||
mock_elasticsearch.count = Mock(return_value=count_response)
|
||||
mock_elasticsearch.list_indices = Mock(return_value=INDEX_LIST_RESPONSE)
|
||||
|
||||
configure(app_config)
|
||||
if throws:
|
||||
with pytest.raises(Exception):
|
||||
logs_model.count_repository_actions(repository, day)
|
||||
else:
|
||||
count = logs_model.count_repository_actions(repository, day)
|
||||
assert count == expected_count
|
||||
if expected_request:
|
||||
mock_elasticsearch.count.assert_called_with(expected_request)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"""
|
||||
start_datetime, end_datetime,
|
||||
repository_id, namespace_id,
|
||||
max_query_time, scroll_responses, expected_requests, expected_logs, throws
|
||||
""",
|
||||
[
|
||||
pytest.param(
|
||||
parse("2018-03-08"), parse("2018-04-02"),
|
||||
1, 1,
|
||||
timedelta(seconds=10), SCROLL_RESPONSES, SCROLL_REQUESTS, SCROLL_LOGS, False,
|
||||
id="Scroll 3 pages with page size = 1"
|
||||
),
|
||||
])
|
||||
def test_yield_logs_for_export(start_datetime, end_datetime,
|
||||
repository_id, namespace_id,
|
||||
max_query_time, scroll_responses, expected_requests, expected_logs, throws,
|
||||
logs_model, mock_elasticsearch, mock_db_model, mock_max_result_window, app_config):
|
||||
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
|
||||
mock_elasticsearch.search_scroll_create = Mock(return_value=scroll_responses[0])
|
||||
mock_elasticsearch.scroll_get = Mock(side_effect=scroll_responses[1:-1])
|
||||
mock_elasticsearch.scroll_delete = Mock(return_value=scroll_responses[-1])
|
||||
|
||||
configure(app_config)
|
||||
if throws:
|
||||
with pytest.raises(Exception):
|
||||
logs_model.yield_logs_for_export(start_datetime, end_datetime, max_query_time=max_query_time)
|
||||
else:
|
||||
log_generator = logs_model.yield_logs_for_export(start_datetime, end_datetime,
|
||||
max_query_time=max_query_time)
|
||||
counter = 0
|
||||
for logs in log_generator:
|
||||
if counter == 0:
|
||||
mock_elasticsearch.search_scroll_create.assert_called_with(*expected_requests[counter])
|
||||
else:
|
||||
mock_elasticsearch.scroll_get.assert_called_with(*expected_requests[counter])
|
||||
assert expected_logs[counter] == logs
|
||||
counter += 1
|
||||
# the last two requests must be
|
||||
# 1. get with response scroll with 0 hits, which indicates the termination condition
|
||||
# 2. delete scroll request
|
||||
mock_elasticsearch.scroll_get.assert_called_with(*expected_requests[-2])
|
||||
mock_elasticsearch.scroll_delete.assert_called_with(*expected_requests[-1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('prefix, is_valid', [
|
||||
pytest.param('..', False, id='Invalid `..`'),
|
||||
pytest.param('.', False, id='Invalid `.`'),
|
||||
pytest.param('-prefix', False, id='Invalid prefix start -'),
|
||||
pytest.param('_prefix', False, id='Invalid prefix start _'),
|
||||
pytest.param('+prefix', False, id='Invalid prefix start +'),
|
||||
pytest.param('prefix_with_UPPERCASES', False, id='Invalid uppercase'),
|
||||
pytest.param('valid_index', True, id='Valid prefix'),
|
||||
pytest.param('valid_index_with_numbers1234', True, id='Valid prefix with numbers'),
|
||||
pytest.param('a'*256, False, id='Prefix too long')
|
||||
])
|
||||
def test_valid_index_prefix(prefix, is_valid):
|
||||
assert ElasticsearchLogs._valid_index_prefix(prefix) == is_valid
|
||||
|
||||
|
||||
@pytest.mark.parametrize('index, cutoff_date, expected_result', [
|
||||
pytest.param(
|
||||
INDEX_NAME_PREFIX+'2019-06-06',
|
||||
datetime(2019, 6, 8),
|
||||
True,
|
||||
id="Index older than cutoff"
|
||||
),
|
||||
pytest.param(
|
||||
INDEX_NAME_PREFIX+'2019-06-06',
|
||||
datetime(2019, 6, 4),
|
||||
False,
|
||||
id="Index younger than cutoff"
|
||||
),
|
||||
pytest.param(
|
||||
INDEX_NAME_PREFIX+'2019-06-06',
|
||||
datetime(2019, 6, 6, 23),
|
||||
False,
|
||||
id="Index older than cutoff but timedelta less than 1 day"
|
||||
),
|
||||
pytest.param(
|
||||
INDEX_NAME_PREFIX+'2019-06-06',
|
||||
datetime(2019, 6, 7),
|
||||
True,
|
||||
id="Index older than cutoff by exactly one day"
|
||||
),
|
||||
])
|
||||
def test_can_delete_index(index, cutoff_date, expected_result):
|
||||
es = ElasticsearchLogs(index_prefix=INDEX_NAME_PREFIX)
|
||||
assert datetime.strptime(index.split(es._index_prefix, 1)[-1], INDEX_DATE_FORMAT)
|
||||
assert es.can_delete_index(index, cutoff_date) == expected_result
|
473
data/logs_model/test/test_logs_interface.py
Normal file
473
data/logs_model/test/test_logs_interface.py
Normal file
|
@ -0,0 +1,473 @@
|
|||
from datetime import datetime, timedelta, date
|
||||
from data.logs_model.datatypes import AggregatedLogCount
|
||||
from data.logs_model.table_logs_model import TableLogsModel
|
||||
from data.logs_model.combined_model import CombinedLogsModel
|
||||
from data.logs_model.inmemory_model import InMemoryModel
|
||||
from data.logs_model.combined_model import _merge_aggregated_log_counts
|
||||
from data.logs_model.document_logs_model import _date_range_in_single_index, DocumentLogsModel
|
||||
from data.logs_model.interface import LogsIterationTimeout
|
||||
from data.logs_model.test.fake_elasticsearch import FAKE_ES_HOST, fake_elasticsearch
|
||||
|
||||
from data.database import LogEntry, LogEntry2, LogEntry3, LogEntryKind
|
||||
from data import model
|
||||
|
||||
from test.fixtures import *
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def mock_page_size():
|
||||
page_size = 2
|
||||
with patch('data.logs_model.document_logs_model.PAGE_SIZE', page_size):
|
||||
yield page_size
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def clear_db_logs(initialized_db):
|
||||
LogEntry.delete().execute()
|
||||
LogEntry2.delete().execute()
|
||||
LogEntry3.delete().execute()
|
||||
|
||||
|
||||
def combined_model():
|
||||
return CombinedLogsModel(TableLogsModel(), InMemoryModel())
|
||||
|
||||
|
||||
def es_model():
|
||||
return DocumentLogsModel(producer='elasticsearch', elasticsearch_config={
|
||||
'host': FAKE_ES_HOST,
|
||||
'port': 12345,
|
||||
})
|
||||
|
||||
@pytest.fixture()
|
||||
def fake_es():
|
||||
with fake_elasticsearch():
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(params=[TableLogsModel, InMemoryModel, es_model, combined_model])
|
||||
def logs_model(request, clear_db_logs, fake_es):
|
||||
return request.param()
|
||||
|
||||
|
||||
def _lookup_logs(logs_model, start_time, end_time, **kwargs):
|
||||
logs_found = []
|
||||
page_token = None
|
||||
while True:
|
||||
found = logs_model.lookup_logs(start_time, end_time, page_token=page_token, **kwargs)
|
||||
logs_found.extend(found.logs)
|
||||
page_token = found.next_page_token
|
||||
if not found.logs or not page_token:
|
||||
break
|
||||
|
||||
assert len(logs_found) == len(set(logs_found))
|
||||
return logs_found
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.environ.get('TEST_DATABASE_URI', '').find('mysql') >= 0,
|
||||
reason='Flaky on MySQL')
|
||||
@pytest.mark.parametrize('namespace_name, repo_name, performer_name, check_args, expect_results', [
|
||||
pytest.param('devtable', 'simple', 'devtable', {}, True, id='no filters'),
|
||||
pytest.param('devtable', 'simple', 'devtable', {
|
||||
'performer_name': 'devtable',
|
||||
}, True, id='matching performer'),
|
||||
|
||||
pytest.param('devtable', 'simple', 'devtable', {
|
||||
'namespace_name': 'devtable',
|
||||
}, True, id='matching namespace'),
|
||||
|
||||
pytest.param('devtable', 'simple', 'devtable', {
|
||||
'namespace_name': 'devtable',
|
||||
'repository_name': 'simple',
|
||||
}, True, id='matching repository'),
|
||||
|
||||
pytest.param('devtable', 'simple', 'devtable', {
|
||||
'performer_name': 'public',
|
||||
}, False, id='different performer'),
|
||||
|
||||
pytest.param('devtable', 'simple', 'devtable', {
|
||||
'namespace_name': 'public',
|
||||
}, False, id='different namespace'),
|
||||
|
||||
pytest.param('devtable', 'simple', 'devtable', {
|
||||
'namespace_name': 'devtable',
|
||||
'repository_name': 'complex',
|
||||
}, False, id='different repository'),
|
||||
])
|
||||
def test_logs(namespace_name, repo_name, performer_name, check_args, expect_results, logs_model):
|
||||
# Add some logs.
|
||||
kinds = list(LogEntryKind.select())
|
||||
user = model.user.get_user(performer_name)
|
||||
|
||||
start_timestamp = datetime.utcnow()
|
||||
timestamp = start_timestamp
|
||||
|
||||
for kind in kinds:
|
||||
for index in range(0, 3):
|
||||
logs_model.log_action(kind.name, namespace_name=namespace_name, repository_name=repo_name,
|
||||
performer=user, ip='1.2.3.4', timestamp=timestamp)
|
||||
timestamp = timestamp + timedelta(seconds=1)
|
||||
|
||||
found = _lookup_logs(logs_model, start_timestamp, start_timestamp + timedelta(minutes=10),
|
||||
**check_args)
|
||||
if expect_results:
|
||||
assert len(found) == len(kinds) * 3
|
||||
else:
|
||||
assert not found
|
||||
|
||||
aggregated_counts = logs_model.get_aggregated_log_counts(start_timestamp,
|
||||
start_timestamp + timedelta(minutes=10),
|
||||
**check_args)
|
||||
if expect_results:
|
||||
assert len(aggregated_counts) == len(kinds)
|
||||
for ac in aggregated_counts:
|
||||
assert ac.count == 3
|
||||
else:
|
||||
assert not aggregated_counts
|
||||
|
||||
|
||||
@pytest.mark.parametrize('filter_kinds, expect_results', [
|
||||
pytest.param(None, True),
|
||||
pytest.param(['push_repo'], True, id='push_repo filter'),
|
||||
pytest.param(['pull_repo'], True, id='pull_repo filter'),
|
||||
pytest.param(['push_repo', 'pull_repo'], False, id='push and pull filters')
|
||||
])
|
||||
def test_lookup_latest_logs(filter_kinds, expect_results, logs_model):
|
||||
kind_map = model.log.get_log_entry_kinds()
|
||||
if filter_kinds:
|
||||
ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds if filter_kinds]
|
||||
else:
|
||||
ignore_ids = []
|
||||
|
||||
now = datetime.now()
|
||||
namespace_name = 'devtable'
|
||||
repo_name = 'simple'
|
||||
performer_name = 'devtable'
|
||||
|
||||
user = model.user.get_user(performer_name)
|
||||
size = 3
|
||||
|
||||
# Log some push actions
|
||||
logs_model.log_action('push_repo', namespace_name=namespace_name, repository_name=repo_name,
|
||||
performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=1, seconds=11))
|
||||
logs_model.log_action('push_repo', namespace_name=namespace_name, repository_name=repo_name,
|
||||
performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=7, seconds=33))
|
||||
|
||||
# Log some pull actions
|
||||
logs_model.log_action('pull_repo', namespace_name=namespace_name, repository_name=repo_name,
|
||||
performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=0, seconds=3))
|
||||
logs_model.log_action('pull_repo', namespace_name=namespace_name, repository_name=repo_name,
|
||||
performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=3, seconds=55))
|
||||
logs_model.log_action('pull_repo', namespace_name=namespace_name, repository_name=repo_name,
|
||||
performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=5, seconds=3))
|
||||
logs_model.log_action('pull_repo', namespace_name=namespace_name, repository_name=repo_name,
|
||||
performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=11, seconds=11))
|
||||
|
||||
# Get the latest logs
|
||||
latest_logs = logs_model.lookup_latest_logs(performer_name, repo_name, namespace_name,
|
||||
filter_kinds=filter_kinds, size=size)
|
||||
|
||||
# Test max lookup size
|
||||
assert len(latest_logs) <= size
|
||||
|
||||
# Make sure that the latest logs returned are in decreasing order
|
||||
assert all(x >= y for x, y in zip(latest_logs, latest_logs[1:]))
|
||||
|
||||
if expect_results:
|
||||
assert latest_logs
|
||||
|
||||
# Lookup all logs filtered by kinds and sort them in reverse chronological order
|
||||
all_logs = _lookup_logs(logs_model, now - timedelta(days=30), now + timedelta(days=30),
|
||||
filter_kinds=filter_kinds, namespace_name=namespace_name,
|
||||
repository_name=repo_name)
|
||||
all_logs = sorted(all_logs, key=lambda l: l.datetime, reverse=True)
|
||||
|
||||
# Check that querying all logs does not return the filtered kinds
|
||||
assert all([log.kind_id not in ignore_ids for log in all_logs])
|
||||
|
||||
# Check that the latest logs contains only th most recent ones
|
||||
assert latest_logs == all_logs[:len(latest_logs)]
|
||||
|
||||
|
||||
def test_count_repository_actions(logs_model):
|
||||
# Log some actions.
|
||||
logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
|
||||
# Log some actions to a different repo.
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='complex',
|
||||
ip='1.2.3.4')
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='complex',
|
||||
ip='1.2.3.4')
|
||||
|
||||
# Count the actions.
|
||||
day = date.today()
|
||||
simple_repo = model.repository.get_repository('devtable', 'simple')
|
||||
|
||||
count = logs_model.count_repository_actions(simple_repo, day)
|
||||
assert count == 3
|
||||
|
||||
complex_repo = model.repository.get_repository('devtable', 'complex')
|
||||
count = logs_model.count_repository_actions(complex_repo, day)
|
||||
assert count == 2
|
||||
|
||||
# Try counting actions for a few days in the future to ensure it doesn't raise an error.
|
||||
count = logs_model.count_repository_actions(simple_repo, day + timedelta(days=5))
|
||||
assert count == 0
|
||||
|
||||
|
||||
def test_yield_log_rotation_context(logs_model):
|
||||
cutoff_date = datetime.now()
|
||||
min_logs_per_rotation = 3
|
||||
|
||||
# Log some actions to be archived
|
||||
# One day
|
||||
logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple1',
|
||||
ip='1.2.3.4', timestamp=cutoff_date-timedelta(days=1, seconds=1))
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2',
|
||||
ip='5.6.7.8', timestamp=cutoff_date-timedelta(days=1, seconds=2))
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple3',
|
||||
ip='9.10.11.12', timestamp=cutoff_date-timedelta(days=1, seconds=3))
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple4',
|
||||
ip='0.0.0.0', timestamp=cutoff_date-timedelta(days=1, seconds=4))
|
||||
# Another day
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple5',
|
||||
ip='1.1.1.1', timestamp=cutoff_date-timedelta(days=2, seconds=1))
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple5',
|
||||
ip='1.1.1.1', timestamp=cutoff_date-timedelta(days=2, seconds=2))
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple5',
|
||||
ip='1.1.1.1', timestamp=cutoff_date-timedelta(days=2, seconds=3))
|
||||
|
||||
found = _lookup_logs(logs_model, cutoff_date - timedelta(days=3), cutoff_date + timedelta(days=1))
|
||||
assert found is not None and len(found) == 7
|
||||
|
||||
# Iterate the logs using the log rotation contexts
|
||||
all_logs = []
|
||||
for log_rotation_context in logs_model.yield_log_rotation_context(cutoff_date,
|
||||
min_logs_per_rotation):
|
||||
with log_rotation_context as context:
|
||||
for logs, _ in context.yield_logs_batch():
|
||||
all_logs.extend(logs)
|
||||
|
||||
assert len(all_logs) == 7
|
||||
found = _lookup_logs(logs_model, cutoff_date - timedelta(days=3), cutoff_date + timedelta(days=1))
|
||||
assert not found
|
||||
|
||||
# Make sure all datetimes are monotonically increasing (by datetime) after sorting the lookup
|
||||
# to make sure no duplicates were returned
|
||||
all_logs.sort(key=lambda d: d.datetime)
|
||||
assert all(x.datetime < y.datetime for x, y in zip(all_logs, all_logs[1:]))
|
||||
|
||||
|
||||
def test_count_repository_actions_with_wildcard_disabled(initialized_db):
|
||||
with fake_elasticsearch(allow_wildcard=False):
|
||||
logs_model = es_model()
|
||||
|
||||
# Log some actions.
|
||||
logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
|
||||
# Log some actions to a different repo.
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='complex',
|
||||
ip='1.2.3.4')
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='complex',
|
||||
ip='1.2.3.4')
|
||||
|
||||
# Count the actions.
|
||||
day = date.today()
|
||||
simple_repo = model.repository.get_repository('devtable', 'simple')
|
||||
|
||||
count = logs_model.count_repository_actions(simple_repo, day)
|
||||
assert count == 3
|
||||
|
||||
complex_repo = model.repository.get_repository('devtable', 'complex')
|
||||
count = logs_model.count_repository_actions(complex_repo, day)
|
||||
assert count == 2
|
||||
|
||||
# Try counting actions for a few days in the future to ensure it doesn't raise an error.
|
||||
count = logs_model.count_repository_actions(simple_repo, day + timedelta(days=5))
|
||||
assert count == 0
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.environ.get('TEST_DATABASE_URI', '').find('mysql') >= 0,
|
||||
reason='Flaky on MySQL')
|
||||
def test_yield_logs_for_export(logs_model):
|
||||
# Add some logs.
|
||||
kinds = list(LogEntryKind.select())
|
||||
user = model.user.get_user('devtable')
|
||||
|
||||
start_timestamp = datetime.utcnow()
|
||||
timestamp = start_timestamp
|
||||
|
||||
for kind in kinds:
|
||||
for index in range(0, 10):
|
||||
logs_model.log_action(kind.name, namespace_name='devtable', repository_name='simple',
|
||||
performer=user, ip='1.2.3.4', timestamp=timestamp)
|
||||
timestamp = timestamp + timedelta(seconds=1)
|
||||
|
||||
# Yield the logs.
|
||||
simple_repo = model.repository.get_repository('devtable', 'simple')
|
||||
logs_found = []
|
||||
for logs in logs_model.yield_logs_for_export(start_timestamp, timestamp + timedelta(minutes=10),
|
||||
repository_id=simple_repo.id):
|
||||
logs_found.extend(logs)
|
||||
|
||||
# Ensure we found all added logs.
|
||||
assert len(logs_found) == len(kinds) * 10
|
||||
|
||||
|
||||
def test_yield_logs_for_export_timeout(logs_model):
|
||||
# Add some logs.
|
||||
kinds = list(LogEntryKind.select())
|
||||
user = model.user.get_user('devtable')
|
||||
|
||||
start_timestamp = datetime.utcnow()
|
||||
timestamp = start_timestamp
|
||||
|
||||
for kind in kinds:
|
||||
for _ in range(0, 2):
|
||||
logs_model.log_action(kind.name, namespace_name='devtable', repository_name='simple',
|
||||
performer=user, ip='1.2.3.4', timestamp=timestamp)
|
||||
timestamp = timestamp + timedelta(seconds=1)
|
||||
|
||||
# Yield the logs. Since we set the timeout to nothing, it should immediately fail.
|
||||
simple_repo = model.repository.get_repository('devtable', 'simple')
|
||||
with pytest.raises(LogsIterationTimeout):
|
||||
list(logs_model.yield_logs_for_export(start_timestamp, timestamp + timedelta(minutes=1),
|
||||
repository_id=simple_repo.id,
|
||||
max_query_time=timedelta(seconds=0)))
|
||||
|
||||
|
||||
def test_disabled_namespace(clear_db_logs):
|
||||
logs_model = TableLogsModel(lambda kind, namespace, is_free: namespace == 'devtable')
|
||||
|
||||
# Log some actions.
|
||||
logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple',
|
||||
ip='1.2.3.4')
|
||||
|
||||
# Log some actions to a different namespace.
|
||||
logs_model.log_action('push_repo', namespace_name='buynlarge', repository_name='orgrepo',
|
||||
ip='1.2.3.4')
|
||||
|
||||
logs_model.log_action('pull_repo', namespace_name='buynlarge', repository_name='orgrepo',
|
||||
ip='1.2.3.4')
|
||||
logs_model.log_action('pull_repo', namespace_name='buynlarge', repository_name='orgrepo',
|
||||
ip='1.2.3.4')
|
||||
|
||||
# Count the actions.
|
||||
day = datetime.today() - timedelta(minutes=60)
|
||||
simple_repo = model.repository.get_repository('devtable', 'simple')
|
||||
count = logs_model.count_repository_actions(simple_repo, day)
|
||||
assert count == 0
|
||||
|
||||
org_repo = model.repository.get_repository('buynlarge', 'orgrepo')
|
||||
count = logs_model.count_repository_actions(org_repo, day)
|
||||
assert count == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('aggregated_log_counts1, aggregated_log_counts2, expected_result', [
|
||||
pytest.param(
|
||||
[
|
||||
AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0)), # 1
|
||||
AggregatedLogCount(1, 3, datetime(2019, 6, 7, 0, 0)), # 2
|
||||
],
|
||||
[
|
||||
AggregatedLogCount(1, 5, datetime(2019, 6, 6, 0, 0)), # 1
|
||||
AggregatedLogCount(1, 7, datetime(2019, 6, 7, 0, 0)), # 2
|
||||
AggregatedLogCount(3, 3, datetime(2019, 6, 1, 0, 0)), # 3
|
||||
],
|
||||
[
|
||||
AggregatedLogCount(1, 8, datetime(2019, 6, 6, 0, 0)), # 1
|
||||
AggregatedLogCount(1, 10, datetime(2019, 6, 7, 0, 0)), # 2
|
||||
AggregatedLogCount(3, 3, datetime(2019, 6, 1, 0, 0)) # 3
|
||||
]
|
||||
),
|
||||
pytest.param(
|
||||
[
|
||||
AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0)), # 1
|
||||
],
|
||||
[
|
||||
AggregatedLogCount(1, 7, datetime(2019, 6, 7, 0, 0)), # 2
|
||||
],
|
||||
[
|
||||
AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0)), # 1
|
||||
AggregatedLogCount(1, 7, datetime(2019, 6, 7, 0, 0)), # 2
|
||||
]
|
||||
),
|
||||
pytest.param(
|
||||
[],
|
||||
[AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0))],
|
||||
[AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0))]
|
||||
),
|
||||
])
|
||||
def test_merge_aggregated_log_counts(aggregated_log_counts1, aggregated_log_counts2, expected_result):
|
||||
assert (sorted(_merge_aggregated_log_counts(aggregated_log_counts1, aggregated_log_counts2)) ==
|
||||
sorted(expected_result))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dt1, dt2, expected_result', [
|
||||
# Valid dates
|
||||
pytest.param(date(2019, 6, 17), date(2019, 6, 18), True),
|
||||
|
||||
# Invalid dates
|
||||
pytest.param(date(2019, 6, 17), date(2019, 6, 17), False),
|
||||
pytest.param(date(2019, 6, 17), date(2019, 6, 19), False),
|
||||
pytest.param(date(2019, 6, 18), date(2019, 6, 17), False),
|
||||
|
||||
# Valid datetimes
|
||||
pytest.param(datetime(2019, 6, 17, 0, 1), datetime(2019, 6, 17, 0, 2), True),
|
||||
|
||||
# Invalid datetimes
|
||||
pytest.param(datetime(2019, 6, 17, 0, 2), datetime(2019, 6, 17, 0, 1), False),
|
||||
pytest.param(datetime(2019, 6, 17, 11), datetime(2019, 6, 17, 11) + timedelta(hours=14), False),
|
||||
])
|
||||
def test_date_range_in_single_index(dt1, dt2, expected_result):
|
||||
assert _date_range_in_single_index(dt1, dt2) == expected_result
|
||||
|
||||
|
||||
def test_pagination(logs_model, mock_page_size):
|
||||
"""
|
||||
Make sure that pagination does not stop if searching through multiple indices by day,
|
||||
and the current log count matches the page size while there are still indices to be searched.
|
||||
"""
|
||||
day1 = datetime.now()
|
||||
day2 = day1 + timedelta(days=1)
|
||||
day3 = day2 + timedelta(days=1)
|
||||
|
||||
# Log some actions in day indices
|
||||
# One day
|
||||
logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple1',
|
||||
ip='1.2.3.4', timestamp=day1)
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple1',
|
||||
ip='5.6.7.8', timestamp=day1)
|
||||
|
||||
found = _lookup_logs(logs_model, day1-timedelta(seconds=1), day3+timedelta(seconds=1))
|
||||
assert len(found) == mock_page_size
|
||||
|
||||
# Another day
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2',
|
||||
ip='1.1.1.1', timestamp=day2)
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2',
|
||||
ip='0.0.0.0', timestamp=day2)
|
||||
|
||||
# Yet another day
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2',
|
||||
ip='1.1.1.1', timestamp=day3)
|
||||
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2',
|
||||
ip='0.0.0.0', timestamp=day3)
|
||||
|
||||
found = _lookup_logs(logs_model, day1-timedelta(seconds=1), day3+timedelta(seconds=1))
|
||||
assert len(found) == 6
|
77
data/logs_model/test/test_logs_producer.py
Normal file
77
data/logs_model/test/test_logs_producer.py
Normal file
|
@ -0,0 +1,77 @@
|
|||
import logging
|
||||
import pytest
|
||||
from dateutil.parser import parse
|
||||
from mock import patch, Mock
|
||||
|
||||
import botocore
|
||||
|
||||
from data.logs_model import configure
|
||||
|
||||
from test_elasticsearch import app_config, logs_model_config, logs_model, mock_elasticsearch, mock_db_model
|
||||
from mock_elasticsearch import *
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FAKE_KAFKA_BROKERS = ['fake_server1', 'fake_server2']
|
||||
FAKE_KAFKA_TOPIC = 'sometopic'
|
||||
FAKE_MAX_BLOCK_SECONDS = 1
|
||||
|
||||
@pytest.fixture()
|
||||
def kafka_logs_producer_config(app_config):
|
||||
producer_config = {}
|
||||
producer_config.update(app_config)
|
||||
|
||||
kafka_config = {
|
||||
'bootstrap_servers': FAKE_KAFKA_BROKERS,
|
||||
'topic': FAKE_KAFKA_TOPIC,
|
||||
'max_block_seconds': FAKE_MAX_BLOCK_SECONDS
|
||||
}
|
||||
|
||||
producer_config['LOGS_MODEL_CONFIG']['producer'] = 'kafka'
|
||||
producer_config['LOGS_MODEL_CONFIG']['kafka_config'] = kafka_config
|
||||
return producer_config
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def kinesis_logs_producer_config(app_config):
|
||||
producer_config = {}
|
||||
producer_config.update(app_config)
|
||||
|
||||
kinesis_stream_config = {
|
||||
'stream_name': 'test-stream',
|
||||
'aws_region': 'fake_region',
|
||||
'aws_access_key': 'some_key',
|
||||
'aws_secret_key': 'some_secret'
|
||||
}
|
||||
|
||||
producer_config['LOGS_MODEL_CONFIG']['producer'] = 'kinesis_stream'
|
||||
producer_config['LOGS_MODEL_CONFIG']['kinesis_stream_config'] = kinesis_stream_config
|
||||
return producer_config
|
||||
|
||||
|
||||
def test_kafka_logs_producers(logs_model, mock_elasticsearch, mock_db_model, kafka_logs_producer_config):
|
||||
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
|
||||
|
||||
producer_config = kafka_logs_producer_config
|
||||
with patch('kafka.client_async.KafkaClient.check_version'), patch('kafka.KafkaProducer.send') as mock_send:
|
||||
configure(producer_config)
|
||||
logs_model.log_action('pull_repo', 'user1', Mock(id=1), '192.168.1.1', {'key': 'value'},
|
||||
None, 'repo1', parse("2019-01-01T03:30"))
|
||||
|
||||
mock_send.assert_called_once()
|
||||
|
||||
|
||||
def test_kinesis_logs_producers(logs_model, mock_elasticsearch, mock_db_model, kinesis_logs_producer_config):
|
||||
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
|
||||
|
||||
producer_config = kinesis_logs_producer_config
|
||||
with patch('botocore.endpoint.EndpointCreator.create_endpoint'), \
|
||||
patch('botocore.client.BaseClient._make_api_call') as mock_send:
|
||||
configure(producer_config)
|
||||
logs_model.log_action('pull_repo', 'user1', Mock(id=1), '192.168.1.1', {'key': 'value'},
|
||||
None, 'repo1', parse("2019-01-01T03:30"))
|
||||
|
||||
# Check that a PutRecord api call is made.
|
||||
# NOTE: The second arg of _make_api_call uses a randomized PartitionKey
|
||||
mock_send.assert_called_once_with(u'PutRecord', mock_send.call_args_list[0][0][1])
|
Reference in a new issue