initial import for Open Source 🎉

This commit is contained in:
Jimmy Zelinskie 2019-11-12 11:09:47 -05:00
parent 1898c361f3
commit 9c0dd3b722
2048 changed files with 218743 additions and 0 deletions

View file

@ -0,0 +1,64 @@
import logging
from data.logs_model.table_logs_model import TableLogsModel
from data.logs_model.document_logs_model import DocumentLogsModel
from data.logs_model.combined_model import CombinedLogsModel
logger = logging.getLogger(__name__)
def _transition_model(*args, **kwargs):
return CombinedLogsModel(
DocumentLogsModel(*args, **kwargs),
TableLogsModel(*args, **kwargs),
)
_LOG_MODELS = {
'database': TableLogsModel,
'transition_reads_both_writes_es': _transition_model,
'elasticsearch': DocumentLogsModel,
}
_PULL_LOG_KINDS = {'pull_repo', 'repo_verb'}
class LogsModelProxy(object):
def __init__(self):
self._model = None
def initialize(self, model):
self._model = model
logger.info('===============================')
logger.info('Using logs model `%s`', self._model)
logger.info('===============================')
def __getattr__(self, attr):
if not self._model:
raise AttributeError("LogsModelProxy is not initialized")
return getattr(self._model, attr)
logs_model = LogsModelProxy()
def configure(app_config):
logger.debug('Configuring log lodel')
model_name = app_config.get('LOGS_MODEL', 'database')
model_config = app_config.get('LOGS_MODEL_CONFIG', {})
def should_skip_logging(kind_name, namespace_name, is_free_namespace):
if namespace_name and namespace_name in app_config.get('DISABLED_FOR_AUDIT_LOGS', {}):
return True
if kind_name in _PULL_LOG_KINDS:
if namespace_name and namespace_name in app_config.get('DISABLED_FOR_PULL_LOGS', {}):
return True
if app_config.get('FEATURE_DISABLE_PULL_LOGS_FOR_FREE_NAMESPACES'):
if is_free_namespace:
return True
return False
model_config['should_skip_logging'] = should_skip_logging
logs_model.initialize(_LOG_MODELS[model_name](**model_config))

View file

@ -0,0 +1,132 @@
import logging
import itertools
from data.logs_model.datatypes import AggregatedLogCount, LogEntriesPage
from data.logs_model.interface import ActionLogsDataInterface
from data.logs_model.shared import SharedModel
logger = logging.getLogger(__name__)
def _merge_aggregated_log_counts(*args):
""" Merge two lists of AggregatedLogCount based on the value of their kind_id and datetime.
"""
matching_keys = {}
aggregated_log_counts_list = itertools.chain.from_iterable(args)
def canonical_key_from_kind_date_tuple(kind_id, dt):
""" Return a comma separated key from an AggregatedLogCount's kind_id and datetime. """
return str(kind_id) + ',' + str(dt)
for kind_id, count, dt in aggregated_log_counts_list:
kind_date_key = canonical_key_from_kind_date_tuple(kind_id, dt)
if kind_date_key in matching_keys:
existing_count = matching_keys[kind_date_key][2]
matching_keys[kind_date_key] = (kind_id, dt, existing_count + count)
else:
matching_keys[kind_date_key] = (kind_id, dt, count)
return [AggregatedLogCount(kind_id, count, dt) for (kind_id, dt, count) in matching_keys.values()]
class CombinedLogsModel(SharedModel, ActionLogsDataInterface):
"""
CombinedLogsModel implements the data model that logs to the first logs model and reads from
both.
"""
def __init__(self, read_write_logs_model, read_only_logs_model):
self.read_write_logs_model = read_write_logs_model
self.read_only_logs_model = read_only_logs_model
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
repository=None, repository_name=None, timestamp=None, is_free_namespace=False):
return self.read_write_logs_model.log_action(kind_name, namespace_name, performer, ip, metadata,
repository, repository_name, timestamp,
is_free_namespace)
def count_repository_actions(self, repository, day):
rw_count = self.read_write_logs_model.count_repository_actions(repository, day)
ro_count = self.read_only_logs_model.count_repository_actions(repository, day)
return rw_count + ro_count
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
repository_name=None, namespace_name=None, filter_kinds=None):
rw_model = self.read_write_logs_model
ro_model = self.read_only_logs_model
rw_count = rw_model.get_aggregated_log_counts(start_datetime, end_datetime,
performer_name=performer_name,
repository_name=repository_name,
namespace_name=namespace_name,
filter_kinds=filter_kinds)
ro_count = ro_model.get_aggregated_log_counts(start_datetime, end_datetime,
performer_name=performer_name,
repository_name=repository_name,
namespace_name=namespace_name,
filter_kinds=filter_kinds)
return _merge_aggregated_log_counts(rw_count, ro_count)
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
namespace_id=None, max_query_time=None):
rw_model = self.read_write_logs_model
ro_model = self.read_only_logs_model
rw_logs = rw_model.yield_logs_for_export(start_datetime, end_datetime, repository_id,
namespace_id, max_query_time)
ro_logs = ro_model.yield_logs_for_export(start_datetime, end_datetime, repository_id,
namespace_id, max_query_time)
for batch in itertools.chain(rw_logs, ro_logs):
yield batch
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
rw_model = self.read_write_logs_model
ro_model = self.read_only_logs_model
page_token = page_token or {}
new_page_token = {}
if page_token is None or not page_token.get('under_readonly_model', False):
rw_page_token = page_token.get('readwrite_page_token')
rw_logs = rw_model.lookup_logs(start_datetime, end_datetime, performer_name,
repository_name, namespace_name, filter_kinds,
rw_page_token, max_page_count)
logs, next_page_token = rw_logs
new_page_token['under_readonly_model'] = next_page_token is None
new_page_token['readwrite_page_token'] = next_page_token
return LogEntriesPage(logs, new_page_token)
else:
readonly_page_token = page_token.get('readonly_page_token')
ro_logs = ro_model.lookup_logs(start_datetime, end_datetime, performer_name,
repository_name, namespace_name, filter_kinds,
readonly_page_token, max_page_count)
logs, next_page_token = ro_logs
if next_page_token is None:
return LogEntriesPage(logs, None)
new_page_token['under_readonly_model'] = True
new_page_token['readonly_page_token'] = next_page_token
return LogEntriesPage(logs, new_page_token)
def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None,
filter_kinds=None, size=20):
latest_logs = []
rw_model = self.read_write_logs_model
ro_model = self.read_only_logs_model
rw_logs = rw_model.lookup_latest_logs(performer_name, repository_name, namespace_name,
filter_kinds, size)
latest_logs.extend(rw_logs)
if len(latest_logs) < size:
ro_logs = ro_model.lookup_latest_logs(performer_name, repository_name, namespace_name,
filter_kinds, size - len(latest_logs))
latest_logs.extend(ro_logs)
return latest_logs
def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
ro_model = self.read_only_logs_model
rw_model = self.read_write_logs_model
ro_ctx = ro_model.yield_log_rotation_context(cutoff_date, min_logs_per_rotation)
rw_ctx = rw_model.yield_log_rotation_context(cutoff_date, min_logs_per_rotation)
for ctx in itertools.chain(ro_ctx, rw_ctx):
yield ctx

View file

@ -0,0 +1,155 @@
import json
from calendar import timegm
from collections import namedtuple
from email.utils import formatdate
from cachetools.func import lru_cache
from data import model
from util.morecollections import AttrDict
def _format_date(date):
""" Output an RFC822 date format. """
if date is None:
return None
return formatdate(timegm(date.utctimetuple()))
@lru_cache(maxsize=1)
def _kinds():
return model.log.get_log_entry_kinds()
class LogEntriesPage(namedtuple('LogEntriesPage', ['logs', 'next_page_token'])):
""" Represents a page returned by the lookup_logs call. The `logs` contains the logs
found for the page and `next_page_token`, if not None, contains the token to be
encoded and returned for the followup call.
"""
class Log(namedtuple('Log', [
'metadata_json', 'ip', 'datetime', 'performer_email', 'performer_username', 'performer_robot',
'account_organization', 'account_username', 'account_email', 'account_robot', 'kind_id'])):
""" Represents a single log entry returned by the logs model. """
@classmethod
def for_logentry(cls, log):
account_organization = None
account_username = None
account_email = None
account_robot = None
try:
account_organization = log.account.organization
account_username = log.account.username
account_email = log.account.email
account_robot = log.account.robot
except AttributeError:
pass
performer_robot = None
performer_username = None
performer_email = None
try:
performer_robot = log.performer.robot
performer_username = log.performer.username
performer_email = log.performer.email
except AttributeError:
pass
return Log(log.metadata_json, log.ip, log.datetime, performer_email, performer_username,
performer_robot, account_organization, account_username, account_email,
account_robot, log.kind_id)
@classmethod
def for_elasticsearch_log(cls, log, id_user_map):
account_organization = None
account_username = None
account_email = None
account_robot = None
try:
if log.account_id:
account = id_user_map[log.account_id]
account_organization = account.organization
account_username = account.username
account_email = account.email
account_robot = account.robot
except AttributeError:
pass
performer_robot = None
performer_username = None
performer_email = None
try:
if log.performer_id:
performer = id_user_map[log.performer_id]
performer_robot = performer.robot
performer_username = performer.username
performer_email = performer.email
except AttributeError:
pass
return Log(log.metadata_json, str(log.ip), log.datetime, performer_email, performer_username,
performer_robot, account_organization, account_username, account_email,
account_robot, log.kind_id)
def to_dict(self, avatar, include_namespace=False):
view = {
'kind': _kinds()[self.kind_id],
'metadata': json.loads(self.metadata_json),
'ip': self.ip,
'datetime': _format_date(self.datetime),
}
if self.performer_username:
performer = AttrDict({'username': self.performer_username, 'email': self.performer_email})
performer.robot = None
if self.performer_robot:
performer.robot = self.performer_robot
view['performer'] = {
'kind': 'user',
'name': self.performer_username,
'is_robot': self.performer_robot,
'avatar': avatar.get_data_for_user(performer),
}
if include_namespace:
if self.account_username:
account = AttrDict({'username': self.account_username, 'email': self.account_email})
if self.account_organization:
view['namespace'] = {
'kind': 'org',
'name': self.account_username,
'avatar': avatar.get_data_for_org(account),
}
else:
account.robot = None
if self.account_robot:
account.robot = self.account_robot
view['namespace'] = {
'kind': 'user',
'name': self.account_username,
'avatar': avatar.get_data_for_user(account),
}
return view
class AggregatedLogCount(namedtuple('AggregatedLogCount', ['kind_id', 'count', 'datetime'])):
""" Represents the aggregated count of the number of logs, of a particular kind, on a day. """
def to_dict(self):
view = {
'kind': _kinds()[self.kind_id],
'count': self.count,
'datetime': _format_date(self.datetime),
}
return view

View file

@ -0,0 +1,532 @@
# pylint: disable=protected-access
import json
import logging
import uuid
from time import time
from datetime import timedelta, datetime, date
from dateutil.parser import parse as parse_datetime
from abc import ABCMeta, abstractmethod
from six import add_metaclass
from elasticsearch.exceptions import ConnectionTimeout, NotFoundError
from data import model
from data.database import CloseForLongOperation
from data.model import config
from data.model.log import (_json_serialize, ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING,
DataModelException)
from data.logs_model.elastic_logs import LogEntry, configure_es
from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage
from data.logs_model.interface import (ActionLogsDataInterface, LogRotationContextInterface,
LogsIterationTimeout)
from data.logs_model.shared import SharedModel, epoch_ms
from data.logs_model.logs_producer import LogProducerProxy, LogSendException
from data.logs_model.logs_producer.kafka_logs_producer import KafkaLogsProducer
from data.logs_model.logs_producer.elasticsearch_logs_producer import ElasticsearchLogsProducer
from data.logs_model.logs_producer.kinesis_stream_logs_producer import KinesisStreamLogsProducer
logger = logging.getLogger(__name__)
PAGE_SIZE = 20
DEFAULT_RESULT_WINDOW = 5000
MAX_RESULT_WINDOW = 10000
# DATE_RANGE_LIMIT is to limit the query date time range to at most 1 month.
DATE_RANGE_LIMIT = 32
# Timeout for count_repository_actions
COUNT_REPOSITORY_ACTION_TIMEOUT = 30
def _date_range_descending(start_datetime, end_datetime, includes_end_datetime=False):
""" Generate the dates between `end_datetime` and `start_datetime`.
If `includes_end_datetime` is set, the generator starts at `end_datetime`,
otherwise, starts the generator at `end_datetime` minus 1 second.
"""
assert end_datetime >= start_datetime
start_date = start_datetime.date()
if includes_end_datetime:
current_date = end_datetime.date()
else:
current_date = (end_datetime - timedelta(seconds=1)).date()
while current_date >= start_date:
yield current_date
current_date = current_date - timedelta(days=1)
def _date_range_in_single_index(dt1, dt2):
""" Determine whether a single index can be searched given a range
of dates or datetimes. If date instances are given, difference should be 1 day.
NOTE: dt2 is exclusive to the search result set.
i.e. The date range is larger or equal to dt1 and strictly smaller than dt2
"""
assert isinstance(dt1, date) and isinstance(dt2, date)
dt = dt2 - dt1
# Check if date or datetime
if not isinstance(dt1, datetime) and not isinstance(dt2, datetime):
return dt == timedelta(days=1)
if dt < timedelta(days=1) and dt >= timedelta(days=0):
return dt2.day == dt1.day
# Check if datetime can be interpreted as a date: hour, minutes, seconds or microseconds set to 0
if dt == timedelta(days=1):
return dt1.hour == 0 and dt1.minute == 0 and dt1.second == 0 and dt1.microsecond == 0
return False
def _for_elasticsearch_logs(logs, repository_id=None, namespace_id=None):
namespace_ids = set()
for log in logs:
namespace_ids.add(log.account_id)
namespace_ids.add(log.performer_id)
assert namespace_id is None or log.account_id == namespace_id
assert repository_id is None or log.repository_id == repository_id
id_user_map = model.user.get_user_map_by_ids(namespace_ids)
return [Log.for_elasticsearch_log(log, id_user_map) for log in logs]
def _random_id():
""" Generates a unique uuid4 string for the random_id field in LogEntry.
It is used as tie-breaker for sorting logs based on datetime:
https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-search-after.html
"""
return str(uuid.uuid4())
@add_metaclass(ABCMeta)
class ElasticsearchLogsModelInterface(object):
"""
Interface for Elasticsearch specific operations with the logs model.
These operations are usually index based.
"""
@abstractmethod
def can_delete_index(self, index, cutoff_date):
""" Return whether the given index is older than the given cutoff date. """
@abstractmethod
def list_indices(self):
""" List the logs model's indices. """
class DocumentLogsModel(SharedModel, ActionLogsDataInterface, ElasticsearchLogsModelInterface):
"""
DocumentLogsModel implements the data model for the logs API backed by an
elasticsearch service.
"""
def __init__(self, should_skip_logging=None, elasticsearch_config=None, producer=None, **kwargs):
self._should_skip_logging = should_skip_logging
self._logs_producer = LogProducerProxy()
self._es_client = configure_es(**elasticsearch_config)
if producer == 'kafka':
kafka_config = kwargs['kafka_config']
self._logs_producer.initialize(KafkaLogsProducer(**kafka_config))
elif producer == 'elasticsearch':
self._logs_producer.initialize(ElasticsearchLogsProducer())
elif producer == 'kinesis_stream':
kinesis_stream_config = kwargs['kinesis_stream_config']
self._logs_producer.initialize(KinesisStreamLogsProducer(**kinesis_stream_config))
else:
raise Exception('Invalid log producer: %s' % producer)
@staticmethod
def _get_ids_by_names(repository_name, namespace_name, performer_name):
""" Retrieve repository/namespace/performer ids based on their names.
throws DataModelException when the namespace_name does not match any
user in the database.
returns database ID or None if not exists.
"""
repository_id = None
account_id = None
performer_id = None
if repository_name and namespace_name:
repository = model.repository.get_repository(namespace_name, repository_name)
if repository:
repository_id = repository.id
account_id = repository.namespace_user.id
if namespace_name and account_id is None:
account = model.user.get_user_or_org(namespace_name)
if account is None:
raise DataModelException('Invalid namespace requested')
account_id = account.id
if performer_name:
performer = model.user.get_user(performer_name)
if performer:
performer_id = performer.id
return repository_id, account_id, performer_id
def _base_query(self, performer_id=None, repository_id=None, account_id=None, filter_kinds=None,
index=None):
if filter_kinds is not None:
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
if index is not None:
search = LogEntry.search(index=index)
else:
search = LogEntry.search()
if performer_id is not None:
assert isinstance(performer_id, int)
search = search.filter('term', performer_id=performer_id)
if repository_id is not None:
assert isinstance(repository_id, int)
search = search.filter('term', repository_id=repository_id)
if account_id is not None and repository_id is None:
assert isinstance(account_id, int)
search = search.filter('term', account_id=account_id)
if filter_kinds is not None:
kind_map = model.log.get_log_entry_kinds()
ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds]
search = search.exclude('terms', kind_id=ignore_ids)
return search
def _base_query_date_range(self, start_datetime, end_datetime, performer_id, repository_id,
account_id, filter_kinds, index=None):
skip_datetime_check = False
if _date_range_in_single_index(start_datetime, end_datetime):
index = self._es_client.index_name(start_datetime)
skip_datetime_check = self._es_client.index_exists(index)
if index and (skip_datetime_check or self._es_client.index_exists(index)):
search = self._base_query(performer_id, repository_id, account_id, filter_kinds,
index=index)
else:
search = self._base_query(performer_id, repository_id, account_id, filter_kinds)
if not skip_datetime_check:
search = search.query('range', datetime={'gte': start_datetime, 'lt': end_datetime})
return search
def _load_logs_for_day(self, logs_date, performer_id, repository_id, account_id, filter_kinds,
after_datetime=None, after_random_id=None, size=PAGE_SIZE):
index = self._es_client.index_name(logs_date)
if not self._es_client.index_exists(index):
return []
search = self._base_query(performer_id, repository_id, account_id, filter_kinds,
index=index)
search = search.sort({'datetime': 'desc'}, {'random_id.keyword': 'desc'})
search = search.extra(size=size)
if after_datetime is not None and after_random_id is not None:
after_datetime_epoch_ms = epoch_ms(after_datetime)
search = search.extra(search_after=[after_datetime_epoch_ms, after_random_id])
return search.execute()
def _load_latest_logs(self, performer_id, repository_id, account_id, filter_kinds, size):
""" Return the latest logs from Elasticsearch.
Look at indices up to theset logrotateworker threshold, or up to 30 days if not defined.
"""
# Set the last index to check to be the logrotateworker threshold, or 30 days
end_datetime = datetime.now()
start_datetime = end_datetime - timedelta(days=DATE_RANGE_LIMIT)
latest_logs = []
for day in _date_range_descending(start_datetime, end_datetime, includes_end_datetime=True):
try:
logs = self._load_logs_for_day(day, performer_id, repository_id, account_id, filter_kinds,
size=size)
latest_logs.extend(logs)
except NotFoundError:
continue
if len(latest_logs) >= size:
break
return _for_elasticsearch_logs(latest_logs[:size], repository_id, account_id)
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
assert start_datetime is not None and end_datetime is not None
# Check for a valid combined model token when migrating online from a combined model
if page_token is not None and page_token.get('readwrite_page_token') is not None:
page_token = page_token.get('readwrite_page_token')
if page_token is not None and max_page_count is not None:
page_number = page_token.get('page_number')
if page_number is not None and page_number + 1 > max_page_count:
return LogEntriesPage([], None)
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
repository_name, namespace_name, performer_name)
after_datetime = None
after_random_id = None
if page_token is not None:
after_datetime = parse_datetime(page_token['datetime'])
after_random_id = page_token['random_id']
if after_datetime is not None:
end_datetime = min(end_datetime, after_datetime)
all_logs = []
with CloseForLongOperation(config.app_config):
for current_date in _date_range_descending(start_datetime, end_datetime):
try:
logs = self._load_logs_for_day(current_date, performer_id, repository_id, account_id,
filter_kinds, after_datetime, after_random_id,
size=PAGE_SIZE+1)
all_logs.extend(logs)
except NotFoundError:
continue
if len(all_logs) > PAGE_SIZE:
break
next_page_token = None
all_logs = all_logs[0:PAGE_SIZE+1]
if len(all_logs) == PAGE_SIZE + 1:
# The last element in the response is used to check if there's more elements.
# The second element in the response is used as the pagination token because search_after does
# not include the exact match, and so the next page will start with the last element.
# This keeps the behavior exactly the same as table_logs_model, so that
# the caller can expect when a pagination token is non-empty, there must be
# at least 1 log to be retrieved.
next_page_token = {
'datetime': all_logs[-2].datetime.isoformat(),
'random_id': all_logs[-2].random_id,
'page_number': page_token['page_number'] + 1 if page_token else 1,
}
return LogEntriesPage(_for_elasticsearch_logs(all_logs[:PAGE_SIZE], repository_id, account_id),
next_page_token)
def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None,
filter_kinds=None, size=20):
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
repository_name, namespace_name, performer_name)
with CloseForLongOperation(config.app_config):
latest_logs = self._load_latest_logs(performer_id, repository_id, account_id, filter_kinds,
size)
return latest_logs
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
repository_name=None, namespace_name=None, filter_kinds=None):
if end_datetime - start_datetime >= timedelta(days=DATE_RANGE_LIMIT):
raise Exception('Cannot lookup aggregated logs over a period longer than a month')
repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names(
repository_name, namespace_name, performer_name)
with CloseForLongOperation(config.app_config):
search = self._base_query_date_range(start_datetime, end_datetime, performer_id,
repository_id, account_id, filter_kinds)
search.aggs.bucket('by_id', 'terms', field='kind_id').bucket('by_date', 'date_histogram',
field='datetime', interval='day')
# es returns all buckets when size=0
search = search.extra(size=0)
resp = search.execute()
if not resp.aggregations:
return []
counts = []
by_id = resp.aggregations['by_id']
for id_bucket in by_id.buckets:
for date_bucket in id_bucket.by_date.buckets:
if date_bucket.doc_count > 0:
counts.append(AggregatedLogCount(id_bucket.key, date_bucket.doc_count, date_bucket.key))
return counts
def count_repository_actions(self, repository, day):
index = self._es_client.index_name(day)
search = self._base_query_date_range(day, day + timedelta(days=1),
None,
repository.id,
None,
None,
index=index)
search = search.params(request_timeout=COUNT_REPOSITORY_ACTION_TIMEOUT)
try:
return search.count()
except NotFoundError:
return 0
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
repository=None, repository_name=None, timestamp=None, is_free_namespace=False):
if self._should_skip_logging and self._should_skip_logging(kind_name, namespace_name,
is_free_namespace):
return
if repository_name is not None:
assert repository is None
assert namespace_name is not None
repository = model.repository.get_repository(namespace_name, repository_name)
if timestamp is None:
timestamp = datetime.today()
account_id = None
performer_id = None
repository_id = None
if namespace_name is not None:
account_id = model.user.get_namespace_user(namespace_name).id
if performer is not None:
performer_id = performer.id
if repository is not None:
repository_id = repository.id
metadata_json = json.dumps(metadata or {}, default=_json_serialize)
kind_id = model.log._get_log_entry_kind(kind_name)
log = LogEntry(random_id=_random_id(), kind_id=kind_id, account_id=account_id,
performer_id=performer_id, ip=ip, metadata_json=metadata_json,
repository_id=repository_id, datetime=timestamp)
try:
self._logs_producer.send(log)
except LogSendException as lse:
strict_logging_disabled = config.app_config.get('ALLOW_PULLS_WITHOUT_STRICT_LOGGING')
logger.exception('log_action failed', extra=({'exception': lse}).update(log.to_dict()))
if not (strict_logging_disabled and kind_name in ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING):
raise
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
namespace_id=None, max_query_time=None):
max_query_time = max_query_time.total_seconds() if max_query_time is not None else 300
search = self._base_query_date_range(start_datetime, end_datetime, None, repository_id,
namespace_id, None)
def raise_on_timeout(batch_generator):
start = time()
for batch in batch_generator:
elapsed = time() - start
if elapsed > max_query_time:
logger.error('Retrieval of logs `%s/%s` timed out with time of `%s`', namespace_id,
repository_id, elapsed)
raise LogsIterationTimeout()
yield batch
start = time()
def read_batch(scroll):
batch = []
for log in scroll:
batch.append(log)
if len(batch) == DEFAULT_RESULT_WINDOW:
yield _for_elasticsearch_logs(batch, repository_id=repository_id,
namespace_id=namespace_id)
batch = []
if batch:
yield _for_elasticsearch_logs(batch, repository_id=repository_id, namespace_id=namespace_id)
search = search.params(size=DEFAULT_RESULT_WINDOW, request_timeout=max_query_time)
try:
with CloseForLongOperation(config.app_config):
for batch in raise_on_timeout(read_batch(search.scan())):
yield batch
except ConnectionTimeout:
raise LogsIterationTimeout()
def can_delete_index(self, index, cutoff_date):
return self._es_client.can_delete_index(index, cutoff_date)
def list_indices(self):
return self._es_client.list_indices()
def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
""" Yield a context manager for a group of outdated logs. """
all_indices = self.list_indices()
for index in all_indices:
if not self.can_delete_index(index, cutoff_date):
continue
context = ElasticsearchLogRotationContext(index, min_logs_per_rotation, self._es_client)
yield context
class ElasticsearchLogRotationContext(LogRotationContextInterface):
"""
ElasticsearchLogRotationContext yield batch of logs from an index.
When completed without exceptions, this context will delete its associated
Elasticsearch index.
"""
def __init__(self, index, min_logs_per_rotation, es_client):
self._es_client = es_client
self.min_logs_per_rotation = min_logs_per_rotation
self.index = index
self.start_pos = 0
self.end_pos = 0
self.scroll = None
def __enter__(self):
search = self._base_query()
self.scroll = search.scan()
return self
def __exit__(self, ex_type, ex_value, ex_traceback):
if ex_type is None and ex_value is None and ex_traceback is None:
logger.debug('Deleting index %s', self.index)
self._es_client.delete_index(self.index)
def yield_logs_batch(self):
def batched_logs(gen, size):
batch = []
for log in gen:
batch.append(log)
if len(batch) == size:
yield batch
batch = []
if batch:
yield batch
for batch in batched_logs(self.scroll, self.min_logs_per_rotation):
self.end_pos = self.start_pos + len(batch) - 1
yield batch, self._generate_filename()
self.start_pos = self.end_pos + 1
def _base_query(self):
search = LogEntry.search(index=self.index)
return search
def _generate_filename(self):
""" Generate the filenames used to archive the action logs. """
filename = '%s_%d-%d' % (self.index, self.start_pos, self.end_pos)
filename = '.'.join((filename, 'txt.gz'))
return filename

View file

@ -0,0 +1,255 @@
import os
import logging
import re
from datetime import datetime, timedelta
from requests_aws4auth import AWS4Auth
from elasticsearch import RequestsHttpConnection
from elasticsearch.exceptions import NotFoundError, AuthorizationException
from elasticsearch_dsl import Index, Document, Integer, Date, Text, Ip, Keyword
from elasticsearch_dsl.connections import connections
logger = logging.getLogger(__name__)
# Name of the connection used for Elasticearch's template API
ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS = 'logentry_template'
# Prefix of autogenerated indices
INDEX_NAME_PREFIX = 'logentry_'
# Time-based index date format
INDEX_DATE_FORMAT = '%Y-%m-%d'
# Timeout for default connection
ELASTICSEARCH_DEFAULT_CONNECTION_TIMEOUT = 15
# Timeout for template api Connection
ELASTICSEARCH_TEMPLATE_CONNECTION_TIMEOUT = 60
# Force an index template update
ELASTICSEARCH_FORCE_INDEX_TEMPLATE_UPDATE = os.environ.get('FORCE_INDEX_TEMPLATE_UPDATE', '')
# Valid index prefix pattern
VALID_INDEX_PATTERN = r'^((?!\.$|\.\.$|[-_+])([^A-Z:\/*?\"<>|,# ]){1,255})$'
class LogEntry(Document):
# random_id is the tie-breaker for sorting in pagination.
# random_id is also used for deduplication of records when using a "at-least-once" delivery stream.
# Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-search-after.html
#
# We use don't use the _id of a document since a `doc_values` is not build for this field:
# An on-disk data structure that stores the same data in a columnar format
# for optimized sorting and aggregations.
# Reference: https://github.com/elastic/elasticsearch/issues/35369
random_id = Text(fields={'keyword': Keyword()})
kind_id = Integer()
account_id = Integer()
performer_id = Integer()
repository_id = Integer()
ip = Ip()
metadata_json = Text()
datetime = Date()
_initialized = False
@classmethod
def init(cls, index_prefix, index_settings=None, skip_template_init=False):
"""
Create the index template, and populate LogEntry's mapping and index settings.
"""
wildcard_index = Index(name=index_prefix + '*')
wildcard_index.settings(**(index_settings or {}))
wildcard_index.document(cls)
cls._index = wildcard_index
cls._index_prefix = index_prefix
if not skip_template_init:
cls.create_or_update_template()
# Since the elasticsearch-dsl API requires the document's index being defined as an inner class at the class level,
# this function needs to be called first before being able to call `save`.
cls._initialized = True
@classmethod
def create_or_update_template(cls):
assert cls._index and cls._index_prefix
index_template = cls._index.as_template(cls._index_prefix)
index_template.save(using=ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS)
def save(self, **kwargs):
# We group the logs based on year, month and day as different indexes, so that
# dropping those indexes based on retention range is easy.
#
# NOTE: This is only used if logging directly to Elasticsearch
# When using Kinesis or Kafka, the consumer of these streams
# will be responsible for the management of the indices' lifecycle.
assert LogEntry._initialized
kwargs['index'] = self.datetime.strftime(self._index_prefix + INDEX_DATE_FORMAT)
return super(LogEntry, self).save(**kwargs)
class ElasticsearchLogs(object):
"""
Model for logs operations stored in an Elasticsearch cluster.
"""
def __init__(self, host=None, port=None, access_key=None, secret_key=None, aws_region=None,
index_settings=None, use_ssl=True, index_prefix=INDEX_NAME_PREFIX):
# For options in index_settings, refer to:
# https://www.elastic.co/guide/en/elasticsearch/guide/master/_index_settings.html
# some index settings are set at index creation time, and therefore, you should NOT
# change those settings once the index is set.
self._host = host
self._port = port
self._access_key = access_key
self._secret_key = secret_key
self._aws_region = aws_region
self._index_prefix = index_prefix
self._index_settings = index_settings
self._use_ssl = use_ssl
self._client = None
self._initialized = False
def _initialize(self):
"""
Initialize a connection to an ES cluster and
creates an index template if it does not exist.
"""
if not self._initialized:
http_auth = None
if self._access_key and self._secret_key and self._aws_region:
http_auth = AWS4Auth(self._access_key, self._secret_key, self._aws_region, 'es')
elif self._access_key and self._secret_key:
http_auth = (self._access_key, self._secret_key)
else:
logger.warn("Connecting to Elasticsearch without HTTP auth")
self._client = connections.create_connection(
hosts=[{
'host': self._host,
'port': self._port
}],
http_auth=http_auth,
use_ssl=self._use_ssl,
verify_certs=True,
connection_class=RequestsHttpConnection,
timeout=ELASTICSEARCH_DEFAULT_CONNECTION_TIMEOUT,
)
# Create a second connection with a timeout of 60s vs 10s.
# For some reason the PUT template API can take anywhere between
# 10s and 30s on the test cluster.
# This only needs to be done once to initialize the index template
connections.create_connection(
alias=ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS,
hosts=[{
'host': self._host,
'port': self._port
}],
http_auth=http_auth,
use_ssl=self._use_ssl,
verify_certs=True,
connection_class=RequestsHttpConnection,
timeout=ELASTICSEARCH_TEMPLATE_CONNECTION_TIMEOUT,
)
try:
force_template_update = ELASTICSEARCH_FORCE_INDEX_TEMPLATE_UPDATE.lower() == 'true'
self._client.indices.get_template(self._index_prefix)
LogEntry.init(self._index_prefix, self._index_settings,
skip_template_init=not force_template_update)
except NotFoundError:
LogEntry.init(self._index_prefix, self._index_settings, skip_template_init=False)
finally:
try:
connections.remove_connection(ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS)
except KeyError as ke:
logger.exception('Elasticsearch connection not found to remove %s: %s',
ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS, ke)
self._initialized = True
def index_name(self, day):
""" Return an index name for the given day. """
return self._index_prefix + day.strftime(INDEX_DATE_FORMAT)
def index_exists(self, index):
try:
return index in self._client.indices.get(index)
except NotFoundError:
return False
@staticmethod
def _valid_index_prefix(prefix):
""" Check that the given index prefix is valid with the set of
indices used by this class.
"""
return re.match(VALID_INDEX_PATTERN, prefix) is not None
def _valid_index_name(self, index):
""" Check that the given index name is valid and follows the format:
<index_prefix>YYYY-MM-DD
"""
if not ElasticsearchLogs._valid_index_prefix(index):
return False
if not index.startswith(self._index_prefix) or len(index) > 255:
return False
index_dt_str = index.split(self._index_prefix, 1)[-1]
try:
datetime.strptime(index_dt_str, INDEX_DATE_FORMAT)
return True
except ValueError:
logger.exception('Invalid date format (YYYY-MM-DD) for index: %s', index)
return False
def can_delete_index(self, index, cutoff_date):
""" Check if the given index can be deleted based on the given index's date and cutoff date. """
assert self._valid_index_name(index)
index_dt = datetime.strptime(index[len(self._index_prefix):], INDEX_DATE_FORMAT)
return index_dt < cutoff_date and cutoff_date - index_dt >= timedelta(days=1)
def list_indices(self):
self._initialize()
try:
return self._client.indices.get(self._index_prefix + '*').keys()
except NotFoundError as nfe:
logger.exception('`%s` indices not found: %s', self._index_prefix, nfe.info)
return []
except AuthorizationException as ae:
logger.exception('Unauthorized for indices `%s`: %s', self._index_prefix, ae.info)
return None
def delete_index(self, index):
self._initialize()
assert self._valid_index_name(index)
try:
self._client.indices.delete(index)
return index
except NotFoundError as nfe:
logger.exception('`%s` indices not found: %s', index, nfe.info)
return None
except AuthorizationException as ae:
logger.exception('Unauthorized to delete index `%s`: %s', index, ae.info)
return None
def configure_es(host, port, access_key=None, secret_key=None, aws_region=None,
index_prefix=None, use_ssl=True, index_settings=None):
"""
For options in index_settings, refer to:
https://www.elastic.co/guide/en/elasticsearch/guide/master/_index_settings.html
some index settings are set at index creation time, and therefore, you should NOT
change those settings once the index is set.
"""
es_client = ElasticsearchLogs(host=host, port=port, access_key=access_key, secret_key=secret_key,
aws_region=aws_region, index_prefix=index_prefix or INDEX_NAME_PREFIX,
use_ssl=use_ssl, index_settings=index_settings)
es_client._initialize()
return es_client

View file

@ -0,0 +1,244 @@
import logging
import json
from collections import namedtuple
from datetime import datetime
from tzlocal import get_localzone
from dateutil.relativedelta import relativedelta
from data import model
from data.logs_model.datatypes import AggregatedLogCount, LogEntriesPage, Log
from data.logs_model.interface import (ActionLogsDataInterface, LogRotationContextInterface,
LogsIterationTimeout)
logger = logging.getLogger(__name__)
LogAndRepository = namedtuple('LogAndRepository', ['log', 'stored_log', 'repository'])
StoredLog = namedtuple('StoredLog', ['kind_id',
'account_id',
'performer_id',
'ip',
'metadata_json',
'repository_id',
'datetime'])
class InMemoryModel(ActionLogsDataInterface):
"""
InMemoryModel implements the data model for logs in-memory. FOR TESTING ONLY.
"""
def __init__(self):
self.logs = []
def _filter_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
namespace_name=None, filter_kinds=None):
if filter_kinds is not None:
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
for log_and_repo in self.logs:
if log_and_repo.log.datetime < start_datetime or log_and_repo.log.datetime > end_datetime:
continue
if performer_name and log_and_repo.log.performer_username != performer_name:
continue
if (repository_name and
(not log_and_repo.repository or log_and_repo.repository.name != repository_name)):
continue
if namespace_name and log_and_repo.log.account_username != namespace_name:
continue
if filter_kinds:
kind_map = model.log.get_log_entry_kinds()
ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds]
if log_and_repo.log.kind_id in ignore_ids:
continue
yield log_and_repo
def _filter_latest_logs(self, performer_name=None, repository_name=None,
namespace_name=None, filter_kinds=None):
if filter_kinds is not None:
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
for log_and_repo in sorted(self.logs, key=lambda t: t.log.datetime, reverse=True):
if performer_name and log_and_repo.log.performer_username != performer_name:
continue
if (repository_name and
(not log_and_repo.repository or log_and_repo.repository.name != repository_name)):
continue
if namespace_name and log_and_repo.log.account_username != namespace_name:
continue
if filter_kinds:
kind_map = model.log.get_log_entry_kinds()
ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds]
if log_and_repo.log.kind_id in ignore_ids:
continue
yield log_and_repo
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
logs = []
for log_and_repo in self._filter_logs(start_datetime, end_datetime, performer_name,
repository_name, namespace_name, filter_kinds):
logs.append(log_and_repo.log)
return LogEntriesPage(logs, None)
def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None,
filter_kinds=None, size=20):
latest_logs = []
for log_and_repo in self._filter_latest_logs(performer_name, repository_name, namespace_name,
filter_kinds):
if size is not None and len(latest_logs) == size:
break
latest_logs.append(log_and_repo.log)
return latest_logs
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
repository_name=None, namespace_name=None, filter_kinds=None):
entries = {}
for log_and_repo in self._filter_logs(start_datetime, end_datetime, performer_name,
repository_name, namespace_name, filter_kinds):
entry = log_and_repo.log
synthetic_date = datetime(start_datetime.year, start_datetime.month, int(entry.datetime.day),
tzinfo=get_localzone())
if synthetic_date.day < start_datetime.day:
synthetic_date = synthetic_date + relativedelta(months=1)
key = '%s-%s' % (entry.kind_id, entry.datetime.day)
if key in entries:
entries[key] = AggregatedLogCount(entry.kind_id, entries[key].count + 1,
synthetic_date)
else:
entries[key] = AggregatedLogCount(entry.kind_id, 1, synthetic_date)
return entries.values()
def count_repository_actions(self, repository, day):
count = 0
for log_and_repo in self.logs:
if log_and_repo.repository != repository:
continue
if log_and_repo.log.datetime.day != day.day:
continue
count += 1
return count
def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue,
namespace_name=None, repository_name=None, callback_url=None,
callback_email=None, filter_kinds=None):
raise NotImplementedError
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
repository=None, repository_name=None, timestamp=None, is_free_namespace=False):
timestamp = timestamp or datetime.today()
if not repository and repository_name and namespace_name:
repository = model.repository.get_repository(namespace_name, repository_name)
account = None
account_id = None
performer_id = None
repository_id = None
if namespace_name is not None:
account = model.user.get_namespace_user(namespace_name)
account_id = account.id
if performer is not None:
performer_id = performer.id
if repository is not None:
repository_id = repository.id
metadata_json = json.dumps(metadata or {})
kind_id = model.log.get_log_entry_kinds()[kind_name]
stored_log = StoredLog(
kind_id,
account_id,
performer_id,
ip,
metadata_json,
repository_id,
timestamp
)
log = Log(metadata_json=metadata,
ip=ip,
datetime=timestamp,
performer_email=performer.email if performer else None,
performer_username=performer.username if performer else None,
performer_robot=performer.robot if performer else None,
account_organization=account.organization if account else None,
account_username=account.username if account else None,
account_email=account.email if account else None,
account_robot=account.robot if account else None,
kind_id=kind_id)
self.logs.append(LogAndRepository(log, stored_log, repository))
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
namespace_id=None, max_query_time=None):
# Just for testing.
if max_query_time is not None:
raise LogsIterationTimeout()
logs = []
for log_and_repo in self._filter_logs(start_datetime, end_datetime):
if (repository_id and
(not log_and_repo.repository or log_and_repo.repository.id != repository_id)):
continue
if namespace_id:
if log_and_repo.log.account_username is None:
continue
namespace = model.user.get_namespace_user(log_and_repo.log.account_username)
if namespace.id != namespace_id:
continue
logs.append(log_and_repo.log)
yield logs
def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
expired_logs = [log_and_repo for log_and_repo in self.logs
if log_and_repo.log.datetime <= cutoff_date]
while True:
if not expired_logs:
break
context = InMemoryLogRotationContext(expired_logs[:min_logs_per_rotation], self.logs)
expired_logs = expired_logs[min_logs_per_rotation:]
yield context
class InMemoryLogRotationContext(LogRotationContextInterface):
def __init__(self, expired_logs, all_logs):
self.expired_logs = expired_logs
self.all_logs = all_logs
def __enter__(self):
return self
def __exit__(self, ex_type, ex_value, ex_traceback):
if ex_type is None and ex_value is None and ex_traceback is None:
for log in self.expired_logs:
self.all_logs.remove(log)
def yield_logs_batch(self):
""" Yield a batch of logs and a filename for that batch. """
filename = 'inmemory_model_filename_placeholder'
filename = '.'.join((filename, 'txt.gz'))
yield [log_and_repo.stored_log for log_and_repo in self.expired_logs], filename

View file

@ -0,0 +1,95 @@
from abc import ABCMeta, abstractmethod
from six import add_metaclass
class LogsIterationTimeout(Exception):
""" Exception raised if logs iteration times out. """
@add_metaclass(ABCMeta)
class ActionLogsDataInterface(object):
""" Interface for code to work with the logs data model. The logs data model consists
of all access for reading and writing action logs.
"""
@abstractmethod
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
""" Looks up all logs between the start_datetime and end_datetime, filtered
by performer (a user), repository or namespace. Note that one (and only one) of the three
can be specified. Returns a LogEntriesPage. `filter_kinds`, if specified, is a set/list
of the kinds of logs to filter out.
"""
@abstractmethod
def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None,
filter_kinds=None, size=20):
""" Looks up latest logs of a specific kind, filtered by performer (a user),
repository or namespace. Note that one (and only one) of the three can be specified.
Returns a list of `Log`.
"""
@abstractmethod
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
repository_name=None, namespace_name=None, filter_kinds=None):
""" Returns the aggregated count of logs, by kind, between the start_datetime and end_datetime,
filtered by performer (a user), repository or namespace. Note that one (and only one) of
the three can be specified. Returns a list of AggregatedLogCount.
"""
@abstractmethod
def count_repository_actions(self, repository, day):
""" Returns the total number of repository actions over the given day, in the given repository
or None on error.
"""
@abstractmethod
def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue,
namespace_name=None, repository_name=None, callback_url=None,
callback_email=None, filter_kinds=None):
""" Queues logs between the start_datetime and end_time, filtered by a repository or namespace,
for export to the specified URL and/or email address. Returns the ID of the export job
queued or None if error.
"""
@abstractmethod
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
repository=None, repository_name=None, timestamp=None, is_free_namespace=False):
""" Logs a single action as having taken place. """
@abstractmethod
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
namespace_id=None, max_query_time=None):
""" Returns an iterator that yields bundles of all logs found between the start_datetime and
end_datetime, optionally filtered by the repository or namespace. This function should be
used for any bulk lookup operations, and should be implemented by implementors to put
minimal strain on the backing storage for large operations. If there was an error in setting
up, returns None.
If max_query_time is specified, each iteration that yields a log bundle will have its
queries run with a maximum timeout of that specified, and, if any exceed that threshold,
LogsIterationTimeout will be raised instead of returning the logs bundle.
"""
@abstractmethod
def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
"""
A generator that yields contexts implementing the LogRotationContextInterface.
Each context represents a set of logs to be archived and deleted once
the context completes without exceptions.
For database logs, the LogRotationContext abstracts over a set of rows. When the context
finishes, its associated rows get deleted.
For Elasticsearch logs, the LogRotationContext abstracts over indices. When the context
finishes, its associated index gets deleted.
"""
@add_metaclass(ABCMeta)
class LogRotationContextInterface(object):
""" Interface for iterating over a set of logs to be archived. """
@abstractmethod
def yield_logs_batch(self):
"""
Generator yielding batch of logs and a filename for that batch.
A batch is a subset of the logs part of the context.
"""

View file

@ -0,0 +1,27 @@
import logging
logger = logging.getLogger(__name__)
class LogSendException(Exception):
""" A generic error when sending the logs to its destination.
e.g. Kinesis, Kafka, Elasticsearch, ...
"""
pass
class LogProducerProxy(object):
def __init__(self):
self._model = None
def initialize(self, model):
self._model = model
logger.info('===============================')
logger.info('Using producer `%s`', self._model)
logger.info('===============================')
def __getattr__(self, attr):
if not self._model:
raise AttributeError("LogsModelProxy is not initialized")
return getattr(self._model, attr)

View file

@ -0,0 +1,25 @@
import logging
from elasticsearch.exceptions import ElasticsearchException
from data.logs_model.logs_producer.interface import LogProducerInterface
from data.logs_model.logs_producer import LogSendException
logger = logging.getLogger(__name__)
class ElasticsearchLogsProducer(LogProducerInterface):
""" Log producer writing log entries to Elasticsearch.
This implementation writes directly to Elasticsearch without a streaming/queueing service.
"""
def send(self, logentry):
try:
logentry.save()
except ElasticsearchException as ex:
logger.exception('ElasticsearchLogsProducer error sending log to Elasticsearch: %s', ex)
raise LogSendException('ElasticsearchLogsProducer error sending log to Elasticsearch: %s' % ex)
except Exception as e:
logger.exception('ElasticsearchLogsProducer exception sending log to Elasticsearch: %s', e)
raise LogSendException('ElasticsearchLogsProducer exception sending log to Elasticsearch: %s' % e)

View file

@ -0,0 +1,8 @@
from abc import ABCMeta, abstractmethod
from six import add_metaclass
@add_metaclass(ABCMeta)
class LogProducerInterface(object):
@abstractmethod
def send(self, logentry):
""" Send a log entry to the configured log infrastructure. """

View file

@ -0,0 +1,45 @@
import logging
from kafka.errors import KafkaError, KafkaTimeoutError
from kafka import KafkaProducer
from data.logs_model.shared import epoch_ms
from data.logs_model.logs_producer.interface import LogProducerInterface
from data.logs_model.logs_producer.util import logs_json_serializer
from data.logs_model.logs_producer import LogSendException
logger = logging.getLogger(__name__)
DEFAULT_MAX_BLOCK_SECONDS = 5
class KafkaLogsProducer(LogProducerInterface):
""" Log producer writing log entries to a Kafka stream. """
def __init__(self, bootstrap_servers=None, topic=None, client_id=None, max_block_seconds=None):
self.bootstrap_servers = bootstrap_servers
self.topic = topic
self.client_id = client_id
self.max_block_ms = (max_block_seconds or DEFAULT_MAX_BLOCK_SECONDS) * 1000
self._producer = KafkaProducer(bootstrap_servers=self.bootstrap_servers,
client_id=self.client_id,
max_block_ms=self.max_block_ms,
value_serializer=logs_json_serializer)
def send(self, logentry):
try:
# send() has a (max_block_ms) timeout and get() has a (max_block_ms) timeout
# for an upper bound of 2x(max_block_ms) before guaranteed delivery
future = self._producer.send(self.topic, logentry.to_dict(), timestamp_ms=epoch_ms(logentry.datetime))
record_metadata = future.get(timeout=self.max_block_ms)
assert future.succeeded
except KafkaTimeoutError as kte:
logger.exception('KafkaLogsProducer timeout sending log to Kafka: %s', kte)
raise LogSendException('KafkaLogsProducer timeout sending log to Kafka: %s' % kte)
except KafkaError as ke:
logger.exception('KafkaLogsProducer error sending log to Kafka: %s', ke)
raise LogSendException('KafkaLogsProducer error sending log to Kafka: %s' % ke)
except Exception as e:
logger.exception('KafkaLogsProducer exception sending log to Kafka: %s', e)
raise LogSendException('KafkaLogsProducer exception sending log to Kafka: %s' % e)

View file

@ -0,0 +1,75 @@
import logging
import hashlib
import random
import boto3
from botocore.exceptions import ClientError
from botocore.client import Config
from data.logs_model.logs_producer.interface import LogProducerInterface
from data.logs_model.logs_producer.util import logs_json_serializer
from data.logs_model.logs_producer import LogSendException
logger = logging.getLogger(__name__)
KINESIS_PARTITION_KEY_PREFIX = 'logentry_partition_key_'
DEFAULT_CONNECT_TIMEOUT = 5
DEFAULT_READ_TIMEOUT = 5
MAX_RETRY_ATTEMPTS = 5
DEFAULT_MAX_POOL_CONNECTIONS = 10
def _partition_key(number_of_shards=None):
""" Generate a partition key for AWS Kinesis stream.
If the number of shards is specified, generate keys where the size of the key space is
the number of shards.
"""
key = None
if number_of_shards is not None:
shard_number = random.randrange(0, number_of_shards)
key = hashlib.sha1(KINESIS_PARTITION_KEY_PREFIX + str(shard_number)).hexdigest()
else:
key = hashlib.sha1(KINESIS_PARTITION_KEY_PREFIX + str(random.getrandbits(256))).hexdigest()
return key
class KinesisStreamLogsProducer(LogProducerInterface):
""" Log producer writing log entries to an Amazon Kinesis Data Stream. """
def __init__(self, stream_name, aws_region, aws_access_key=None, aws_secret_key=None,
connect_timeout=None, read_timeout=None, max_retries=None,
max_pool_connections=None):
self._stream_name = stream_name
self._aws_region = aws_region
self._aws_access_key = aws_access_key
self._aws_secret_key = aws_secret_key
self._connect_timeout = connect_timeout or DEFAULT_CONNECT_TIMEOUT
self._read_timeout = read_timeout or DEFAULT_READ_TIMEOUT
self._max_retries = max_retries or MAX_RETRY_ATTEMPTS
self._max_pool_connections=max_pool_connections or DEFAULT_MAX_POOL_CONNECTIONS
client_config = Config(connect_timeout=self._connect_timeout,
read_timeout=self._read_timeout ,
retries={'max_attempts': self._max_retries},
max_pool_connections=self._max_pool_connections)
self._producer = boto3.client('kinesis', use_ssl=True,
region_name=self._aws_region,
aws_access_key_id=self._aws_access_key,
aws_secret_access_key=self._aws_secret_key,
config=client_config)
def send(self, logentry):
try:
data = logs_json_serializer(logentry)
self._producer.put_record(
StreamName=self._stream_name,
Data=data,
PartitionKey=_partition_key()
)
except ClientError as ce:
logger.exception('KinesisStreamLogsProducer client error sending log to Kinesis: %s', ce)
raise LogSendException('KinesisStreamLogsProducer client error sending log to Kinesis: %s' % ce)
except Exception as e:
logger.exception('KinesisStreamLogsProducer exception sending log to Kinesis: %s', e)
raise LogSendException('KinesisStreamLogsProducer exception sending log to Kinesis: %s' % e)

View file

@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
import logging
import json
from datetime import datetime
import pytest
from data.logs_model.logs_producer.util import logs_json_serializer
from data.logs_model.elastic_logs import LogEntry
logger = logging.getLogger(__name__)
TEST_DATETIME = datetime.utcnow()
TEST_JSON_STRING = '{"a": "b", "c": "d"}'
TEST_JSON_STRING_WITH_UNICODE = u'{"éëê": "îôû"}'
VALID_LOGENTRY = LogEntry(random_id='123-45', ip='0.0.0.0', metadata_json=TEST_JSON_STRING, datetime=TEST_DATETIME)
VALID_LOGENTRY_WITH_UNICODE = LogEntry(random_id='123-45', ip='0.0.0.0', metadata_json=TEST_JSON_STRING_WITH_UNICODE, datetime=TEST_DATETIME)
VALID_LOGENTRY_EXPECTED_OUTPUT = '{"datetime": "%s", "ip": "0.0.0.0", "metadata_json": "{\\"a\\": \\"b\\", \\"c\\": \\"d\\"}", "random_id": "123-45"}' % TEST_DATETIME.isoformat()
VALID_LOGENTRY_WITH_UNICODE_EXPECTED_OUTPUT = '{"datetime": "%s", "ip": "0.0.0.0", "metadata_json": "{\\"\\u00e9\\u00eb\\u00ea\\": \\"\\u00ee\\u00f4\\u00fb\\"}", "random_id": "123-45"}' % TEST_DATETIME.isoformat()
@pytest.mark.parametrize(
'is_valid, given_input, expected_output',
[
# Valid inputs
pytest.param(True, VALID_LOGENTRY, VALID_LOGENTRY_EXPECTED_OUTPUT),
# With unicode
pytest.param(True, VALID_LOGENTRY_WITH_UNICODE, VALID_LOGENTRY_WITH_UNICODE_EXPECTED_OUTPUT),
])
def test_logs_json_serializer(is_valid, given_input, expected_output):
if not is_valid:
with pytest.raises(ValueError) as ve:
data = logs_json_serializer(given_input)
else:
data = logs_json_serializer(given_input, sort_keys=True)
assert data == expected_output
# Make sure the datetime was serialized in the correct ISO8601
datetime_str = json.loads(data)['datetime']
assert datetime_str == TEST_DATETIME.isoformat()

View file

@ -0,0 +1,15 @@
import json
from datetime import datetime
class LogEntryJSONEncoder(json.JSONEncoder):
""" JSON encoder to encode datetimes to ISO8601 format. """
def default(self, obj):
if isinstance(obj, datetime):
return obj.isoformat()
return super(LogEntryJSONEncoder, self).default(obj)
def logs_json_serializer(logentry, sort_keys=False):
""" Serializes a LogEntry to json bytes. """
return json.dumps(logentry.to_dict(), cls=LogEntryJSONEncoder,
ensure_ascii=True, sort_keys=sort_keys).encode('ascii')

53
data/logs_model/shared.py Normal file
View file

@ -0,0 +1,53 @@
import uuid
import json
from calendar import timegm
from data import model
class SharedModel:
def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue,
namespace_name=None, repository_name=None, callback_url=None,
callback_email=None, filter_kinds=None):
""" Queues logs between the start_datetime and end_time, filtered by a repository or namespace,
for export to the specified URL and/or email address. Returns the ID of the export job
queued or None if error.
"""
export_id = str(uuid.uuid4())
namespace = model.user.get_namespace_user(namespace_name)
if namespace is None:
return None
repository = None
if repository_name is not None:
repository = model.repository.get_repository(namespace_name, repository_name)
if repository is None:
return None
export_action_logs_queue.put([namespace_name],
json.dumps({
'export_id': export_id,
'repository_id': repository.id if repository else None,
'namespace_id': namespace.id,
'namespace_name': namespace.username,
'repository_name': repository.name if repository else None,
'start_time': start_datetime.strftime('%m/%d/%Y'),
'end_time': end_datetime.strftime('%m/%d/%Y'),
'callback_url': callback_url,
'callback_email': callback_email,
}), retries_remaining=3)
return export_id
def epoch_ms(dt):
return (timegm(dt.timetuple()) * 1000) + (dt.microsecond / 1000)
def get_kinds_filter(kinds):
""" Given a list of kinds, return the set of kinds not that are not part of that list.
i.e Returns the list of kinds to be filtered out. """
kind_map = model.log.get_log_entry_kinds()
kind_map = {key: kind_map[key] for key in kind_map if not isinstance(key, int)}
return [kind_name for kind_name in kind_map if kind_name not in kinds]

View file

@ -0,0 +1,291 @@
# pylint: disable=protected-access
import logging
from datetime import datetime, timedelta
from tzlocal import get_localzone
from dateutil.relativedelta import relativedelta
from data import model
from data.model import config
from data.database import LogEntry, LogEntry2, LogEntry3, UseThenDisconnect
from data.logs_model.interface import ActionLogsDataInterface, LogsIterationTimeout, \
LogRotationContextInterface
from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage
from data.logs_model.shared import SharedModel
from data.model.log import get_stale_logs, get_stale_logs_start_id, delete_stale_logs
logger = logging.getLogger(__name__)
MINIMUM_RANGE_SIZE = 1 # second
MAXIMUM_RANGE_SIZE = 60 * 60 * 24 * 30 # seconds ~= 1 month
EXPECTED_ITERATION_LOG_COUNT = 1000
LOG_MODELS = [LogEntry3, LogEntry2, LogEntry]
class TableLogsModel(SharedModel, ActionLogsDataInterface):
"""
TableLogsModel implements the data model for the logs API backed by a single table
in the database.
"""
def __init__(self, should_skip_logging=None, **kwargs):
self._should_skip_logging = should_skip_logging
def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None,
namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None):
if filter_kinds is not None:
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
assert start_datetime is not None
assert end_datetime is not None
repository = None
if repository_name and namespace_name:
repository = model.repository.get_repository(namespace_name, repository_name)
assert repository
performer = None
if performer_name:
performer = model.user.get_user(performer_name)
assert performer
def get_logs(m, page_token):
logs_query = model.log.get_logs_query(start_datetime, end_datetime, performer=performer,
repository=repository, namespace=namespace_name,
ignore=filter_kinds, model=m)
logs, next_page_token = model.modelutil.paginate(logs_query, m,
descending=True,
page_token=page_token,
limit=20,
max_page=max_page_count,
sort_field_name='datetime')
return logs, next_page_token
TOKEN_TABLE_ID = 'tti'
table_index = 0
logs = []
next_page_token = page_token or None
# Skip empty pages (empty table)
while len(logs) == 0 and table_index < len(LOG_MODELS) - 1:
table_specified = next_page_token is not None and next_page_token.get(TOKEN_TABLE_ID) is not None
if table_specified:
table_index = next_page_token.get(TOKEN_TABLE_ID)
logs_result, next_page_token = get_logs(LOG_MODELS[table_index], next_page_token)
logs.extend(logs_result)
if next_page_token is None and table_index < len(LOG_MODELS) - 1:
next_page_token = {TOKEN_TABLE_ID: table_index + 1}
return LogEntriesPage([Log.for_logentry(log) for log in logs], next_page_token)
def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None,
filter_kinds=None, size=20):
if filter_kinds is not None:
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
repository = None
if repository_name and namespace_name:
repository = model.repository.get_repository(namespace_name, repository_name)
assert repository
performer = None
if performer_name:
performer = model.user.get_user(performer_name)
assert performer
def get_latest_logs(m):
logs_query = model.log.get_latest_logs_query(performer=performer, repository=repository,
namespace=namespace_name, ignore=filter_kinds,
model=m, size=size)
logs = list(logs_query)
return [Log.for_logentry(log) for log in logs]
return get_latest_logs(LOG_MODELS[0])
def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None,
repository_name=None, namespace_name=None, filter_kinds=None):
if filter_kinds is not None:
assert all(isinstance(kind_name, str) for kind_name in filter_kinds)
if end_datetime - start_datetime >= timedelta(weeks=4):
raise Exception('Cannot lookup aggregated logs over a period longer than a month')
repository = None
if repository_name and namespace_name:
repository = model.repository.get_repository(namespace_name, repository_name)
performer = None
if performer_name:
performer = model.user.get_user(performer_name)
entries = {}
for log_model in LOG_MODELS:
aggregated = model.log.get_aggregated_logs(start_datetime, end_datetime,
performer=performer,
repository=repository,
namespace=namespace_name,
ignore=filter_kinds,
model=log_model)
for entry in aggregated:
synthetic_date = datetime(start_datetime.year, start_datetime.month, int(entry.day),
tzinfo=get_localzone())
if synthetic_date.day < start_datetime.day:
synthetic_date = synthetic_date + relativedelta(months=1)
key = '%s-%s' % (entry.kind_id, entry.day)
if key in entries:
entries[key] = AggregatedLogCount(entry.kind_id, entry.count + entries[key].count,
synthetic_date)
else:
entries[key] = AggregatedLogCount(entry.kind_id, entry.count, synthetic_date)
return entries.values()
def count_repository_actions(self, repository, day):
return model.repositoryactioncount.count_repository_actions(repository, day)
def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None,
repository=None, repository_name=None, timestamp=None, is_free_namespace=False):
if self._should_skip_logging and self._should_skip_logging(kind_name, namespace_name,
is_free_namespace):
return
if repository_name is not None:
assert repository is None
assert namespace_name is not None
repository = model.repository.get_repository(namespace_name, repository_name)
model.log.log_action(kind_name, namespace_name, performer=performer, repository=repository,
ip=ip, metadata=metadata or {}, timestamp=timestamp)
def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None,
namespace_id=None, max_query_time=None):
# Using an adjusting scale, start downloading log rows in batches, starting at
# MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or
# the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes
# longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out.
batch_start_time = datetime.utcnow()
current_start_datetime = start_datetime
current_batch_size = timedelta(seconds=MINIMUM_RANGE_SIZE)
while current_start_datetime < end_datetime:
# Verify we haven't been working for too long.
work_elapsed = datetime.utcnow() - batch_start_time
if max_query_time is not None and work_elapsed > max_query_time:
logger.error('Retrieval of logs `%s/%s` timed out with time of `%s`',
namespace_id, repository_id, work_elapsed)
raise LogsIterationTimeout()
current_end_datetime = current_start_datetime + current_batch_size
current_end_datetime = min(current_end_datetime, end_datetime)
# Load the next set of logs.
def load_logs():
logger.debug('Retrieving logs over range %s -> %s with namespace %s and repository %s',
current_start_datetime, current_end_datetime, namespace_id, repository_id)
logs_query = model.log.get_logs_query(namespace=namespace_id,
repository=repository_id,
start_time=current_start_datetime,
end_time=current_end_datetime)
logs = list(logs_query)
for log in logs:
if namespace_id is not None:
assert log.account_id == namespace_id
if repository_id is not None:
assert log.repository_id == repository_id
logs = [Log.for_logentry(log) for log in logs]
return logs
logs, elapsed = _run_and_time(load_logs)
if max_query_time is not None and elapsed > max_query_time:
logger.error('Retrieval of logs for export `%s/%s` with range `%s-%s` timed out at `%s`',
namespace_id, repository_id, current_start_datetime, current_end_datetime,
elapsed)
raise LogsIterationTimeout()
yield logs
# Move forward.
current_start_datetime = current_end_datetime
# Increase the batch size if necessary.
if len(logs) < EXPECTED_ITERATION_LOG_COUNT:
seconds = min(MAXIMUM_RANGE_SIZE, current_batch_size.total_seconds() * 2)
current_batch_size = timedelta(seconds=seconds)
def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation):
""" Yield a context manager for a group of outdated logs. """
for log_model in LOG_MODELS:
while True:
with UseThenDisconnect(config.app_config):
start_id = get_stale_logs_start_id(log_model)
if start_id is None:
logger.warning('Failed to find start id')
break
logger.debug('Found starting ID %s', start_id)
lookup_end_id = start_id + min_logs_per_rotation
logs = [log for log in get_stale_logs(start_id, lookup_end_id,
log_model, cutoff_date)]
if not logs:
logger.debug('No further logs found')
break
end_id = max([log.id for log in logs])
context = DatabaseLogRotationContext(logs, log_model, start_id, end_id)
yield context
def _run_and_time(fn):
start_time = datetime.utcnow()
result = fn()
return result, datetime.utcnow() - start_time
table_logs_model = TableLogsModel()
class DatabaseLogRotationContext(LogRotationContextInterface):
"""
DatabaseLogRotationContext represents a batch of logs to be archived together.
i.e A set of logs to be archived in the same file (based on the number of logs per rotation).
When completed without exceptions, this context will delete the stale logs
from rows `start_id` to `end_id`.
"""
def __init__(self, logs, log_model, start_id, end_id):
self.logs = logs
self.log_model = log_model
self.start_id = start_id
self.end_id = end_id
def __enter__(self):
return self
def __exit__(self, ex_type, ex_value, ex_traceback):
if ex_type is None and ex_value is None and ex_traceback is None:
with UseThenDisconnect(config.app_config):
logger.debug('Deleting logs from IDs %s to %s', self.start_id, self.end_id)
delete_stale_logs(self.start_id, self.end_id, self.log_model)
def yield_logs_batch(self):
""" Yield a batch of logs and a filename for that batch. """
filename = '%d-%d-%s.txt.gz' % (self.start_id, self.end_id,
self.log_model.__name__.lower())
yield self.logs, filename

View file

View file

@ -0,0 +1,390 @@
import json
import uuid
import fnmatch
from collections import defaultdict
from contextlib import contextmanager
from datetime import datetime
import dateutil.parser
from httmock import urlmatch, HTTMock
FAKE_ES_HOST = 'fakees'
EMPTY_RESULT = {
'hits': {'hits': [], 'total': 0},
'_shards': {'successful': 1, 'total': 1},
}
def parse_query(query):
if not query:
return {}
return {s.split('=')[0]: s.split('=')[1] for s in query.split("&")}
@contextmanager
def fake_elasticsearch(allow_wildcard=True):
templates = {}
docs = defaultdict(list)
scrolls = {}
id_counter = [1]
def transform(value, field_name):
# TODO: implement this using a real index template if we ever need more than a few
# fields here.
if field_name == 'datetime':
if isinstance(value, int):
return datetime.utcfromtimestamp(value / 1000)
parsed = dateutil.parser.parse(value)
return parsed
return value
@urlmatch(netloc=FAKE_ES_HOST, path=r'/_template/(.+)', method='GET')
def get_template(url, request):
template_name = url[len('/_template/'):]
if template_name in templates:
return {'status_code': 200}
return {'status_code': 404}
@urlmatch(netloc=FAKE_ES_HOST, path=r'/_template/(.+)', method='PUT')
def put_template(url, request):
template_name = url[len('/_template/'):]
templates[template_name] = True
return {'status_code': 201}
@urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)/_doc', method='POST')
def post_doc(url, request):
index_name, _ = url.path[1:].split('/')
item = json.loads(request.body)
item['_id'] = item['random_id']
id_counter[0] += 1
docs[index_name].append(item)
return {
'status_code': 204,
'headers': {
'Content-Type': 'application/json',
},
'content': json.dumps({
"result": "created",
}),
}
@urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)$', method='DELETE')
def index_delete(url, request):
index_name_or_pattern = url.path[1:]
to_delete = []
for index_name in docs.keys():
if not fnmatch.fnmatch(index_name, index_name_or_pattern):
continue
to_delete.append(index_name)
for index in to_delete:
docs.pop(index)
return {
'status_code': 200,
'headers': {
'Content-Type': 'application/json',
},
'content': {'acknowledged': True}
}
@urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)$', method='GET')
def index_lookup(url, request):
index_name_or_pattern = url.path[1:]
found = {}
for index_name in docs.keys():
if not fnmatch.fnmatch(index_name, index_name_or_pattern):
continue
found[index_name] = {}
if not found:
return {
'status_code': 404,
}
return {
'status_code': 200,
'headers': {
'Content-Type': 'application/json',
},
'content': json.dumps(found),
}
def _match_query(index_name_or_pattern, query):
found = []
found_index = False
for index_name in docs.keys():
if not allow_wildcard and index_name_or_pattern.find('*') >= 0:
break
if not fnmatch.fnmatch(index_name, index_name_or_pattern):
continue
found_index = True
def _is_match(doc, current_query):
if current_query is None:
return True
for filter_type, filter_params in current_query.iteritems():
for field_name, filter_props in filter_params.iteritems():
if filter_type == 'range':
lt = transform(filter_props['lt'], field_name)
gte = transform(filter_props['gte'], field_name)
doc_value = transform(doc[field_name], field_name)
if not (doc_value < lt and doc_value >= gte):
return False
elif filter_type == 'term':
doc_value = transform(doc[field_name], field_name)
return doc_value == filter_props
elif filter_type == 'terms':
doc_value = transform(doc[field_name], field_name)
return doc_value in filter_props
elif filter_type == 'bool':
assert not 'should' in filter_params, 'should is unsupported'
must = filter_params.get('must')
must_not = filter_params.get('must_not')
filter_bool = filter_params.get('filter')
if must:
for check in must:
if not _is_match(doc, check):
return False
if must_not:
for check in must_not:
if _is_match(doc, check):
return False
if filter_bool:
for check in filter_bool:
if not _is_match(doc, check):
return False
else:
raise Exception('Unimplemented query %s: %s' % (filter_type, query))
return True
for doc in docs[index_name]:
if not _is_match(doc, query):
continue
found.append({'_source': doc, '_index': index_name})
return found, found_index or (index_name_or_pattern.find('*') >= 0)
@urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)/_count$', method='GET')
def count_docs(url, request):
request = json.loads(request.body)
index_name_or_pattern, _ = url.path[1:].split('/')
found, found_index = _match_query(index_name_or_pattern, request['query'])
if not found_index:
return {
'status_code': 404,
}
return {
'status_code': 200,
'headers': {
'Content-Type': 'application/json',
},
'content': json.dumps({'count': len(found)}),
}
@urlmatch(netloc=FAKE_ES_HOST, path=r'/_search/scroll$', method='GET')
def lookup_scroll(url, request):
request_obj = json.loads(request.body)
scroll_id = request_obj['scroll_id']
if scroll_id in scrolls:
return {
'status_code': 200,
'headers': {
'Content-Type': 'application/json',
},
'content': json.dumps(scrolls[scroll_id]),
}
return {
'status_code': 404,
}
@urlmatch(netloc=FAKE_ES_HOST, path=r'/_search/scroll$', method='DELETE')
def delete_scroll(url, request):
request = json.loads(request.body)
for scroll_id in request['scroll_id']:
scrolls.pop(scroll_id, None)
return {
'status_code': 404,
}
@urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)/_search$', method='GET')
def lookup_docs(url, request):
query_params = parse_query(url.query)
request = json.loads(request.body)
index_name_or_pattern, _ = url.path[1:].split('/')
# Find matching docs.
query = request.get('query')
found, found_index = _match_query(index_name_or_pattern, query)
if not found_index:
return {
'status_code': 404,
}
# Sort.
sort = request.get('sort')
if sort:
if sort == ['_doc'] or sort == '_doc':
found.sort(key=lambda x: x['_source']['_id'])
else:
def get_sort_key(item):
source = item['_source']
key = ''
for sort_config in sort:
for sort_key, direction in sort_config.iteritems():
assert direction == 'desc'
sort_key = sort_key.replace('.keyword', '')
key += str(transform(source[sort_key], sort_key))
key += '|'
return key
found.sort(key=get_sort_key, reverse=True)
# Search after.
search_after = request.get('search_after')
if search_after:
sort_fields = []
for sort_config in sort:
if isinstance(sort_config, unicode):
sort_fields.append(sort_config)
continue
for sort_key, _ in sort_config.iteritems():
sort_key = sort_key.replace('.keyword', '')
sort_fields.append(sort_key)
for index, search_after_value in enumerate(search_after):
field_name = sort_fields[index]
value = transform(search_after_value, field_name)
if field_name == '_doc':
found = [f for f in found if transform(f['_source']['_id'], field_name) > value]
else:
found = [f for f in found if transform(f['_source'][field_name], field_name) < value]
if len(found) < 2:
break
if field_name == '_doc':
if found[0]['_source']['_id'] != found[1]['_source']:
break
else:
if found[0]['_source'][field_name] != found[1]['_source']:
break
# Size.
size = request.get('size')
if size:
found = found[0:size]
# Aggregation.
# {u'query':
# {u'range':
# {u'datetime': {u'lt': u'2019-06-27T15:45:09.768085',
# u'gte': u'2019-06-27T15:35:09.768085'}}},
# u'aggs': {
# u'by_id': {
# u'terms': {u'field': u'kind_id'},
# u'aggs': {
# u'by_date': {u'date_histogram': {u'field': u'datetime', u'interval': u'day'}}}}},
# u'size': 0}
def _by_field(agg_field_params, results):
aggregated_by_field = defaultdict(list)
for agg_means, agg_means_params in agg_field_params.iteritems():
if agg_means == 'terms':
field_name = agg_means_params['field']
for result in results:
value = result['_source'][field_name]
aggregated_by_field[value].append(result)
elif agg_means == 'date_histogram':
field_name = agg_means_params['field']
interval = agg_means_params['interval']
for result in results:
value = transform(result['_source'][field_name], field_name)
aggregated_by_field[getattr(value, interval)].append(result)
elif agg_means == 'aggs':
# Skip. Handled below.
continue
else:
raise Exception('Unsupported aggregation method: %s' % agg_means)
# Invoke the aggregation recursively.
buckets = []
for field_value, field_results in aggregated_by_field.iteritems():
aggregated = _aggregate(agg_field_params, field_results)
if isinstance(aggregated, list):
aggregated = {'doc_count': len(aggregated)}
aggregated['key'] = field_value
buckets.append(aggregated)
return {'buckets': buckets}
def _aggregate(query_config, results):
agg_params = query_config.get(u'aggs')
if not agg_params:
return results
by_field_name = {}
for agg_field_name, agg_field_params in agg_params.iteritems():
by_field_name[agg_field_name] = _by_field(agg_field_params, results)
return by_field_name
final_result = {
'hits': {
'hits': found,
'total': len(found),
},
'_shards': {
'successful': 1,
'total': 1,
},
'aggregations': _aggregate(request, found),
}
if query_params.get('scroll'):
scroll_id = str(uuid.uuid4())
scrolls[scroll_id] = EMPTY_RESULT
final_result['_scroll_id'] = scroll_id
return {
'status_code': 200,
'headers': {
'Content-Type': 'application/json',
},
'content': json.dumps(final_result),
}
@urlmatch(netloc=FAKE_ES_HOST)
def catchall_handler(url, request):
print "Unsupported URL: %s %s" % (request.method, url, )
return {'status_code': 501}
handlers = [get_template, put_template, index_delete, index_lookup, post_doc, count_docs,
lookup_docs, lookup_scroll, delete_scroll, catchall_handler]
with HTTMock(*handlers):
yield

View file

@ -0,0 +1,400 @@
# -*- coding: utf-8 -*-
import json
from datetime import datetime
from dateutil.parser import parse
from data.logs_model.datatypes import LogEntriesPage, Log, AggregatedLogCount
def _status(d, code=200):
return {"status_code": code, "content": json.dumps(d)}
def _shards(d, total=5, failed=0, successful=5):
d.update({"_shards": {"total": total, "failed": failed, "successful": successful}})
return d
def _hits(hits):
return {"hits": {"total": len(hits), "max_score": None, "hits": hits}}
INDEX_LIST_RESPONSE_HIT1_HIT2 = _status({
"logentry_2018-03-08": {},
"logentry_2018-04-02": {}
})
INDEX_LIST_RESPONSE_HIT2 = _status({
"logentry_2018-04-02": {}
})
INDEX_LIST_RESPONSE = _status({
"logentry_2019-01-01": {},
"logentry_2017-03-08": {},
"logentry_2018-03-08": {},
"logentry_2018-04-02": {}
})
DEFAULT_TEMPLATE_RESPONSE = _status({"acknowledged": True})
INDEX_RESPONSE_2019_01_01 = _status(
_shards({
"_index": "logentry_2019-01-01",
"_type": "_doc",
"_id": "1",
"_version": 1,
"_seq_no": 0,
"_primary_term": 1,
"result": "created"
}))
INDEX_RESPONSE_2017_03_08 = _status(
_shards({
"_index": "logentry_2017-03-08",
"_type": "_doc",
"_id": "1",
"_version": 1,
"_seq_no": 0,
"_primary_term": 1,
"result": "created"
}))
FAILURE_400 = _status({}, 400)
INDEX_REQUEST_2019_01_01 = [
"logentry_2019-01-01", {
"account_id":
1,
"repository_id":
1,
"ip":
"192.168.1.1",
"random_id":
233,
"datetime":
"2019-01-01T03:30:00",
"metadata_json": json.loads("{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1520479800}"),
"performer_id":
1,
"kind_id":
1
}
]
INDEX_REQUEST_2017_03_08 = [
"logentry_2017-03-08", {
"repository_id":
1,
"account_id":
1,
"ip":
"192.168.1.1",
"random_id":
233,
"datetime":
"2017-03-08T03:30:00",
"metadata_json": json.loads("{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1520479800}"),
"performer_id":
1,
"kind_id":
2
}
]
_hit1 = {
"_index": "logentry_2018-03-08",
"_type": "doc",
"_id": "1",
"_score": None,
"_source": {
"random_id":
233,
"kind_id":
1,
"account_id":
1,
"performer_id":
1,
"repository_id":
1,
"ip":
"192.168.1.1",
"metadata_json":
"{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1520479800}",
"datetime":
"2018-03-08T03:30",
},
"sort": [1520479800000, 233]
}
_hit2 = {
"_index": "logentry_2018-04-02",
"_type": "doc",
"_id": "2",
"_score": None,
"_source": {
"random_id":
233,
"kind_id":
2,
"account_id":
1,
"performer_id":
1,
"repository_id":
1,
"ip":
"192.168.1.2",
"metadata_json":
"{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1522639800}",
"datetime":
"2018-04-02T03:30",
},
"sort": [1522639800000, 233]
}
_log1 = Log(
"{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1520479800}",
"192.168.1.1", parse("2018-03-08T03:30"), "user1.email", "user1.username", "user1.robot",
"user1.organization", "user1.username", "user1.email", "user1.robot", 1)
_log2 = Log(
"{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1522639800}",
"192.168.1.2", parse("2018-04-02T03:30"), "user1.email", "user1.username", "user1.robot",
"user1.organization", "user1.username", "user1.email", "user1.robot", 2)
SEARCH_RESPONSE_START = _status(_shards(_hits([_hit1, _hit2])))
SEARCH_RESPONSE_END = _status(_shards(_hits([_hit2])))
SEARCH_REQUEST_START = {
"sort": [{
"datetime": "desc"
}, {
"random_id.keyword": "desc"
}],
"query": {
"bool": {
"filter": [{
"term": {
"performer_id": 1
}
}, {
"term": {
"repository_id": 1
}
}]
}
},
"size": 2
}
SEARCH_REQUEST_END = {
"sort": [{
"datetime": "desc"
}, {
"random_id.keyword": "desc"
}],
"query": {
"bool": {
"filter": [{
"term": {
"performer_id": 1
}
}, {
"term": {
"repository_id": 1
}
}]
}
},
"search_after": [1520479800000, 233],
"size": 2
}
SEARCH_REQUEST_FILTER = {
"sort": [{
"datetime": "desc"
}, {
"random_id.keyword": "desc"
}],
"query": {
"bool": {
"filter": [{
"term": {
"performer_id": 1
}
}, {
"term": {
"repository_id": 1
}
}, {
"bool": {
"must_not": [{
"terms": {
"kind_id": [1]
}
}]
}
}]
}
},
"size": 2
}
SEARCH_PAGE_TOKEN = {
"datetime": datetime(2018, 3, 8, 3, 30).isoformat(),
"random_id": 233,
"page_number": 1
}
SEARCH_PAGE_START = LogEntriesPage(logs=[_log1], next_page_token=SEARCH_PAGE_TOKEN)
SEARCH_PAGE_END = LogEntriesPage(logs=[_log2], next_page_token=None)
SEARCH_PAGE_EMPTY = LogEntriesPage([], None)
AGGS_RESPONSE = _status(
_shards({
"hits": {
"total": 4,
"max_score": None,
"hits": []
},
"aggregations": {
"by_id": {
"doc_count_error_upper_bound":
0,
"sum_other_doc_count":
0,
"buckets": [{
"key": 2,
"doc_count": 3,
"by_date": {
"buckets": [{
"key_as_string": "2009-11-12T00:00:00.000Z",
"key": 1257984000000,
"doc_count": 1
}, {
"key_as_string": "2009-11-13T00:00:00.000Z",
"key": 1258070400000,
"doc_count": 0
}, {
"key_as_string": "2009-11-14T00:00:00.000Z",
"key": 1258156800000,
"doc_count": 2
}]
}
}, {
"key": 1,
"doc_count": 1,
"by_date": {
"buckets": [{
"key_as_string": "2009-11-15T00:00:00.000Z",
"key": 1258243200000,
"doc_count": 1
}]
}
}]
}
}
}))
AGGS_REQUEST = {
"query": {
"bool": {
"filter": [{
"term": {
"performer_id": 1
}
}, {
"term": {
"repository_id": 1
}
}, {
"bool": {
"must_not": [{
"terms": {
"kind_id": [2]
}
}]
}
}],
"must": [{
"range": {
"datetime": {
"lt": "2018-04-08T03:30:00",
"gte": "2018-03-08T03:30:00"
}
}
}]
}
},
"aggs": {
"by_id": {
"terms": {
"field": "kind_id"
},
"aggs": {
"by_date": {
"date_histogram": {
"field": "datetime",
"interval": "day"
}
}
}
}
},
"size": 0
}
AGGS_COUNT = [
AggregatedLogCount(1, 1, parse("2009-11-15T00:00:00.000")),
AggregatedLogCount(2, 1, parse("2009-11-12T00:00:00.000")),
AggregatedLogCount(2, 2, parse("2009-11-14T00:00:00.000"))
]
COUNT_REQUEST = {
"query": {
"bool": {
"filter": [{
"term": {
"repository_id": 1
}
}]
}
}
}
COUNT_RESPONSE = _status(_shards({
"count": 1,
}))
# assume there are 2 pages
_scroll_id = "DnF1ZXJ5VGhlbkZldGNoBQAAAAAAACEmFkk1aGlTRzdSUWllejZmYTlEYTN3SVEAAAAAAAAhJRZJNWhpU0c3UlFpZXo2ZmE5RGEzd0lRAAAAAAAAHtAWLWZpaFZXVzVSTy1OTXA5V3MwcHZrZwAAAAAAAB7RFi1maWhWV1c1Uk8tTk1wOVdzMHB2a2cAAAAAAAAhJxZJNWhpU0c3UlFpZXo2ZmE5RGEzd0lR"
def _scroll(d):
d["_scroll_id"] = _scroll_id
return d
SCROLL_CREATE = _status(_shards(_scroll(_hits([_hit1]))))
SCROLL_GET = _status(_shards(_scroll(_hits([_hit2]))))
SCROLL_GET_2 = _status(_shards(_scroll(_hits([]))))
SCROLL_DELETE = _status({"succeeded": True, "num_freed": 5})
SCROLL_LOGS = [[_log1], [_log2]]
SCROLL_REQUESTS = [
[
"5m", 1, {
"sort": "_doc",
"query": {
"range": {
"datetime": {
"lt": "2018-04-02T00:00:00",
"gte": "2018-03-08T00:00:00"
}
}
}
}
],
[{"scroll": "5m", "scroll_id": _scroll_id}],
[{"scroll":"5m", "scroll_id": _scroll_id}],
[{"scroll_id": [_scroll_id]}],
]
SCROLL_RESPONSES = [SCROLL_CREATE, SCROLL_GET, SCROLL_GET_2, SCROLL_DELETE]

View file

@ -0,0 +1,130 @@
from datetime import date, datetime, timedelta
from freezegun import freeze_time
from data.logs_model.inmemory_model import InMemoryModel
from data.logs_model.combined_model import CombinedLogsModel
from test.fixtures import *
@pytest.fixture()
def first_model():
return InMemoryModel()
@pytest.fixture()
def second_model():
return InMemoryModel()
@pytest.fixture()
def combined_model(first_model, second_model, initialized_db):
return CombinedLogsModel(first_model, second_model)
def test_log_action(first_model, second_model, combined_model, initialized_db):
day = date(2019, 1, 1)
# Write to the combined model.
with freeze_time(day):
combined_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
simple_repo = model.repository.get_repository('devtable', 'simple')
# Make sure it is found in the first model but not the second.
assert combined_model.count_repository_actions(simple_repo, day) == 1
assert first_model.count_repository_actions(simple_repo, day) == 1
assert second_model.count_repository_actions(simple_repo, day) == 0
def test_count_repository_actions(first_model, second_model, combined_model, initialized_db):
# Write to each model.
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
# Ensure the counts match as expected.
day = datetime.today() - timedelta(minutes=60)
simple_repo = model.repository.get_repository('devtable', 'simple')
assert first_model.count_repository_actions(simple_repo, day) == 3
assert second_model.count_repository_actions(simple_repo, day) == 2
assert combined_model.count_repository_actions(simple_repo, day) == 5
def test_yield_logs_for_export(first_model, second_model, combined_model, initialized_db):
now = datetime.now()
# Write to each model.
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
later = datetime.now()
# Ensure the full set of logs is yielded.
first_logs = list(first_model.yield_logs_for_export(now, later))[0]
second_logs = list(second_model.yield_logs_for_export(now, later))[0]
combined = list(combined_model.yield_logs_for_export(now, later))
full_combined = []
for subset in combined:
full_combined.extend(subset)
assert len(full_combined) == len(first_logs) + len(second_logs)
assert full_combined == (first_logs + second_logs)
def test_lookup_logs(first_model, second_model, combined_model, initialized_db):
now = datetime.now()
# Write to each model.
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
later = datetime.now()
def _collect_logs(model):
page_token = None
all_logs = []
while True:
paginated_logs = model.lookup_logs(now, later, page_token=page_token)
page_token = paginated_logs.next_page_token
all_logs.extend(paginated_logs.logs)
if page_token is None:
break
return all_logs
first_logs = _collect_logs(first_model)
second_logs = _collect_logs(second_model)
combined = _collect_logs(combined_model)
assert len(combined) == len(first_logs) + len(second_logs)
assert combined == (first_logs + second_logs)

View file

@ -0,0 +1,529 @@
# -*- coding: utf-8 -*-
# pylint: disable=redefined-outer-name, wildcard-import
import json
from datetime import datetime, timedelta
import pytest
from mock import patch, Mock
from dateutil.parser import parse
from httmock import urlmatch, HTTMock
from data.model.log import _json_serialize
from data.logs_model.elastic_logs import ElasticsearchLogs, INDEX_NAME_PREFIX, INDEX_DATE_FORMAT
from data.logs_model import configure, LogsModelProxy
from mock_elasticsearch import *
FAKE_ES_HOST = 'fakees'
FAKE_ES_HOST_PATTERN = r'fakees.*'
FAKE_ES_PORT = 443
FAKE_AWS_ACCESS_KEY = None
FAKE_AWS_SECRET_KEY = None
FAKE_AWS_REGION = None
@pytest.fixture()
def logs_model_config():
conf = {
'LOGS_MODEL': 'elasticsearch',
'LOGS_MODEL_CONFIG': {
'producer': 'elasticsearch',
'elasticsearch_config': {
'host': FAKE_ES_HOST,
'port': FAKE_ES_PORT,
'access_key': FAKE_AWS_ACCESS_KEY,
'secret_key': FAKE_AWS_SECRET_KEY,
'aws_region': FAKE_AWS_REGION
}
}
}
return conf
FAKE_LOG_ENTRY_KINDS = {'push_repo': 1, 'pull_repo': 2}
FAKE_NAMESPACES = {
'user1':
Mock(id=1, organization="user1.organization", username="user1.username", email="user1.email",
robot="user1.robot"),
'user2':
Mock(id=2, organization="user2.organization", username="user2.username", email="user2.email",
robot="user2.robot")
}
FAKE_REPOSITORIES = {
'user1/repo1': Mock(id=1, namespace_user=FAKE_NAMESPACES['user1']),
'user2/repo2': Mock(id=2, namespace_user=FAKE_NAMESPACES['user2']),
}
@pytest.fixture()
def logs_model():
# prevent logs model from changing
logs_model = LogsModelProxy()
with patch('data.logs_model.logs_model', logs_model):
yield logs_model
@pytest.fixture(scope='function')
def app_config(logs_model_config):
fake_config = {}
fake_config.update(logs_model_config)
with patch("data.logs_model.document_logs_model.config.app_config", fake_config):
yield fake_config
@pytest.fixture()
def mock_page_size():
with patch('data.logs_model.document_logs_model.PAGE_SIZE', 1):
yield
@pytest.fixture()
def mock_max_result_window():
with patch('data.logs_model.document_logs_model.DEFAULT_RESULT_WINDOW', 1):
yield
@pytest.fixture
def mock_random_id():
mock_random = Mock(return_value=233)
with patch('data.logs_model.document_logs_model._random_id', mock_random):
yield
@pytest.fixture()
def mock_db_model():
def get_user_map_by_ids(namespace_ids):
mapping = {}
for i in namespace_ids:
for name in FAKE_NAMESPACES:
if FAKE_NAMESPACES[name].id == i:
mapping[i] = FAKE_NAMESPACES[name]
return mapping
model = Mock(
user=Mock(
get_namespace_user=FAKE_NAMESPACES.get,
get_user_or_org=FAKE_NAMESPACES.get,
get_user=FAKE_NAMESPACES.get,
get_user_map_by_ids=get_user_map_by_ids,
),
repository=Mock(get_repository=lambda user_name, repo_name: FAKE_REPOSITORIES.get(
user_name + '/' + repo_name),
),
log=Mock(
_get_log_entry_kind=lambda name: FAKE_LOG_ENTRY_KINDS[name],
_json_serialize=_json_serialize,
get_log_entry_kinds=Mock(return_value=FAKE_LOG_ENTRY_KINDS),
),
)
with patch('data.logs_model.document_logs_model.model', model), patch(
'data.logs_model.datatypes.model', model):
yield
def parse_query(query):
return {s.split('=')[0]: s.split('=')[1] for s in query.split("&") if s != ""}
@pytest.fixture()
def mock_elasticsearch():
mock = Mock()
mock.template.side_effect = NotImplementedError
mock.index.side_effect = NotImplementedError
mock.count.side_effect = NotImplementedError
mock.scroll_get.side_effect = NotImplementedError
mock.scroll_delete.side_effect = NotImplementedError
mock.search_scroll_create.side_effect = NotImplementedError
mock.search_aggs.side_effect = NotImplementedError
mock.search_after.side_effect = NotImplementedError
mock.list_indices.side_effect = NotImplementedError
@urlmatch(netloc=r'.*', path=r'.*')
def default(url, req):
raise Exception('\nurl={}\nmethod={}\nreq.url={}\nheaders={}\nbody={}'.format(
url, req.method, req.url, req.headers, req.body))
@urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/_template/.*')
def template(url, req):
return mock.template(url.query.split('/')[-1], req.body)
@urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/logentry_(\*|[0-9\-]+)')
def list_indices(url, req):
return mock.list_indices()
@urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/logentry_[0-9\-]*/_doc')
def index(url, req):
index = url.path.split('/')[1]
body = json.loads(req.body)
body['metadata_json'] = json.loads(body['metadata_json'])
return mock.index(index, body)
@urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/logentry_([0-9\-]*|\*)/_count')
def count(_, req):
return mock.count(json.loads(req.body))
@urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/_search/scroll')
def scroll(url, req):
if req.method == 'DELETE':
return mock.scroll_delete(json.loads(req.body))
elif req.method == 'GET':
request_obj = json.loads(req.body)
return mock.scroll_get(request_obj)
raise NotImplementedError()
@urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/logentry_(\*|[0-9\-]*)/_search')
def search(url, req):
if "scroll" in url.query:
query = parse_query(url.query)
window_size = query['scroll']
maximum_result_size = int(query['size'])
return mock.search_scroll_create(window_size, maximum_result_size, json.loads(req.body))
elif "aggs" in req.body:
return mock.search_aggs(json.loads(req.body))
else:
return mock.search_after(json.loads(req.body))
with HTTMock(scroll, count, search, index, template, list_indices, default):
yield mock
@pytest.mark.parametrize(
"""
unlogged_pulls_ok, kind_name, namespace_name, repository, repository_name,
timestamp,
index_response, expected_request, throws
""",
[
# Invalid inputs
pytest.param(
False, 'non-existing', None, None, None,
None,
None, None, True,
id="Invalid Kind"
),
pytest.param(
False, 'pull_repo', 'user1', Mock(id=1), 'repo1',
None,
None, None, True,
id="Invalid Parameters"
),
# Remote exceptions
pytest.param(
False, 'pull_repo', 'user1', Mock(id=1), None,
None,
FAILURE_400, None, True,
id="Throw on pull log failure"
),
pytest.param(
True, 'pull_repo', 'user1', Mock(id=1), None,
parse("2017-03-08T03:30"),
FAILURE_400, INDEX_REQUEST_2017_03_08, False,
id="Ok on pull log failure"
),
# Success executions
pytest.param(
False, 'pull_repo', 'user1', Mock(id=1), None,
parse("2017-03-08T03:30"),
INDEX_RESPONSE_2017_03_08, INDEX_REQUEST_2017_03_08, False,
id="Log with namespace name and repository"
),
pytest.param(
False, 'push_repo', 'user1', None, 'repo1',
parse("2019-01-01T03:30"),
INDEX_RESPONSE_2019_01_01, INDEX_REQUEST_2019_01_01, False,
id="Log with namespace name and repository name"
),
])
def test_log_action(unlogged_pulls_ok, kind_name, namespace_name, repository, repository_name,
timestamp,
index_response, expected_request, throws,
app_config, logs_model, mock_elasticsearch, mock_db_model, mock_random_id):
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
mock_elasticsearch.index = Mock(return_value=index_response)
app_config['ALLOW_PULLS_WITHOUT_STRICT_LOGGING'] = unlogged_pulls_ok
configure(app_config)
performer = Mock(id=1)
ip = "192.168.1.1"
metadata = {'key': 'value', 'time': parse("2018-03-08T03:30"), '😂': '😂👌👌👌👌'}
if throws:
with pytest.raises(Exception):
logs_model.log_action(kind_name, namespace_name, performer, ip, metadata, repository,
repository_name, timestamp)
else:
logs_model.log_action(kind_name, namespace_name, performer, ip, metadata, repository,
repository_name, timestamp)
mock_elasticsearch.index.assert_called_with(*expected_request)
@pytest.mark.parametrize(
"""
start_datetime, end_datetime,
performer_name, repository_name, namespace_name,
filter_kinds,
page_token,
max_page_count,
search_response,
list_indices_response,
expected_request,
expected_page,
throws
""",
[
# 1st page
pytest.param(
parse('2018-03-08T03:30'), parse('2018-04-08T03:30'),
'user1', 'repo1', 'user1',
None,
None,
None,
SEARCH_RESPONSE_START,
INDEX_LIST_RESPONSE_HIT1_HIT2,
SEARCH_REQUEST_START,
SEARCH_PAGE_START,
False,
id="1st page"
),
# Last page
pytest.param(
parse('2018-03-08T03:30'), parse('2018-04-08T03:30'),
'user1', 'repo1', 'user1',
None,
SEARCH_PAGE_TOKEN,
None,
SEARCH_RESPONSE_END,
INDEX_LIST_RESPONSE_HIT1_HIT2,
SEARCH_REQUEST_END,
SEARCH_PAGE_END,
False,
id="Search using pagination token"
),
# Filter
pytest.param(
parse('2018-03-08T03:30'), parse('2018-04-08T03:30'),
'user1', 'repo1', 'user1',
['push_repo'],
None,
None,
SEARCH_RESPONSE_END,
INDEX_LIST_RESPONSE_HIT2,
SEARCH_REQUEST_FILTER,
SEARCH_PAGE_END,
False,
id="Filtered search"
),
# Max page count
pytest.param(
parse('2018-03-08T03:30'), parse('2018-04-08T03:30'),
'user1', 'repo1', 'user1',
None,
SEARCH_PAGE_TOKEN,
1,
AssertionError, # Assert that it should not reach the ES server
None,
None,
SEARCH_PAGE_EMPTY,
False,
id="Page token reaches maximum page count",
),
])
def test_lookup_logs(start_datetime, end_datetime,
performer_name, repository_name, namespace_name,
filter_kinds,
page_token,
max_page_count,
search_response,
list_indices_response,
expected_request,
expected_page,
throws,
logs_model, mock_elasticsearch, mock_db_model, mock_page_size, app_config):
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
mock_elasticsearch.search_after = Mock(return_value=search_response)
mock_elasticsearch.list_indices = Mock(return_value=list_indices_response)
configure(app_config)
if throws:
with pytest.raises(Exception):
logs_model.lookup_logs(start_datetime, end_datetime, performer_name, repository_name,
namespace_name, filter_kinds, page_token, max_page_count)
else:
page = logs_model.lookup_logs(start_datetime, end_datetime, performer_name, repository_name,
namespace_name, filter_kinds, page_token, max_page_count)
assert page == expected_page
if expected_request:
mock_elasticsearch.search_after.assert_called_with(expected_request)
@pytest.mark.parametrize(
"""
start_datetime, end_datetime,
performer_name, repository_name, namespace_name,
filter_kinds, search_response, expected_request, expected_counts, throws
""",
[
# Valid
pytest.param(
parse('2018-03-08T03:30'), parse('2018-04-08T03:30'),
'user1', 'repo1', 'user1',
['pull_repo'], AGGS_RESPONSE, AGGS_REQUEST, AGGS_COUNT, False,
id="Valid Counts"
),
# Invalid case: date range too big
pytest.param(
parse('2018-03-08T03:30'), parse('2018-04-09T03:30'),
'user1', 'repo1', 'user1',
[], None, None, None, True,
id="Throw on date range too big"
)
])
def test_get_aggregated_log_counts(start_datetime, end_datetime,
performer_name, repository_name, namespace_name,
filter_kinds, search_response, expected_request, expected_counts, throws,
logs_model, mock_elasticsearch, mock_db_model, app_config):
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
mock_elasticsearch.search_aggs = Mock(return_value=search_response)
configure(app_config)
if throws:
with pytest.raises(Exception):
logs_model.get_aggregated_log_counts(start_datetime, end_datetime, performer_name,
repository_name, namespace_name, filter_kinds)
else:
counts = logs_model.get_aggregated_log_counts(start_datetime, end_datetime, performer_name,
repository_name, namespace_name, filter_kinds)
assert set(counts) == set(expected_counts)
if expected_request:
mock_elasticsearch.search_aggs.assert_called_with(expected_request)
@pytest.mark.parametrize(
"""
repository,
day,
count_response, expected_request, expected_count, throws
""",
[
pytest.param(
FAKE_REPOSITORIES['user1/repo1'],
parse("2018-03-08").date(),
COUNT_RESPONSE, COUNT_REQUEST, 1, False,
id="Valid Count with 1 as result"),
])
def test_count_repository_actions(repository,
day,
count_response, expected_request, expected_count, throws,
logs_model, mock_elasticsearch, mock_db_model, app_config):
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
mock_elasticsearch.count = Mock(return_value=count_response)
mock_elasticsearch.list_indices = Mock(return_value=INDEX_LIST_RESPONSE)
configure(app_config)
if throws:
with pytest.raises(Exception):
logs_model.count_repository_actions(repository, day)
else:
count = logs_model.count_repository_actions(repository, day)
assert count == expected_count
if expected_request:
mock_elasticsearch.count.assert_called_with(expected_request)
@pytest.mark.parametrize(
"""
start_datetime, end_datetime,
repository_id, namespace_id,
max_query_time, scroll_responses, expected_requests, expected_logs, throws
""",
[
pytest.param(
parse("2018-03-08"), parse("2018-04-02"),
1, 1,
timedelta(seconds=10), SCROLL_RESPONSES, SCROLL_REQUESTS, SCROLL_LOGS, False,
id="Scroll 3 pages with page size = 1"
),
])
def test_yield_logs_for_export(start_datetime, end_datetime,
repository_id, namespace_id,
max_query_time, scroll_responses, expected_requests, expected_logs, throws,
logs_model, mock_elasticsearch, mock_db_model, mock_max_result_window, app_config):
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
mock_elasticsearch.search_scroll_create = Mock(return_value=scroll_responses[0])
mock_elasticsearch.scroll_get = Mock(side_effect=scroll_responses[1:-1])
mock_elasticsearch.scroll_delete = Mock(return_value=scroll_responses[-1])
configure(app_config)
if throws:
with pytest.raises(Exception):
logs_model.yield_logs_for_export(start_datetime, end_datetime, max_query_time=max_query_time)
else:
log_generator = logs_model.yield_logs_for_export(start_datetime, end_datetime,
max_query_time=max_query_time)
counter = 0
for logs in log_generator:
if counter == 0:
mock_elasticsearch.search_scroll_create.assert_called_with(*expected_requests[counter])
else:
mock_elasticsearch.scroll_get.assert_called_with(*expected_requests[counter])
assert expected_logs[counter] == logs
counter += 1
# the last two requests must be
# 1. get with response scroll with 0 hits, which indicates the termination condition
# 2. delete scroll request
mock_elasticsearch.scroll_get.assert_called_with(*expected_requests[-2])
mock_elasticsearch.scroll_delete.assert_called_with(*expected_requests[-1])
@pytest.mark.parametrize('prefix, is_valid', [
pytest.param('..', False, id='Invalid `..`'),
pytest.param('.', False, id='Invalid `.`'),
pytest.param('-prefix', False, id='Invalid prefix start -'),
pytest.param('_prefix', False, id='Invalid prefix start _'),
pytest.param('+prefix', False, id='Invalid prefix start +'),
pytest.param('prefix_with_UPPERCASES', False, id='Invalid uppercase'),
pytest.param('valid_index', True, id='Valid prefix'),
pytest.param('valid_index_with_numbers1234', True, id='Valid prefix with numbers'),
pytest.param('a'*256, False, id='Prefix too long')
])
def test_valid_index_prefix(prefix, is_valid):
assert ElasticsearchLogs._valid_index_prefix(prefix) == is_valid
@pytest.mark.parametrize('index, cutoff_date, expected_result', [
pytest.param(
INDEX_NAME_PREFIX+'2019-06-06',
datetime(2019, 6, 8),
True,
id="Index older than cutoff"
),
pytest.param(
INDEX_NAME_PREFIX+'2019-06-06',
datetime(2019, 6, 4),
False,
id="Index younger than cutoff"
),
pytest.param(
INDEX_NAME_PREFIX+'2019-06-06',
datetime(2019, 6, 6, 23),
False,
id="Index older than cutoff but timedelta less than 1 day"
),
pytest.param(
INDEX_NAME_PREFIX+'2019-06-06',
datetime(2019, 6, 7),
True,
id="Index older than cutoff by exactly one day"
),
])
def test_can_delete_index(index, cutoff_date, expected_result):
es = ElasticsearchLogs(index_prefix=INDEX_NAME_PREFIX)
assert datetime.strptime(index.split(es._index_prefix, 1)[-1], INDEX_DATE_FORMAT)
assert es.can_delete_index(index, cutoff_date) == expected_result

View file

@ -0,0 +1,473 @@
from datetime import datetime, timedelta, date
from data.logs_model.datatypes import AggregatedLogCount
from data.logs_model.table_logs_model import TableLogsModel
from data.logs_model.combined_model import CombinedLogsModel
from data.logs_model.inmemory_model import InMemoryModel
from data.logs_model.combined_model import _merge_aggregated_log_counts
from data.logs_model.document_logs_model import _date_range_in_single_index, DocumentLogsModel
from data.logs_model.interface import LogsIterationTimeout
from data.logs_model.test.fake_elasticsearch import FAKE_ES_HOST, fake_elasticsearch
from data.database import LogEntry, LogEntry2, LogEntry3, LogEntryKind
from data import model
from test.fixtures import *
@pytest.fixture()
def mock_page_size():
page_size = 2
with patch('data.logs_model.document_logs_model.PAGE_SIZE', page_size):
yield page_size
@pytest.fixture()
def clear_db_logs(initialized_db):
LogEntry.delete().execute()
LogEntry2.delete().execute()
LogEntry3.delete().execute()
def combined_model():
return CombinedLogsModel(TableLogsModel(), InMemoryModel())
def es_model():
return DocumentLogsModel(producer='elasticsearch', elasticsearch_config={
'host': FAKE_ES_HOST,
'port': 12345,
})
@pytest.fixture()
def fake_es():
with fake_elasticsearch():
yield
@pytest.fixture(params=[TableLogsModel, InMemoryModel, es_model, combined_model])
def logs_model(request, clear_db_logs, fake_es):
return request.param()
def _lookup_logs(logs_model, start_time, end_time, **kwargs):
logs_found = []
page_token = None
while True:
found = logs_model.lookup_logs(start_time, end_time, page_token=page_token, **kwargs)
logs_found.extend(found.logs)
page_token = found.next_page_token
if not found.logs or not page_token:
break
assert len(logs_found) == len(set(logs_found))
return logs_found
@pytest.mark.skipif(os.environ.get('TEST_DATABASE_URI', '').find('mysql') >= 0,
reason='Flaky on MySQL')
@pytest.mark.parametrize('namespace_name, repo_name, performer_name, check_args, expect_results', [
pytest.param('devtable', 'simple', 'devtable', {}, True, id='no filters'),
pytest.param('devtable', 'simple', 'devtable', {
'performer_name': 'devtable',
}, True, id='matching performer'),
pytest.param('devtable', 'simple', 'devtable', {
'namespace_name': 'devtable',
}, True, id='matching namespace'),
pytest.param('devtable', 'simple', 'devtable', {
'namespace_name': 'devtable',
'repository_name': 'simple',
}, True, id='matching repository'),
pytest.param('devtable', 'simple', 'devtable', {
'performer_name': 'public',
}, False, id='different performer'),
pytest.param('devtable', 'simple', 'devtable', {
'namespace_name': 'public',
}, False, id='different namespace'),
pytest.param('devtable', 'simple', 'devtable', {
'namespace_name': 'devtable',
'repository_name': 'complex',
}, False, id='different repository'),
])
def test_logs(namespace_name, repo_name, performer_name, check_args, expect_results, logs_model):
# Add some logs.
kinds = list(LogEntryKind.select())
user = model.user.get_user(performer_name)
start_timestamp = datetime.utcnow()
timestamp = start_timestamp
for kind in kinds:
for index in range(0, 3):
logs_model.log_action(kind.name, namespace_name=namespace_name, repository_name=repo_name,
performer=user, ip='1.2.3.4', timestamp=timestamp)
timestamp = timestamp + timedelta(seconds=1)
found = _lookup_logs(logs_model, start_timestamp, start_timestamp + timedelta(minutes=10),
**check_args)
if expect_results:
assert len(found) == len(kinds) * 3
else:
assert not found
aggregated_counts = logs_model.get_aggregated_log_counts(start_timestamp,
start_timestamp + timedelta(minutes=10),
**check_args)
if expect_results:
assert len(aggregated_counts) == len(kinds)
for ac in aggregated_counts:
assert ac.count == 3
else:
assert not aggregated_counts
@pytest.mark.parametrize('filter_kinds, expect_results', [
pytest.param(None, True),
pytest.param(['push_repo'], True, id='push_repo filter'),
pytest.param(['pull_repo'], True, id='pull_repo filter'),
pytest.param(['push_repo', 'pull_repo'], False, id='push and pull filters')
])
def test_lookup_latest_logs(filter_kinds, expect_results, logs_model):
kind_map = model.log.get_log_entry_kinds()
if filter_kinds:
ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds if filter_kinds]
else:
ignore_ids = []
now = datetime.now()
namespace_name = 'devtable'
repo_name = 'simple'
performer_name = 'devtable'
user = model.user.get_user(performer_name)
size = 3
# Log some push actions
logs_model.log_action('push_repo', namespace_name=namespace_name, repository_name=repo_name,
performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=1, seconds=11))
logs_model.log_action('push_repo', namespace_name=namespace_name, repository_name=repo_name,
performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=7, seconds=33))
# Log some pull actions
logs_model.log_action('pull_repo', namespace_name=namespace_name, repository_name=repo_name,
performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=0, seconds=3))
logs_model.log_action('pull_repo', namespace_name=namespace_name, repository_name=repo_name,
performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=3, seconds=55))
logs_model.log_action('pull_repo', namespace_name=namespace_name, repository_name=repo_name,
performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=5, seconds=3))
logs_model.log_action('pull_repo', namespace_name=namespace_name, repository_name=repo_name,
performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=11, seconds=11))
# Get the latest logs
latest_logs = logs_model.lookup_latest_logs(performer_name, repo_name, namespace_name,
filter_kinds=filter_kinds, size=size)
# Test max lookup size
assert len(latest_logs) <= size
# Make sure that the latest logs returned are in decreasing order
assert all(x >= y for x, y in zip(latest_logs, latest_logs[1:]))
if expect_results:
assert latest_logs
# Lookup all logs filtered by kinds and sort them in reverse chronological order
all_logs = _lookup_logs(logs_model, now - timedelta(days=30), now + timedelta(days=30),
filter_kinds=filter_kinds, namespace_name=namespace_name,
repository_name=repo_name)
all_logs = sorted(all_logs, key=lambda l: l.datetime, reverse=True)
# Check that querying all logs does not return the filtered kinds
assert all([log.kind_id not in ignore_ids for log in all_logs])
# Check that the latest logs contains only th most recent ones
assert latest_logs == all_logs[:len(latest_logs)]
def test_count_repository_actions(logs_model):
# Log some actions.
logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
# Log some actions to a different repo.
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='complex',
ip='1.2.3.4')
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='complex',
ip='1.2.3.4')
# Count the actions.
day = date.today()
simple_repo = model.repository.get_repository('devtable', 'simple')
count = logs_model.count_repository_actions(simple_repo, day)
assert count == 3
complex_repo = model.repository.get_repository('devtable', 'complex')
count = logs_model.count_repository_actions(complex_repo, day)
assert count == 2
# Try counting actions for a few days in the future to ensure it doesn't raise an error.
count = logs_model.count_repository_actions(simple_repo, day + timedelta(days=5))
assert count == 0
def test_yield_log_rotation_context(logs_model):
cutoff_date = datetime.now()
min_logs_per_rotation = 3
# Log some actions to be archived
# One day
logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple1',
ip='1.2.3.4', timestamp=cutoff_date-timedelta(days=1, seconds=1))
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2',
ip='5.6.7.8', timestamp=cutoff_date-timedelta(days=1, seconds=2))
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple3',
ip='9.10.11.12', timestamp=cutoff_date-timedelta(days=1, seconds=3))
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple4',
ip='0.0.0.0', timestamp=cutoff_date-timedelta(days=1, seconds=4))
# Another day
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple5',
ip='1.1.1.1', timestamp=cutoff_date-timedelta(days=2, seconds=1))
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple5',
ip='1.1.1.1', timestamp=cutoff_date-timedelta(days=2, seconds=2))
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple5',
ip='1.1.1.1', timestamp=cutoff_date-timedelta(days=2, seconds=3))
found = _lookup_logs(logs_model, cutoff_date - timedelta(days=3), cutoff_date + timedelta(days=1))
assert found is not None and len(found) == 7
# Iterate the logs using the log rotation contexts
all_logs = []
for log_rotation_context in logs_model.yield_log_rotation_context(cutoff_date,
min_logs_per_rotation):
with log_rotation_context as context:
for logs, _ in context.yield_logs_batch():
all_logs.extend(logs)
assert len(all_logs) == 7
found = _lookup_logs(logs_model, cutoff_date - timedelta(days=3), cutoff_date + timedelta(days=1))
assert not found
# Make sure all datetimes are monotonically increasing (by datetime) after sorting the lookup
# to make sure no duplicates were returned
all_logs.sort(key=lambda d: d.datetime)
assert all(x.datetime < y.datetime for x, y in zip(all_logs, all_logs[1:]))
def test_count_repository_actions_with_wildcard_disabled(initialized_db):
with fake_elasticsearch(allow_wildcard=False):
logs_model = es_model()
# Log some actions.
logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
# Log some actions to a different repo.
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='complex',
ip='1.2.3.4')
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='complex',
ip='1.2.3.4')
# Count the actions.
day = date.today()
simple_repo = model.repository.get_repository('devtable', 'simple')
count = logs_model.count_repository_actions(simple_repo, day)
assert count == 3
complex_repo = model.repository.get_repository('devtable', 'complex')
count = logs_model.count_repository_actions(complex_repo, day)
assert count == 2
# Try counting actions for a few days in the future to ensure it doesn't raise an error.
count = logs_model.count_repository_actions(simple_repo, day + timedelta(days=5))
assert count == 0
@pytest.mark.skipif(os.environ.get('TEST_DATABASE_URI', '').find('mysql') >= 0,
reason='Flaky on MySQL')
def test_yield_logs_for_export(logs_model):
# Add some logs.
kinds = list(LogEntryKind.select())
user = model.user.get_user('devtable')
start_timestamp = datetime.utcnow()
timestamp = start_timestamp
for kind in kinds:
for index in range(0, 10):
logs_model.log_action(kind.name, namespace_name='devtable', repository_name='simple',
performer=user, ip='1.2.3.4', timestamp=timestamp)
timestamp = timestamp + timedelta(seconds=1)
# Yield the logs.
simple_repo = model.repository.get_repository('devtable', 'simple')
logs_found = []
for logs in logs_model.yield_logs_for_export(start_timestamp, timestamp + timedelta(minutes=10),
repository_id=simple_repo.id):
logs_found.extend(logs)
# Ensure we found all added logs.
assert len(logs_found) == len(kinds) * 10
def test_yield_logs_for_export_timeout(logs_model):
# Add some logs.
kinds = list(LogEntryKind.select())
user = model.user.get_user('devtable')
start_timestamp = datetime.utcnow()
timestamp = start_timestamp
for kind in kinds:
for _ in range(0, 2):
logs_model.log_action(kind.name, namespace_name='devtable', repository_name='simple',
performer=user, ip='1.2.3.4', timestamp=timestamp)
timestamp = timestamp + timedelta(seconds=1)
# Yield the logs. Since we set the timeout to nothing, it should immediately fail.
simple_repo = model.repository.get_repository('devtable', 'simple')
with pytest.raises(LogsIterationTimeout):
list(logs_model.yield_logs_for_export(start_timestamp, timestamp + timedelta(minutes=1),
repository_id=simple_repo.id,
max_query_time=timedelta(seconds=0)))
def test_disabled_namespace(clear_db_logs):
logs_model = TableLogsModel(lambda kind, namespace, is_free: namespace == 'devtable')
# Log some actions.
logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple',
ip='1.2.3.4')
# Log some actions to a different namespace.
logs_model.log_action('push_repo', namespace_name='buynlarge', repository_name='orgrepo',
ip='1.2.3.4')
logs_model.log_action('pull_repo', namespace_name='buynlarge', repository_name='orgrepo',
ip='1.2.3.4')
logs_model.log_action('pull_repo', namespace_name='buynlarge', repository_name='orgrepo',
ip='1.2.3.4')
# Count the actions.
day = datetime.today() - timedelta(minutes=60)
simple_repo = model.repository.get_repository('devtable', 'simple')
count = logs_model.count_repository_actions(simple_repo, day)
assert count == 0
org_repo = model.repository.get_repository('buynlarge', 'orgrepo')
count = logs_model.count_repository_actions(org_repo, day)
assert count == 3
@pytest.mark.parametrize('aggregated_log_counts1, aggregated_log_counts2, expected_result', [
pytest.param(
[
AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0)), # 1
AggregatedLogCount(1, 3, datetime(2019, 6, 7, 0, 0)), # 2
],
[
AggregatedLogCount(1, 5, datetime(2019, 6, 6, 0, 0)), # 1
AggregatedLogCount(1, 7, datetime(2019, 6, 7, 0, 0)), # 2
AggregatedLogCount(3, 3, datetime(2019, 6, 1, 0, 0)), # 3
],
[
AggregatedLogCount(1, 8, datetime(2019, 6, 6, 0, 0)), # 1
AggregatedLogCount(1, 10, datetime(2019, 6, 7, 0, 0)), # 2
AggregatedLogCount(3, 3, datetime(2019, 6, 1, 0, 0)) # 3
]
),
pytest.param(
[
AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0)), # 1
],
[
AggregatedLogCount(1, 7, datetime(2019, 6, 7, 0, 0)), # 2
],
[
AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0)), # 1
AggregatedLogCount(1, 7, datetime(2019, 6, 7, 0, 0)), # 2
]
),
pytest.param(
[],
[AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0))],
[AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0))]
),
])
def test_merge_aggregated_log_counts(aggregated_log_counts1, aggregated_log_counts2, expected_result):
assert (sorted(_merge_aggregated_log_counts(aggregated_log_counts1, aggregated_log_counts2)) ==
sorted(expected_result))
@pytest.mark.parametrize('dt1, dt2, expected_result', [
# Valid dates
pytest.param(date(2019, 6, 17), date(2019, 6, 18), True),
# Invalid dates
pytest.param(date(2019, 6, 17), date(2019, 6, 17), False),
pytest.param(date(2019, 6, 17), date(2019, 6, 19), False),
pytest.param(date(2019, 6, 18), date(2019, 6, 17), False),
# Valid datetimes
pytest.param(datetime(2019, 6, 17, 0, 1), datetime(2019, 6, 17, 0, 2), True),
# Invalid datetimes
pytest.param(datetime(2019, 6, 17, 0, 2), datetime(2019, 6, 17, 0, 1), False),
pytest.param(datetime(2019, 6, 17, 11), datetime(2019, 6, 17, 11) + timedelta(hours=14), False),
])
def test_date_range_in_single_index(dt1, dt2, expected_result):
assert _date_range_in_single_index(dt1, dt2) == expected_result
def test_pagination(logs_model, mock_page_size):
"""
Make sure that pagination does not stop if searching through multiple indices by day,
and the current log count matches the page size while there are still indices to be searched.
"""
day1 = datetime.now()
day2 = day1 + timedelta(days=1)
day3 = day2 + timedelta(days=1)
# Log some actions in day indices
# One day
logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple1',
ip='1.2.3.4', timestamp=day1)
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple1',
ip='5.6.7.8', timestamp=day1)
found = _lookup_logs(logs_model, day1-timedelta(seconds=1), day3+timedelta(seconds=1))
assert len(found) == mock_page_size
# Another day
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2',
ip='1.1.1.1', timestamp=day2)
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2',
ip='0.0.0.0', timestamp=day2)
# Yet another day
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2',
ip='1.1.1.1', timestamp=day3)
logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2',
ip='0.0.0.0', timestamp=day3)
found = _lookup_logs(logs_model, day1-timedelta(seconds=1), day3+timedelta(seconds=1))
assert len(found) == 6

View file

@ -0,0 +1,77 @@
import logging
import pytest
from dateutil.parser import parse
from mock import patch, Mock
import botocore
from data.logs_model import configure
from test_elasticsearch import app_config, logs_model_config, logs_model, mock_elasticsearch, mock_db_model
from mock_elasticsearch import *
logger = logging.getLogger(__name__)
FAKE_KAFKA_BROKERS = ['fake_server1', 'fake_server2']
FAKE_KAFKA_TOPIC = 'sometopic'
FAKE_MAX_BLOCK_SECONDS = 1
@pytest.fixture()
def kafka_logs_producer_config(app_config):
producer_config = {}
producer_config.update(app_config)
kafka_config = {
'bootstrap_servers': FAKE_KAFKA_BROKERS,
'topic': FAKE_KAFKA_TOPIC,
'max_block_seconds': FAKE_MAX_BLOCK_SECONDS
}
producer_config['LOGS_MODEL_CONFIG']['producer'] = 'kafka'
producer_config['LOGS_MODEL_CONFIG']['kafka_config'] = kafka_config
return producer_config
@pytest.fixture()
def kinesis_logs_producer_config(app_config):
producer_config = {}
producer_config.update(app_config)
kinesis_stream_config = {
'stream_name': 'test-stream',
'aws_region': 'fake_region',
'aws_access_key': 'some_key',
'aws_secret_key': 'some_secret'
}
producer_config['LOGS_MODEL_CONFIG']['producer'] = 'kinesis_stream'
producer_config['LOGS_MODEL_CONFIG']['kinesis_stream_config'] = kinesis_stream_config
return producer_config
def test_kafka_logs_producers(logs_model, mock_elasticsearch, mock_db_model, kafka_logs_producer_config):
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
producer_config = kafka_logs_producer_config
with patch('kafka.client_async.KafkaClient.check_version'), patch('kafka.KafkaProducer.send') as mock_send:
configure(producer_config)
logs_model.log_action('pull_repo', 'user1', Mock(id=1), '192.168.1.1', {'key': 'value'},
None, 'repo1', parse("2019-01-01T03:30"))
mock_send.assert_called_once()
def test_kinesis_logs_producers(logs_model, mock_elasticsearch, mock_db_model, kinesis_logs_producer_config):
mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE)
producer_config = kinesis_logs_producer_config
with patch('botocore.endpoint.EndpointCreator.create_endpoint'), \
patch('botocore.client.BaseClient._make_api_call') as mock_send:
configure(producer_config)
logs_model.log_action('pull_repo', 'user1', Mock(id=1), '192.168.1.1', {'key': 'value'},
None, 'repo1', parse("2019-01-01T03:30"))
# Check that a PutRecord api call is made.
# NOTE: The second arg of _make_api_call uses a randomized PartitionKey
mock_send.assert_called_once_with(u'PutRecord', mock_send.call_args_list[0][0][1])