initial import for Open Source 🎉
This commit is contained in:
		
							parent
							
								
									1898c361f3
								
							
						
					
					
						commit
						9c0dd3b722
					
				
					 2048 changed files with 218743 additions and 0 deletions
				
			
		
							
								
								
									
										64
									
								
								data/logs_model/__init__.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										64
									
								
								data/logs_model/__init__.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,64 @@ | |||
| import logging | ||||
| 
 | ||||
| from data.logs_model.table_logs_model import TableLogsModel | ||||
| from data.logs_model.document_logs_model import DocumentLogsModel | ||||
| from data.logs_model.combined_model import CombinedLogsModel | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| 
 | ||||
| def _transition_model(*args, **kwargs): | ||||
|   return CombinedLogsModel( | ||||
|     DocumentLogsModel(*args, **kwargs), | ||||
|     TableLogsModel(*args, **kwargs), | ||||
|   ) | ||||
| 
 | ||||
| 
 | ||||
| _LOG_MODELS = { | ||||
|   'database': TableLogsModel, | ||||
|   'transition_reads_both_writes_es': _transition_model, | ||||
|   'elasticsearch': DocumentLogsModel, | ||||
| } | ||||
| 
 | ||||
| _PULL_LOG_KINDS = {'pull_repo', 'repo_verb'} | ||||
| 
 | ||||
| class LogsModelProxy(object): | ||||
|   def __init__(self): | ||||
|     self._model = None | ||||
| 
 | ||||
|   def initialize(self, model): | ||||
|     self._model = model | ||||
|     logger.info('===============================') | ||||
|     logger.info('Using logs model `%s`', self._model) | ||||
|     logger.info('===============================') | ||||
| 
 | ||||
|   def __getattr__(self, attr): | ||||
|     if not self._model: | ||||
|       raise AttributeError("LogsModelProxy is not initialized") | ||||
|     return getattr(self._model, attr) | ||||
| 
 | ||||
| 
 | ||||
| logs_model = LogsModelProxy() | ||||
| 
 | ||||
| 
 | ||||
| def configure(app_config): | ||||
|   logger.debug('Configuring log lodel') | ||||
|   model_name = app_config.get('LOGS_MODEL', 'database') | ||||
|   model_config = app_config.get('LOGS_MODEL_CONFIG', {}) | ||||
| 
 | ||||
|   def should_skip_logging(kind_name, namespace_name, is_free_namespace): | ||||
|     if namespace_name and namespace_name in app_config.get('DISABLED_FOR_AUDIT_LOGS', {}): | ||||
|       return True | ||||
| 
 | ||||
|     if kind_name in _PULL_LOG_KINDS: | ||||
|       if namespace_name and namespace_name in app_config.get('DISABLED_FOR_PULL_LOGS', {}): | ||||
|         return True | ||||
| 
 | ||||
|       if app_config.get('FEATURE_DISABLE_PULL_LOGS_FOR_FREE_NAMESPACES'): | ||||
|         if is_free_namespace: | ||||
|           return True | ||||
| 
 | ||||
|     return False | ||||
| 
 | ||||
|   model_config['should_skip_logging'] = should_skip_logging | ||||
|   logs_model.initialize(_LOG_MODELS[model_name](**model_config)) | ||||
							
								
								
									
										132
									
								
								data/logs_model/combined_model.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										132
									
								
								data/logs_model/combined_model.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,132 @@ | |||
| import logging | ||||
| import itertools | ||||
| 
 | ||||
| from data.logs_model.datatypes import AggregatedLogCount, LogEntriesPage | ||||
| from data.logs_model.interface import ActionLogsDataInterface | ||||
| from data.logs_model.shared import SharedModel | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| 
 | ||||
| def _merge_aggregated_log_counts(*args): | ||||
|   """ Merge two lists of AggregatedLogCount based on the value of their kind_id and datetime. | ||||
|   """ | ||||
|   matching_keys = {} | ||||
|   aggregated_log_counts_list = itertools.chain.from_iterable(args) | ||||
| 
 | ||||
|   def canonical_key_from_kind_date_tuple(kind_id, dt): | ||||
|     """ Return a comma separated key from an AggregatedLogCount's kind_id and datetime. """ | ||||
|     return str(kind_id) + ',' + str(dt) | ||||
| 
 | ||||
|   for kind_id, count, dt in aggregated_log_counts_list: | ||||
|     kind_date_key = canonical_key_from_kind_date_tuple(kind_id, dt) | ||||
|     if kind_date_key in matching_keys: | ||||
|       existing_count = matching_keys[kind_date_key][2] | ||||
|       matching_keys[kind_date_key] = (kind_id, dt, existing_count + count) | ||||
|     else: | ||||
|       matching_keys[kind_date_key] = (kind_id, dt, count) | ||||
| 
 | ||||
|   return [AggregatedLogCount(kind_id, count, dt) for (kind_id, dt, count) in matching_keys.values()] | ||||
| 
 | ||||
| 
 | ||||
| class CombinedLogsModel(SharedModel, ActionLogsDataInterface): | ||||
|   """ | ||||
|   CombinedLogsModel implements the data model that logs to the first logs model and reads from | ||||
|   both. | ||||
|   """ | ||||
| 
 | ||||
|   def __init__(self, read_write_logs_model, read_only_logs_model): | ||||
|     self.read_write_logs_model = read_write_logs_model | ||||
|     self.read_only_logs_model = read_only_logs_model | ||||
| 
 | ||||
|   def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None, | ||||
|                  repository=None, repository_name=None, timestamp=None, is_free_namespace=False): | ||||
|     return self.read_write_logs_model.log_action(kind_name, namespace_name, performer, ip, metadata, | ||||
|                                                  repository, repository_name, timestamp, | ||||
|                                                  is_free_namespace) | ||||
| 
 | ||||
|   def count_repository_actions(self, repository, day): | ||||
|     rw_count = self.read_write_logs_model.count_repository_actions(repository, day) | ||||
|     ro_count = self.read_only_logs_model.count_repository_actions(repository, day) | ||||
|     return rw_count + ro_count | ||||
| 
 | ||||
|   def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None, | ||||
|                                 repository_name=None, namespace_name=None, filter_kinds=None): | ||||
|     rw_model = self.read_write_logs_model | ||||
|     ro_model = self.read_only_logs_model | ||||
|     rw_count = rw_model.get_aggregated_log_counts(start_datetime, end_datetime, | ||||
|                                                   performer_name=performer_name, | ||||
|                                                   repository_name=repository_name, | ||||
|                                                   namespace_name=namespace_name, | ||||
|                                                   filter_kinds=filter_kinds) | ||||
|     ro_count = ro_model.get_aggregated_log_counts(start_datetime, end_datetime, | ||||
|                                                   performer_name=performer_name, | ||||
|                                                   repository_name=repository_name, | ||||
|                                                   namespace_name=namespace_name, | ||||
|                                                   filter_kinds=filter_kinds) | ||||
|     return _merge_aggregated_log_counts(rw_count, ro_count) | ||||
| 
 | ||||
|   def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None, | ||||
|                             namespace_id=None, max_query_time=None): | ||||
|     rw_model = self.read_write_logs_model | ||||
|     ro_model = self.read_only_logs_model | ||||
|     rw_logs = rw_model.yield_logs_for_export(start_datetime, end_datetime, repository_id, | ||||
|                                              namespace_id, max_query_time) | ||||
|     ro_logs = ro_model.yield_logs_for_export(start_datetime, end_datetime, repository_id, | ||||
|                                              namespace_id, max_query_time) | ||||
|     for batch in itertools.chain(rw_logs, ro_logs): | ||||
|       yield batch | ||||
| 
 | ||||
|   def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None, | ||||
|                   namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None): | ||||
|     rw_model = self.read_write_logs_model | ||||
|     ro_model = self.read_only_logs_model | ||||
| 
 | ||||
|     page_token = page_token or {} | ||||
| 
 | ||||
|     new_page_token = {} | ||||
|     if page_token is None or not page_token.get('under_readonly_model', False): | ||||
|       rw_page_token = page_token.get('readwrite_page_token') | ||||
|       rw_logs = rw_model.lookup_logs(start_datetime, end_datetime, performer_name, | ||||
|                                      repository_name, namespace_name, filter_kinds, | ||||
|                                      rw_page_token, max_page_count) | ||||
|       logs, next_page_token = rw_logs | ||||
|       new_page_token['under_readonly_model'] = next_page_token is None | ||||
|       new_page_token['readwrite_page_token'] = next_page_token | ||||
|       return LogEntriesPage(logs, new_page_token) | ||||
|     else: | ||||
|       readonly_page_token = page_token.get('readonly_page_token') | ||||
|       ro_logs = ro_model.lookup_logs(start_datetime, end_datetime, performer_name, | ||||
|                                      repository_name, namespace_name, filter_kinds, | ||||
|                                      readonly_page_token, max_page_count) | ||||
|       logs, next_page_token = ro_logs | ||||
|       if next_page_token is None: | ||||
|         return LogEntriesPage(logs, None) | ||||
| 
 | ||||
|       new_page_token['under_readonly_model'] = True | ||||
|       new_page_token['readonly_page_token'] = next_page_token | ||||
|       return LogEntriesPage(logs, new_page_token) | ||||
| 
 | ||||
|   def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None, | ||||
|                          filter_kinds=None, size=20): | ||||
|     latest_logs = [] | ||||
|     rw_model = self.read_write_logs_model | ||||
|     ro_model = self.read_only_logs_model | ||||
| 
 | ||||
|     rw_logs = rw_model.lookup_latest_logs(performer_name, repository_name, namespace_name, | ||||
|                                           filter_kinds, size) | ||||
|     latest_logs.extend(rw_logs) | ||||
|     if len(latest_logs) < size: | ||||
|       ro_logs = ro_model.lookup_latest_logs(performer_name, repository_name, namespace_name, | ||||
|                                             filter_kinds, size - len(latest_logs)) | ||||
|       latest_logs.extend(ro_logs) | ||||
| 
 | ||||
|     return latest_logs | ||||
| 
 | ||||
|   def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation): | ||||
|     ro_model = self.read_only_logs_model | ||||
|     rw_model = self.read_write_logs_model | ||||
|     ro_ctx = ro_model.yield_log_rotation_context(cutoff_date, min_logs_per_rotation) | ||||
|     rw_ctx = rw_model.yield_log_rotation_context(cutoff_date, min_logs_per_rotation) | ||||
|     for ctx in itertools.chain(ro_ctx, rw_ctx): | ||||
|       yield ctx | ||||
							
								
								
									
										155
									
								
								data/logs_model/datatypes.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										155
									
								
								data/logs_model/datatypes.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,155 @@ | |||
| import json | ||||
| 
 | ||||
| from calendar import timegm | ||||
| from collections import namedtuple | ||||
| from email.utils import formatdate | ||||
| 
 | ||||
| from cachetools.func import lru_cache | ||||
| 
 | ||||
| from data import model | ||||
| from util.morecollections import AttrDict | ||||
| 
 | ||||
| 
 | ||||
| def _format_date(date): | ||||
|   """ Output an RFC822 date format. """ | ||||
|   if date is None: | ||||
|     return None | ||||
| 
 | ||||
|   return formatdate(timegm(date.utctimetuple())) | ||||
| 
 | ||||
| 
 | ||||
| @lru_cache(maxsize=1) | ||||
| def _kinds(): | ||||
|   return model.log.get_log_entry_kinds() | ||||
| 
 | ||||
| 
 | ||||
| class LogEntriesPage(namedtuple('LogEntriesPage', ['logs', 'next_page_token'])): | ||||
|   """ Represents a page returned by the lookup_logs call. The `logs` contains the logs | ||||
|       found for the page and `next_page_token`, if not None, contains the token to be | ||||
|       encoded and returned for the followup call. | ||||
|   """ | ||||
| 
 | ||||
| 
 | ||||
| class Log(namedtuple('Log', [ | ||||
|     'metadata_json', 'ip', 'datetime', 'performer_email', 'performer_username', 'performer_robot', | ||||
|     'account_organization', 'account_username', 'account_email', 'account_robot', 'kind_id'])): | ||||
|   """ Represents a single log entry returned by the logs model. """ | ||||
| 
 | ||||
|   @classmethod | ||||
|   def for_logentry(cls, log): | ||||
|     account_organization = None | ||||
|     account_username = None | ||||
|     account_email = None | ||||
|     account_robot = None | ||||
| 
 | ||||
|     try: | ||||
|       account_organization = log.account.organization | ||||
|       account_username = log.account.username | ||||
|       account_email = log.account.email | ||||
|       account_robot = log.account.robot | ||||
|     except AttributeError: | ||||
|       pass | ||||
| 
 | ||||
|     performer_robot = None | ||||
|     performer_username = None | ||||
|     performer_email = None | ||||
| 
 | ||||
|     try: | ||||
|       performer_robot = log.performer.robot | ||||
|       performer_username = log.performer.username | ||||
|       performer_email = log.performer.email | ||||
|     except AttributeError: | ||||
|       pass | ||||
| 
 | ||||
|     return Log(log.metadata_json, log.ip, log.datetime, performer_email, performer_username, | ||||
|                performer_robot, account_organization, account_username, account_email, | ||||
|                account_robot, log.kind_id) | ||||
| 
 | ||||
|   @classmethod | ||||
|   def for_elasticsearch_log(cls, log, id_user_map): | ||||
|     account_organization = None | ||||
|     account_username = None | ||||
|     account_email = None | ||||
|     account_robot = None | ||||
| 
 | ||||
|     try: | ||||
|       if log.account_id: | ||||
|         account = id_user_map[log.account_id] | ||||
|         account_organization = account.organization | ||||
|         account_username = account.username | ||||
|         account_email = account.email | ||||
|         account_robot = account.robot | ||||
|     except AttributeError: | ||||
|       pass | ||||
| 
 | ||||
|     performer_robot = None | ||||
|     performer_username = None | ||||
|     performer_email = None | ||||
| 
 | ||||
|     try: | ||||
|       if log.performer_id: | ||||
|         performer = id_user_map[log.performer_id] | ||||
|         performer_robot = performer.robot | ||||
|         performer_username = performer.username | ||||
|         performer_email = performer.email | ||||
|     except AttributeError: | ||||
|       pass | ||||
| 
 | ||||
|     return Log(log.metadata_json, str(log.ip), log.datetime, performer_email, performer_username, | ||||
|                performer_robot, account_organization, account_username, account_email, | ||||
|                account_robot, log.kind_id) | ||||
| 
 | ||||
|   def to_dict(self, avatar, include_namespace=False): | ||||
|     view = { | ||||
|       'kind': _kinds()[self.kind_id], | ||||
|       'metadata': json.loads(self.metadata_json), | ||||
|       'ip': self.ip, | ||||
|       'datetime': _format_date(self.datetime), | ||||
|     } | ||||
| 
 | ||||
|     if self.performer_username: | ||||
|       performer = AttrDict({'username': self.performer_username, 'email': self.performer_email}) | ||||
|       performer.robot = None | ||||
|       if self.performer_robot: | ||||
|         performer.robot = self.performer_robot | ||||
| 
 | ||||
|       view['performer'] = { | ||||
|         'kind': 'user', | ||||
|         'name': self.performer_username, | ||||
|         'is_robot': self.performer_robot, | ||||
|         'avatar': avatar.get_data_for_user(performer), | ||||
|       } | ||||
| 
 | ||||
|     if include_namespace: | ||||
|       if self.account_username: | ||||
|         account = AttrDict({'username': self.account_username, 'email': self.account_email}) | ||||
|         if self.account_organization: | ||||
| 
 | ||||
|           view['namespace'] = { | ||||
|             'kind': 'org', | ||||
|             'name': self.account_username, | ||||
|             'avatar': avatar.get_data_for_org(account), | ||||
|           } | ||||
|         else: | ||||
|           account.robot = None | ||||
|           if self.account_robot: | ||||
|             account.robot = self.account_robot | ||||
|           view['namespace'] = { | ||||
|             'kind': 'user', | ||||
|             'name': self.account_username, | ||||
|             'avatar': avatar.get_data_for_user(account), | ||||
|           } | ||||
| 
 | ||||
|     return view | ||||
| 
 | ||||
| 
 | ||||
| class AggregatedLogCount(namedtuple('AggregatedLogCount', ['kind_id', 'count', 'datetime'])): | ||||
|   """ Represents the aggregated count of the number of logs, of a particular kind, on a day. """ | ||||
|   def to_dict(self): | ||||
|     view = { | ||||
|       'kind': _kinds()[self.kind_id], | ||||
|       'count': self.count, | ||||
|       'datetime': _format_date(self.datetime), | ||||
|     } | ||||
| 
 | ||||
|     return view | ||||
							
								
								
									
										532
									
								
								data/logs_model/document_logs_model.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										532
									
								
								data/logs_model/document_logs_model.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,532 @@ | |||
| # pylint: disable=protected-access | ||||
| 
 | ||||
| import json | ||||
| import logging | ||||
| import uuid | ||||
| 
 | ||||
| from time import time | ||||
| from datetime import timedelta, datetime, date | ||||
| from dateutil.parser import parse as parse_datetime | ||||
| 
 | ||||
| from abc import ABCMeta, abstractmethod | ||||
| from six import add_metaclass | ||||
| 
 | ||||
| from elasticsearch.exceptions import ConnectionTimeout, NotFoundError | ||||
| 
 | ||||
| from data import model | ||||
| from data.database import CloseForLongOperation | ||||
| from data.model import config | ||||
| from data.model.log import (_json_serialize, ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING, | ||||
|                             DataModelException) | ||||
| from data.logs_model.elastic_logs import LogEntry, configure_es | ||||
| from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage | ||||
| from data.logs_model.interface import (ActionLogsDataInterface, LogRotationContextInterface, | ||||
|                                        LogsIterationTimeout) | ||||
| from data.logs_model.shared import SharedModel, epoch_ms | ||||
| 
 | ||||
| from data.logs_model.logs_producer import LogProducerProxy, LogSendException | ||||
| from data.logs_model.logs_producer.kafka_logs_producer import KafkaLogsProducer | ||||
| from data.logs_model.logs_producer.elasticsearch_logs_producer import ElasticsearchLogsProducer | ||||
| from data.logs_model.logs_producer.kinesis_stream_logs_producer import KinesisStreamLogsProducer | ||||
| 
 | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| PAGE_SIZE = 20 | ||||
| DEFAULT_RESULT_WINDOW = 5000 | ||||
| MAX_RESULT_WINDOW = 10000 | ||||
| 
 | ||||
| # DATE_RANGE_LIMIT is to limit the query date time range to at most 1 month. | ||||
| DATE_RANGE_LIMIT = 32 | ||||
| 
 | ||||
| # Timeout for count_repository_actions | ||||
| COUNT_REPOSITORY_ACTION_TIMEOUT = 30 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def _date_range_descending(start_datetime, end_datetime, includes_end_datetime=False): | ||||
|   """ Generate the dates between `end_datetime` and `start_datetime`. | ||||
| 
 | ||||
|   If `includes_end_datetime` is set, the generator starts at `end_datetime`, | ||||
|   otherwise, starts the generator at `end_datetime` minus 1 second. | ||||
|   """ | ||||
|   assert end_datetime >= start_datetime | ||||
|   start_date = start_datetime.date() | ||||
| 
 | ||||
|   if includes_end_datetime: | ||||
|     current_date = end_datetime.date() | ||||
|   else: | ||||
|     current_date = (end_datetime - timedelta(seconds=1)).date() | ||||
| 
 | ||||
|   while current_date >= start_date: | ||||
|     yield current_date | ||||
|     current_date = current_date - timedelta(days=1) | ||||
| 
 | ||||
| 
 | ||||
| def _date_range_in_single_index(dt1, dt2): | ||||
|   """ Determine whether a single index can be searched given a range | ||||
|   of dates or datetimes. If date instances are given, difference should be 1 day. | ||||
| 
 | ||||
|   NOTE: dt2 is exclusive to the search result set. | ||||
|   i.e. The date range is larger or equal to dt1 and strictly smaller than dt2 | ||||
|   """ | ||||
|   assert isinstance(dt1, date) and isinstance(dt2, date) | ||||
| 
 | ||||
|   dt = dt2 - dt1 | ||||
| 
 | ||||
|   # Check if date or datetime | ||||
|   if not isinstance(dt1, datetime) and not isinstance(dt2, datetime): | ||||
|     return dt == timedelta(days=1) | ||||
| 
 | ||||
|   if dt < timedelta(days=1) and dt >= timedelta(days=0): | ||||
|     return dt2.day == dt1.day | ||||
| 
 | ||||
|   # Check if datetime can be interpreted as a date: hour, minutes, seconds or microseconds set to 0 | ||||
|   if dt == timedelta(days=1): | ||||
|     return dt1.hour == 0 and dt1.minute == 0 and dt1.second == 0 and dt1.microsecond == 0 | ||||
| 
 | ||||
|   return False | ||||
| 
 | ||||
| 
 | ||||
| def _for_elasticsearch_logs(logs, repository_id=None, namespace_id=None): | ||||
|   namespace_ids = set() | ||||
|   for log in logs: | ||||
|     namespace_ids.add(log.account_id) | ||||
|     namespace_ids.add(log.performer_id) | ||||
|     assert namespace_id is None or log.account_id == namespace_id | ||||
|     assert repository_id is None or log.repository_id == repository_id | ||||
| 
 | ||||
|   id_user_map = model.user.get_user_map_by_ids(namespace_ids) | ||||
|   return [Log.for_elasticsearch_log(log, id_user_map) for log in logs] | ||||
| 
 | ||||
| 
 | ||||
| def _random_id(): | ||||
|   """ Generates a unique uuid4 string for the random_id field in LogEntry. | ||||
|   It is used as tie-breaker for sorting logs based on datetime: | ||||
|   https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-search-after.html | ||||
|   """ | ||||
|   return str(uuid.uuid4()) | ||||
| 
 | ||||
| 
 | ||||
| @add_metaclass(ABCMeta) | ||||
| class ElasticsearchLogsModelInterface(object): | ||||
|   """ | ||||
|   Interface for Elasticsearch specific operations with the logs model. | ||||
|   These operations are usually index based. | ||||
|   """ | ||||
| 
 | ||||
|   @abstractmethod | ||||
|   def can_delete_index(self, index, cutoff_date): | ||||
|     """ Return whether the given index is older than the given cutoff date. """ | ||||
| 
 | ||||
|   @abstractmethod | ||||
|   def list_indices(self): | ||||
|     """ List the logs model's indices. """ | ||||
| 
 | ||||
| 
 | ||||
| class DocumentLogsModel(SharedModel, ActionLogsDataInterface, ElasticsearchLogsModelInterface): | ||||
|   """ | ||||
|   DocumentLogsModel implements the data model for the logs API backed by an | ||||
|   elasticsearch service. | ||||
|   """ | ||||
|   def __init__(self, should_skip_logging=None, elasticsearch_config=None, producer=None, **kwargs): | ||||
|     self._should_skip_logging = should_skip_logging | ||||
|     self._logs_producer = LogProducerProxy() | ||||
|     self._es_client = configure_es(**elasticsearch_config) | ||||
| 
 | ||||
|     if producer == 'kafka': | ||||
|       kafka_config = kwargs['kafka_config'] | ||||
|       self._logs_producer.initialize(KafkaLogsProducer(**kafka_config)) | ||||
|     elif producer == 'elasticsearch': | ||||
|       self._logs_producer.initialize(ElasticsearchLogsProducer()) | ||||
|     elif producer == 'kinesis_stream': | ||||
|       kinesis_stream_config = kwargs['kinesis_stream_config'] | ||||
|       self._logs_producer.initialize(KinesisStreamLogsProducer(**kinesis_stream_config)) | ||||
|     else: | ||||
|       raise Exception('Invalid log producer: %s' % producer) | ||||
| 
 | ||||
|   @staticmethod | ||||
|   def _get_ids_by_names(repository_name, namespace_name, performer_name): | ||||
|     """ Retrieve repository/namespace/performer ids based on their names. | ||||
|         throws DataModelException when the namespace_name does not match any | ||||
|         user in the database. | ||||
|         returns database ID or None if not exists. | ||||
|     """ | ||||
|     repository_id = None | ||||
|     account_id = None | ||||
|     performer_id = None | ||||
| 
 | ||||
|     if repository_name and namespace_name: | ||||
|       repository = model.repository.get_repository(namespace_name, repository_name) | ||||
|       if repository: | ||||
|         repository_id = repository.id | ||||
|         account_id = repository.namespace_user.id | ||||
| 
 | ||||
|     if namespace_name and account_id is None: | ||||
|       account = model.user.get_user_or_org(namespace_name) | ||||
|       if account is None: | ||||
|         raise DataModelException('Invalid namespace requested') | ||||
| 
 | ||||
|       account_id = account.id | ||||
| 
 | ||||
|     if performer_name: | ||||
|       performer = model.user.get_user(performer_name) | ||||
|       if performer: | ||||
|         performer_id = performer.id | ||||
| 
 | ||||
|     return repository_id, account_id, performer_id | ||||
| 
 | ||||
|   def _base_query(self, performer_id=None, repository_id=None, account_id=None, filter_kinds=None, | ||||
|                   index=None): | ||||
|     if filter_kinds is not None: | ||||
|       assert all(isinstance(kind_name, str) for kind_name in filter_kinds) | ||||
| 
 | ||||
|     if index is not None: | ||||
|       search = LogEntry.search(index=index) | ||||
|     else: | ||||
|       search = LogEntry.search() | ||||
| 
 | ||||
|     if performer_id is not None: | ||||
|       assert isinstance(performer_id, int) | ||||
|       search = search.filter('term', performer_id=performer_id) | ||||
| 
 | ||||
|     if repository_id is not None: | ||||
|       assert isinstance(repository_id, int) | ||||
|       search = search.filter('term', repository_id=repository_id) | ||||
| 
 | ||||
|     if account_id is not None and repository_id is None: | ||||
|       assert isinstance(account_id, int) | ||||
|       search = search.filter('term', account_id=account_id) | ||||
| 
 | ||||
|     if filter_kinds is not None: | ||||
|       kind_map = model.log.get_log_entry_kinds() | ||||
|       ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds] | ||||
|       search = search.exclude('terms', kind_id=ignore_ids) | ||||
| 
 | ||||
|     return search | ||||
| 
 | ||||
|   def _base_query_date_range(self, start_datetime, end_datetime, performer_id, repository_id, | ||||
|                              account_id, filter_kinds, index=None): | ||||
|     skip_datetime_check = False | ||||
|     if _date_range_in_single_index(start_datetime, end_datetime): | ||||
|       index = self._es_client.index_name(start_datetime) | ||||
|       skip_datetime_check = self._es_client.index_exists(index) | ||||
| 
 | ||||
|     if index and (skip_datetime_check or self._es_client.index_exists(index)): | ||||
|       search = self._base_query(performer_id, repository_id, account_id, filter_kinds, | ||||
|                                 index=index) | ||||
|     else: | ||||
|       search = self._base_query(performer_id, repository_id, account_id, filter_kinds) | ||||
| 
 | ||||
|     if not skip_datetime_check: | ||||
|       search = search.query('range', datetime={'gte': start_datetime, 'lt': end_datetime}) | ||||
| 
 | ||||
|     return search | ||||
| 
 | ||||
|   def _load_logs_for_day(self, logs_date, performer_id, repository_id, account_id, filter_kinds, | ||||
|                          after_datetime=None, after_random_id=None, size=PAGE_SIZE): | ||||
|     index = self._es_client.index_name(logs_date) | ||||
|     if not self._es_client.index_exists(index): | ||||
|       return [] | ||||
| 
 | ||||
|     search = self._base_query(performer_id, repository_id, account_id, filter_kinds, | ||||
|                               index=index) | ||||
|     search = search.sort({'datetime': 'desc'}, {'random_id.keyword': 'desc'}) | ||||
|     search = search.extra(size=size) | ||||
| 
 | ||||
|     if after_datetime is not None and after_random_id is not None: | ||||
|       after_datetime_epoch_ms = epoch_ms(after_datetime) | ||||
|       search = search.extra(search_after=[after_datetime_epoch_ms, after_random_id]) | ||||
| 
 | ||||
|     return search.execute() | ||||
| 
 | ||||
|   def _load_latest_logs(self, performer_id, repository_id, account_id, filter_kinds, size): | ||||
|     """ Return the latest logs from Elasticsearch. | ||||
| 
 | ||||
|     Look at indices up to theset logrotateworker threshold, or up to 30 days if not defined. | ||||
|     """ | ||||
|     # Set the last index to check to be the logrotateworker threshold, or 30 days | ||||
|     end_datetime = datetime.now() | ||||
|     start_datetime = end_datetime - timedelta(days=DATE_RANGE_LIMIT) | ||||
| 
 | ||||
|     latest_logs = [] | ||||
|     for day in _date_range_descending(start_datetime, end_datetime, includes_end_datetime=True): | ||||
|       try: | ||||
|         logs = self._load_logs_for_day(day, performer_id, repository_id, account_id, filter_kinds, | ||||
|                                        size=size) | ||||
|         latest_logs.extend(logs) | ||||
|       except NotFoundError: | ||||
|         continue | ||||
| 
 | ||||
|       if len(latest_logs) >= size: | ||||
|         break | ||||
| 
 | ||||
|     return _for_elasticsearch_logs(latest_logs[:size], repository_id, account_id) | ||||
| 
 | ||||
|   def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None, | ||||
|                   namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None): | ||||
|     assert start_datetime is not None and end_datetime is not None | ||||
| 
 | ||||
|     # Check for a valid combined model token when migrating online from a combined model | ||||
|     if page_token is not None and page_token.get('readwrite_page_token') is not None: | ||||
|       page_token = page_token.get('readwrite_page_token') | ||||
| 
 | ||||
|     if page_token is not None and max_page_count is not None: | ||||
|       page_number = page_token.get('page_number') | ||||
|       if page_number is not None and page_number + 1 > max_page_count: | ||||
|         return LogEntriesPage([], None) | ||||
| 
 | ||||
|     repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names( | ||||
|       repository_name, namespace_name, performer_name) | ||||
| 
 | ||||
|     after_datetime = None | ||||
|     after_random_id = None | ||||
|     if page_token is not None: | ||||
|       after_datetime = parse_datetime(page_token['datetime']) | ||||
|       after_random_id = page_token['random_id'] | ||||
| 
 | ||||
|     if after_datetime is not None: | ||||
|       end_datetime = min(end_datetime, after_datetime) | ||||
| 
 | ||||
|     all_logs = [] | ||||
| 
 | ||||
|     with CloseForLongOperation(config.app_config): | ||||
|       for current_date in _date_range_descending(start_datetime, end_datetime): | ||||
|         try: | ||||
|           logs = self._load_logs_for_day(current_date, performer_id, repository_id, account_id, | ||||
|                                          filter_kinds, after_datetime, after_random_id, | ||||
|                                          size=PAGE_SIZE+1) | ||||
| 
 | ||||
|           all_logs.extend(logs) | ||||
|         except NotFoundError: | ||||
|           continue | ||||
| 
 | ||||
|         if len(all_logs) > PAGE_SIZE: | ||||
|           break | ||||
| 
 | ||||
|     next_page_token = None | ||||
|     all_logs = all_logs[0:PAGE_SIZE+1] | ||||
| 
 | ||||
|     if len(all_logs) == PAGE_SIZE + 1: | ||||
|       # The last element in the response is used to check if there's more elements. | ||||
|       # The second element in the response is used as the pagination token because search_after does | ||||
|       # not include the exact match, and so the next page will start with the last element. | ||||
|       # This keeps the behavior exactly the same as table_logs_model, so that | ||||
|       # the caller can expect when a pagination token is non-empty, there must be | ||||
|       # at least 1 log to be retrieved. | ||||
|       next_page_token = { | ||||
|         'datetime': all_logs[-2].datetime.isoformat(), | ||||
|         'random_id': all_logs[-2].random_id, | ||||
|         'page_number': page_token['page_number'] + 1 if page_token else 1, | ||||
|       } | ||||
| 
 | ||||
|     return LogEntriesPage(_for_elasticsearch_logs(all_logs[:PAGE_SIZE], repository_id, account_id), | ||||
|                           next_page_token) | ||||
| 
 | ||||
|   def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None, | ||||
|                          filter_kinds=None, size=20): | ||||
|     repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names( | ||||
|       repository_name, namespace_name, performer_name) | ||||
| 
 | ||||
|     with CloseForLongOperation(config.app_config): | ||||
|       latest_logs = self._load_latest_logs(performer_id, repository_id, account_id, filter_kinds, | ||||
|                                            size) | ||||
| 
 | ||||
|     return latest_logs | ||||
| 
 | ||||
| 
 | ||||
|   def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None, | ||||
|                                 repository_name=None, namespace_name=None, filter_kinds=None): | ||||
|     if end_datetime - start_datetime >= timedelta(days=DATE_RANGE_LIMIT): | ||||
|       raise Exception('Cannot lookup aggregated logs over a period longer than a month') | ||||
| 
 | ||||
|     repository_id, account_id, performer_id = DocumentLogsModel._get_ids_by_names( | ||||
|       repository_name, namespace_name, performer_name) | ||||
| 
 | ||||
|     with CloseForLongOperation(config.app_config): | ||||
|       search = self._base_query_date_range(start_datetime, end_datetime, performer_id, | ||||
|                                            repository_id, account_id, filter_kinds) | ||||
|       search.aggs.bucket('by_id', 'terms', field='kind_id').bucket('by_date', 'date_histogram', | ||||
|                                                                    field='datetime', interval='day') | ||||
|       # es returns all buckets when size=0 | ||||
|       search = search.extra(size=0) | ||||
|       resp = search.execute() | ||||
| 
 | ||||
|     if not resp.aggregations: | ||||
|       return [] | ||||
| 
 | ||||
|     counts = [] | ||||
|     by_id = resp.aggregations['by_id'] | ||||
| 
 | ||||
|     for id_bucket in by_id.buckets: | ||||
|       for date_bucket in id_bucket.by_date.buckets: | ||||
|         if date_bucket.doc_count > 0: | ||||
|           counts.append(AggregatedLogCount(id_bucket.key, date_bucket.doc_count, date_bucket.key)) | ||||
| 
 | ||||
|     return counts | ||||
| 
 | ||||
|   def count_repository_actions(self, repository, day): | ||||
|     index = self._es_client.index_name(day) | ||||
|     search = self._base_query_date_range(day, day + timedelta(days=1), | ||||
|                                          None, | ||||
|                                          repository.id, | ||||
|                                          None, | ||||
|                                          None, | ||||
|                                          index=index) | ||||
|     search = search.params(request_timeout=COUNT_REPOSITORY_ACTION_TIMEOUT) | ||||
| 
 | ||||
|     try: | ||||
|       return search.count() | ||||
|     except NotFoundError: | ||||
|       return 0 | ||||
| 
 | ||||
|   def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None, | ||||
|                  repository=None, repository_name=None, timestamp=None, is_free_namespace=False): | ||||
|     if self._should_skip_logging and self._should_skip_logging(kind_name, namespace_name, | ||||
|                                                                is_free_namespace): | ||||
|       return | ||||
| 
 | ||||
|     if repository_name is not None: | ||||
|       assert repository is None | ||||
|       assert namespace_name is not None | ||||
|       repository = model.repository.get_repository(namespace_name, repository_name) | ||||
| 
 | ||||
|     if timestamp is None: | ||||
|       timestamp = datetime.today() | ||||
| 
 | ||||
|     account_id = None | ||||
|     performer_id = None | ||||
|     repository_id = None | ||||
| 
 | ||||
|     if namespace_name is not None: | ||||
|       account_id = model.user.get_namespace_user(namespace_name).id | ||||
| 
 | ||||
|     if performer is not None: | ||||
|       performer_id = performer.id | ||||
| 
 | ||||
|     if repository is not None: | ||||
|       repository_id = repository.id | ||||
| 
 | ||||
|     metadata_json = json.dumps(metadata or {}, default=_json_serialize) | ||||
|     kind_id = model.log._get_log_entry_kind(kind_name) | ||||
|     log = LogEntry(random_id=_random_id(), kind_id=kind_id, account_id=account_id, | ||||
|                    performer_id=performer_id, ip=ip, metadata_json=metadata_json, | ||||
|                    repository_id=repository_id, datetime=timestamp) | ||||
| 
 | ||||
|     try: | ||||
|       self._logs_producer.send(log) | ||||
|     except LogSendException as lse: | ||||
|       strict_logging_disabled = config.app_config.get('ALLOW_PULLS_WITHOUT_STRICT_LOGGING') | ||||
|       logger.exception('log_action failed', extra=({'exception': lse}).update(log.to_dict())) | ||||
|       if not (strict_logging_disabled and kind_name in ACTIONS_ALLOWED_WITHOUT_AUDIT_LOGGING): | ||||
|         raise | ||||
| 
 | ||||
|   def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None, | ||||
|                             namespace_id=None, max_query_time=None): | ||||
|     max_query_time = max_query_time.total_seconds() if max_query_time is not None else 300 | ||||
|     search = self._base_query_date_range(start_datetime, end_datetime, None, repository_id, | ||||
|                                          namespace_id, None) | ||||
| 
 | ||||
|     def raise_on_timeout(batch_generator): | ||||
|       start = time() | ||||
|       for batch in batch_generator: | ||||
|         elapsed = time() - start | ||||
|         if elapsed > max_query_time: | ||||
|           logger.error('Retrieval of logs `%s/%s` timed out with time of `%s`', namespace_id, | ||||
|                        repository_id, elapsed) | ||||
|           raise LogsIterationTimeout() | ||||
| 
 | ||||
|         yield batch | ||||
|         start = time() | ||||
| 
 | ||||
|     def read_batch(scroll): | ||||
|       batch = [] | ||||
|       for log in scroll: | ||||
|         batch.append(log) | ||||
|         if len(batch) == DEFAULT_RESULT_WINDOW: | ||||
|           yield _for_elasticsearch_logs(batch, repository_id=repository_id, | ||||
|                                         namespace_id=namespace_id) | ||||
|           batch = [] | ||||
| 
 | ||||
|       if batch: | ||||
|         yield _for_elasticsearch_logs(batch, repository_id=repository_id, namespace_id=namespace_id) | ||||
| 
 | ||||
|     search = search.params(size=DEFAULT_RESULT_WINDOW, request_timeout=max_query_time) | ||||
| 
 | ||||
|     try: | ||||
|       with CloseForLongOperation(config.app_config): | ||||
|         for batch in raise_on_timeout(read_batch(search.scan())): | ||||
|           yield batch | ||||
|     except ConnectionTimeout: | ||||
|       raise LogsIterationTimeout() | ||||
| 
 | ||||
|   def can_delete_index(self, index, cutoff_date): | ||||
|     return self._es_client.can_delete_index(index, cutoff_date) | ||||
| 
 | ||||
|   def list_indices(self): | ||||
|     return self._es_client.list_indices() | ||||
| 
 | ||||
|   def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation): | ||||
|     """ Yield a context manager for a group of outdated logs. """ | ||||
|     all_indices = self.list_indices() | ||||
|     for index in all_indices: | ||||
|       if not self.can_delete_index(index, cutoff_date): | ||||
|         continue | ||||
| 
 | ||||
|       context = ElasticsearchLogRotationContext(index, min_logs_per_rotation, self._es_client) | ||||
|       yield context | ||||
| 
 | ||||
| 
 | ||||
| class ElasticsearchLogRotationContext(LogRotationContextInterface): | ||||
|   """ | ||||
|   ElasticsearchLogRotationContext yield batch of logs from an index. | ||||
| 
 | ||||
|   When completed without exceptions, this context will delete its associated | ||||
|   Elasticsearch index. | ||||
|   """ | ||||
|   def __init__(self, index, min_logs_per_rotation, es_client): | ||||
|     self._es_client = es_client | ||||
|     self.min_logs_per_rotation = min_logs_per_rotation | ||||
|     self.index = index | ||||
| 
 | ||||
|     self.start_pos = 0 | ||||
|     self.end_pos = 0 | ||||
| 
 | ||||
|     self.scroll = None | ||||
| 
 | ||||
|   def __enter__(self): | ||||
|     search = self._base_query() | ||||
|     self.scroll = search.scan() | ||||
|     return self | ||||
| 
 | ||||
|   def __exit__(self, ex_type, ex_value, ex_traceback): | ||||
|     if ex_type is None and ex_value is None and ex_traceback is None: | ||||
|       logger.debug('Deleting index %s', self.index) | ||||
|       self._es_client.delete_index(self.index) | ||||
| 
 | ||||
|   def yield_logs_batch(self): | ||||
|     def batched_logs(gen, size): | ||||
|       batch = [] | ||||
|       for log in gen: | ||||
|         batch.append(log) | ||||
|         if len(batch) == size: | ||||
|           yield batch | ||||
|           batch = [] | ||||
| 
 | ||||
|       if batch: | ||||
|         yield batch | ||||
| 
 | ||||
|     for batch in batched_logs(self.scroll, self.min_logs_per_rotation): | ||||
|       self.end_pos = self.start_pos + len(batch) - 1 | ||||
|       yield batch, self._generate_filename() | ||||
|       self.start_pos = self.end_pos + 1 | ||||
| 
 | ||||
|   def _base_query(self): | ||||
|     search = LogEntry.search(index=self.index) | ||||
|     return search | ||||
| 
 | ||||
|   def _generate_filename(self): | ||||
|     """ Generate the filenames used to archive the action logs. """ | ||||
|     filename = '%s_%d-%d' % (self.index, self.start_pos, self.end_pos) | ||||
|     filename = '.'.join((filename, 'txt.gz')) | ||||
|     return filename | ||||
							
								
								
									
										255
									
								
								data/logs_model/elastic_logs.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										255
									
								
								data/logs_model/elastic_logs.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,255 @@ | |||
| import os | ||||
| import logging | ||||
| import re | ||||
| from datetime import datetime, timedelta | ||||
| 
 | ||||
| from requests_aws4auth import AWS4Auth | ||||
| 
 | ||||
| from elasticsearch import RequestsHttpConnection | ||||
| from elasticsearch.exceptions import NotFoundError, AuthorizationException | ||||
| from elasticsearch_dsl import Index, Document, Integer, Date, Text, Ip, Keyword | ||||
| from elasticsearch_dsl.connections import connections | ||||
| 
 | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| # Name of the connection used for Elasticearch's template API | ||||
| ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS = 'logentry_template' | ||||
| 
 | ||||
| # Prefix of autogenerated indices | ||||
| INDEX_NAME_PREFIX = 'logentry_' | ||||
| 
 | ||||
| # Time-based index date format | ||||
| INDEX_DATE_FORMAT = '%Y-%m-%d' | ||||
| 
 | ||||
| # Timeout for default connection | ||||
| ELASTICSEARCH_DEFAULT_CONNECTION_TIMEOUT = 15 | ||||
| 
 | ||||
| # Timeout for template api Connection | ||||
| ELASTICSEARCH_TEMPLATE_CONNECTION_TIMEOUT = 60 | ||||
| 
 | ||||
| # Force an index template update | ||||
| ELASTICSEARCH_FORCE_INDEX_TEMPLATE_UPDATE = os.environ.get('FORCE_INDEX_TEMPLATE_UPDATE', '') | ||||
| 
 | ||||
| # Valid index prefix pattern | ||||
| VALID_INDEX_PATTERN = r'^((?!\.$|\.\.$|[-_+])([^A-Z:\/*?\"<>|,# ]){1,255})$' | ||||
| 
 | ||||
| 
 | ||||
| class LogEntry(Document): | ||||
|   # random_id is the tie-breaker for sorting in pagination. | ||||
|   # random_id is also used for deduplication of records when using a "at-least-once" delivery stream. | ||||
|   # Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-search-after.html | ||||
|   # | ||||
|   # We use don't use the _id of a document since a `doc_values` is not build for this field: | ||||
|   # An on-disk data structure that stores the same data in a columnar format | ||||
|   # for optimized sorting and aggregations. | ||||
|   # Reference: https://github.com/elastic/elasticsearch/issues/35369 | ||||
|   random_id = Text(fields={'keyword': Keyword()}) | ||||
|   kind_id = Integer() | ||||
|   account_id = Integer() | ||||
|   performer_id = Integer() | ||||
|   repository_id = Integer() | ||||
|   ip = Ip() | ||||
|   metadata_json = Text() | ||||
|   datetime = Date() | ||||
| 
 | ||||
|   _initialized = False | ||||
| 
 | ||||
|   @classmethod | ||||
|   def init(cls, index_prefix, index_settings=None, skip_template_init=False): | ||||
|     """ | ||||
|     Create the index template, and populate LogEntry's mapping and index settings. | ||||
|     """ | ||||
|     wildcard_index = Index(name=index_prefix + '*') | ||||
|     wildcard_index.settings(**(index_settings or {})) | ||||
|     wildcard_index.document(cls) | ||||
|     cls._index = wildcard_index | ||||
|     cls._index_prefix = index_prefix | ||||
| 
 | ||||
|     if not skip_template_init: | ||||
|       cls.create_or_update_template() | ||||
| 
 | ||||
|     # Since the elasticsearch-dsl API requires the document's index being defined as an inner class at the class level, | ||||
|     # this function needs to be called first before being able to call `save`. | ||||
|     cls._initialized = True | ||||
| 
 | ||||
|   @classmethod | ||||
|   def create_or_update_template(cls): | ||||
|     assert cls._index and cls._index_prefix | ||||
|     index_template = cls._index.as_template(cls._index_prefix) | ||||
|     index_template.save(using=ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS) | ||||
| 
 | ||||
|   def save(self, **kwargs): | ||||
|     # We group the logs based on year, month and day as different indexes, so that | ||||
|     # dropping those indexes based on retention range is easy. | ||||
|     # | ||||
|     # NOTE: This is only used if logging directly to Elasticsearch | ||||
|     #       When using Kinesis or Kafka, the consumer of these streams | ||||
|     #       will be responsible for the management of the indices' lifecycle. | ||||
|     assert LogEntry._initialized | ||||
|     kwargs['index'] = self.datetime.strftime(self._index_prefix + INDEX_DATE_FORMAT) | ||||
|     return super(LogEntry, self).save(**kwargs) | ||||
| 
 | ||||
| 
 | ||||
| class ElasticsearchLogs(object): | ||||
|   """ | ||||
|   Model for logs operations stored in an Elasticsearch cluster. | ||||
|   """ | ||||
| 
 | ||||
|   def __init__(self, host=None, port=None, access_key=None, secret_key=None, aws_region=None, | ||||
|                index_settings=None, use_ssl=True, index_prefix=INDEX_NAME_PREFIX): | ||||
|     # For options in index_settings, refer to: | ||||
|     # https://www.elastic.co/guide/en/elasticsearch/guide/master/_index_settings.html | ||||
|     # some index settings are set at index creation time, and therefore, you should NOT | ||||
|     # change those settings once the index is set. | ||||
|     self._host = host | ||||
|     self._port = port | ||||
|     self._access_key = access_key | ||||
|     self._secret_key = secret_key | ||||
|     self._aws_region = aws_region | ||||
|     self._index_prefix = index_prefix | ||||
|     self._index_settings = index_settings | ||||
|     self._use_ssl = use_ssl | ||||
| 
 | ||||
|     self._client = None | ||||
|     self._initialized = False | ||||
| 
 | ||||
|   def _initialize(self): | ||||
|     """ | ||||
|     Initialize a connection to an ES cluster and | ||||
|     creates an index template if it does not exist. | ||||
|     """ | ||||
|     if not self._initialized: | ||||
|       http_auth = None | ||||
|       if self._access_key and self._secret_key and self._aws_region: | ||||
|         http_auth = AWS4Auth(self._access_key, self._secret_key, self._aws_region, 'es') | ||||
|       elif self._access_key and self._secret_key: | ||||
|         http_auth = (self._access_key, self._secret_key) | ||||
|       else: | ||||
|         logger.warn("Connecting to Elasticsearch without HTTP auth") | ||||
| 
 | ||||
|       self._client = connections.create_connection( | ||||
|         hosts=[{ | ||||
|           'host': self._host, | ||||
|           'port': self._port | ||||
|         }], | ||||
|         http_auth=http_auth, | ||||
|         use_ssl=self._use_ssl, | ||||
|         verify_certs=True, | ||||
|         connection_class=RequestsHttpConnection, | ||||
|         timeout=ELASTICSEARCH_DEFAULT_CONNECTION_TIMEOUT, | ||||
|       ) | ||||
| 
 | ||||
|       # Create a second connection with a timeout of 60s vs 10s. | ||||
|       # For some reason the PUT template API can take anywhere between | ||||
|       # 10s and 30s on the test cluster. | ||||
|       # This only needs to be done once to initialize the index template | ||||
|       connections.create_connection( | ||||
|         alias=ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS, | ||||
|         hosts=[{ | ||||
|           'host': self._host, | ||||
|           'port': self._port | ||||
|         }], | ||||
|         http_auth=http_auth, | ||||
|         use_ssl=self._use_ssl, | ||||
|         verify_certs=True, | ||||
|         connection_class=RequestsHttpConnection, | ||||
|         timeout=ELASTICSEARCH_TEMPLATE_CONNECTION_TIMEOUT, | ||||
|       ) | ||||
| 
 | ||||
|       try: | ||||
|         force_template_update = ELASTICSEARCH_FORCE_INDEX_TEMPLATE_UPDATE.lower() == 'true' | ||||
|         self._client.indices.get_template(self._index_prefix) | ||||
|         LogEntry.init(self._index_prefix, self._index_settings, | ||||
|                       skip_template_init=not force_template_update) | ||||
|       except NotFoundError: | ||||
|         LogEntry.init(self._index_prefix, self._index_settings, skip_template_init=False) | ||||
|       finally: | ||||
|         try: | ||||
|           connections.remove_connection(ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS) | ||||
|         except KeyError as ke: | ||||
|           logger.exception('Elasticsearch connection not found to remove %s: %s', | ||||
|                            ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS, ke) | ||||
| 
 | ||||
|       self._initialized = True | ||||
| 
 | ||||
|   def index_name(self, day): | ||||
|     """ Return an index name for the given day. """ | ||||
|     return self._index_prefix + day.strftime(INDEX_DATE_FORMAT) | ||||
| 
 | ||||
|   def index_exists(self, index): | ||||
|     try: | ||||
|       return index in self._client.indices.get(index) | ||||
|     except NotFoundError: | ||||
|       return False | ||||
| 
 | ||||
|   @staticmethod | ||||
|   def _valid_index_prefix(prefix): | ||||
|     """ Check that the given index prefix is valid with the set of | ||||
|     indices used by this class. | ||||
|     """ | ||||
|     return re.match(VALID_INDEX_PATTERN, prefix) is not None | ||||
| 
 | ||||
|   def _valid_index_name(self, index): | ||||
|     """ Check that the given index name is valid and follows the format: | ||||
|     <index_prefix>YYYY-MM-DD | ||||
|     """ | ||||
|     if not ElasticsearchLogs._valid_index_prefix(index): | ||||
|       return False | ||||
| 
 | ||||
|     if not index.startswith(self._index_prefix) or len(index) > 255: | ||||
|       return False | ||||
| 
 | ||||
|     index_dt_str = index.split(self._index_prefix, 1)[-1] | ||||
|     try: | ||||
|       datetime.strptime(index_dt_str, INDEX_DATE_FORMAT) | ||||
|       return True | ||||
|     except ValueError: | ||||
|       logger.exception('Invalid date format (YYYY-MM-DD) for index: %s', index) | ||||
|       return False | ||||
| 
 | ||||
|   def can_delete_index(self, index, cutoff_date): | ||||
|     """ Check if the given index can be deleted based on the given index's date and cutoff date. """ | ||||
|     assert self._valid_index_name(index) | ||||
|     index_dt = datetime.strptime(index[len(self._index_prefix):], INDEX_DATE_FORMAT) | ||||
|     return index_dt < cutoff_date and cutoff_date - index_dt >= timedelta(days=1) | ||||
| 
 | ||||
|   def list_indices(self): | ||||
|     self._initialize() | ||||
|     try: | ||||
|       return self._client.indices.get(self._index_prefix + '*').keys() | ||||
|     except NotFoundError as nfe: | ||||
|       logger.exception('`%s` indices not found: %s', self._index_prefix, nfe.info) | ||||
|       return [] | ||||
|     except AuthorizationException as ae: | ||||
|       logger.exception('Unauthorized for indices `%s`: %s', self._index_prefix, ae.info) | ||||
|       return None | ||||
| 
 | ||||
|   def delete_index(self, index): | ||||
|     self._initialize() | ||||
|     assert self._valid_index_name(index) | ||||
| 
 | ||||
|     try: | ||||
|       self._client.indices.delete(index) | ||||
|       return index | ||||
|     except NotFoundError as nfe: | ||||
|       logger.exception('`%s` indices not found: %s', index, nfe.info) | ||||
|       return None | ||||
|     except AuthorizationException as ae: | ||||
|       logger.exception('Unauthorized to delete index `%s`: %s', index, ae.info) | ||||
|       return None | ||||
| 
 | ||||
| 
 | ||||
| def configure_es(host, port, access_key=None, secret_key=None, aws_region=None, | ||||
|                  index_prefix=None, use_ssl=True, index_settings=None): | ||||
|   """ | ||||
|   For options in index_settings, refer to: | ||||
|   https://www.elastic.co/guide/en/elasticsearch/guide/master/_index_settings.html | ||||
|   some index settings are set at index creation time, and therefore, you should NOT | ||||
|   change those settings once the index is set. | ||||
|   """ | ||||
|   es_client = ElasticsearchLogs(host=host, port=port, access_key=access_key, secret_key=secret_key, | ||||
|                                 aws_region=aws_region, index_prefix=index_prefix or INDEX_NAME_PREFIX, | ||||
|                                 use_ssl=use_ssl, index_settings=index_settings) | ||||
|   es_client._initialize() | ||||
|   return es_client | ||||
							
								
								
									
										244
									
								
								data/logs_model/inmemory_model.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										244
									
								
								data/logs_model/inmemory_model.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,244 @@ | |||
| import logging | ||||
| import json | ||||
| 
 | ||||
| from collections import namedtuple | ||||
| from datetime import datetime | ||||
| from tzlocal import get_localzone | ||||
| from dateutil.relativedelta import relativedelta | ||||
| 
 | ||||
| from data import model | ||||
| from data.logs_model.datatypes import AggregatedLogCount, LogEntriesPage, Log | ||||
| from data.logs_model.interface import (ActionLogsDataInterface, LogRotationContextInterface, | ||||
|                                        LogsIterationTimeout) | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| LogAndRepository = namedtuple('LogAndRepository', ['log', 'stored_log', 'repository']) | ||||
| 
 | ||||
| StoredLog = namedtuple('StoredLog', ['kind_id', | ||||
|                                      'account_id', | ||||
|                                      'performer_id', | ||||
|                                      'ip', | ||||
|                                      'metadata_json', | ||||
|                                      'repository_id', | ||||
|                                      'datetime']) | ||||
| 
 | ||||
| class InMemoryModel(ActionLogsDataInterface): | ||||
|   """ | ||||
|   InMemoryModel implements the data model for logs in-memory. FOR TESTING ONLY. | ||||
|   """ | ||||
|   def __init__(self): | ||||
|     self.logs = [] | ||||
| 
 | ||||
|   def _filter_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None, | ||||
|                    namespace_name=None, filter_kinds=None): | ||||
|     if filter_kinds is not None: | ||||
|       assert all(isinstance(kind_name, str) for kind_name in filter_kinds) | ||||
| 
 | ||||
|     for log_and_repo in self.logs: | ||||
|       if log_and_repo.log.datetime < start_datetime or log_and_repo.log.datetime > end_datetime: | ||||
|         continue | ||||
| 
 | ||||
|       if performer_name and log_and_repo.log.performer_username != performer_name: | ||||
|         continue | ||||
| 
 | ||||
|       if (repository_name and | ||||
|           (not log_and_repo.repository or log_and_repo.repository.name != repository_name)): | ||||
|         continue | ||||
| 
 | ||||
|       if namespace_name and log_and_repo.log.account_username != namespace_name: | ||||
|         continue | ||||
| 
 | ||||
|       if filter_kinds: | ||||
|         kind_map = model.log.get_log_entry_kinds() | ||||
|         ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds] | ||||
|         if log_and_repo.log.kind_id in ignore_ids: | ||||
|           continue | ||||
| 
 | ||||
|       yield log_and_repo | ||||
| 
 | ||||
|   def _filter_latest_logs(self, performer_name=None, repository_name=None, | ||||
|                           namespace_name=None, filter_kinds=None): | ||||
|     if filter_kinds is not None: | ||||
|       assert all(isinstance(kind_name, str) for kind_name in filter_kinds) | ||||
| 
 | ||||
|     for log_and_repo in sorted(self.logs, key=lambda t: t.log.datetime, reverse=True): | ||||
|       if performer_name and log_and_repo.log.performer_username != performer_name: | ||||
|         continue | ||||
| 
 | ||||
|       if (repository_name and | ||||
|           (not log_and_repo.repository or log_and_repo.repository.name != repository_name)): | ||||
|         continue | ||||
| 
 | ||||
|       if namespace_name and log_and_repo.log.account_username != namespace_name: | ||||
|         continue | ||||
| 
 | ||||
|       if filter_kinds: | ||||
|         kind_map = model.log.get_log_entry_kinds() | ||||
|         ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds] | ||||
|         if log_and_repo.log.kind_id in ignore_ids: | ||||
|           continue | ||||
| 
 | ||||
|       yield log_and_repo | ||||
| 
 | ||||
|   def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None, | ||||
|                   namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None): | ||||
|     logs = [] | ||||
|     for log_and_repo in self._filter_logs(start_datetime, end_datetime, performer_name, | ||||
|                                           repository_name, namespace_name, filter_kinds): | ||||
|       logs.append(log_and_repo.log) | ||||
|     return LogEntriesPage(logs, None) | ||||
| 
 | ||||
|   def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None, | ||||
|                          filter_kinds=None, size=20): | ||||
|     latest_logs = [] | ||||
|     for log_and_repo in self._filter_latest_logs(performer_name, repository_name, namespace_name, | ||||
|                                                  filter_kinds): | ||||
|       if size is not None and len(latest_logs) == size: | ||||
|         break | ||||
| 
 | ||||
|       latest_logs.append(log_and_repo.log) | ||||
| 
 | ||||
|     return latest_logs | ||||
| 
 | ||||
|   def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None, | ||||
|                                 repository_name=None, namespace_name=None, filter_kinds=None): | ||||
|     entries = {} | ||||
|     for log_and_repo in self._filter_logs(start_datetime, end_datetime, performer_name, | ||||
|                                           repository_name, namespace_name, filter_kinds): | ||||
|       entry = log_and_repo.log | ||||
|       synthetic_date = datetime(start_datetime.year, start_datetime.month, int(entry.datetime.day), | ||||
|                                 tzinfo=get_localzone()) | ||||
|       if synthetic_date.day < start_datetime.day: | ||||
|         synthetic_date = synthetic_date + relativedelta(months=1) | ||||
| 
 | ||||
|       key = '%s-%s' % (entry.kind_id, entry.datetime.day) | ||||
| 
 | ||||
|       if key in entries: | ||||
|         entries[key] = AggregatedLogCount(entry.kind_id, entries[key].count + 1, | ||||
|                                           synthetic_date) | ||||
|       else: | ||||
|         entries[key] = AggregatedLogCount(entry.kind_id, 1, synthetic_date) | ||||
| 
 | ||||
|     return entries.values() | ||||
| 
 | ||||
|   def count_repository_actions(self, repository, day): | ||||
|     count = 0 | ||||
|     for log_and_repo in self.logs: | ||||
|       if log_and_repo.repository != repository: | ||||
|         continue | ||||
| 
 | ||||
|       if log_and_repo.log.datetime.day != day.day: | ||||
|         continue | ||||
| 
 | ||||
|       count += 1 | ||||
| 
 | ||||
|     return count | ||||
| 
 | ||||
|   def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue, | ||||
|                         namespace_name=None, repository_name=None, callback_url=None, | ||||
|                         callback_email=None, filter_kinds=None): | ||||
|     raise NotImplementedError | ||||
| 
 | ||||
|   def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None, | ||||
|                  repository=None, repository_name=None, timestamp=None, is_free_namespace=False): | ||||
|     timestamp = timestamp or datetime.today() | ||||
| 
 | ||||
|     if not repository and repository_name and namespace_name: | ||||
|       repository = model.repository.get_repository(namespace_name, repository_name) | ||||
| 
 | ||||
|     account = None | ||||
|     account_id = None | ||||
|     performer_id = None | ||||
|     repository_id = None | ||||
| 
 | ||||
|     if namespace_name is not None: | ||||
|       account = model.user.get_namespace_user(namespace_name) | ||||
|       account_id = account.id | ||||
| 
 | ||||
|     if performer is not None: | ||||
|       performer_id = performer.id | ||||
| 
 | ||||
|     if repository is not None: | ||||
|       repository_id = repository.id | ||||
| 
 | ||||
|     metadata_json = json.dumps(metadata or {}) | ||||
|     kind_id = model.log.get_log_entry_kinds()[kind_name] | ||||
| 
 | ||||
|     stored_log = StoredLog( | ||||
|       kind_id, | ||||
|       account_id, | ||||
|       performer_id, | ||||
|       ip, | ||||
|       metadata_json, | ||||
|       repository_id, | ||||
|       timestamp | ||||
|     ) | ||||
| 
 | ||||
|     log = Log(metadata_json=metadata, | ||||
|               ip=ip, | ||||
|               datetime=timestamp, | ||||
|               performer_email=performer.email if performer else None, | ||||
|               performer_username=performer.username if performer else None, | ||||
|               performer_robot=performer.robot if performer else None, | ||||
|               account_organization=account.organization if account else None, | ||||
|               account_username=account.username if account else None, | ||||
|               account_email=account.email if account else None, | ||||
|               account_robot=account.robot if account else None, | ||||
|               kind_id=kind_id) | ||||
| 
 | ||||
|     self.logs.append(LogAndRepository(log, stored_log, repository)) | ||||
| 
 | ||||
|   def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None, | ||||
|                             namespace_id=None, max_query_time=None): | ||||
|     # Just for testing. | ||||
|     if max_query_time is not None: | ||||
|       raise LogsIterationTimeout() | ||||
| 
 | ||||
|     logs = [] | ||||
|     for log_and_repo in self._filter_logs(start_datetime, end_datetime): | ||||
|       if (repository_id and | ||||
|           (not log_and_repo.repository or log_and_repo.repository.id != repository_id)): | ||||
|         continue | ||||
| 
 | ||||
|       if namespace_id: | ||||
|         if log_and_repo.log.account_username is None: | ||||
|           continue | ||||
| 
 | ||||
|         namespace = model.user.get_namespace_user(log_and_repo.log.account_username) | ||||
|         if namespace.id != namespace_id: | ||||
|           continue | ||||
| 
 | ||||
|       logs.append(log_and_repo.log) | ||||
| 
 | ||||
|     yield logs | ||||
| 
 | ||||
|   def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation): | ||||
|     expired_logs = [log_and_repo for log_and_repo in self.logs | ||||
|                     if log_and_repo.log.datetime <= cutoff_date] | ||||
|     while True: | ||||
|       if not expired_logs: | ||||
|         break | ||||
|       context = InMemoryLogRotationContext(expired_logs[:min_logs_per_rotation], self.logs) | ||||
|       expired_logs = expired_logs[min_logs_per_rotation:] | ||||
|       yield context | ||||
| 
 | ||||
| 
 | ||||
| class InMemoryLogRotationContext(LogRotationContextInterface): | ||||
|   def __init__(self, expired_logs, all_logs): | ||||
|     self.expired_logs = expired_logs | ||||
|     self.all_logs = all_logs | ||||
| 
 | ||||
|   def __enter__(self): | ||||
|     return self | ||||
| 
 | ||||
|   def __exit__(self, ex_type, ex_value, ex_traceback): | ||||
|     if ex_type is None and ex_value is None and ex_traceback is None: | ||||
|       for log in self.expired_logs: | ||||
|         self.all_logs.remove(log) | ||||
| 
 | ||||
|   def yield_logs_batch(self): | ||||
|     """ Yield a batch of logs and a filename for that batch. """ | ||||
|     filename = 'inmemory_model_filename_placeholder' | ||||
|     filename = '.'.join((filename, 'txt.gz')) | ||||
|     yield [log_and_repo.stored_log for log_and_repo in self.expired_logs], filename | ||||
							
								
								
									
										95
									
								
								data/logs_model/interface.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										95
									
								
								data/logs_model/interface.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,95 @@ | |||
| from abc import ABCMeta, abstractmethod | ||||
| from six import add_metaclass | ||||
| 
 | ||||
| class LogsIterationTimeout(Exception): | ||||
|   """ Exception raised if logs iteration times out. """ | ||||
| 
 | ||||
| 
 | ||||
| @add_metaclass(ABCMeta) | ||||
| class ActionLogsDataInterface(object): | ||||
|   """ Interface for code to work with the logs data model. The logs data model consists | ||||
|       of all access for reading and writing action logs. | ||||
|   """ | ||||
|   @abstractmethod | ||||
|   def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None, | ||||
|                   namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None): | ||||
|     """ Looks up all logs between the start_datetime and end_datetime, filtered | ||||
|         by performer (a user), repository or namespace. Note that one (and only one) of the three | ||||
|         can be specified. Returns a LogEntriesPage. `filter_kinds`, if specified, is a set/list | ||||
|         of the kinds of logs to filter out. | ||||
|     """ | ||||
| 
 | ||||
|   @abstractmethod | ||||
|   def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None, | ||||
|                          filter_kinds=None, size=20): | ||||
|     """ Looks up latest logs of a specific kind, filtered by performer (a user), | ||||
|         repository or namespace. Note that one (and only one) of the three can be specified. | ||||
|         Returns a list of `Log`. | ||||
|     """ | ||||
| 
 | ||||
|   @abstractmethod | ||||
|   def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None, | ||||
|                                 repository_name=None, namespace_name=None, filter_kinds=None): | ||||
|     """ Returns the aggregated count of logs, by kind, between the start_datetime and end_datetime, | ||||
|         filtered by performer (a user), repository or namespace. Note that one (and only one) of | ||||
|         the three can be specified. Returns a list of AggregatedLogCount. | ||||
|     """ | ||||
| 
 | ||||
|   @abstractmethod | ||||
|   def count_repository_actions(self, repository, day): | ||||
|     """ Returns the total number of repository actions over the given day, in the given repository | ||||
|         or None on error. | ||||
|     """ | ||||
| 
 | ||||
|   @abstractmethod | ||||
|   def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue, | ||||
|                         namespace_name=None, repository_name=None, callback_url=None, | ||||
|                         callback_email=None, filter_kinds=None): | ||||
|     """ Queues logs between the start_datetime and end_time, filtered by a repository or namespace, | ||||
|         for export to the specified URL and/or email address. Returns the ID of the export job | ||||
|         queued or None if error. | ||||
|     """ | ||||
| 
 | ||||
|   @abstractmethod | ||||
|   def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None, | ||||
|                  repository=None, repository_name=None, timestamp=None, is_free_namespace=False): | ||||
|     """ Logs a single action as having taken place. """ | ||||
| 
 | ||||
|   @abstractmethod | ||||
|   def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None, | ||||
|                             namespace_id=None, max_query_time=None): | ||||
|     """ Returns an iterator that yields bundles of all logs found between the start_datetime and | ||||
|         end_datetime, optionally filtered by the repository or namespace. This function should be | ||||
|         used for any bulk lookup operations, and should be implemented by implementors to put | ||||
|         minimal strain on the backing storage for large operations. If there was an error in setting | ||||
|         up, returns None. | ||||
| 
 | ||||
|         If max_query_time is specified, each iteration that yields a log bundle will have its | ||||
|         queries run with a maximum timeout of that specified, and, if any exceed that threshold, | ||||
|         LogsIterationTimeout will be raised instead of returning the logs bundle. | ||||
|     """ | ||||
| 
 | ||||
|   @abstractmethod | ||||
|   def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation): | ||||
|     """ | ||||
|     A generator that yields contexts implementing the LogRotationContextInterface. | ||||
|     Each context represents a set of logs to be archived and deleted once | ||||
|     the context completes without exceptions. | ||||
| 
 | ||||
|     For database logs, the LogRotationContext abstracts over a set of rows. When the context | ||||
|     finishes, its associated rows get deleted. | ||||
| 
 | ||||
|     For Elasticsearch logs, the LogRotationContext abstracts over indices. When the context | ||||
|     finishes, its associated index gets deleted. | ||||
|     """ | ||||
| 
 | ||||
| 
 | ||||
| @add_metaclass(ABCMeta) | ||||
| class LogRotationContextInterface(object): | ||||
|   """ Interface for iterating over a set of logs to be archived. """ | ||||
|   @abstractmethod | ||||
|   def yield_logs_batch(self): | ||||
|     """ | ||||
|     Generator yielding batch of logs and a filename for that batch. | ||||
|     A batch is a subset of the logs part of the context. | ||||
|     """ | ||||
							
								
								
									
										27
									
								
								data/logs_model/logs_producer/__init__.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								data/logs_model/logs_producer/__init__.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,27 @@ | |||
| import logging | ||||
| 
 | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| 
 | ||||
| class LogSendException(Exception): | ||||
|   """ A generic error when sending the logs to its destination. | ||||
|   e.g. Kinesis, Kafka, Elasticsearch, ... | ||||
|   """ | ||||
|   pass | ||||
| 
 | ||||
| 
 | ||||
| class LogProducerProxy(object): | ||||
|   def __init__(self): | ||||
|     self._model = None | ||||
| 
 | ||||
|   def initialize(self, model): | ||||
|     self._model = model | ||||
|     logger.info('===============================') | ||||
|     logger.info('Using producer `%s`', self._model) | ||||
|     logger.info('===============================') | ||||
| 
 | ||||
|   def __getattr__(self, attr): | ||||
|     if not self._model: | ||||
|       raise AttributeError("LogsModelProxy is not initialized") | ||||
|     return getattr(self._model, attr) | ||||
							
								
								
									
										25
									
								
								data/logs_model/logs_producer/elasticsearch_logs_producer.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								data/logs_model/logs_producer/elasticsearch_logs_producer.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,25 @@ | |||
| import logging | ||||
| 
 | ||||
| from elasticsearch.exceptions import ElasticsearchException | ||||
| 
 | ||||
| from data.logs_model.logs_producer.interface import LogProducerInterface | ||||
| from data.logs_model.logs_producer import LogSendException | ||||
| 
 | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| 
 | ||||
| class ElasticsearchLogsProducer(LogProducerInterface): | ||||
|   """ Log producer writing log entries to Elasticsearch. | ||||
| 
 | ||||
|   This implementation writes directly to Elasticsearch without a streaming/queueing service. | ||||
|   """ | ||||
|   def send(self, logentry): | ||||
|     try: | ||||
|       logentry.save() | ||||
|     except ElasticsearchException as ex: | ||||
|       logger.exception('ElasticsearchLogsProducer error sending log to Elasticsearch: %s', ex) | ||||
|       raise LogSendException('ElasticsearchLogsProducer error sending log to Elasticsearch: %s' % ex) | ||||
|     except Exception as e: | ||||
|       logger.exception('ElasticsearchLogsProducer exception sending log to Elasticsearch: %s', e) | ||||
|       raise LogSendException('ElasticsearchLogsProducer exception sending log to Elasticsearch: %s' % e) | ||||
							
								
								
									
										8
									
								
								data/logs_model/logs_producer/interface.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								data/logs_model/logs_producer/interface.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,8 @@ | |||
| from abc import ABCMeta, abstractmethod | ||||
| from six import add_metaclass | ||||
| 
 | ||||
| @add_metaclass(ABCMeta) | ||||
| class LogProducerInterface(object): | ||||
|   @abstractmethod | ||||
|   def send(self, logentry): | ||||
|     """ Send a log entry to the configured log infrastructure. """ | ||||
							
								
								
									
										45
									
								
								data/logs_model/logs_producer/kafka_logs_producer.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								data/logs_model/logs_producer/kafka_logs_producer.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,45 @@ | |||
| import logging | ||||
| 
 | ||||
| from kafka.errors import KafkaError, KafkaTimeoutError | ||||
| from kafka import KafkaProducer | ||||
| 
 | ||||
| from data.logs_model.shared import epoch_ms | ||||
| from data.logs_model.logs_producer.interface import LogProducerInterface | ||||
| from data.logs_model.logs_producer.util import logs_json_serializer | ||||
| from data.logs_model.logs_producer import LogSendException | ||||
| 
 | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| DEFAULT_MAX_BLOCK_SECONDS = 5 | ||||
| 
 | ||||
| 
 | ||||
| class KafkaLogsProducer(LogProducerInterface): | ||||
|   """ Log producer writing log entries to a Kafka stream. """ | ||||
|   def __init__(self, bootstrap_servers=None, topic=None, client_id=None, max_block_seconds=None): | ||||
|     self.bootstrap_servers = bootstrap_servers | ||||
|     self.topic = topic | ||||
|     self.client_id = client_id | ||||
|     self.max_block_ms = (max_block_seconds or DEFAULT_MAX_BLOCK_SECONDS) * 1000 | ||||
| 
 | ||||
|     self._producer = KafkaProducer(bootstrap_servers=self.bootstrap_servers, | ||||
|                                    client_id=self.client_id, | ||||
|                                    max_block_ms=self.max_block_ms, | ||||
|                                    value_serializer=logs_json_serializer) | ||||
| 
 | ||||
|   def send(self, logentry): | ||||
|     try: | ||||
|       # send() has a (max_block_ms) timeout and get() has a (max_block_ms) timeout | ||||
|       # for an upper bound of 2x(max_block_ms) before guaranteed delivery | ||||
|       future = self._producer.send(self.topic, logentry.to_dict(), timestamp_ms=epoch_ms(logentry.datetime)) | ||||
|       record_metadata = future.get(timeout=self.max_block_ms) | ||||
|       assert future.succeeded | ||||
|     except KafkaTimeoutError as kte: | ||||
|       logger.exception('KafkaLogsProducer timeout sending log to Kafka: %s', kte) | ||||
|       raise LogSendException('KafkaLogsProducer timeout sending log to Kafka: %s' % kte) | ||||
|     except KafkaError as ke: | ||||
|       logger.exception('KafkaLogsProducer error sending log to Kafka: %s', ke) | ||||
|       raise LogSendException('KafkaLogsProducer error sending log to Kafka: %s' % ke) | ||||
|     except Exception as e: | ||||
|       logger.exception('KafkaLogsProducer exception sending log to Kafka: %s', e) | ||||
|       raise LogSendException('KafkaLogsProducer exception sending log to Kafka: %s' % e) | ||||
|  | @ -0,0 +1,75 @@ | |||
| import logging | ||||
| import hashlib | ||||
| import random | ||||
| 
 | ||||
| import boto3 | ||||
| from botocore.exceptions import ClientError | ||||
| from botocore.client import Config | ||||
| 
 | ||||
| from data.logs_model.logs_producer.interface import LogProducerInterface | ||||
| from data.logs_model.logs_producer.util import logs_json_serializer | ||||
| from data.logs_model.logs_producer import LogSendException | ||||
| 
 | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| KINESIS_PARTITION_KEY_PREFIX = 'logentry_partition_key_' | ||||
| DEFAULT_CONNECT_TIMEOUT = 5 | ||||
| DEFAULT_READ_TIMEOUT = 5 | ||||
| MAX_RETRY_ATTEMPTS = 5 | ||||
| DEFAULT_MAX_POOL_CONNECTIONS = 10 | ||||
| 
 | ||||
| 
 | ||||
| def _partition_key(number_of_shards=None): | ||||
|   """ Generate a partition key for AWS Kinesis stream. | ||||
|   If the number of shards is specified, generate keys where the size of the key space is | ||||
|   the number of shards. | ||||
|   """ | ||||
|   key = None | ||||
|   if number_of_shards is not None: | ||||
|     shard_number = random.randrange(0, number_of_shards) | ||||
|     key = hashlib.sha1(KINESIS_PARTITION_KEY_PREFIX + str(shard_number)).hexdigest() | ||||
|   else: | ||||
|     key = hashlib.sha1(KINESIS_PARTITION_KEY_PREFIX + str(random.getrandbits(256))).hexdigest() | ||||
| 
 | ||||
|   return key | ||||
| 
 | ||||
| 
 | ||||
| class KinesisStreamLogsProducer(LogProducerInterface): | ||||
|   """ Log producer writing log entries to an Amazon Kinesis Data Stream. """ | ||||
|   def __init__(self, stream_name, aws_region, aws_access_key=None, aws_secret_key=None, | ||||
|                connect_timeout=None, read_timeout=None, max_retries=None, | ||||
|                max_pool_connections=None): | ||||
|     self._stream_name = stream_name | ||||
|     self._aws_region = aws_region | ||||
|     self._aws_access_key = aws_access_key | ||||
|     self._aws_secret_key = aws_secret_key | ||||
|     self._connect_timeout = connect_timeout or DEFAULT_CONNECT_TIMEOUT | ||||
|     self._read_timeout = read_timeout or DEFAULT_READ_TIMEOUT | ||||
|     self._max_retries = max_retries or MAX_RETRY_ATTEMPTS | ||||
|     self._max_pool_connections=max_pool_connections or DEFAULT_MAX_POOL_CONNECTIONS | ||||
| 
 | ||||
|     client_config = Config(connect_timeout=self._connect_timeout, | ||||
|                            read_timeout=self._read_timeout , | ||||
|                            retries={'max_attempts': self._max_retries}, | ||||
|                            max_pool_connections=self._max_pool_connections) | ||||
|     self._producer = boto3.client('kinesis', use_ssl=True, | ||||
|                                   region_name=self._aws_region, | ||||
|                                   aws_access_key_id=self._aws_access_key, | ||||
|                                   aws_secret_access_key=self._aws_secret_key, | ||||
|                                   config=client_config) | ||||
| 
 | ||||
|   def send(self, logentry): | ||||
|     try: | ||||
|       data = logs_json_serializer(logentry) | ||||
|       self._producer.put_record( | ||||
|         StreamName=self._stream_name, | ||||
|         Data=data, | ||||
|         PartitionKey=_partition_key() | ||||
|       ) | ||||
|     except ClientError as ce: | ||||
|       logger.exception('KinesisStreamLogsProducer client error sending log to Kinesis: %s', ce) | ||||
|       raise LogSendException('KinesisStreamLogsProducer client error sending log to Kinesis: %s' % ce) | ||||
|     except Exception as e: | ||||
|       logger.exception('KinesisStreamLogsProducer exception sending log to Kinesis: %s', e) | ||||
|       raise LogSendException('KinesisStreamLogsProducer exception sending log to Kinesis: %s' % e) | ||||
|  | @ -0,0 +1,45 @@ | |||
| # -*- coding: utf-8 -*- | ||||
| 
 | ||||
| import logging | ||||
| import json | ||||
| from datetime import datetime | ||||
| import pytest | ||||
| 
 | ||||
| from data.logs_model.logs_producer.util import logs_json_serializer | ||||
| from data.logs_model.elastic_logs import LogEntry | ||||
| 
 | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| 
 | ||||
| TEST_DATETIME = datetime.utcnow() | ||||
| 
 | ||||
| TEST_JSON_STRING = '{"a": "b", "c": "d"}' | ||||
| TEST_JSON_STRING_WITH_UNICODE = u'{"éëê": "îôû"}' | ||||
| 
 | ||||
| VALID_LOGENTRY = LogEntry(random_id='123-45', ip='0.0.0.0', metadata_json=TEST_JSON_STRING, datetime=TEST_DATETIME) | ||||
| VALID_LOGENTRY_WITH_UNICODE = LogEntry(random_id='123-45', ip='0.0.0.0', metadata_json=TEST_JSON_STRING_WITH_UNICODE, datetime=TEST_DATETIME) | ||||
| 
 | ||||
| VALID_LOGENTRY_EXPECTED_OUTPUT = '{"datetime": "%s", "ip": "0.0.0.0", "metadata_json": "{\\"a\\": \\"b\\", \\"c\\": \\"d\\"}", "random_id": "123-45"}' % TEST_DATETIME.isoformat() | ||||
| VALID_LOGENTRY_WITH_UNICODE_EXPECTED_OUTPUT = '{"datetime": "%s", "ip": "0.0.0.0", "metadata_json": "{\\"\\u00e9\\u00eb\\u00ea\\": \\"\\u00ee\\u00f4\\u00fb\\"}", "random_id": "123-45"}' % TEST_DATETIME.isoformat() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|   'is_valid, given_input, expected_output', | ||||
|   [ | ||||
|     # Valid inputs | ||||
|     pytest.param(True, VALID_LOGENTRY, VALID_LOGENTRY_EXPECTED_OUTPUT), | ||||
|     # With unicode | ||||
|     pytest.param(True, VALID_LOGENTRY_WITH_UNICODE, VALID_LOGENTRY_WITH_UNICODE_EXPECTED_OUTPUT), | ||||
|   ]) | ||||
| def test_logs_json_serializer(is_valid, given_input, expected_output): | ||||
|   if not is_valid: | ||||
|     with pytest.raises(ValueError) as ve: | ||||
|       data = logs_json_serializer(given_input) | ||||
|   else: | ||||
|     data = logs_json_serializer(given_input, sort_keys=True) | ||||
|     assert data == expected_output | ||||
| 
 | ||||
|   # Make sure the datetime was serialized in the correct ISO8601 | ||||
|   datetime_str = json.loads(data)['datetime'] | ||||
|   assert datetime_str == TEST_DATETIME.isoformat() | ||||
							
								
								
									
										15
									
								
								data/logs_model/logs_producer/util.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								data/logs_model/logs_producer/util.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,15 @@ | |||
| import json | ||||
| from datetime import datetime | ||||
| 
 | ||||
| class LogEntryJSONEncoder(json.JSONEncoder): | ||||
|   """ JSON encoder to encode datetimes to ISO8601 format. """ | ||||
|   def default(self, obj): | ||||
|     if isinstance(obj, datetime): | ||||
|       return obj.isoformat() | ||||
| 
 | ||||
|     return super(LogEntryJSONEncoder, self).default(obj) | ||||
| 
 | ||||
| def logs_json_serializer(logentry, sort_keys=False): | ||||
|   """ Serializes a LogEntry to json bytes. """ | ||||
|   return json.dumps(logentry.to_dict(), cls=LogEntryJSONEncoder, | ||||
|                     ensure_ascii=True, sort_keys=sort_keys).encode('ascii') | ||||
							
								
								
									
										53
									
								
								data/logs_model/shared.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								data/logs_model/shared.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,53 @@ | |||
| import uuid | ||||
| import json | ||||
| 
 | ||||
| from calendar import timegm | ||||
| 
 | ||||
| from data import model | ||||
| 
 | ||||
| 
 | ||||
| class SharedModel: | ||||
|   def queue_logs_export(self, start_datetime, end_datetime, export_action_logs_queue, | ||||
|                         namespace_name=None, repository_name=None, callback_url=None, | ||||
|                         callback_email=None, filter_kinds=None): | ||||
|     """ Queues logs between the start_datetime and end_time, filtered by a repository or namespace, | ||||
|         for export to the specified URL and/or email address. Returns the ID of the export job | ||||
|         queued or None if error. | ||||
|     """ | ||||
|     export_id = str(uuid.uuid4()) | ||||
|     namespace = model.user.get_namespace_user(namespace_name) | ||||
|     if namespace is None: | ||||
|       return None | ||||
| 
 | ||||
|     repository = None | ||||
|     if repository_name is not None: | ||||
|       repository = model.repository.get_repository(namespace_name, repository_name) | ||||
|       if repository is None: | ||||
|         return None | ||||
| 
 | ||||
|     export_action_logs_queue.put([namespace_name], | ||||
|                                  json.dumps({ | ||||
|                                    'export_id': export_id, | ||||
|                                    'repository_id': repository.id if repository else None, | ||||
|                                    'namespace_id': namespace.id, | ||||
|                                    'namespace_name': namespace.username, | ||||
|                                    'repository_name': repository.name if repository else None, | ||||
|                                    'start_time': start_datetime.strftime('%m/%d/%Y'), | ||||
|                                    'end_time': end_datetime.strftime('%m/%d/%Y'), | ||||
|                                    'callback_url': callback_url, | ||||
|                                    'callback_email': callback_email, | ||||
|                                  }), retries_remaining=3) | ||||
| 
 | ||||
|     return export_id | ||||
| 
 | ||||
| 
 | ||||
| def epoch_ms(dt): | ||||
|   return (timegm(dt.timetuple()) * 1000) + (dt.microsecond / 1000) | ||||
| 
 | ||||
| 
 | ||||
| def get_kinds_filter(kinds): | ||||
|   """ Given a list of kinds, return the set of kinds not that are not part of that list. | ||||
|       i.e Returns the list of kinds to be filtered out. """ | ||||
|   kind_map = model.log.get_log_entry_kinds() | ||||
|   kind_map = {key: kind_map[key] for key in kind_map if not isinstance(key, int)} | ||||
|   return [kind_name for kind_name in kind_map if kind_name not in kinds] | ||||
							
								
								
									
										291
									
								
								data/logs_model/table_logs_model.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										291
									
								
								data/logs_model/table_logs_model.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,291 @@ | |||
| # pylint: disable=protected-access | ||||
| 
 | ||||
| import logging | ||||
| 
 | ||||
| from datetime import datetime, timedelta | ||||
| 
 | ||||
| from tzlocal import get_localzone | ||||
| from dateutil.relativedelta import relativedelta | ||||
| 
 | ||||
| from data import model | ||||
| from data.model import config | ||||
| from data.database import LogEntry, LogEntry2, LogEntry3, UseThenDisconnect | ||||
| from data.logs_model.interface import ActionLogsDataInterface, LogsIterationTimeout, \ | ||||
|   LogRotationContextInterface | ||||
| from data.logs_model.datatypes import Log, AggregatedLogCount, LogEntriesPage | ||||
| from data.logs_model.shared import SharedModel | ||||
| from data.model.log import get_stale_logs, get_stale_logs_start_id, delete_stale_logs | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| MINIMUM_RANGE_SIZE = 1 # second | ||||
| MAXIMUM_RANGE_SIZE = 60 * 60 * 24 * 30 # seconds ~= 1 month | ||||
| EXPECTED_ITERATION_LOG_COUNT = 1000 | ||||
| 
 | ||||
| 
 | ||||
| LOG_MODELS = [LogEntry3, LogEntry2, LogEntry] | ||||
| 
 | ||||
| 
 | ||||
| class TableLogsModel(SharedModel, ActionLogsDataInterface): | ||||
|   """ | ||||
|   TableLogsModel implements the data model for the logs API backed by a single table | ||||
|   in the database. | ||||
|   """ | ||||
|   def __init__(self, should_skip_logging=None, **kwargs): | ||||
|     self._should_skip_logging = should_skip_logging | ||||
| 
 | ||||
|   def lookup_logs(self, start_datetime, end_datetime, performer_name=None, repository_name=None, | ||||
|                   namespace_name=None, filter_kinds=None, page_token=None, max_page_count=None): | ||||
|     if filter_kinds is not None: | ||||
|       assert all(isinstance(kind_name, str) for kind_name in filter_kinds) | ||||
| 
 | ||||
|     assert start_datetime is not None | ||||
|     assert end_datetime is not None | ||||
| 
 | ||||
|     repository = None | ||||
|     if repository_name and namespace_name: | ||||
|       repository = model.repository.get_repository(namespace_name, repository_name) | ||||
|       assert repository | ||||
| 
 | ||||
|     performer = None | ||||
|     if performer_name: | ||||
|       performer = model.user.get_user(performer_name) | ||||
|       assert performer | ||||
| 
 | ||||
|     def get_logs(m, page_token): | ||||
|       logs_query = model.log.get_logs_query(start_datetime, end_datetime, performer=performer, | ||||
|                                             repository=repository, namespace=namespace_name, | ||||
|                                             ignore=filter_kinds, model=m) | ||||
| 
 | ||||
|       logs, next_page_token = model.modelutil.paginate(logs_query, m, | ||||
|                                                        descending=True, | ||||
|                                                        page_token=page_token, | ||||
|                                                        limit=20, | ||||
|                                                        max_page=max_page_count, | ||||
|                                                        sort_field_name='datetime') | ||||
| 
 | ||||
|       return logs, next_page_token | ||||
| 
 | ||||
|     TOKEN_TABLE_ID = 'tti' | ||||
|     table_index = 0 | ||||
|     logs = [] | ||||
|     next_page_token = page_token or None | ||||
| 
 | ||||
|     # Skip empty pages (empty table) | ||||
|     while len(logs) == 0 and table_index < len(LOG_MODELS) - 1: | ||||
|       table_specified = next_page_token is not None and next_page_token.get(TOKEN_TABLE_ID) is not None | ||||
|       if table_specified: | ||||
|         table_index = next_page_token.get(TOKEN_TABLE_ID) | ||||
| 
 | ||||
|       logs_result, next_page_token = get_logs(LOG_MODELS[table_index], next_page_token) | ||||
|       logs.extend(logs_result) | ||||
| 
 | ||||
|       if next_page_token is None and table_index < len(LOG_MODELS) - 1: | ||||
|         next_page_token = {TOKEN_TABLE_ID: table_index + 1} | ||||
| 
 | ||||
|     return LogEntriesPage([Log.for_logentry(log) for log in logs], next_page_token) | ||||
| 
 | ||||
|   def lookup_latest_logs(self, performer_name=None, repository_name=None, namespace_name=None, | ||||
|                          filter_kinds=None, size=20): | ||||
|     if filter_kinds is not None: | ||||
|       assert all(isinstance(kind_name, str) for kind_name in filter_kinds) | ||||
| 
 | ||||
|     repository = None | ||||
|     if repository_name and namespace_name: | ||||
|       repository = model.repository.get_repository(namespace_name, repository_name) | ||||
|       assert repository | ||||
| 
 | ||||
|     performer = None | ||||
|     if performer_name: | ||||
|       performer = model.user.get_user(performer_name) | ||||
|       assert performer | ||||
| 
 | ||||
|     def get_latest_logs(m): | ||||
|       logs_query = model.log.get_latest_logs_query(performer=performer, repository=repository, | ||||
|                                                    namespace=namespace_name, ignore=filter_kinds, | ||||
|                                                    model=m, size=size) | ||||
| 
 | ||||
|       logs = list(logs_query) | ||||
|       return [Log.for_logentry(log) for log in logs] | ||||
| 
 | ||||
|     return get_latest_logs(LOG_MODELS[0]) | ||||
| 
 | ||||
|   def get_aggregated_log_counts(self, start_datetime, end_datetime, performer_name=None, | ||||
|                                 repository_name=None, namespace_name=None, filter_kinds=None): | ||||
|     if filter_kinds is not None: | ||||
|       assert all(isinstance(kind_name, str) for kind_name in filter_kinds) | ||||
| 
 | ||||
|     if end_datetime - start_datetime >= timedelta(weeks=4): | ||||
|       raise Exception('Cannot lookup aggregated logs over a period longer than a month') | ||||
| 
 | ||||
|     repository = None | ||||
|     if repository_name and namespace_name: | ||||
|       repository = model.repository.get_repository(namespace_name, repository_name) | ||||
| 
 | ||||
|     performer = None | ||||
|     if performer_name: | ||||
|       performer = model.user.get_user(performer_name) | ||||
| 
 | ||||
|     entries = {} | ||||
|     for log_model in LOG_MODELS: | ||||
|       aggregated = model.log.get_aggregated_logs(start_datetime, end_datetime, | ||||
|                                                  performer=performer, | ||||
|                                                  repository=repository, | ||||
|                                                  namespace=namespace_name, | ||||
|                                                  ignore=filter_kinds, | ||||
|                                                  model=log_model) | ||||
| 
 | ||||
|       for entry in aggregated: | ||||
|         synthetic_date = datetime(start_datetime.year, start_datetime.month, int(entry.day), | ||||
|                                   tzinfo=get_localzone()) | ||||
|         if synthetic_date.day < start_datetime.day: | ||||
|           synthetic_date = synthetic_date + relativedelta(months=1) | ||||
| 
 | ||||
|         key = '%s-%s' % (entry.kind_id, entry.day) | ||||
| 
 | ||||
|         if key in entries: | ||||
|           entries[key] = AggregatedLogCount(entry.kind_id, entry.count + entries[key].count, | ||||
|                                             synthetic_date) | ||||
|         else: | ||||
|           entries[key] = AggregatedLogCount(entry.kind_id, entry.count, synthetic_date) | ||||
| 
 | ||||
|     return entries.values() | ||||
| 
 | ||||
|   def count_repository_actions(self, repository, day): | ||||
|     return model.repositoryactioncount.count_repository_actions(repository, day) | ||||
| 
 | ||||
|   def log_action(self, kind_name, namespace_name=None, performer=None, ip=None, metadata=None, | ||||
|                  repository=None, repository_name=None, timestamp=None, is_free_namespace=False): | ||||
|     if self._should_skip_logging and self._should_skip_logging(kind_name, namespace_name, | ||||
|                                                                is_free_namespace): | ||||
|       return | ||||
| 
 | ||||
|     if repository_name is not None: | ||||
|       assert repository is None | ||||
|       assert namespace_name is not None | ||||
|       repository = model.repository.get_repository(namespace_name, repository_name) | ||||
| 
 | ||||
|     model.log.log_action(kind_name, namespace_name, performer=performer, repository=repository, | ||||
|                          ip=ip, metadata=metadata or {}, timestamp=timestamp) | ||||
| 
 | ||||
|   def yield_logs_for_export(self, start_datetime, end_datetime, repository_id=None, | ||||
|                             namespace_id=None, max_query_time=None): | ||||
|     # Using an adjusting scale, start downloading log rows in batches, starting at | ||||
|     # MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or | ||||
|     # the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes | ||||
|     # longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out. | ||||
|     batch_start_time = datetime.utcnow() | ||||
| 
 | ||||
|     current_start_datetime = start_datetime | ||||
|     current_batch_size = timedelta(seconds=MINIMUM_RANGE_SIZE) | ||||
| 
 | ||||
|     while current_start_datetime < end_datetime: | ||||
|       # Verify we haven't been working for too long. | ||||
|       work_elapsed = datetime.utcnow() - batch_start_time | ||||
|       if max_query_time is not None and work_elapsed > max_query_time: | ||||
|         logger.error('Retrieval of logs `%s/%s` timed out with time of `%s`', | ||||
|                      namespace_id, repository_id, work_elapsed) | ||||
|         raise LogsIterationTimeout() | ||||
| 
 | ||||
|       current_end_datetime = current_start_datetime + current_batch_size | ||||
|       current_end_datetime = min(current_end_datetime, end_datetime) | ||||
| 
 | ||||
|       # Load the next set of logs. | ||||
|       def load_logs(): | ||||
|         logger.debug('Retrieving logs over range %s -> %s with namespace %s and repository %s', | ||||
|                      current_start_datetime, current_end_datetime, namespace_id, repository_id) | ||||
| 
 | ||||
|         logs_query = model.log.get_logs_query(namespace=namespace_id, | ||||
|                                               repository=repository_id, | ||||
|                                               start_time=current_start_datetime, | ||||
|                                               end_time=current_end_datetime) | ||||
|         logs = list(logs_query) | ||||
|         for log in logs: | ||||
|           if namespace_id is not None: | ||||
|             assert log.account_id == namespace_id | ||||
| 
 | ||||
|           if repository_id is not None: | ||||
|             assert log.repository_id == repository_id | ||||
| 
 | ||||
|         logs = [Log.for_logentry(log) for log in logs] | ||||
|         return logs | ||||
| 
 | ||||
|       logs, elapsed = _run_and_time(load_logs) | ||||
|       if max_query_time is not None and elapsed > max_query_time: | ||||
|         logger.error('Retrieval of logs for export `%s/%s` with range `%s-%s` timed out at `%s`', | ||||
|                      namespace_id, repository_id, current_start_datetime, current_end_datetime, | ||||
|                      elapsed) | ||||
|         raise LogsIterationTimeout() | ||||
| 
 | ||||
|       yield logs | ||||
| 
 | ||||
|       # Move forward. | ||||
|       current_start_datetime = current_end_datetime | ||||
| 
 | ||||
|       # Increase the batch size if necessary. | ||||
|       if len(logs) < EXPECTED_ITERATION_LOG_COUNT: | ||||
|         seconds = min(MAXIMUM_RANGE_SIZE, current_batch_size.total_seconds() * 2) | ||||
|         current_batch_size = timedelta(seconds=seconds) | ||||
| 
 | ||||
|   def yield_log_rotation_context(self, cutoff_date, min_logs_per_rotation): | ||||
|     """ Yield a context manager for a group of outdated logs. """ | ||||
|     for log_model in LOG_MODELS: | ||||
|       while True: | ||||
|         with UseThenDisconnect(config.app_config): | ||||
|           start_id = get_stale_logs_start_id(log_model) | ||||
| 
 | ||||
|           if start_id is None: | ||||
|             logger.warning('Failed to find start id') | ||||
|             break | ||||
| 
 | ||||
|           logger.debug('Found starting ID %s', start_id) | ||||
|           lookup_end_id = start_id + min_logs_per_rotation | ||||
|           logs = [log for log in get_stale_logs(start_id, lookup_end_id, | ||||
|                                                 log_model, cutoff_date)] | ||||
| 
 | ||||
|         if not logs: | ||||
|           logger.debug('No further logs found') | ||||
|           break | ||||
| 
 | ||||
|         end_id = max([log.id for log in logs]) | ||||
|         context = DatabaseLogRotationContext(logs, log_model, start_id, end_id) | ||||
|         yield context | ||||
| 
 | ||||
| 
 | ||||
| def _run_and_time(fn): | ||||
|   start_time = datetime.utcnow() | ||||
|   result = fn() | ||||
|   return result, datetime.utcnow() - start_time | ||||
| 
 | ||||
| 
 | ||||
| table_logs_model = TableLogsModel() | ||||
| 
 | ||||
| 
 | ||||
| class DatabaseLogRotationContext(LogRotationContextInterface): | ||||
|   """ | ||||
|   DatabaseLogRotationContext represents a batch of logs to be archived together. | ||||
|   i.e A set of logs to be archived in the same file (based on the number of logs per rotation). | ||||
| 
 | ||||
|   When completed without exceptions, this context will delete the stale logs | ||||
|   from rows `start_id` to `end_id`. | ||||
|   """ | ||||
|   def __init__(self, logs, log_model, start_id, end_id): | ||||
|     self.logs = logs | ||||
|     self.log_model = log_model | ||||
|     self.start_id = start_id | ||||
|     self.end_id = end_id | ||||
| 
 | ||||
|   def __enter__(self): | ||||
|     return self | ||||
| 
 | ||||
|   def __exit__(self, ex_type, ex_value, ex_traceback): | ||||
|     if ex_type is None and ex_value is None and ex_traceback is None: | ||||
|       with UseThenDisconnect(config.app_config): | ||||
|         logger.debug('Deleting logs from IDs %s to %s', self.start_id, self.end_id) | ||||
|         delete_stale_logs(self.start_id, self.end_id, self.log_model) | ||||
| 
 | ||||
|   def yield_logs_batch(self): | ||||
|     """ Yield a batch of logs and a filename for that batch. """ | ||||
|     filename = '%d-%d-%s.txt.gz' % (self.start_id, self.end_id, | ||||
|                                     self.log_model.__name__.lower()) | ||||
|     yield self.logs, filename | ||||
							
								
								
									
										0
									
								
								data/logs_model/test/__init__.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								data/logs_model/test/__init__.py
									
										
									
									
									
										Normal file
									
								
							
							
								
								
									
										390
									
								
								data/logs_model/test/fake_elasticsearch.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										390
									
								
								data/logs_model/test/fake_elasticsearch.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,390 @@ | |||
| import json | ||||
| import uuid | ||||
| import fnmatch | ||||
| 
 | ||||
| from collections import defaultdict | ||||
| from contextlib import contextmanager | ||||
| from datetime import datetime | ||||
| 
 | ||||
| import dateutil.parser | ||||
| 
 | ||||
| from httmock import urlmatch, HTTMock | ||||
| 
 | ||||
| FAKE_ES_HOST = 'fakees' | ||||
| 
 | ||||
| EMPTY_RESULT = { | ||||
|   'hits': {'hits': [], 'total': 0}, | ||||
|   '_shards': {'successful': 1, 'total': 1}, | ||||
| } | ||||
| 
 | ||||
| def parse_query(query): | ||||
|   if not query: | ||||
|     return {} | ||||
| 
 | ||||
|   return {s.split('=')[0]: s.split('=')[1] for s in query.split("&")} | ||||
| 
 | ||||
| 
 | ||||
| @contextmanager | ||||
| def fake_elasticsearch(allow_wildcard=True): | ||||
|   templates = {} | ||||
|   docs = defaultdict(list) | ||||
|   scrolls = {} | ||||
|   id_counter = [1] | ||||
| 
 | ||||
|   def transform(value, field_name): | ||||
|     # TODO: implement this using a real index template if we ever need more than a few | ||||
|     # fields here. | ||||
|     if field_name == 'datetime': | ||||
|       if isinstance(value, int): | ||||
|         return datetime.utcfromtimestamp(value / 1000) | ||||
| 
 | ||||
|       parsed = dateutil.parser.parse(value) | ||||
|       return parsed | ||||
| 
 | ||||
|     return value | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST, path=r'/_template/(.+)', method='GET') | ||||
|   def get_template(url, request): | ||||
|     template_name = url[len('/_template/'):] | ||||
|     if template_name in templates: | ||||
|       return {'status_code': 200} | ||||
| 
 | ||||
|     return {'status_code': 404} | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST, path=r'/_template/(.+)', method='PUT') | ||||
|   def put_template(url, request): | ||||
|     template_name = url[len('/_template/'):] | ||||
|     templates[template_name] = True | ||||
|     return {'status_code': 201} | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)/_doc', method='POST') | ||||
|   def post_doc(url, request): | ||||
|     index_name, _ = url.path[1:].split('/') | ||||
|     item = json.loads(request.body) | ||||
|     item['_id'] = item['random_id'] | ||||
|     id_counter[0] += 1 | ||||
|     docs[index_name].append(item) | ||||
|     return { | ||||
|       'status_code': 204, | ||||
|       'headers': { | ||||
|         'Content-Type': 'application/json', | ||||
|       }, | ||||
|       'content': json.dumps({ | ||||
|         "result": "created", | ||||
|       }), | ||||
|     } | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)$', method='DELETE') | ||||
|   def index_delete(url, request): | ||||
|     index_name_or_pattern = url.path[1:] | ||||
|     to_delete = [] | ||||
|     for index_name in docs.keys(): | ||||
|       if not fnmatch.fnmatch(index_name, index_name_or_pattern): | ||||
|         continue | ||||
| 
 | ||||
|       to_delete.append(index_name) | ||||
| 
 | ||||
|     for index in to_delete: | ||||
|       docs.pop(index) | ||||
| 
 | ||||
|     return { | ||||
|       'status_code': 200, | ||||
|       'headers': { | ||||
|         'Content-Type': 'application/json', | ||||
|       }, | ||||
|       'content': {'acknowledged': True} | ||||
|     } | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)$', method='GET') | ||||
|   def index_lookup(url, request): | ||||
|     index_name_or_pattern = url.path[1:] | ||||
|     found = {} | ||||
|     for index_name in docs.keys(): | ||||
|       if not fnmatch.fnmatch(index_name, index_name_or_pattern): | ||||
|         continue | ||||
| 
 | ||||
|       found[index_name] = {} | ||||
| 
 | ||||
|     if not found: | ||||
|       return { | ||||
|         'status_code': 404, | ||||
|       } | ||||
| 
 | ||||
|     return { | ||||
|       'status_code': 200, | ||||
|       'headers': { | ||||
|         'Content-Type': 'application/json', | ||||
|       }, | ||||
|       'content': json.dumps(found), | ||||
|     } | ||||
| 
 | ||||
|   def _match_query(index_name_or_pattern, query): | ||||
|     found = [] | ||||
|     found_index = False | ||||
| 
 | ||||
|     for index_name in docs.keys(): | ||||
|       if not allow_wildcard and index_name_or_pattern.find('*') >= 0: | ||||
|         break | ||||
| 
 | ||||
|       if not fnmatch.fnmatch(index_name, index_name_or_pattern): | ||||
|         continue | ||||
| 
 | ||||
|       found_index = True | ||||
| 
 | ||||
|       def _is_match(doc, current_query): | ||||
|         if current_query is None: | ||||
|           return True | ||||
| 
 | ||||
|         for filter_type, filter_params in current_query.iteritems(): | ||||
|           for field_name, filter_props in filter_params.iteritems(): | ||||
|             if filter_type == 'range': | ||||
|               lt = transform(filter_props['lt'], field_name) | ||||
|               gte = transform(filter_props['gte'], field_name) | ||||
|               doc_value = transform(doc[field_name], field_name) | ||||
|               if not (doc_value < lt and doc_value >= gte): | ||||
|                 return False | ||||
|             elif filter_type == 'term': | ||||
|               doc_value = transform(doc[field_name], field_name) | ||||
|               return doc_value == filter_props | ||||
|             elif filter_type == 'terms': | ||||
|               doc_value = transform(doc[field_name], field_name) | ||||
|               return doc_value in filter_props | ||||
|             elif filter_type == 'bool': | ||||
|               assert not 'should' in filter_params, 'should is unsupported' | ||||
| 
 | ||||
|               must = filter_params.get('must') | ||||
|               must_not = filter_params.get('must_not') | ||||
|               filter_bool = filter_params.get('filter') | ||||
| 
 | ||||
|               if must: | ||||
|                 for check in must: | ||||
|                   if not _is_match(doc, check): | ||||
|                     return False | ||||
| 
 | ||||
|               if must_not: | ||||
|                 for check in must_not: | ||||
|                   if _is_match(doc, check): | ||||
|                     return False | ||||
| 
 | ||||
|               if filter_bool: | ||||
|                 for check in filter_bool: | ||||
|                   if not _is_match(doc, check): | ||||
|                     return False | ||||
|             else: | ||||
|               raise Exception('Unimplemented query %s: %s' % (filter_type, query)) | ||||
| 
 | ||||
|         return True | ||||
| 
 | ||||
|       for doc in docs[index_name]: | ||||
|         if not _is_match(doc, query): | ||||
|           continue | ||||
| 
 | ||||
|         found.append({'_source': doc, '_index': index_name}) | ||||
| 
 | ||||
|     return found, found_index or (index_name_or_pattern.find('*') >= 0) | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)/_count$', method='GET') | ||||
|   def count_docs(url, request): | ||||
|     request = json.loads(request.body) | ||||
|     index_name_or_pattern, _ = url.path[1:].split('/') | ||||
| 
 | ||||
|     found, found_index = _match_query(index_name_or_pattern, request['query']) | ||||
|     if not found_index: | ||||
|       return { | ||||
|         'status_code': 404, | ||||
|       } | ||||
| 
 | ||||
|     return { | ||||
|       'status_code': 200, | ||||
|       'headers': { | ||||
|         'Content-Type': 'application/json', | ||||
|       }, | ||||
|       'content': json.dumps({'count': len(found)}), | ||||
|     } | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST, path=r'/_search/scroll$', method='GET') | ||||
|   def lookup_scroll(url, request): | ||||
|     request_obj = json.loads(request.body) | ||||
|     scroll_id = request_obj['scroll_id'] | ||||
|     if scroll_id in scrolls: | ||||
|       return { | ||||
|         'status_code': 200, | ||||
|         'headers': { | ||||
|           'Content-Type': 'application/json', | ||||
|         }, | ||||
|         'content': json.dumps(scrolls[scroll_id]), | ||||
|       } | ||||
| 
 | ||||
|     return { | ||||
|       'status_code': 404, | ||||
|     } | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST, path=r'/_search/scroll$', method='DELETE') | ||||
|   def delete_scroll(url, request): | ||||
|     request = json.loads(request.body) | ||||
|     for scroll_id in request['scroll_id']: | ||||
|       scrolls.pop(scroll_id, None) | ||||
| 
 | ||||
|     return { | ||||
|       'status_code': 404, | ||||
|     } | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST, path=r'/([^/]+)/_search$', method='GET') | ||||
|   def lookup_docs(url, request): | ||||
|     query_params = parse_query(url.query) | ||||
| 
 | ||||
|     request = json.loads(request.body) | ||||
|     index_name_or_pattern, _ = url.path[1:].split('/') | ||||
| 
 | ||||
|     # Find matching docs. | ||||
|     query = request.get('query') | ||||
|     found, found_index = _match_query(index_name_or_pattern, query) | ||||
|     if not found_index: | ||||
|       return { | ||||
|         'status_code': 404, | ||||
|       } | ||||
| 
 | ||||
|     # Sort. | ||||
|     sort = request.get('sort') | ||||
|     if sort: | ||||
|       if sort == ['_doc'] or sort == '_doc': | ||||
|         found.sort(key=lambda x: x['_source']['_id']) | ||||
|       else: | ||||
|         def get_sort_key(item): | ||||
|           source = item['_source'] | ||||
|           key = '' | ||||
|           for sort_config in sort: | ||||
|             for sort_key, direction in sort_config.iteritems(): | ||||
|               assert direction == 'desc' | ||||
|               sort_key = sort_key.replace('.keyword', '') | ||||
|               key += str(transform(source[sort_key], sort_key)) | ||||
|               key += '|' | ||||
|           return key | ||||
| 
 | ||||
|         found.sort(key=get_sort_key, reverse=True) | ||||
| 
 | ||||
|     # Search after. | ||||
|     search_after = request.get('search_after') | ||||
|     if search_after: | ||||
|       sort_fields = [] | ||||
|       for sort_config in sort: | ||||
|         if isinstance(sort_config, unicode): | ||||
|           sort_fields.append(sort_config) | ||||
|           continue | ||||
| 
 | ||||
|         for sort_key, _ in sort_config.iteritems(): | ||||
|           sort_key = sort_key.replace('.keyword', '') | ||||
|           sort_fields.append(sort_key) | ||||
| 
 | ||||
|       for index, search_after_value in enumerate(search_after): | ||||
|         field_name = sort_fields[index] | ||||
|         value = transform(search_after_value, field_name) | ||||
|         if field_name == '_doc': | ||||
|           found = [f for f in found if transform(f['_source']['_id'], field_name) > value] | ||||
|         else: | ||||
|           found = [f for f in found if transform(f['_source'][field_name], field_name) < value] | ||||
|         if len(found) < 2: | ||||
|           break | ||||
| 
 | ||||
|         if field_name == '_doc': | ||||
|           if found[0]['_source']['_id'] != found[1]['_source']: | ||||
|             break | ||||
|         else: | ||||
|           if found[0]['_source'][field_name] != found[1]['_source']: | ||||
|             break | ||||
| 
 | ||||
|     # Size. | ||||
|     size = request.get('size') | ||||
|     if size: | ||||
|       found = found[0:size] | ||||
| 
 | ||||
|     # Aggregation. | ||||
|     # {u'query': | ||||
|     #   {u'range': | ||||
|     #     {u'datetime': {u'lt': u'2019-06-27T15:45:09.768085', | ||||
|     #                    u'gte': u'2019-06-27T15:35:09.768085'}}}, | ||||
|     #      u'aggs': { | ||||
|     #         u'by_id': { | ||||
|     #           u'terms': {u'field': u'kind_id'}, | ||||
|     #           u'aggs': { | ||||
|     #             u'by_date': {u'date_histogram': {u'field': u'datetime', u'interval': u'day'}}}}}, | ||||
|     #   u'size': 0} | ||||
|     def _by_field(agg_field_params, results): | ||||
|       aggregated_by_field = defaultdict(list) | ||||
| 
 | ||||
|       for agg_means, agg_means_params in agg_field_params.iteritems(): | ||||
|         if agg_means == 'terms': | ||||
|           field_name = agg_means_params['field'] | ||||
|           for result in results: | ||||
|             value = result['_source'][field_name] | ||||
|             aggregated_by_field[value].append(result) | ||||
|         elif agg_means == 'date_histogram': | ||||
|           field_name = agg_means_params['field'] | ||||
|           interval = agg_means_params['interval'] | ||||
|           for result in results: | ||||
|             value = transform(result['_source'][field_name], field_name) | ||||
|             aggregated_by_field[getattr(value, interval)].append(result) | ||||
|         elif agg_means == 'aggs': | ||||
|           # Skip. Handled below. | ||||
|           continue | ||||
|         else: | ||||
|           raise Exception('Unsupported aggregation method: %s' % agg_means) | ||||
| 
 | ||||
|       # Invoke the aggregation recursively. | ||||
|       buckets = [] | ||||
|       for field_value, field_results in aggregated_by_field.iteritems(): | ||||
|         aggregated = _aggregate(agg_field_params, field_results) | ||||
|         if isinstance(aggregated, list): | ||||
|           aggregated = {'doc_count': len(aggregated)} | ||||
| 
 | ||||
|         aggregated['key'] = field_value | ||||
|         buckets.append(aggregated) | ||||
| 
 | ||||
|       return {'buckets': buckets} | ||||
| 
 | ||||
|     def _aggregate(query_config, results): | ||||
|       agg_params = query_config.get(u'aggs') | ||||
|       if not agg_params: | ||||
|         return results | ||||
| 
 | ||||
|       by_field_name = {} | ||||
|       for agg_field_name, agg_field_params in agg_params.iteritems(): | ||||
|         by_field_name[agg_field_name] = _by_field(agg_field_params, results) | ||||
| 
 | ||||
|       return by_field_name | ||||
| 
 | ||||
|     final_result = { | ||||
|       'hits': { | ||||
|         'hits': found, | ||||
|         'total': len(found), | ||||
|       }, | ||||
|       '_shards': { | ||||
|         'successful': 1, | ||||
|         'total': 1, | ||||
|       }, | ||||
|       'aggregations': _aggregate(request, found), | ||||
|     } | ||||
| 
 | ||||
|     if query_params.get('scroll'): | ||||
|       scroll_id = str(uuid.uuid4()) | ||||
|       scrolls[scroll_id] = EMPTY_RESULT | ||||
|       final_result['_scroll_id'] = scroll_id | ||||
| 
 | ||||
|     return { | ||||
|       'status_code': 200, | ||||
|       'headers': { | ||||
|         'Content-Type': 'application/json', | ||||
|       }, | ||||
|       'content': json.dumps(final_result), | ||||
|     } | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST) | ||||
|   def catchall_handler(url, request): | ||||
|     print "Unsupported URL: %s %s" % (request.method, url, ) | ||||
|     return {'status_code': 501} | ||||
| 
 | ||||
|   handlers = [get_template, put_template, index_delete, index_lookup, post_doc, count_docs, | ||||
|               lookup_docs, lookup_scroll, delete_scroll, catchall_handler] | ||||
| 
 | ||||
|   with HTTMock(*handlers): | ||||
|     yield | ||||
							
								
								
									
										400
									
								
								data/logs_model/test/mock_elasticsearch.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										400
									
								
								data/logs_model/test/mock_elasticsearch.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,400 @@ | |||
| # -*- coding: utf-8 -*- | ||||
| import json | ||||
| 
 | ||||
| from datetime import datetime | ||||
| from dateutil.parser import parse | ||||
| 
 | ||||
| from data.logs_model.datatypes import LogEntriesPage, Log, AggregatedLogCount | ||||
| 
 | ||||
| 
 | ||||
| def _status(d, code=200): | ||||
|   return {"status_code": code, "content": json.dumps(d)} | ||||
| 
 | ||||
| 
 | ||||
| def _shards(d, total=5, failed=0, successful=5): | ||||
|   d.update({"_shards": {"total": total, "failed": failed, "successful": successful}}) | ||||
|   return d | ||||
| 
 | ||||
| 
 | ||||
| def _hits(hits): | ||||
|   return {"hits": {"total": len(hits), "max_score": None, "hits": hits}} | ||||
| 
 | ||||
| 
 | ||||
| INDEX_LIST_RESPONSE_HIT1_HIT2 = _status({ | ||||
|   "logentry_2018-03-08": {}, | ||||
|   "logentry_2018-04-02": {} | ||||
| }) | ||||
| 
 | ||||
| 
 | ||||
| INDEX_LIST_RESPONSE_HIT2 = _status({ | ||||
|   "logentry_2018-04-02": {} | ||||
| }) | ||||
| 
 | ||||
| 
 | ||||
| INDEX_LIST_RESPONSE = _status({ | ||||
|   "logentry_2019-01-01": {}, | ||||
|   "logentry_2017-03-08": {}, | ||||
|   "logentry_2018-03-08": {}, | ||||
|   "logentry_2018-04-02": {} | ||||
| }) | ||||
| 
 | ||||
| 
 | ||||
| DEFAULT_TEMPLATE_RESPONSE = _status({"acknowledged": True}) | ||||
| INDEX_RESPONSE_2019_01_01 = _status( | ||||
|   _shards({ | ||||
|     "_index": "logentry_2019-01-01", | ||||
|     "_type": "_doc", | ||||
|     "_id": "1", | ||||
|     "_version": 1, | ||||
|     "_seq_no": 0, | ||||
|     "_primary_term": 1, | ||||
|     "result": "created" | ||||
|   })) | ||||
| 
 | ||||
| INDEX_RESPONSE_2017_03_08 = _status( | ||||
|   _shards({ | ||||
|     "_index": "logentry_2017-03-08", | ||||
|     "_type": "_doc", | ||||
|     "_id": "1", | ||||
|     "_version": 1, | ||||
|     "_seq_no": 0, | ||||
|     "_primary_term": 1, | ||||
|     "result": "created" | ||||
|   })) | ||||
| 
 | ||||
| FAILURE_400 = _status({}, 400) | ||||
| 
 | ||||
| INDEX_REQUEST_2019_01_01 = [ | ||||
|   "logentry_2019-01-01", { | ||||
|     "account_id": | ||||
|       1, | ||||
|     "repository_id": | ||||
|       1, | ||||
|     "ip": | ||||
|       "192.168.1.1", | ||||
|     "random_id": | ||||
|       233, | ||||
|     "datetime": | ||||
|       "2019-01-01T03:30:00", | ||||
|     "metadata_json": json.loads("{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1520479800}"), | ||||
|     "performer_id": | ||||
|       1, | ||||
|     "kind_id": | ||||
|       1 | ||||
|   } | ||||
| ] | ||||
| 
 | ||||
| INDEX_REQUEST_2017_03_08 = [ | ||||
|   "logentry_2017-03-08", { | ||||
|     "repository_id": | ||||
|       1, | ||||
|     "account_id": | ||||
|       1, | ||||
|     "ip": | ||||
|       "192.168.1.1", | ||||
|     "random_id": | ||||
|       233, | ||||
|     "datetime": | ||||
|       "2017-03-08T03:30:00", | ||||
|     "metadata_json": json.loads("{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1520479800}"), | ||||
|     "performer_id": | ||||
|       1, | ||||
|     "kind_id": | ||||
|       2 | ||||
|   } | ||||
| ] | ||||
| 
 | ||||
| _hit1 = { | ||||
|   "_index": "logentry_2018-03-08", | ||||
|   "_type": "doc", | ||||
|   "_id": "1", | ||||
|   "_score": None, | ||||
|   "_source": { | ||||
|     "random_id": | ||||
|       233, | ||||
|     "kind_id": | ||||
|       1, | ||||
|     "account_id": | ||||
|       1, | ||||
|     "performer_id": | ||||
|       1, | ||||
|     "repository_id": | ||||
|       1, | ||||
|     "ip": | ||||
|       "192.168.1.1", | ||||
|     "metadata_json": | ||||
|       "{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1520479800}", | ||||
|     "datetime": | ||||
|       "2018-03-08T03:30", | ||||
|   }, | ||||
|   "sort": [1520479800000, 233] | ||||
| } | ||||
| 
 | ||||
| _hit2 = { | ||||
|   "_index": "logentry_2018-04-02", | ||||
|   "_type": "doc", | ||||
|   "_id": "2", | ||||
|   "_score": None, | ||||
|   "_source": { | ||||
|     "random_id": | ||||
|       233, | ||||
|     "kind_id": | ||||
|       2, | ||||
|     "account_id": | ||||
|       1, | ||||
|     "performer_id": | ||||
|       1, | ||||
|     "repository_id": | ||||
|       1, | ||||
|     "ip": | ||||
|       "192.168.1.2", | ||||
|     "metadata_json": | ||||
|       "{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1522639800}", | ||||
|     "datetime": | ||||
|       "2018-04-02T03:30", | ||||
|   }, | ||||
|   "sort": [1522639800000, 233] | ||||
| } | ||||
| 
 | ||||
| _log1 = Log( | ||||
|   "{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1520479800}", | ||||
|   "192.168.1.1", parse("2018-03-08T03:30"), "user1.email", "user1.username", "user1.robot", | ||||
|   "user1.organization", "user1.username", "user1.email", "user1.robot", 1) | ||||
| _log2 = Log( | ||||
|   "{\"\\ud83d\\ude02\": \"\\ud83d\\ude02\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\\ud83d\\udc4c\", \"key\": \"value\", \"time\": 1522639800}", | ||||
|   "192.168.1.2", parse("2018-04-02T03:30"), "user1.email", "user1.username", "user1.robot", | ||||
|   "user1.organization", "user1.username", "user1.email", "user1.robot", 2) | ||||
| 
 | ||||
| SEARCH_RESPONSE_START = _status(_shards(_hits([_hit1, _hit2]))) | ||||
| SEARCH_RESPONSE_END = _status(_shards(_hits([_hit2]))) | ||||
| SEARCH_REQUEST_START = { | ||||
|   "sort": [{ | ||||
|     "datetime": "desc" | ||||
|   }, { | ||||
|     "random_id.keyword": "desc" | ||||
|   }], | ||||
|   "query": { | ||||
|     "bool": { | ||||
|       "filter": [{ | ||||
|         "term": { | ||||
|           "performer_id": 1 | ||||
|         } | ||||
|       }, { | ||||
|         "term": { | ||||
|           "repository_id": 1 | ||||
|         } | ||||
|       }] | ||||
|     } | ||||
|   }, | ||||
|   "size": 2 | ||||
| } | ||||
| SEARCH_REQUEST_END = { | ||||
|   "sort": [{ | ||||
|     "datetime": "desc" | ||||
|   }, { | ||||
|     "random_id.keyword": "desc" | ||||
|   }], | ||||
|   "query": { | ||||
|     "bool": { | ||||
|       "filter": [{ | ||||
|         "term": { | ||||
|           "performer_id": 1 | ||||
|         } | ||||
|       }, { | ||||
|         "term": { | ||||
|           "repository_id": 1 | ||||
|         } | ||||
|       }] | ||||
|     } | ||||
|   }, | ||||
|   "search_after": [1520479800000, 233], | ||||
|   "size": 2 | ||||
| } | ||||
| SEARCH_REQUEST_FILTER = { | ||||
|   "sort": [{ | ||||
|     "datetime": "desc" | ||||
|   }, { | ||||
|     "random_id.keyword": "desc" | ||||
|   }], | ||||
|   "query": { | ||||
|     "bool": { | ||||
|       "filter": [{ | ||||
|         "term": { | ||||
|           "performer_id": 1 | ||||
|         } | ||||
|       }, { | ||||
|         "term": { | ||||
|           "repository_id": 1 | ||||
|         } | ||||
|       }, { | ||||
|         "bool": { | ||||
|           "must_not": [{ | ||||
|             "terms": { | ||||
|               "kind_id": [1] | ||||
|             } | ||||
|           }] | ||||
|         } | ||||
|       }] | ||||
|     } | ||||
|   }, | ||||
|   "size": 2 | ||||
| } | ||||
| SEARCH_PAGE_TOKEN = { | ||||
|   "datetime": datetime(2018, 3, 8, 3, 30).isoformat(), | ||||
|   "random_id": 233, | ||||
|   "page_number": 1 | ||||
| } | ||||
| SEARCH_PAGE_START = LogEntriesPage(logs=[_log1], next_page_token=SEARCH_PAGE_TOKEN) | ||||
| SEARCH_PAGE_END = LogEntriesPage(logs=[_log2], next_page_token=None) | ||||
| SEARCH_PAGE_EMPTY = LogEntriesPage([], None) | ||||
| 
 | ||||
| AGGS_RESPONSE = _status( | ||||
|   _shards({ | ||||
|     "hits": { | ||||
|       "total": 4, | ||||
|       "max_score": None, | ||||
|       "hits": [] | ||||
|     }, | ||||
|     "aggregations": { | ||||
|       "by_id": { | ||||
|         "doc_count_error_upper_bound": | ||||
|           0, | ||||
|         "sum_other_doc_count": | ||||
|           0, | ||||
|         "buckets": [{ | ||||
|           "key": 2, | ||||
|           "doc_count": 3, | ||||
|           "by_date": { | ||||
|             "buckets": [{ | ||||
|               "key_as_string": "2009-11-12T00:00:00.000Z", | ||||
|               "key": 1257984000000, | ||||
|               "doc_count": 1 | ||||
|             }, { | ||||
|               "key_as_string": "2009-11-13T00:00:00.000Z", | ||||
|               "key": 1258070400000, | ||||
|               "doc_count": 0 | ||||
|             }, { | ||||
|               "key_as_string": "2009-11-14T00:00:00.000Z", | ||||
|               "key": 1258156800000, | ||||
|               "doc_count": 2 | ||||
|             }] | ||||
|           } | ||||
|         }, { | ||||
|           "key": 1, | ||||
|           "doc_count": 1, | ||||
|           "by_date": { | ||||
|             "buckets": [{ | ||||
|               "key_as_string": "2009-11-15T00:00:00.000Z", | ||||
|               "key": 1258243200000, | ||||
|               "doc_count": 1 | ||||
|             }] | ||||
|           } | ||||
|         }] | ||||
|       } | ||||
|     } | ||||
|   })) | ||||
| 
 | ||||
| AGGS_REQUEST = { | ||||
|   "query": { | ||||
|     "bool": { | ||||
|       "filter": [{ | ||||
|         "term": { | ||||
|           "performer_id": 1 | ||||
|         } | ||||
|       }, { | ||||
|         "term": { | ||||
|           "repository_id": 1 | ||||
|         } | ||||
|       }, { | ||||
|         "bool": { | ||||
|           "must_not": [{ | ||||
|             "terms": { | ||||
|               "kind_id": [2] | ||||
|             } | ||||
|           }] | ||||
|         } | ||||
|       }], | ||||
|       "must": [{ | ||||
|         "range": { | ||||
|           "datetime": { | ||||
|             "lt": "2018-04-08T03:30:00", | ||||
|             "gte": "2018-03-08T03:30:00" | ||||
|           } | ||||
|         } | ||||
|       }] | ||||
|     } | ||||
|   }, | ||||
|   "aggs": { | ||||
|     "by_id": { | ||||
|       "terms": { | ||||
|         "field": "kind_id" | ||||
|       }, | ||||
|       "aggs": { | ||||
|         "by_date": { | ||||
|           "date_histogram": { | ||||
|             "field": "datetime", | ||||
|             "interval": "day" | ||||
|           } | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|   }, | ||||
|   "size": 0 | ||||
| } | ||||
| 
 | ||||
| AGGS_COUNT = [ | ||||
|   AggregatedLogCount(1, 1, parse("2009-11-15T00:00:00.000")), | ||||
|   AggregatedLogCount(2, 1, parse("2009-11-12T00:00:00.000")), | ||||
|   AggregatedLogCount(2, 2, parse("2009-11-14T00:00:00.000")) | ||||
| ] | ||||
| 
 | ||||
| COUNT_REQUEST = { | ||||
|   "query": { | ||||
|     "bool": { | ||||
|       "filter": [{ | ||||
|         "term": { | ||||
|           "repository_id": 1 | ||||
|         } | ||||
|       }] | ||||
|     } | ||||
|   } | ||||
| } | ||||
| COUNT_RESPONSE = _status(_shards({ | ||||
|   "count": 1, | ||||
| })) | ||||
| 
 | ||||
| # assume there are 2 pages | ||||
| _scroll_id = "DnF1ZXJ5VGhlbkZldGNoBQAAAAAAACEmFkk1aGlTRzdSUWllejZmYTlEYTN3SVEAAAAAAAAhJRZJNWhpU0c3UlFpZXo2ZmE5RGEzd0lRAAAAAAAAHtAWLWZpaFZXVzVSTy1OTXA5V3MwcHZrZwAAAAAAAB7RFi1maWhWV1c1Uk8tTk1wOVdzMHB2a2cAAAAAAAAhJxZJNWhpU0c3UlFpZXo2ZmE5RGEzd0lR" | ||||
| 
 | ||||
| 
 | ||||
| def _scroll(d): | ||||
|   d["_scroll_id"] = _scroll_id | ||||
|   return d | ||||
| 
 | ||||
| 
 | ||||
| SCROLL_CREATE = _status(_shards(_scroll(_hits([_hit1])))) | ||||
| SCROLL_GET = _status(_shards(_scroll(_hits([_hit2])))) | ||||
| SCROLL_GET_2 = _status(_shards(_scroll(_hits([])))) | ||||
| SCROLL_DELETE = _status({"succeeded": True, "num_freed": 5}) | ||||
| SCROLL_LOGS = [[_log1], [_log2]] | ||||
| 
 | ||||
| SCROLL_REQUESTS = [ | ||||
|   [ | ||||
|     "5m", 1, { | ||||
|       "sort": "_doc", | ||||
|       "query": { | ||||
|         "range": { | ||||
|           "datetime": { | ||||
|             "lt": "2018-04-02T00:00:00", | ||||
|             "gte": "2018-03-08T00:00:00" | ||||
|           } | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|   ], | ||||
|   [{"scroll": "5m", "scroll_id": _scroll_id}], | ||||
|   [{"scroll":"5m", "scroll_id": _scroll_id}], | ||||
|   [{"scroll_id": [_scroll_id]}], | ||||
| ] | ||||
| 
 | ||||
| SCROLL_RESPONSES = [SCROLL_CREATE, SCROLL_GET, SCROLL_GET_2, SCROLL_DELETE] | ||||
							
								
								
									
										130
									
								
								data/logs_model/test/test_combined_model.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										130
									
								
								data/logs_model/test/test_combined_model.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,130 @@ | |||
| from datetime import date, datetime, timedelta | ||||
| 
 | ||||
| from freezegun import freeze_time | ||||
| 
 | ||||
| from data.logs_model.inmemory_model import InMemoryModel | ||||
| from data.logs_model.combined_model import CombinedLogsModel | ||||
| 
 | ||||
| from test.fixtures import * | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def first_model(): | ||||
|   return InMemoryModel() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def second_model(): | ||||
|   return InMemoryModel() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def combined_model(first_model, second_model, initialized_db): | ||||
|   return CombinedLogsModel(first_model, second_model) | ||||
| 
 | ||||
| 
 | ||||
| def test_log_action(first_model, second_model, combined_model, initialized_db): | ||||
|   day = date(2019, 1, 1) | ||||
| 
 | ||||
|   # Write to the combined model. | ||||
|   with freeze_time(day): | ||||
|     combined_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                               ip='1.2.3.4') | ||||
| 
 | ||||
|   simple_repo = model.repository.get_repository('devtable', 'simple') | ||||
| 
 | ||||
|   # Make sure it is found in the first model but not the second. | ||||
|   assert combined_model.count_repository_actions(simple_repo, day) == 1 | ||||
|   assert first_model.count_repository_actions(simple_repo, day) == 1 | ||||
|   assert second_model.count_repository_actions(simple_repo, day) == 0 | ||||
| 
 | ||||
| 
 | ||||
| def test_count_repository_actions(first_model, second_model, combined_model, initialized_db): | ||||
|   # Write to each model. | ||||
|   first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                          ip='1.2.3.4') | ||||
|   first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                          ip='1.2.3.4') | ||||
|   first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                          ip='1.2.3.4') | ||||
| 
 | ||||
|   second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                           ip='1.2.3.4') | ||||
|   second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                           ip='1.2.3.4') | ||||
| 
 | ||||
|   # Ensure the counts match as expected. | ||||
|   day = datetime.today() - timedelta(minutes=60) | ||||
|   simple_repo = model.repository.get_repository('devtable', 'simple') | ||||
| 
 | ||||
|   assert first_model.count_repository_actions(simple_repo, day) == 3 | ||||
|   assert second_model.count_repository_actions(simple_repo, day) == 2 | ||||
|   assert combined_model.count_repository_actions(simple_repo, day) == 5 | ||||
| 
 | ||||
| 
 | ||||
| def test_yield_logs_for_export(first_model, second_model, combined_model, initialized_db): | ||||
|   now = datetime.now() | ||||
| 
 | ||||
|   # Write to each model. | ||||
|   first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                          ip='1.2.3.4') | ||||
|   first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                          ip='1.2.3.4') | ||||
|   first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                          ip='1.2.3.4') | ||||
| 
 | ||||
|   second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                           ip='1.2.3.4') | ||||
|   second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                           ip='1.2.3.4') | ||||
| 
 | ||||
|   later = datetime.now() | ||||
| 
 | ||||
|   # Ensure the full set of logs is yielded. | ||||
|   first_logs = list(first_model.yield_logs_for_export(now, later))[0] | ||||
|   second_logs = list(second_model.yield_logs_for_export(now, later))[0] | ||||
| 
 | ||||
|   combined = list(combined_model.yield_logs_for_export(now, later)) | ||||
|   full_combined = [] | ||||
|   for subset in combined: | ||||
|     full_combined.extend(subset) | ||||
| 
 | ||||
|   assert len(full_combined) == len(first_logs) + len(second_logs) | ||||
|   assert full_combined == (first_logs + second_logs) | ||||
| 
 | ||||
| 
 | ||||
| def test_lookup_logs(first_model, second_model, combined_model, initialized_db): | ||||
|   now = datetime.now() | ||||
| 
 | ||||
|   # Write to each model. | ||||
|   first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                          ip='1.2.3.4') | ||||
|   first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                          ip='1.2.3.4') | ||||
|   first_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                          ip='1.2.3.4') | ||||
| 
 | ||||
|   second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                           ip='1.2.3.4') | ||||
|   second_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                           ip='1.2.3.4') | ||||
| 
 | ||||
|   later = datetime.now() | ||||
| 
 | ||||
|   def _collect_logs(model): | ||||
|     page_token = None | ||||
|     all_logs = [] | ||||
|     while True: | ||||
|       paginated_logs = model.lookup_logs(now, later, page_token=page_token) | ||||
|       page_token = paginated_logs.next_page_token | ||||
|       all_logs.extend(paginated_logs.logs) | ||||
|       if page_token is None: | ||||
|         break | ||||
|     return all_logs | ||||
| 
 | ||||
|   first_logs = _collect_logs(first_model) | ||||
|   second_logs = _collect_logs(second_model) | ||||
|   combined = _collect_logs(combined_model) | ||||
| 
 | ||||
|   assert len(combined) == len(first_logs) + len(second_logs) | ||||
|   assert combined == (first_logs + second_logs) | ||||
							
								
								
									
										529
									
								
								data/logs_model/test/test_elasticsearch.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										529
									
								
								data/logs_model/test/test_elasticsearch.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,529 @@ | |||
| # -*- coding: utf-8 -*- | ||||
| 
 | ||||
| # pylint: disable=redefined-outer-name, wildcard-import | ||||
| 
 | ||||
| import json | ||||
| from datetime import datetime, timedelta | ||||
| 
 | ||||
| import pytest | ||||
| from mock import patch, Mock | ||||
| from dateutil.parser import parse | ||||
| 
 | ||||
| from httmock import urlmatch, HTTMock | ||||
| 
 | ||||
| from data.model.log import _json_serialize | ||||
| from data.logs_model.elastic_logs import ElasticsearchLogs, INDEX_NAME_PREFIX, INDEX_DATE_FORMAT | ||||
| from data.logs_model import configure, LogsModelProxy | ||||
| from mock_elasticsearch import * | ||||
| 
 | ||||
| FAKE_ES_HOST = 'fakees' | ||||
| FAKE_ES_HOST_PATTERN = r'fakees.*' | ||||
| FAKE_ES_PORT = 443 | ||||
| FAKE_AWS_ACCESS_KEY = None | ||||
| FAKE_AWS_SECRET_KEY = None | ||||
| FAKE_AWS_REGION = None | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def logs_model_config(): | ||||
|   conf = { | ||||
|     'LOGS_MODEL': 'elasticsearch', | ||||
|     'LOGS_MODEL_CONFIG': { | ||||
|       'producer': 'elasticsearch', | ||||
|       'elasticsearch_config': { | ||||
|         'host': FAKE_ES_HOST, | ||||
|         'port': FAKE_ES_PORT, | ||||
|         'access_key': FAKE_AWS_ACCESS_KEY, | ||||
|         'secret_key': FAKE_AWS_SECRET_KEY, | ||||
|         'aws_region': FAKE_AWS_REGION | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|   return conf | ||||
| 
 | ||||
| 
 | ||||
| FAKE_LOG_ENTRY_KINDS = {'push_repo': 1, 'pull_repo': 2} | ||||
| FAKE_NAMESPACES = { | ||||
|   'user1': | ||||
|     Mock(id=1, organization="user1.organization", username="user1.username", email="user1.email", | ||||
|          robot="user1.robot"), | ||||
|   'user2': | ||||
|     Mock(id=2, organization="user2.organization", username="user2.username", email="user2.email", | ||||
|          robot="user2.robot") | ||||
| } | ||||
| FAKE_REPOSITORIES = { | ||||
|   'user1/repo1': Mock(id=1, namespace_user=FAKE_NAMESPACES['user1']), | ||||
|   'user2/repo2': Mock(id=2, namespace_user=FAKE_NAMESPACES['user2']), | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def logs_model(): | ||||
|   # prevent logs model from changing | ||||
|   logs_model = LogsModelProxy() | ||||
|   with patch('data.logs_model.logs_model', logs_model): | ||||
|     yield logs_model | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope='function') | ||||
| def app_config(logs_model_config): | ||||
|   fake_config = {} | ||||
|   fake_config.update(logs_model_config) | ||||
|   with patch("data.logs_model.document_logs_model.config.app_config", fake_config): | ||||
|     yield fake_config | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def mock_page_size(): | ||||
|   with patch('data.logs_model.document_logs_model.PAGE_SIZE', 1): | ||||
|     yield | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def mock_max_result_window(): | ||||
|   with patch('data.logs_model.document_logs_model.DEFAULT_RESULT_WINDOW', 1): | ||||
|     yield | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def mock_random_id(): | ||||
|   mock_random = Mock(return_value=233) | ||||
|   with patch('data.logs_model.document_logs_model._random_id', mock_random): | ||||
|     yield | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def mock_db_model(): | ||||
|   def get_user_map_by_ids(namespace_ids): | ||||
|     mapping = {} | ||||
|     for i in namespace_ids: | ||||
|       for name in FAKE_NAMESPACES: | ||||
|         if FAKE_NAMESPACES[name].id == i: | ||||
|           mapping[i] = FAKE_NAMESPACES[name] | ||||
|     return mapping | ||||
| 
 | ||||
|   model = Mock( | ||||
|     user=Mock( | ||||
|       get_namespace_user=FAKE_NAMESPACES.get, | ||||
|       get_user_or_org=FAKE_NAMESPACES.get, | ||||
|       get_user=FAKE_NAMESPACES.get, | ||||
|       get_user_map_by_ids=get_user_map_by_ids, | ||||
|     ), | ||||
|     repository=Mock(get_repository=lambda user_name, repo_name: FAKE_REPOSITORIES.get( | ||||
|       user_name + '/' + repo_name), | ||||
|                     ), | ||||
|     log=Mock( | ||||
|       _get_log_entry_kind=lambda name: FAKE_LOG_ENTRY_KINDS[name], | ||||
|       _json_serialize=_json_serialize, | ||||
|       get_log_entry_kinds=Mock(return_value=FAKE_LOG_ENTRY_KINDS), | ||||
|     ), | ||||
|   ) | ||||
| 
 | ||||
|   with patch('data.logs_model.document_logs_model.model', model), patch( | ||||
|       'data.logs_model.datatypes.model', model): | ||||
|     yield | ||||
| 
 | ||||
| 
 | ||||
| def parse_query(query): | ||||
|   return {s.split('=')[0]: s.split('=')[1] for s in query.split("&") if s != ""} | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def mock_elasticsearch(): | ||||
|   mock = Mock() | ||||
|   mock.template.side_effect = NotImplementedError | ||||
|   mock.index.side_effect = NotImplementedError | ||||
|   mock.count.side_effect = NotImplementedError | ||||
|   mock.scroll_get.side_effect = NotImplementedError | ||||
|   mock.scroll_delete.side_effect = NotImplementedError | ||||
|   mock.search_scroll_create.side_effect = NotImplementedError | ||||
|   mock.search_aggs.side_effect = NotImplementedError | ||||
|   mock.search_after.side_effect = NotImplementedError | ||||
|   mock.list_indices.side_effect = NotImplementedError | ||||
| 
 | ||||
|   @urlmatch(netloc=r'.*', path=r'.*') | ||||
|   def default(url, req): | ||||
|     raise Exception('\nurl={}\nmethod={}\nreq.url={}\nheaders={}\nbody={}'.format( | ||||
|       url, req.method, req.url, req.headers, req.body)) | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/_template/.*') | ||||
|   def template(url, req): | ||||
|     return mock.template(url.query.split('/')[-1], req.body) | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/logentry_(\*|[0-9\-]+)') | ||||
|   def list_indices(url, req): | ||||
|     return mock.list_indices() | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/logentry_[0-9\-]*/_doc') | ||||
|   def index(url, req): | ||||
|     index = url.path.split('/')[1] | ||||
|     body = json.loads(req.body) | ||||
|     body['metadata_json'] = json.loads(body['metadata_json']) | ||||
|     return mock.index(index, body) | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/logentry_([0-9\-]*|\*)/_count') | ||||
|   def count(_, req): | ||||
|     return mock.count(json.loads(req.body)) | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/_search/scroll') | ||||
|   def scroll(url, req): | ||||
|     if req.method == 'DELETE': | ||||
|       return mock.scroll_delete(json.loads(req.body)) | ||||
|     elif req.method == 'GET': | ||||
|       request_obj = json.loads(req.body) | ||||
|       return mock.scroll_get(request_obj) | ||||
|     raise NotImplementedError() | ||||
| 
 | ||||
|   @urlmatch(netloc=FAKE_ES_HOST_PATTERN, path=r'/logentry_(\*|[0-9\-]*)/_search') | ||||
|   def search(url, req): | ||||
|     if "scroll" in url.query: | ||||
|       query = parse_query(url.query) | ||||
|       window_size = query['scroll'] | ||||
|       maximum_result_size = int(query['size']) | ||||
|       return mock.search_scroll_create(window_size, maximum_result_size, json.loads(req.body)) | ||||
|     elif "aggs" in req.body: | ||||
|       return mock.search_aggs(json.loads(req.body)) | ||||
|     else: | ||||
|       return mock.search_after(json.loads(req.body)) | ||||
| 
 | ||||
|   with HTTMock(scroll, count, search, index, template, list_indices, default): | ||||
|     yield mock | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|   """ | ||||
|   unlogged_pulls_ok, kind_name, namespace_name, repository, repository_name, | ||||
|   timestamp, | ||||
|   index_response, expected_request, throws | ||||
|   """, | ||||
|   [ | ||||
|     # Invalid inputs | ||||
|     pytest.param( | ||||
|       False, 'non-existing', None, None, None, | ||||
|       None, | ||||
|       None, None, True, | ||||
|       id="Invalid Kind" | ||||
|     ), | ||||
|     pytest.param( | ||||
|       False, 'pull_repo', 'user1', Mock(id=1), 'repo1', | ||||
|       None, | ||||
|       None, None, True, | ||||
|       id="Invalid Parameters" | ||||
|     ), | ||||
| 
 | ||||
|     # Remote exceptions | ||||
|     pytest.param( | ||||
|       False, 'pull_repo', 'user1', Mock(id=1), None, | ||||
|       None, | ||||
|       FAILURE_400, None, True, | ||||
|       id="Throw on pull log failure" | ||||
|     ), | ||||
|     pytest.param( | ||||
|       True, 'pull_repo', 'user1', Mock(id=1), None, | ||||
|       parse("2017-03-08T03:30"), | ||||
|       FAILURE_400, INDEX_REQUEST_2017_03_08, False, | ||||
|       id="Ok on pull log failure" | ||||
|     ), | ||||
| 
 | ||||
|     # Success executions | ||||
|     pytest.param( | ||||
|       False, 'pull_repo', 'user1', Mock(id=1), None, | ||||
|       parse("2017-03-08T03:30"), | ||||
|       INDEX_RESPONSE_2017_03_08, INDEX_REQUEST_2017_03_08, False, | ||||
|       id="Log with namespace name and repository" | ||||
|     ), | ||||
|     pytest.param( | ||||
|       False, 'push_repo', 'user1', None, 'repo1', | ||||
|       parse("2019-01-01T03:30"), | ||||
|       INDEX_RESPONSE_2019_01_01, INDEX_REQUEST_2019_01_01, False, | ||||
|       id="Log with namespace name and repository name" | ||||
|     ), | ||||
|   ]) | ||||
| def test_log_action(unlogged_pulls_ok, kind_name, namespace_name, repository, repository_name, | ||||
|                     timestamp, | ||||
|                     index_response, expected_request, throws, | ||||
|                     app_config, logs_model, mock_elasticsearch, mock_db_model, mock_random_id): | ||||
|   mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE) | ||||
|   mock_elasticsearch.index = Mock(return_value=index_response) | ||||
|   app_config['ALLOW_PULLS_WITHOUT_STRICT_LOGGING'] = unlogged_pulls_ok | ||||
|   configure(app_config) | ||||
| 
 | ||||
|   performer = Mock(id=1) | ||||
|   ip = "192.168.1.1" | ||||
|   metadata = {'key': 'value', 'time': parse("2018-03-08T03:30"), '😂': '😂👌👌👌👌'} | ||||
|   if throws: | ||||
|     with pytest.raises(Exception): | ||||
|       logs_model.log_action(kind_name, namespace_name, performer, ip, metadata, repository, | ||||
|                             repository_name, timestamp) | ||||
|   else: | ||||
|     logs_model.log_action(kind_name, namespace_name, performer, ip, metadata, repository, | ||||
|                           repository_name, timestamp) | ||||
|     mock_elasticsearch.index.assert_called_with(*expected_request) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|   """ | ||||
|   start_datetime, end_datetime, | ||||
|   performer_name, repository_name, namespace_name, | ||||
|   filter_kinds, | ||||
|   page_token, | ||||
|   max_page_count, | ||||
|   search_response, | ||||
|   list_indices_response, | ||||
|   expected_request, | ||||
|   expected_page, | ||||
|   throws | ||||
|   """, | ||||
|   [ | ||||
|     # 1st page | ||||
|     pytest.param( | ||||
|       parse('2018-03-08T03:30'), parse('2018-04-08T03:30'), | ||||
|       'user1', 'repo1', 'user1', | ||||
|       None, | ||||
|       None, | ||||
|       None, | ||||
|       SEARCH_RESPONSE_START, | ||||
|       INDEX_LIST_RESPONSE_HIT1_HIT2, | ||||
|       SEARCH_REQUEST_START, | ||||
|       SEARCH_PAGE_START, | ||||
|       False, | ||||
|       id="1st page" | ||||
|     ), | ||||
| 
 | ||||
|     # Last page | ||||
|     pytest.param( | ||||
|       parse('2018-03-08T03:30'), parse('2018-04-08T03:30'), | ||||
|       'user1', 'repo1', 'user1', | ||||
|       None, | ||||
|       SEARCH_PAGE_TOKEN, | ||||
|       None, | ||||
|       SEARCH_RESPONSE_END, | ||||
|       INDEX_LIST_RESPONSE_HIT1_HIT2, | ||||
|       SEARCH_REQUEST_END, | ||||
|       SEARCH_PAGE_END, | ||||
|       False, | ||||
|       id="Search using pagination token" | ||||
|     ), | ||||
| 
 | ||||
|     # Filter | ||||
|     pytest.param( | ||||
|       parse('2018-03-08T03:30'), parse('2018-04-08T03:30'), | ||||
|       'user1', 'repo1', 'user1', | ||||
|       ['push_repo'], | ||||
|       None, | ||||
|       None, | ||||
|       SEARCH_RESPONSE_END, | ||||
|       INDEX_LIST_RESPONSE_HIT2, | ||||
|       SEARCH_REQUEST_FILTER, | ||||
|       SEARCH_PAGE_END, | ||||
|       False, | ||||
|       id="Filtered search" | ||||
|     ), | ||||
| 
 | ||||
|     # Max page count | ||||
|     pytest.param( | ||||
|       parse('2018-03-08T03:30'), parse('2018-04-08T03:30'), | ||||
|       'user1', 'repo1', 'user1', | ||||
|       None, | ||||
|       SEARCH_PAGE_TOKEN, | ||||
|       1, | ||||
|       AssertionError, # Assert that it should not reach the ES server | ||||
|       None, | ||||
|       None, | ||||
|       SEARCH_PAGE_EMPTY, | ||||
|       False, | ||||
|       id="Page token reaches maximum page count", | ||||
|      ), | ||||
|   ]) | ||||
| def test_lookup_logs(start_datetime, end_datetime, | ||||
|                      performer_name, repository_name, namespace_name, | ||||
|                      filter_kinds, | ||||
|                      page_token, | ||||
|                      max_page_count, | ||||
|                      search_response, | ||||
|                      list_indices_response, | ||||
|                      expected_request, | ||||
|                      expected_page, | ||||
|                      throws, | ||||
|                      logs_model, mock_elasticsearch, mock_db_model, mock_page_size, app_config): | ||||
|   mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE) | ||||
|   mock_elasticsearch.search_after = Mock(return_value=search_response) | ||||
|   mock_elasticsearch.list_indices = Mock(return_value=list_indices_response) | ||||
| 
 | ||||
|   configure(app_config) | ||||
|   if throws: | ||||
|     with pytest.raises(Exception): | ||||
|       logs_model.lookup_logs(start_datetime, end_datetime, performer_name, repository_name, | ||||
|                              namespace_name, filter_kinds, page_token, max_page_count) | ||||
|   else: | ||||
|     page = logs_model.lookup_logs(start_datetime, end_datetime, performer_name, repository_name, | ||||
|                                   namespace_name, filter_kinds, page_token, max_page_count) | ||||
|     assert page == expected_page | ||||
|     if expected_request: | ||||
|       mock_elasticsearch.search_after.assert_called_with(expected_request) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|   """ | ||||
|   start_datetime, end_datetime, | ||||
|   performer_name, repository_name, namespace_name, | ||||
|   filter_kinds, search_response, expected_request, expected_counts, throws | ||||
|   """, | ||||
|   [ | ||||
|     # Valid | ||||
|     pytest.param( | ||||
|       parse('2018-03-08T03:30'), parse('2018-04-08T03:30'), | ||||
|       'user1', 'repo1', 'user1', | ||||
|       ['pull_repo'], AGGS_RESPONSE, AGGS_REQUEST, AGGS_COUNT, False, | ||||
|       id="Valid Counts" | ||||
|     ), | ||||
| 
 | ||||
|     # Invalid case: date range too big | ||||
|     pytest.param( | ||||
|       parse('2018-03-08T03:30'), parse('2018-04-09T03:30'), | ||||
|       'user1', 'repo1', 'user1', | ||||
|       [], None, None, None, True, | ||||
|       id="Throw on date range too big" | ||||
|     ) | ||||
|   ]) | ||||
| def test_get_aggregated_log_counts(start_datetime, end_datetime, | ||||
|                                    performer_name, repository_name, namespace_name, | ||||
|                                    filter_kinds, search_response, expected_request, expected_counts, throws, | ||||
|                                    logs_model, mock_elasticsearch, mock_db_model, app_config): | ||||
|   mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE) | ||||
|   mock_elasticsearch.search_aggs = Mock(return_value=search_response) | ||||
| 
 | ||||
|   configure(app_config) | ||||
|   if throws: | ||||
|     with pytest.raises(Exception): | ||||
|       logs_model.get_aggregated_log_counts(start_datetime, end_datetime, performer_name, | ||||
|                                            repository_name, namespace_name, filter_kinds) | ||||
|   else: | ||||
|     counts = logs_model.get_aggregated_log_counts(start_datetime, end_datetime, performer_name, | ||||
|                                                   repository_name, namespace_name, filter_kinds) | ||||
|     assert set(counts) == set(expected_counts) | ||||
|     if expected_request: | ||||
|       mock_elasticsearch.search_aggs.assert_called_with(expected_request) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|   """ | ||||
|   repository, | ||||
|   day, | ||||
|   count_response, expected_request, expected_count, throws | ||||
|   """, | ||||
|   [ | ||||
|     pytest.param( | ||||
|       FAKE_REPOSITORIES['user1/repo1'], | ||||
|       parse("2018-03-08").date(), | ||||
|       COUNT_RESPONSE, COUNT_REQUEST, 1, False, | ||||
|       id="Valid Count with 1 as result"), | ||||
|   ]) | ||||
| def test_count_repository_actions(repository, | ||||
|                                   day, | ||||
|                                   count_response, expected_request, expected_count, throws, | ||||
|                                   logs_model, mock_elasticsearch, mock_db_model, app_config): | ||||
|   mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE) | ||||
|   mock_elasticsearch.count = Mock(return_value=count_response) | ||||
|   mock_elasticsearch.list_indices = Mock(return_value=INDEX_LIST_RESPONSE) | ||||
| 
 | ||||
|   configure(app_config) | ||||
|   if throws: | ||||
|     with pytest.raises(Exception): | ||||
|       logs_model.count_repository_actions(repository, day) | ||||
|   else: | ||||
|     count = logs_model.count_repository_actions(repository, day) | ||||
|     assert count == expected_count | ||||
|     if expected_request: | ||||
|       mock_elasticsearch.count.assert_called_with(expected_request) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|   """ | ||||
|   start_datetime, end_datetime, | ||||
|   repository_id, namespace_id, | ||||
|   max_query_time, scroll_responses, expected_requests, expected_logs, throws | ||||
|   """, | ||||
|   [ | ||||
|     pytest.param( | ||||
|       parse("2018-03-08"), parse("2018-04-02"), | ||||
|       1, 1, | ||||
|       timedelta(seconds=10), SCROLL_RESPONSES, SCROLL_REQUESTS, SCROLL_LOGS, False, | ||||
|       id="Scroll 3 pages with page size = 1" | ||||
|     ), | ||||
|   ]) | ||||
| def test_yield_logs_for_export(start_datetime, end_datetime, | ||||
|                                repository_id, namespace_id, | ||||
|                                max_query_time, scroll_responses, expected_requests, expected_logs, throws, | ||||
|                                logs_model, mock_elasticsearch, mock_db_model, mock_max_result_window, app_config): | ||||
|   mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE) | ||||
|   mock_elasticsearch.search_scroll_create = Mock(return_value=scroll_responses[0]) | ||||
|   mock_elasticsearch.scroll_get = Mock(side_effect=scroll_responses[1:-1]) | ||||
|   mock_elasticsearch.scroll_delete = Mock(return_value=scroll_responses[-1]) | ||||
| 
 | ||||
|   configure(app_config) | ||||
|   if throws: | ||||
|     with pytest.raises(Exception): | ||||
|       logs_model.yield_logs_for_export(start_datetime, end_datetime, max_query_time=max_query_time) | ||||
|   else: | ||||
|     log_generator = logs_model.yield_logs_for_export(start_datetime, end_datetime, | ||||
|                                                      max_query_time=max_query_time) | ||||
|     counter = 0 | ||||
|     for logs in log_generator: | ||||
|       if counter == 0: | ||||
|         mock_elasticsearch.search_scroll_create.assert_called_with(*expected_requests[counter]) | ||||
|       else: | ||||
|         mock_elasticsearch.scroll_get.assert_called_with(*expected_requests[counter]) | ||||
|       assert expected_logs[counter] == logs | ||||
|       counter += 1 | ||||
|     # the last two requests must be | ||||
|     # 1. get with response scroll with 0 hits, which indicates the termination condition | ||||
|     # 2. delete scroll request | ||||
|     mock_elasticsearch.scroll_get.assert_called_with(*expected_requests[-2]) | ||||
|     mock_elasticsearch.scroll_delete.assert_called_with(*expected_requests[-1]) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('prefix, is_valid', [ | ||||
|   pytest.param('..', False, id='Invalid `..`'), | ||||
|   pytest.param('.', False, id='Invalid `.`'), | ||||
|   pytest.param('-prefix', False, id='Invalid prefix start -'), | ||||
|   pytest.param('_prefix', False, id='Invalid prefix start _'), | ||||
|   pytest.param('+prefix', False, id='Invalid prefix start +'), | ||||
|   pytest.param('prefix_with_UPPERCASES', False, id='Invalid uppercase'), | ||||
|   pytest.param('valid_index', True, id='Valid prefix'), | ||||
|   pytest.param('valid_index_with_numbers1234', True, id='Valid prefix with numbers'), | ||||
|   pytest.param('a'*256, False, id='Prefix too long') | ||||
| ]) | ||||
| def test_valid_index_prefix(prefix, is_valid): | ||||
|   assert ElasticsearchLogs._valid_index_prefix(prefix) == is_valid | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('index, cutoff_date, expected_result', [ | ||||
|   pytest.param( | ||||
|     INDEX_NAME_PREFIX+'2019-06-06', | ||||
|     datetime(2019, 6, 8), | ||||
|     True, | ||||
|     id="Index older than cutoff" | ||||
|   ), | ||||
|   pytest.param( | ||||
|     INDEX_NAME_PREFIX+'2019-06-06', | ||||
|     datetime(2019, 6, 4), | ||||
|     False, | ||||
|     id="Index younger than cutoff" | ||||
|   ), | ||||
|   pytest.param( | ||||
|     INDEX_NAME_PREFIX+'2019-06-06', | ||||
|     datetime(2019, 6, 6, 23), | ||||
|     False, | ||||
|     id="Index older than cutoff but timedelta less than 1 day" | ||||
|   ), | ||||
|   pytest.param( | ||||
|     INDEX_NAME_PREFIX+'2019-06-06', | ||||
|     datetime(2019, 6, 7), | ||||
|     True, | ||||
|     id="Index older than cutoff by exactly one day" | ||||
|   ), | ||||
| ]) | ||||
| def test_can_delete_index(index, cutoff_date, expected_result): | ||||
|   es = ElasticsearchLogs(index_prefix=INDEX_NAME_PREFIX) | ||||
|   assert datetime.strptime(index.split(es._index_prefix, 1)[-1], INDEX_DATE_FORMAT) | ||||
|   assert es.can_delete_index(index, cutoff_date) == expected_result | ||||
							
								
								
									
										473
									
								
								data/logs_model/test/test_logs_interface.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										473
									
								
								data/logs_model/test/test_logs_interface.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,473 @@ | |||
| from datetime import datetime, timedelta, date | ||||
| from data.logs_model.datatypes import AggregatedLogCount | ||||
| from data.logs_model.table_logs_model import TableLogsModel | ||||
| from data.logs_model.combined_model import CombinedLogsModel | ||||
| from data.logs_model.inmemory_model import InMemoryModel | ||||
| from data.logs_model.combined_model import _merge_aggregated_log_counts | ||||
| from data.logs_model.document_logs_model import _date_range_in_single_index, DocumentLogsModel | ||||
| from data.logs_model.interface import LogsIterationTimeout | ||||
| from data.logs_model.test.fake_elasticsearch import FAKE_ES_HOST, fake_elasticsearch | ||||
| 
 | ||||
| from data.database import LogEntry, LogEntry2, LogEntry3, LogEntryKind | ||||
| from data import model | ||||
| 
 | ||||
| from test.fixtures import * | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def mock_page_size(): | ||||
|   page_size = 2 | ||||
|   with patch('data.logs_model.document_logs_model.PAGE_SIZE', page_size): | ||||
|     yield page_size | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def clear_db_logs(initialized_db): | ||||
|   LogEntry.delete().execute() | ||||
|   LogEntry2.delete().execute() | ||||
|   LogEntry3.delete().execute() | ||||
| 
 | ||||
| 
 | ||||
| def combined_model(): | ||||
|   return CombinedLogsModel(TableLogsModel(), InMemoryModel()) | ||||
| 
 | ||||
| 
 | ||||
| def es_model(): | ||||
|   return DocumentLogsModel(producer='elasticsearch', elasticsearch_config={ | ||||
|     'host': FAKE_ES_HOST, | ||||
|     'port': 12345, | ||||
|   }) | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def fake_es(): | ||||
|   with fake_elasticsearch(): | ||||
|     yield | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(params=[TableLogsModel, InMemoryModel, es_model, combined_model]) | ||||
| def logs_model(request, clear_db_logs, fake_es): | ||||
|   return request.param() | ||||
| 
 | ||||
| 
 | ||||
| def _lookup_logs(logs_model, start_time, end_time, **kwargs): | ||||
|   logs_found = [] | ||||
|   page_token = None | ||||
|   while True: | ||||
|     found = logs_model.lookup_logs(start_time, end_time, page_token=page_token, **kwargs) | ||||
|     logs_found.extend(found.logs) | ||||
|     page_token = found.next_page_token | ||||
|     if not found.logs or not page_token: | ||||
|       break | ||||
| 
 | ||||
|   assert len(logs_found) == len(set(logs_found)) | ||||
|   return logs_found | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.skipif(os.environ.get('TEST_DATABASE_URI', '').find('mysql') >= 0,  | ||||
|                     reason='Flaky on MySQL') | ||||
| @pytest.mark.parametrize('namespace_name, repo_name, performer_name, check_args, expect_results', [ | ||||
|   pytest.param('devtable', 'simple', 'devtable', {}, True, id='no filters'), | ||||
|   pytest.param('devtable', 'simple', 'devtable', { | ||||
|     'performer_name': 'devtable', | ||||
|   }, True, id='matching performer'), | ||||
| 
 | ||||
|   pytest.param('devtable', 'simple', 'devtable', { | ||||
|     'namespace_name': 'devtable', | ||||
|   }, True, id='matching namespace'), | ||||
| 
 | ||||
|   pytest.param('devtable', 'simple', 'devtable', { | ||||
|     'namespace_name': 'devtable', | ||||
|     'repository_name': 'simple', | ||||
|   }, True, id='matching repository'), | ||||
| 
 | ||||
|   pytest.param('devtable', 'simple', 'devtable', { | ||||
|     'performer_name': 'public', | ||||
|   }, False, id='different performer'), | ||||
| 
 | ||||
|   pytest.param('devtable', 'simple', 'devtable', { | ||||
|     'namespace_name': 'public', | ||||
|   }, False, id='different namespace'), | ||||
| 
 | ||||
|   pytest.param('devtable', 'simple', 'devtable', { | ||||
|     'namespace_name': 'devtable', | ||||
|     'repository_name': 'complex', | ||||
|   }, False, id='different repository'), | ||||
| ]) | ||||
| def test_logs(namespace_name, repo_name, performer_name, check_args, expect_results, logs_model): | ||||
|   # Add some logs. | ||||
|   kinds = list(LogEntryKind.select()) | ||||
|   user = model.user.get_user(performer_name) | ||||
| 
 | ||||
|   start_timestamp = datetime.utcnow() | ||||
|   timestamp = start_timestamp | ||||
| 
 | ||||
|   for kind in kinds: | ||||
|     for index in range(0, 3): | ||||
|       logs_model.log_action(kind.name, namespace_name=namespace_name, repository_name=repo_name, | ||||
|                             performer=user, ip='1.2.3.4', timestamp=timestamp) | ||||
|       timestamp = timestamp + timedelta(seconds=1) | ||||
| 
 | ||||
|   found = _lookup_logs(logs_model, start_timestamp, start_timestamp + timedelta(minutes=10), | ||||
|                        **check_args) | ||||
|   if expect_results: | ||||
|     assert len(found) == len(kinds) * 3 | ||||
|   else: | ||||
|     assert not found | ||||
| 
 | ||||
|   aggregated_counts = logs_model.get_aggregated_log_counts(start_timestamp, | ||||
|                                                            start_timestamp + timedelta(minutes=10), | ||||
|                                                            **check_args) | ||||
|   if expect_results: | ||||
|     assert len(aggregated_counts) == len(kinds) | ||||
|     for ac in aggregated_counts: | ||||
|       assert ac.count == 3 | ||||
|   else: | ||||
|     assert not aggregated_counts | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('filter_kinds, expect_results', [ | ||||
|   pytest.param(None, True), | ||||
|   pytest.param(['push_repo'], True, id='push_repo filter'), | ||||
|   pytest.param(['pull_repo'], True, id='pull_repo filter'), | ||||
|   pytest.param(['push_repo', 'pull_repo'], False, id='push and pull filters') | ||||
| ]) | ||||
| def test_lookup_latest_logs(filter_kinds, expect_results, logs_model): | ||||
|   kind_map = model.log.get_log_entry_kinds() | ||||
|   if filter_kinds: | ||||
|     ignore_ids = [kind_map[kind_name] for kind_name in filter_kinds if filter_kinds] | ||||
|   else: | ||||
|     ignore_ids = [] | ||||
| 
 | ||||
|   now = datetime.now() | ||||
|   namespace_name = 'devtable' | ||||
|   repo_name = 'simple' | ||||
|   performer_name = 'devtable' | ||||
| 
 | ||||
|   user = model.user.get_user(performer_name) | ||||
|   size = 3 | ||||
| 
 | ||||
|   # Log some push actions | ||||
|   logs_model.log_action('push_repo', namespace_name=namespace_name, repository_name=repo_name, | ||||
|                         performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=1, seconds=11)) | ||||
|   logs_model.log_action('push_repo', namespace_name=namespace_name, repository_name=repo_name, | ||||
|                         performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=7, seconds=33)) | ||||
| 
 | ||||
|   # Log some pull actions | ||||
|   logs_model.log_action('pull_repo', namespace_name=namespace_name, repository_name=repo_name, | ||||
|                         performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=0, seconds=3)) | ||||
|   logs_model.log_action('pull_repo', namespace_name=namespace_name, repository_name=repo_name, | ||||
|                         performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=3, seconds=55)) | ||||
|   logs_model.log_action('pull_repo', namespace_name=namespace_name, repository_name=repo_name, | ||||
|                         performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=5, seconds=3)) | ||||
|   logs_model.log_action('pull_repo', namespace_name=namespace_name, repository_name=repo_name, | ||||
|                         performer=user, ip='0.0.0.0', timestamp=now-timedelta(days=11, seconds=11)) | ||||
| 
 | ||||
|   # Get the latest logs | ||||
|   latest_logs = logs_model.lookup_latest_logs(performer_name, repo_name, namespace_name, | ||||
|                                               filter_kinds=filter_kinds, size=size) | ||||
| 
 | ||||
|   # Test max lookup size | ||||
|   assert len(latest_logs) <= size | ||||
| 
 | ||||
|   # Make sure that the latest logs returned are in decreasing order | ||||
|   assert all(x >= y for x, y in zip(latest_logs, latest_logs[1:])) | ||||
| 
 | ||||
|   if expect_results: | ||||
|     assert latest_logs | ||||
| 
 | ||||
|     # Lookup all logs filtered by kinds and sort them in reverse chronological order | ||||
|     all_logs = _lookup_logs(logs_model, now - timedelta(days=30), now + timedelta(days=30), | ||||
|                             filter_kinds=filter_kinds, namespace_name=namespace_name, | ||||
|                             repository_name=repo_name) | ||||
|     all_logs = sorted(all_logs, key=lambda l: l.datetime, reverse=True) | ||||
| 
 | ||||
|     # Check that querying all logs does not return the filtered kinds | ||||
|     assert all([log.kind_id not in ignore_ids for log in all_logs]) | ||||
| 
 | ||||
|     # Check that the latest logs contains only th most recent ones | ||||
|     assert latest_logs == all_logs[:len(latest_logs)] | ||||
| 
 | ||||
| 
 | ||||
| def test_count_repository_actions(logs_model): | ||||
|   # Log some actions. | ||||
|   logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                         ip='1.2.3.4') | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple', | ||||
|                         ip='1.2.3.4') | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple', | ||||
|                         ip='1.2.3.4') | ||||
| 
 | ||||
|   # Log some actions to a different repo. | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='complex', | ||||
|                         ip='1.2.3.4') | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='complex', | ||||
|                         ip='1.2.3.4') | ||||
| 
 | ||||
|   # Count the actions. | ||||
|   day = date.today() | ||||
|   simple_repo = model.repository.get_repository('devtable', 'simple') | ||||
| 
 | ||||
|   count = logs_model.count_repository_actions(simple_repo, day) | ||||
|   assert count == 3 | ||||
| 
 | ||||
|   complex_repo = model.repository.get_repository('devtable', 'complex') | ||||
|   count = logs_model.count_repository_actions(complex_repo, day) | ||||
|   assert count == 2 | ||||
| 
 | ||||
|   # Try counting actions for a few days in the future to ensure it doesn't raise an error. | ||||
|   count = logs_model.count_repository_actions(simple_repo, day + timedelta(days=5)) | ||||
|   assert count == 0 | ||||
| 
 | ||||
| 
 | ||||
| def test_yield_log_rotation_context(logs_model): | ||||
|   cutoff_date = datetime.now() | ||||
|   min_logs_per_rotation = 3 | ||||
| 
 | ||||
|   # Log some actions to be archived | ||||
|   # One day | ||||
|   logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple1', | ||||
|                         ip='1.2.3.4', timestamp=cutoff_date-timedelta(days=1, seconds=1)) | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2', | ||||
|                         ip='5.6.7.8', timestamp=cutoff_date-timedelta(days=1, seconds=2)) | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple3', | ||||
|                         ip='9.10.11.12', timestamp=cutoff_date-timedelta(days=1, seconds=3)) | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple4', | ||||
|                         ip='0.0.0.0', timestamp=cutoff_date-timedelta(days=1, seconds=4)) | ||||
|   # Another day | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple5', | ||||
|                         ip='1.1.1.1', timestamp=cutoff_date-timedelta(days=2, seconds=1)) | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple5', | ||||
|                         ip='1.1.1.1', timestamp=cutoff_date-timedelta(days=2, seconds=2)) | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple5', | ||||
|                         ip='1.1.1.1', timestamp=cutoff_date-timedelta(days=2, seconds=3)) | ||||
| 
 | ||||
|   found = _lookup_logs(logs_model, cutoff_date - timedelta(days=3), cutoff_date + timedelta(days=1)) | ||||
|   assert found is not None and len(found) == 7 | ||||
| 
 | ||||
|   # Iterate the logs using the log rotation contexts | ||||
|   all_logs = [] | ||||
|   for log_rotation_context in logs_model.yield_log_rotation_context(cutoff_date, | ||||
|                                                                     min_logs_per_rotation): | ||||
|     with log_rotation_context as context: | ||||
|       for logs, _ in context.yield_logs_batch(): | ||||
|         all_logs.extend(logs) | ||||
| 
 | ||||
|   assert len(all_logs) == 7 | ||||
|   found = _lookup_logs(logs_model, cutoff_date - timedelta(days=3), cutoff_date + timedelta(days=1)) | ||||
|   assert not found | ||||
| 
 | ||||
|   # Make sure all datetimes are monotonically increasing (by datetime) after sorting the lookup | ||||
|   # to make sure no duplicates were returned | ||||
|   all_logs.sort(key=lambda d: d.datetime) | ||||
|   assert all(x.datetime < y.datetime for x, y in zip(all_logs, all_logs[1:])) | ||||
| 
 | ||||
| 
 | ||||
| def test_count_repository_actions_with_wildcard_disabled(initialized_db): | ||||
|   with fake_elasticsearch(allow_wildcard=False): | ||||
|     logs_model = es_model() | ||||
| 
 | ||||
|     # Log some actions. | ||||
|     logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                           ip='1.2.3.4') | ||||
| 
 | ||||
|     logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple', | ||||
|                           ip='1.2.3.4') | ||||
|     logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple', | ||||
|                           ip='1.2.3.4') | ||||
| 
 | ||||
|     # Log some actions to a different repo. | ||||
|     logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='complex', | ||||
|                           ip='1.2.3.4') | ||||
|     logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='complex', | ||||
|                           ip='1.2.3.4') | ||||
| 
 | ||||
|     # Count the actions. | ||||
|     day = date.today() | ||||
|     simple_repo = model.repository.get_repository('devtable', 'simple') | ||||
| 
 | ||||
|     count = logs_model.count_repository_actions(simple_repo, day) | ||||
|     assert count == 3 | ||||
| 
 | ||||
|     complex_repo = model.repository.get_repository('devtable', 'complex') | ||||
|     count = logs_model.count_repository_actions(complex_repo, day) | ||||
|     assert count == 2 | ||||
| 
 | ||||
|     # Try counting actions for a few days in the future to ensure it doesn't raise an error. | ||||
|     count = logs_model.count_repository_actions(simple_repo, day + timedelta(days=5)) | ||||
|     assert count == 0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.skipif(os.environ.get('TEST_DATABASE_URI', '').find('mysql') >= 0,  | ||||
|                     reason='Flaky on MySQL') | ||||
| def test_yield_logs_for_export(logs_model): | ||||
|   # Add some logs. | ||||
|   kinds = list(LogEntryKind.select()) | ||||
|   user = model.user.get_user('devtable') | ||||
| 
 | ||||
|   start_timestamp = datetime.utcnow() | ||||
|   timestamp = start_timestamp | ||||
| 
 | ||||
|   for kind in kinds: | ||||
|     for index in range(0, 10): | ||||
|       logs_model.log_action(kind.name, namespace_name='devtable', repository_name='simple', | ||||
|                             performer=user, ip='1.2.3.4', timestamp=timestamp) | ||||
|       timestamp = timestamp + timedelta(seconds=1) | ||||
| 
 | ||||
|   # Yield the logs. | ||||
|   simple_repo = model.repository.get_repository('devtable', 'simple') | ||||
|   logs_found = [] | ||||
|   for logs in logs_model.yield_logs_for_export(start_timestamp, timestamp + timedelta(minutes=10), | ||||
|                                                repository_id=simple_repo.id): | ||||
|     logs_found.extend(logs) | ||||
| 
 | ||||
|   # Ensure we found all added logs. | ||||
|   assert len(logs_found) == len(kinds) * 10 | ||||
| 
 | ||||
| 
 | ||||
| def test_yield_logs_for_export_timeout(logs_model): | ||||
|   # Add some logs. | ||||
|   kinds = list(LogEntryKind.select()) | ||||
|   user = model.user.get_user('devtable') | ||||
| 
 | ||||
|   start_timestamp = datetime.utcnow() | ||||
|   timestamp = start_timestamp | ||||
| 
 | ||||
|   for kind in kinds: | ||||
|     for _ in range(0, 2): | ||||
|       logs_model.log_action(kind.name, namespace_name='devtable', repository_name='simple', | ||||
|                             performer=user, ip='1.2.3.4', timestamp=timestamp) | ||||
|       timestamp = timestamp + timedelta(seconds=1) | ||||
| 
 | ||||
|   # Yield the logs. Since we set the timeout to nothing, it should immediately fail. | ||||
|   simple_repo = model.repository.get_repository('devtable', 'simple') | ||||
|   with pytest.raises(LogsIterationTimeout): | ||||
|     list(logs_model.yield_logs_for_export(start_timestamp, timestamp + timedelta(minutes=1), | ||||
|                                           repository_id=simple_repo.id, | ||||
|                                           max_query_time=timedelta(seconds=0))) | ||||
| 
 | ||||
| 
 | ||||
| def test_disabled_namespace(clear_db_logs): | ||||
|   logs_model = TableLogsModel(lambda kind, namespace, is_free: namespace == 'devtable') | ||||
| 
 | ||||
|   # Log some actions. | ||||
|   logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple', | ||||
|                         ip='1.2.3.4') | ||||
| 
 | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple', | ||||
|                         ip='1.2.3.4') | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple', | ||||
|                         ip='1.2.3.4') | ||||
| 
 | ||||
|   # Log some actions to a different namespace. | ||||
|   logs_model.log_action('push_repo', namespace_name='buynlarge', repository_name='orgrepo', | ||||
|                         ip='1.2.3.4') | ||||
| 
 | ||||
|   logs_model.log_action('pull_repo', namespace_name='buynlarge', repository_name='orgrepo', | ||||
|                         ip='1.2.3.4') | ||||
|   logs_model.log_action('pull_repo', namespace_name='buynlarge', repository_name='orgrepo', | ||||
|                         ip='1.2.3.4') | ||||
| 
 | ||||
|   # Count the actions. | ||||
|   day = datetime.today() - timedelta(minutes=60) | ||||
|   simple_repo = model.repository.get_repository('devtable', 'simple') | ||||
|   count = logs_model.count_repository_actions(simple_repo, day) | ||||
|   assert count == 0 | ||||
| 
 | ||||
|   org_repo = model.repository.get_repository('buynlarge', 'orgrepo') | ||||
|   count = logs_model.count_repository_actions(org_repo, day) | ||||
|   assert count == 3 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('aggregated_log_counts1, aggregated_log_counts2, expected_result', [ | ||||
|   pytest.param( | ||||
|     [ | ||||
|       AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0)), # 1 | ||||
|       AggregatedLogCount(1, 3, datetime(2019, 6, 7, 0, 0)), # 2 | ||||
|     ], | ||||
|     [ | ||||
|       AggregatedLogCount(1, 5, datetime(2019, 6, 6, 0, 0)), # 1 | ||||
|       AggregatedLogCount(1, 7, datetime(2019, 6, 7, 0, 0)), # 2 | ||||
|       AggregatedLogCount(3, 3, datetime(2019, 6, 1, 0, 0)), # 3 | ||||
|     ], | ||||
|     [ | ||||
|       AggregatedLogCount(1, 8, datetime(2019, 6, 6, 0, 0)), # 1 | ||||
|       AggregatedLogCount(1, 10, datetime(2019, 6, 7, 0, 0)), # 2 | ||||
|       AggregatedLogCount(3, 3, datetime(2019, 6, 1, 0, 0)) # 3 | ||||
|     ] | ||||
|   ), | ||||
|   pytest.param( | ||||
|     [ | ||||
|       AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0)), # 1 | ||||
|     ], | ||||
|     [ | ||||
|       AggregatedLogCount(1, 7, datetime(2019, 6, 7, 0, 0)), # 2 | ||||
|     ], | ||||
|     [ | ||||
|       AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0)), # 1 | ||||
|       AggregatedLogCount(1, 7, datetime(2019, 6, 7, 0, 0)), # 2 | ||||
|     ] | ||||
|   ), | ||||
|   pytest.param( | ||||
|     [], | ||||
|     [AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0))], | ||||
|     [AggregatedLogCount(1, 3, datetime(2019, 6, 6, 0, 0))] | ||||
|   ), | ||||
| ]) | ||||
| def test_merge_aggregated_log_counts(aggregated_log_counts1, aggregated_log_counts2, expected_result): | ||||
|   assert (sorted(_merge_aggregated_log_counts(aggregated_log_counts1, aggregated_log_counts2)) == | ||||
|           sorted(expected_result)) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('dt1, dt2, expected_result', [ | ||||
|   # Valid dates | ||||
|   pytest.param(date(2019, 6, 17), date(2019, 6, 18), True), | ||||
| 
 | ||||
|   # Invalid dates | ||||
|   pytest.param(date(2019, 6, 17), date(2019, 6, 17), False), | ||||
|   pytest.param(date(2019, 6, 17), date(2019, 6, 19), False), | ||||
|   pytest.param(date(2019, 6, 18), date(2019, 6, 17), False), | ||||
| 
 | ||||
|   # Valid datetimes  | ||||
|   pytest.param(datetime(2019, 6, 17, 0, 1), datetime(2019, 6, 17, 0, 2), True), | ||||
| 
 | ||||
|   # Invalid datetimes | ||||
|   pytest.param(datetime(2019, 6, 17, 0, 2), datetime(2019, 6, 17, 0, 1), False), | ||||
|   pytest.param(datetime(2019, 6, 17, 11), datetime(2019, 6, 17, 11) + timedelta(hours=14), False), | ||||
| ]) | ||||
| def test_date_range_in_single_index(dt1, dt2, expected_result): | ||||
|   assert _date_range_in_single_index(dt1, dt2) == expected_result | ||||
| 
 | ||||
| 
 | ||||
| def test_pagination(logs_model, mock_page_size): | ||||
|   """ | ||||
|   Make sure that pagination does not stop if searching through multiple indices by day, | ||||
|   and the current log count matches the page size while there are still indices to be searched. | ||||
|   """ | ||||
|   day1 = datetime.now() | ||||
|   day2 = day1 + timedelta(days=1) | ||||
|   day3 = day2 + timedelta(days=1) | ||||
| 
 | ||||
|   # Log some actions in day indices | ||||
|   # One day | ||||
|   logs_model.log_action('push_repo', namespace_name='devtable', repository_name='simple1', | ||||
|                         ip='1.2.3.4', timestamp=day1) | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple1', | ||||
|                         ip='5.6.7.8', timestamp=day1) | ||||
| 
 | ||||
|   found = _lookup_logs(logs_model, day1-timedelta(seconds=1), day3+timedelta(seconds=1)) | ||||
|   assert len(found) == mock_page_size | ||||
| 
 | ||||
|   # Another day | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2', | ||||
|                         ip='1.1.1.1', timestamp=day2) | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2', | ||||
|                         ip='0.0.0.0', timestamp=day2) | ||||
| 
 | ||||
|   # Yet another day | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2', | ||||
|                         ip='1.1.1.1', timestamp=day3) | ||||
|   logs_model.log_action('pull_repo', namespace_name='devtable', repository_name='simple2', | ||||
|                         ip='0.0.0.0', timestamp=day3) | ||||
| 
 | ||||
|   found = _lookup_logs(logs_model, day1-timedelta(seconds=1), day3+timedelta(seconds=1)) | ||||
|   assert len(found) == 6 | ||||
							
								
								
									
										77
									
								
								data/logs_model/test/test_logs_producer.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								data/logs_model/test/test_logs_producer.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,77 @@ | |||
| import logging | ||||
| import pytest | ||||
| from dateutil.parser import parse | ||||
| from mock import patch, Mock | ||||
| 
 | ||||
| import botocore | ||||
| 
 | ||||
| from data.logs_model import configure | ||||
| 
 | ||||
| from test_elasticsearch import app_config, logs_model_config, logs_model, mock_elasticsearch, mock_db_model | ||||
| from mock_elasticsearch import * | ||||
| 
 | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| FAKE_KAFKA_BROKERS = ['fake_server1', 'fake_server2'] | ||||
| FAKE_KAFKA_TOPIC = 'sometopic' | ||||
| FAKE_MAX_BLOCK_SECONDS = 1 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def kafka_logs_producer_config(app_config): | ||||
|   producer_config = {} | ||||
|   producer_config.update(app_config) | ||||
|    | ||||
|   kafka_config = { | ||||
|     'bootstrap_servers': FAKE_KAFKA_BROKERS, | ||||
|     'topic': FAKE_KAFKA_TOPIC, | ||||
|     'max_block_seconds': FAKE_MAX_BLOCK_SECONDS | ||||
|   } | ||||
| 
 | ||||
|   producer_config['LOGS_MODEL_CONFIG']['producer'] = 'kafka' | ||||
|   producer_config['LOGS_MODEL_CONFIG']['kafka_config'] = kafka_config | ||||
|   return producer_config | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def kinesis_logs_producer_config(app_config): | ||||
|   producer_config = {} | ||||
|   producer_config.update(app_config) | ||||
|    | ||||
|   kinesis_stream_config = { | ||||
|     'stream_name': 'test-stream', | ||||
|     'aws_region': 'fake_region', | ||||
|     'aws_access_key': 'some_key', | ||||
|     'aws_secret_key': 'some_secret' | ||||
|   } | ||||
| 
 | ||||
|   producer_config['LOGS_MODEL_CONFIG']['producer'] = 'kinesis_stream' | ||||
|   producer_config['LOGS_MODEL_CONFIG']['kinesis_stream_config'] = kinesis_stream_config | ||||
|   return producer_config | ||||
| 
 | ||||
| 
 | ||||
| def test_kafka_logs_producers(logs_model, mock_elasticsearch, mock_db_model, kafka_logs_producer_config): | ||||
|   mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE) | ||||
| 
 | ||||
|   producer_config = kafka_logs_producer_config | ||||
|   with patch('kafka.client_async.KafkaClient.check_version'), patch('kafka.KafkaProducer.send') as mock_send: | ||||
|     configure(producer_config) | ||||
|     logs_model.log_action('pull_repo', 'user1', Mock(id=1), '192.168.1.1', {'key': 'value'}, | ||||
|                           None, 'repo1', parse("2019-01-01T03:30")) | ||||
|      | ||||
|     mock_send.assert_called_once() | ||||
| 
 | ||||
| 
 | ||||
| def test_kinesis_logs_producers(logs_model, mock_elasticsearch, mock_db_model, kinesis_logs_producer_config): | ||||
|   mock_elasticsearch.template = Mock(return_value=DEFAULT_TEMPLATE_RESPONSE) | ||||
| 
 | ||||
|   producer_config = kinesis_logs_producer_config | ||||
|   with patch('botocore.endpoint.EndpointCreator.create_endpoint'), \ | ||||
|        patch('botocore.client.BaseClient._make_api_call') as mock_send: | ||||
|     configure(producer_config) | ||||
|     logs_model.log_action('pull_repo', 'user1', Mock(id=1), '192.168.1.1', {'key': 'value'}, | ||||
|                           None, 'repo1', parse("2019-01-01T03:30")) | ||||
| 
 | ||||
|     # Check that a PutRecord api call is made. | ||||
|     # NOTE: The second arg of _make_api_call uses a randomized PartitionKey | ||||
|     mock_send.assert_called_once_with(u'PutRecord', mock_send.call_args_list[0][0][1]) | ||||
		Reference in a new issue