initial import for Open Source 🎉
This commit is contained in:
parent
1898c361f3
commit
9c0dd3b722
2048 changed files with 218743 additions and 0 deletions
27
data/logs_model/logs_producer/__init__.py
Normal file
27
data/logs_model/logs_producer/__init__.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import logging
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LogSendException(Exception):
|
||||
""" A generic error when sending the logs to its destination.
|
||||
e.g. Kinesis, Kafka, Elasticsearch, ...
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class LogProducerProxy(object):
|
||||
def __init__(self):
|
||||
self._model = None
|
||||
|
||||
def initialize(self, model):
|
||||
self._model = model
|
||||
logger.info('===============================')
|
||||
logger.info('Using producer `%s`', self._model)
|
||||
logger.info('===============================')
|
||||
|
||||
def __getattr__(self, attr):
|
||||
if not self._model:
|
||||
raise AttributeError("LogsModelProxy is not initialized")
|
||||
return getattr(self._model, attr)
|
25
data/logs_model/logs_producer/elasticsearch_logs_producer.py
Normal file
25
data/logs_model/logs_producer/elasticsearch_logs_producer.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
import logging
|
||||
|
||||
from elasticsearch.exceptions import ElasticsearchException
|
||||
|
||||
from data.logs_model.logs_producer.interface import LogProducerInterface
|
||||
from data.logs_model.logs_producer import LogSendException
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ElasticsearchLogsProducer(LogProducerInterface):
|
||||
""" Log producer writing log entries to Elasticsearch.
|
||||
|
||||
This implementation writes directly to Elasticsearch without a streaming/queueing service.
|
||||
"""
|
||||
def send(self, logentry):
|
||||
try:
|
||||
logentry.save()
|
||||
except ElasticsearchException as ex:
|
||||
logger.exception('ElasticsearchLogsProducer error sending log to Elasticsearch: %s', ex)
|
||||
raise LogSendException('ElasticsearchLogsProducer error sending log to Elasticsearch: %s' % ex)
|
||||
except Exception as e:
|
||||
logger.exception('ElasticsearchLogsProducer exception sending log to Elasticsearch: %s', e)
|
||||
raise LogSendException('ElasticsearchLogsProducer exception sending log to Elasticsearch: %s' % e)
|
8
data/logs_model/logs_producer/interface.py
Normal file
8
data/logs_model/logs_producer/interface.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
from abc import ABCMeta, abstractmethod
|
||||
from six import add_metaclass
|
||||
|
||||
@add_metaclass(ABCMeta)
|
||||
class LogProducerInterface(object):
|
||||
@abstractmethod
|
||||
def send(self, logentry):
|
||||
""" Send a log entry to the configured log infrastructure. """
|
45
data/logs_model/logs_producer/kafka_logs_producer.py
Normal file
45
data/logs_model/logs_producer/kafka_logs_producer.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
import logging
|
||||
|
||||
from kafka.errors import KafkaError, KafkaTimeoutError
|
||||
from kafka import KafkaProducer
|
||||
|
||||
from data.logs_model.shared import epoch_ms
|
||||
from data.logs_model.logs_producer.interface import LogProducerInterface
|
||||
from data.logs_model.logs_producer.util import logs_json_serializer
|
||||
from data.logs_model.logs_producer import LogSendException
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_MAX_BLOCK_SECONDS = 5
|
||||
|
||||
|
||||
class KafkaLogsProducer(LogProducerInterface):
|
||||
""" Log producer writing log entries to a Kafka stream. """
|
||||
def __init__(self, bootstrap_servers=None, topic=None, client_id=None, max_block_seconds=None):
|
||||
self.bootstrap_servers = bootstrap_servers
|
||||
self.topic = topic
|
||||
self.client_id = client_id
|
||||
self.max_block_ms = (max_block_seconds or DEFAULT_MAX_BLOCK_SECONDS) * 1000
|
||||
|
||||
self._producer = KafkaProducer(bootstrap_servers=self.bootstrap_servers,
|
||||
client_id=self.client_id,
|
||||
max_block_ms=self.max_block_ms,
|
||||
value_serializer=logs_json_serializer)
|
||||
|
||||
def send(self, logentry):
|
||||
try:
|
||||
# send() has a (max_block_ms) timeout and get() has a (max_block_ms) timeout
|
||||
# for an upper bound of 2x(max_block_ms) before guaranteed delivery
|
||||
future = self._producer.send(self.topic, logentry.to_dict(), timestamp_ms=epoch_ms(logentry.datetime))
|
||||
record_metadata = future.get(timeout=self.max_block_ms)
|
||||
assert future.succeeded
|
||||
except KafkaTimeoutError as kte:
|
||||
logger.exception('KafkaLogsProducer timeout sending log to Kafka: %s', kte)
|
||||
raise LogSendException('KafkaLogsProducer timeout sending log to Kafka: %s' % kte)
|
||||
except KafkaError as ke:
|
||||
logger.exception('KafkaLogsProducer error sending log to Kafka: %s', ke)
|
||||
raise LogSendException('KafkaLogsProducer error sending log to Kafka: %s' % ke)
|
||||
except Exception as e:
|
||||
logger.exception('KafkaLogsProducer exception sending log to Kafka: %s', e)
|
||||
raise LogSendException('KafkaLogsProducer exception sending log to Kafka: %s' % e)
|
|
@ -0,0 +1,75 @@
|
|||
import logging
|
||||
import hashlib
|
||||
import random
|
||||
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
from botocore.client import Config
|
||||
|
||||
from data.logs_model.logs_producer.interface import LogProducerInterface
|
||||
from data.logs_model.logs_producer.util import logs_json_serializer
|
||||
from data.logs_model.logs_producer import LogSendException
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
KINESIS_PARTITION_KEY_PREFIX = 'logentry_partition_key_'
|
||||
DEFAULT_CONNECT_TIMEOUT = 5
|
||||
DEFAULT_READ_TIMEOUT = 5
|
||||
MAX_RETRY_ATTEMPTS = 5
|
||||
DEFAULT_MAX_POOL_CONNECTIONS = 10
|
||||
|
||||
|
||||
def _partition_key(number_of_shards=None):
|
||||
""" Generate a partition key for AWS Kinesis stream.
|
||||
If the number of shards is specified, generate keys where the size of the key space is
|
||||
the number of shards.
|
||||
"""
|
||||
key = None
|
||||
if number_of_shards is not None:
|
||||
shard_number = random.randrange(0, number_of_shards)
|
||||
key = hashlib.sha1(KINESIS_PARTITION_KEY_PREFIX + str(shard_number)).hexdigest()
|
||||
else:
|
||||
key = hashlib.sha1(KINESIS_PARTITION_KEY_PREFIX + str(random.getrandbits(256))).hexdigest()
|
||||
|
||||
return key
|
||||
|
||||
|
||||
class KinesisStreamLogsProducer(LogProducerInterface):
|
||||
""" Log producer writing log entries to an Amazon Kinesis Data Stream. """
|
||||
def __init__(self, stream_name, aws_region, aws_access_key=None, aws_secret_key=None,
|
||||
connect_timeout=None, read_timeout=None, max_retries=None,
|
||||
max_pool_connections=None):
|
||||
self._stream_name = stream_name
|
||||
self._aws_region = aws_region
|
||||
self._aws_access_key = aws_access_key
|
||||
self._aws_secret_key = aws_secret_key
|
||||
self._connect_timeout = connect_timeout or DEFAULT_CONNECT_TIMEOUT
|
||||
self._read_timeout = read_timeout or DEFAULT_READ_TIMEOUT
|
||||
self._max_retries = max_retries or MAX_RETRY_ATTEMPTS
|
||||
self._max_pool_connections=max_pool_connections or DEFAULT_MAX_POOL_CONNECTIONS
|
||||
|
||||
client_config = Config(connect_timeout=self._connect_timeout,
|
||||
read_timeout=self._read_timeout ,
|
||||
retries={'max_attempts': self._max_retries},
|
||||
max_pool_connections=self._max_pool_connections)
|
||||
self._producer = boto3.client('kinesis', use_ssl=True,
|
||||
region_name=self._aws_region,
|
||||
aws_access_key_id=self._aws_access_key,
|
||||
aws_secret_access_key=self._aws_secret_key,
|
||||
config=client_config)
|
||||
|
||||
def send(self, logentry):
|
||||
try:
|
||||
data = logs_json_serializer(logentry)
|
||||
self._producer.put_record(
|
||||
StreamName=self._stream_name,
|
||||
Data=data,
|
||||
PartitionKey=_partition_key()
|
||||
)
|
||||
except ClientError as ce:
|
||||
logger.exception('KinesisStreamLogsProducer client error sending log to Kinesis: %s', ce)
|
||||
raise LogSendException('KinesisStreamLogsProducer client error sending log to Kinesis: %s' % ce)
|
||||
except Exception as e:
|
||||
logger.exception('KinesisStreamLogsProducer exception sending log to Kinesis: %s', e)
|
||||
raise LogSendException('KinesisStreamLogsProducer exception sending log to Kinesis: %s' % e)
|
|
@ -0,0 +1,45 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import json
|
||||
from datetime import datetime
|
||||
import pytest
|
||||
|
||||
from data.logs_model.logs_producer.util import logs_json_serializer
|
||||
from data.logs_model.elastic_logs import LogEntry
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
TEST_DATETIME = datetime.utcnow()
|
||||
|
||||
TEST_JSON_STRING = '{"a": "b", "c": "d"}'
|
||||
TEST_JSON_STRING_WITH_UNICODE = u'{"éëê": "îôû"}'
|
||||
|
||||
VALID_LOGENTRY = LogEntry(random_id='123-45', ip='0.0.0.0', metadata_json=TEST_JSON_STRING, datetime=TEST_DATETIME)
|
||||
VALID_LOGENTRY_WITH_UNICODE = LogEntry(random_id='123-45', ip='0.0.0.0', metadata_json=TEST_JSON_STRING_WITH_UNICODE, datetime=TEST_DATETIME)
|
||||
|
||||
VALID_LOGENTRY_EXPECTED_OUTPUT = '{"datetime": "%s", "ip": "0.0.0.0", "metadata_json": "{\\"a\\": \\"b\\", \\"c\\": \\"d\\"}", "random_id": "123-45"}' % TEST_DATETIME.isoformat()
|
||||
VALID_LOGENTRY_WITH_UNICODE_EXPECTED_OUTPUT = '{"datetime": "%s", "ip": "0.0.0.0", "metadata_json": "{\\"\\u00e9\\u00eb\\u00ea\\": \\"\\u00ee\\u00f4\\u00fb\\"}", "random_id": "123-45"}' % TEST_DATETIME.isoformat()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'is_valid, given_input, expected_output',
|
||||
[
|
||||
# Valid inputs
|
||||
pytest.param(True, VALID_LOGENTRY, VALID_LOGENTRY_EXPECTED_OUTPUT),
|
||||
# With unicode
|
||||
pytest.param(True, VALID_LOGENTRY_WITH_UNICODE, VALID_LOGENTRY_WITH_UNICODE_EXPECTED_OUTPUT),
|
||||
])
|
||||
def test_logs_json_serializer(is_valid, given_input, expected_output):
|
||||
if not is_valid:
|
||||
with pytest.raises(ValueError) as ve:
|
||||
data = logs_json_serializer(given_input)
|
||||
else:
|
||||
data = logs_json_serializer(given_input, sort_keys=True)
|
||||
assert data == expected_output
|
||||
|
||||
# Make sure the datetime was serialized in the correct ISO8601
|
||||
datetime_str = json.loads(data)['datetime']
|
||||
assert datetime_str == TEST_DATETIME.isoformat()
|
15
data/logs_model/logs_producer/util.py
Normal file
15
data/logs_model/logs_producer/util.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
import json
|
||||
from datetime import datetime
|
||||
|
||||
class LogEntryJSONEncoder(json.JSONEncoder):
|
||||
""" JSON encoder to encode datetimes to ISO8601 format. """
|
||||
def default(self, obj):
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
|
||||
return super(LogEntryJSONEncoder, self).default(obj)
|
||||
|
||||
def logs_json_serializer(logentry, sort_keys=False):
|
||||
""" Serializes a LogEntry to json bytes. """
|
||||
return json.dumps(logentry.to_dict(), cls=LogEntryJSONEncoder,
|
||||
ensure_ascii=True, sort_keys=sort_keys).encode('ascii')
|
Reference in a new issue