This repository has been archived on 2020-03-24. You can view files and clone it, but cannot push or open issues or pull requests.
quay/data/logs_model/elastic_logs.py

256 lines
9.3 KiB
Python

import os
import logging
import re
from datetime import datetime, timedelta
from requests_aws4auth import AWS4Auth
from elasticsearch import RequestsHttpConnection
from elasticsearch.exceptions import NotFoundError, AuthorizationException
from elasticsearch_dsl import Index, Document, Integer, Date, Text, Ip, Keyword
from elasticsearch_dsl.connections import connections
logger = logging.getLogger(__name__)
# Name of the connection used for Elasticearch's template API
ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS = 'logentry_template'
# Prefix of autogenerated indices
INDEX_NAME_PREFIX = 'logentry_'
# Time-based index date format
INDEX_DATE_FORMAT = '%Y-%m-%d'
# Timeout for default connection
ELASTICSEARCH_DEFAULT_CONNECTION_TIMEOUT = 15
# Timeout for template api Connection
ELASTICSEARCH_TEMPLATE_CONNECTION_TIMEOUT = 60
# Force an index template update
ELASTICSEARCH_FORCE_INDEX_TEMPLATE_UPDATE = os.environ.get('FORCE_INDEX_TEMPLATE_UPDATE', '')
# Valid index prefix pattern
VALID_INDEX_PATTERN = r'^((?!\.$|\.\.$|[-_+])([^A-Z:\/*?\"<>|,# ]){1,255})$'
class LogEntry(Document):
# random_id is the tie-breaker for sorting in pagination.
# random_id is also used for deduplication of records when using a "at-least-once" delivery stream.
# Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-search-after.html
#
# We use don't use the _id of a document since a `doc_values` is not build for this field:
# An on-disk data structure that stores the same data in a columnar format
# for optimized sorting and aggregations.
# Reference: https://github.com/elastic/elasticsearch/issues/35369
random_id = Text(fields={'keyword': Keyword()})
kind_id = Integer()
account_id = Integer()
performer_id = Integer()
repository_id = Integer()
ip = Ip()
metadata_json = Text()
datetime = Date()
_initialized = False
@classmethod
def init(cls, index_prefix, index_settings=None, skip_template_init=False):
"""
Create the index template, and populate LogEntry's mapping and index settings.
"""
wildcard_index = Index(name=index_prefix + '*')
wildcard_index.settings(**(index_settings or {}))
wildcard_index.document(cls)
cls._index = wildcard_index
cls._index_prefix = index_prefix
if not skip_template_init:
cls.create_or_update_template()
# Since the elasticsearch-dsl API requires the document's index being defined as an inner class at the class level,
# this function needs to be called first before being able to call `save`.
cls._initialized = True
@classmethod
def create_or_update_template(cls):
assert cls._index and cls._index_prefix
index_template = cls._index.as_template(cls._index_prefix)
index_template.save(using=ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS)
def save(self, **kwargs):
# We group the logs based on year, month and day as different indexes, so that
# dropping those indexes based on retention range is easy.
#
# NOTE: This is only used if logging directly to Elasticsearch
# When using Kinesis or Kafka, the consumer of these streams
# will be responsible for the management of the indices' lifecycle.
assert LogEntry._initialized
kwargs['index'] = self.datetime.strftime(self._index_prefix + INDEX_DATE_FORMAT)
return super(LogEntry, self).save(**kwargs)
class ElasticsearchLogs(object):
"""
Model for logs operations stored in an Elasticsearch cluster.
"""
def __init__(self, host=None, port=None, access_key=None, secret_key=None, aws_region=None,
index_settings=None, use_ssl=True, index_prefix=INDEX_NAME_PREFIX):
# For options in index_settings, refer to:
# https://www.elastic.co/guide/en/elasticsearch/guide/master/_index_settings.html
# some index settings are set at index creation time, and therefore, you should NOT
# change those settings once the index is set.
self._host = host
self._port = port
self._access_key = access_key
self._secret_key = secret_key
self._aws_region = aws_region
self._index_prefix = index_prefix
self._index_settings = index_settings
self._use_ssl = use_ssl
self._client = None
self._initialized = False
def _initialize(self):
"""
Initialize a connection to an ES cluster and
creates an index template if it does not exist.
"""
if not self._initialized:
http_auth = None
if self._access_key and self._secret_key and self._aws_region:
http_auth = AWS4Auth(self._access_key, self._secret_key, self._aws_region, 'es')
elif self._access_key and self._secret_key:
http_auth = (self._access_key, self._secret_key)
else:
logger.warn("Connecting to Elasticsearch without HTTP auth")
self._client = connections.create_connection(
hosts=[{
'host': self._host,
'port': self._port
}],
http_auth=http_auth,
use_ssl=self._use_ssl,
verify_certs=True,
connection_class=RequestsHttpConnection,
timeout=ELASTICSEARCH_DEFAULT_CONNECTION_TIMEOUT,
)
# Create a second connection with a timeout of 60s vs 10s.
# For some reason the PUT template API can take anywhere between
# 10s and 30s on the test cluster.
# This only needs to be done once to initialize the index template
connections.create_connection(
alias=ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS,
hosts=[{
'host': self._host,
'port': self._port
}],
http_auth=http_auth,
use_ssl=self._use_ssl,
verify_certs=True,
connection_class=RequestsHttpConnection,
timeout=ELASTICSEARCH_TEMPLATE_CONNECTION_TIMEOUT,
)
try:
force_template_update = ELASTICSEARCH_FORCE_INDEX_TEMPLATE_UPDATE.lower() == 'true'
self._client.indices.get_template(self._index_prefix)
LogEntry.init(self._index_prefix, self._index_settings,
skip_template_init=not force_template_update)
except NotFoundError:
LogEntry.init(self._index_prefix, self._index_settings, skip_template_init=False)
finally:
try:
connections.remove_connection(ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS)
except KeyError as ke:
logger.exception('Elasticsearch connection not found to remove %s: %s',
ELASTICSEARCH_TEMPLATE_CONNECTION_ALIAS, ke)
self._initialized = True
def index_name(self, day):
""" Return an index name for the given day. """
return self._index_prefix + day.strftime(INDEX_DATE_FORMAT)
def index_exists(self, index):
try:
return index in self._client.indices.get(index)
except NotFoundError:
return False
@staticmethod
def _valid_index_prefix(prefix):
""" Check that the given index prefix is valid with the set of
indices used by this class.
"""
return re.match(VALID_INDEX_PATTERN, prefix) is not None
def _valid_index_name(self, index):
""" Check that the given index name is valid and follows the format:
<index_prefix>YYYY-MM-DD
"""
if not ElasticsearchLogs._valid_index_prefix(index):
return False
if not index.startswith(self._index_prefix) or len(index) > 255:
return False
index_dt_str = index.split(self._index_prefix, 1)[-1]
try:
datetime.strptime(index_dt_str, INDEX_DATE_FORMAT)
return True
except ValueError:
logger.exception('Invalid date format (YYYY-MM-DD) for index: %s', index)
return False
def can_delete_index(self, index, cutoff_date):
""" Check if the given index can be deleted based on the given index's date and cutoff date. """
assert self._valid_index_name(index)
index_dt = datetime.strptime(index[len(self._index_prefix):], INDEX_DATE_FORMAT)
return index_dt < cutoff_date and cutoff_date - index_dt >= timedelta(days=1)
def list_indices(self):
self._initialize()
try:
return self._client.indices.get(self._index_prefix + '*').keys()
except NotFoundError as nfe:
logger.exception('`%s` indices not found: %s', self._index_prefix, nfe.info)
return []
except AuthorizationException as ae:
logger.exception('Unauthorized for indices `%s`: %s', self._index_prefix, ae.info)
return None
def delete_index(self, index):
self._initialize()
assert self._valid_index_name(index)
try:
self._client.indices.delete(index)
return index
except NotFoundError as nfe:
logger.exception('`%s` indices not found: %s', index, nfe.info)
return None
except AuthorizationException as ae:
logger.exception('Unauthorized to delete index `%s`: %s', index, ae.info)
return None
def configure_es(host, port, access_key=None, secret_key=None, aws_region=None,
index_prefix=None, use_ssl=True, index_settings=None):
"""
For options in index_settings, refer to:
https://www.elastic.co/guide/en/elasticsearch/guide/master/_index_settings.html
some index settings are set at index creation time, and therefore, you should NOT
change those settings once the index is set.
"""
es_client = ElasticsearchLogs(host=host, port=port, access_key=access_key, secret_key=secret_key,
aws_region=aws_region, index_prefix=index_prefix or INDEX_NAME_PREFIX,
use_ssl=use_ssl, index_settings=index_settings)
es_client._initialize()
return es_client