Refactor prometheus integration
Move prometheus to SaaS and make it a plugin Move static callers to use metrics_queue plugin Change local-docker to support different quay clone dirnames Change prom_aggregator to use logrus
This commit is contained in:
parent
3d9acf2fff
commit
668a8edc50
10 changed files with 216 additions and 161 deletions
|
@ -4,20 +4,32 @@ import time
|
|||
|
||||
from functools import wraps
|
||||
from Queue import Queue, Full
|
||||
from util.prometheus import Histogram, Counter
|
||||
|
||||
from flask import g, request
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
resp_time = Histogram('response_time', 'HTTP response time in seconds', labelnames=['endpoint'])
|
||||
resp_code = Counter('response_code', 'HTTP response code', labelnames=['endpoint', 'code'])
|
||||
non_200 = Counter('response_non200', 'Non-200 HTTP response codes', labelnames=['endpoint'])
|
||||
|
||||
API_BUCKETS = [.01, .025, .05, .1, .25, .5, 1.0, 2.5, 5.0]
|
||||
|
||||
|
||||
class MetricQueue(object):
|
||||
def __init__(self):
|
||||
def __init__(self, prom):
|
||||
self._queue = None
|
||||
self.resp_time = prom.create_histogram('response_time', 'HTTP response time in seconds',
|
||||
labelnames=['endpoint'], buckets=API_BUCKETS)
|
||||
self.resp_code = prom.create_counter('response_code', 'HTTP response code',
|
||||
labelnames=['endpoint', 'code'])
|
||||
self.non_200 = prom.create_counter('response_non200', 'Non-200 HTTP response codes',
|
||||
labelnames=['endpoint'])
|
||||
self.multipart_upload_start = prom.create_counter('multipart_upload_start',
|
||||
'Multipart upload startse')
|
||||
self.multipart_upload_end = prom.create_counter('self._metric_queue.multipart_upload_end',
|
||||
'Multipart upload ends.', labelnames=['type'])
|
||||
self.build_capacity_shortage = prom.create_gauge('build_capacity_shortage',
|
||||
'Build capacity shortage.')
|
||||
self.percent_building = prom.create_gauge('build_percent_building', 'Percent building.')
|
||||
|
||||
def enable(self, maxsize=10000):
|
||||
self._queue = Queue(maxsize)
|
||||
|
@ -40,13 +52,16 @@ class MetricQueue(object):
|
|||
def get_nowait(self):
|
||||
return self._queue.get_nowait()
|
||||
|
||||
|
||||
def time_blueprint(bp, metric_queue):
|
||||
bp.before_request(time_before_request)
|
||||
bp.after_request(time_after_request(bp.name, metric_queue))
|
||||
|
||||
|
||||
def time_before_request():
|
||||
g._request_start_time = time.time()
|
||||
|
||||
|
||||
def time_after_request(name, metric_queue):
|
||||
def f(r):
|
||||
start = getattr(g, '_request_start_time', None)
|
||||
|
@ -59,18 +74,19 @@ def time_after_request(name, metric_queue):
|
|||
metric_queue.put('ResponseTime', dur, dimensions=dims, unit='Seconds')
|
||||
metric_queue.put('ResponseCode', r.status_code, dimensions=dims)
|
||||
|
||||
resp_time.Observe(dur, labelvalues=[request.endpoint])
|
||||
resp_code.Inc(labelvalues=[request.endpoint, r.status_code])
|
||||
metric_queue.resp_time.Observe(dur, labelvalues=[request.endpoint])
|
||||
metric_queue.resp_code.Inc(labelvalues=[request.endpoint, r.status_code])
|
||||
|
||||
if r.status_code >= 500:
|
||||
metric_queue.put('5XXResponse', 1, dimensions={'name': name})
|
||||
elif r.status_code < 200 or r.status_code >= 300:
|
||||
metric_queue.put('Non200Response', 1, dimensions={'name': name})
|
||||
non_200.Inc(labelvalues=[request.endpoint])
|
||||
metric_queue.non_200.Inc(labelvalues=[request.endpoint])
|
||||
|
||||
return r
|
||||
return f
|
||||
|
||||
|
||||
def time_decorator(name, metric_queue):
|
||||
after = time_after_request(name, metric_queue)
|
||||
def decorator(func):
|
||||
|
|
157
util/saas/prometheus.py
Normal file
157
util/saas/prometheus.py
Normal file
|
@ -0,0 +1,157 @@
|
|||
import datetime
|
||||
import json
|
||||
import logging
|
||||
|
||||
from Queue import Queue, Full, Empty
|
||||
from threading import Thread
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
QUEUE_MAX = 1000
|
||||
MAX_BATCH_SIZE = 100
|
||||
REGISTER_WAIT = datetime.timedelta(hours=1)
|
||||
|
||||
|
||||
class _QueueSender(Thread):
|
||||
def __init__(self, queue, url, registered):
|
||||
Thread.__init__(self)
|
||||
self.daemon = True
|
||||
self.next_register = datetime.datetime.now()
|
||||
self._queue = queue
|
||||
self._url = url
|
||||
self._registered = registered
|
||||
|
||||
def run(self):
|
||||
while True:
|
||||
reqs = []
|
||||
reqs.append(self._queue.get())
|
||||
|
||||
while len(reqs) < MAX_BATCH_SIZE:
|
||||
try:
|
||||
req = self._queue.get_nowait()
|
||||
reqs.append(req)
|
||||
except Empty:
|
||||
break
|
||||
|
||||
try:
|
||||
resp = requests.post(self._url + '/call', '\n'.join(reqs))
|
||||
if resp.status_code == 500 and self.next_register <= datetime.datetime.now():
|
||||
resp = requests.post(self._url + '/call', '\n'.join(self._registered))
|
||||
self.next_register = datetime.datetime.now() + REGISTER_WAIT
|
||||
logger.debug('Register returned %s for %s metrics; setting next to %s', resp.status_code,
|
||||
len(self._registered), self.next_register)
|
||||
elif resp.status_code != 200:
|
||||
logger.debug('Failed sending to prometheus: %s: %s: %s', resp.status_code, resp.text,
|
||||
', '.join(reqs))
|
||||
else:
|
||||
logger.debug('Sent %d prometheus metrics', len(reqs))
|
||||
except:
|
||||
logger.exception('Failed to write to prometheus aggregator: %s', reqs)
|
||||
|
||||
|
||||
class Prometheus(object):
|
||||
def __init__(self, url):
|
||||
self._registered = []
|
||||
self._url = url
|
||||
|
||||
if url is not None:
|
||||
self._queue = Queue(QUEUE_MAX)
|
||||
self._sender = _QueueSender(self._queue, url, self._registered)
|
||||
self._sender.start()
|
||||
logger.debug('Prometheus aggregator sending to %s', url)
|
||||
else:
|
||||
self._queue = None
|
||||
logger.debug('Prometheus aggregator disabled')
|
||||
|
||||
def enqueue(self, call, data):
|
||||
if not self._queue:
|
||||
return
|
||||
|
||||
v = json.dumps({
|
||||
'Call': call,
|
||||
'Data': data,
|
||||
})
|
||||
if call == 'register':
|
||||
self._registered.append(v)
|
||||
return
|
||||
try:
|
||||
self._queue.put_nowait(v)
|
||||
except Full:
|
||||
# If the queue is full, it is because 1) no aggregator was enabled or 2)
|
||||
# the aggregator is taking a long time to respond to requests. In the case
|
||||
# of 1, it's probably enterprise mode and we don't care. In the case of 2,
|
||||
# the response timeout error is printed at another place. In either case,
|
||||
# we don't need to print an error here.
|
||||
pass
|
||||
|
||||
def _create_collector(self, collector_type, args, kwargs):
|
||||
return _Collector(self.enqueue, collector_type, *args, **kwargs)
|
||||
|
||||
def create_gauge(self, *args, **kwargs):
|
||||
return self._create_collector('Gauge', args, kwargs)
|
||||
|
||||
def create_counter(self, *args, **kwargs):
|
||||
return self._create_collector('Counter', args, kwargs)
|
||||
|
||||
def create_summary(self, *args, **kwargs):
|
||||
return self._create_collector('Summary', args, kwargs)
|
||||
|
||||
def create_histogram(self, *args, **kwargs):
|
||||
return self._create_collector('Histogram', args, kwargs)
|
||||
|
||||
def create_untyped(self, *args, **kwargs):
|
||||
return self._create_collector('Untyped', args, kwargs)
|
||||
|
||||
|
||||
class PrometheusPlugin(object):
|
||||
def __init__(self, app=None):
|
||||
self.app = app
|
||||
if app is not None:
|
||||
self.state = self.init_app(app)
|
||||
else:
|
||||
self.state = None
|
||||
|
||||
def init_app(self, app):
|
||||
prom_url = app.config.get('PROMETHEUS_AGGREGATOR_URL')
|
||||
logger.debug('Initializing prometheus with aggregator url: %s', prom_url)
|
||||
prometheus = Prometheus(prom_url)
|
||||
|
||||
# register extension with app
|
||||
app.extensions = getattr(app, 'extensions', {})
|
||||
app.extensions['prometheus'] = prometheus
|
||||
return prometheus
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.state, name, None)
|
||||
|
||||
|
||||
class _Collector(object):
|
||||
def __init__(self, enqueue_method, c_type, name, c_help, namespace='', subsystem='', **kwargs):
|
||||
self._enqueue_method = enqueue_method
|
||||
|
||||
self._base_args = {
|
||||
'Name': name,
|
||||
'Namespace': namespace,
|
||||
'Subsystem': subsystem,
|
||||
'Type': c_type,
|
||||
}
|
||||
|
||||
registration_params = dict(kwargs)
|
||||
registration_params.update(self._base_args)
|
||||
registration_params['Help'] = c_help
|
||||
|
||||
self._enqueue_method('register', registration_params)
|
||||
|
||||
def __getattr__(self, method):
|
||||
def f(value=0, labelvalues=()):
|
||||
data = dict(self._base_args)
|
||||
data.update({
|
||||
'Value': value,
|
||||
'LabelValues': [str(i) for i in labelvalues],
|
||||
'Method': method,
|
||||
})
|
||||
self._enqueue_method('put', data)
|
||||
return f
|
Reference in a new issue