Use prometheus as a metric backend
This entails writing a metric aggregation program since each worker has its own memory, and thus own metrics because of python gunicorn. The python client is a simple wrapper that makes web requests to it.
This commit is contained in:
parent
781f2eec72
commit
3d9acf2fff
10 changed files with 502 additions and 0 deletions
122
util/prometheus/__init__.py
Normal file
122
util/prometheus/__init__.py
Normal file
|
@ -0,0 +1,122 @@
|
|||
import datetime
|
||||
import json
|
||||
import logging
|
||||
|
||||
from Queue import Queue, Full, Empty
|
||||
from threading import Thread
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
URL = None
|
||||
QUEUE_MAX = 1000
|
||||
MAX_BATCH_SIZE = 100
|
||||
REGISTER_WAIT = datetime.timedelta(hours=1)
|
||||
|
||||
queue = Queue(QUEUE_MAX)
|
||||
registered = []
|
||||
|
||||
def init(url):
|
||||
global URL, queue
|
||||
if not url:
|
||||
logger.debug('Prometheus aggregator not started: empty URL')
|
||||
queue = None
|
||||
return
|
||||
URL = url
|
||||
sender = _QueueSender()
|
||||
sender.start()
|
||||
logger.debug('Prometheus aggregator sending to %s', URL)
|
||||
|
||||
def enqueue(call, data):
|
||||
if not queue:
|
||||
return
|
||||
v = json.dumps({
|
||||
'Call': call,
|
||||
'Data': data,
|
||||
})
|
||||
if call == 'register':
|
||||
registered.append(v)
|
||||
return
|
||||
try:
|
||||
queue.put_nowait(v)
|
||||
except Full:
|
||||
# If the queue is full, it is because 1) no aggregator was enabled or 2)
|
||||
# the aggregator is taking a long time to respond to requests. In the case
|
||||
# of 1, it's probably enterprise mode and we don't care. In the case of 2,
|
||||
# the response timeout error is printed at another place. In either case,
|
||||
# we don't need to print an error here.
|
||||
pass
|
||||
|
||||
class _QueueSender(Thread):
|
||||
def __init__(self):
|
||||
Thread.__init__(self)
|
||||
self.daemon = True
|
||||
self.next_register = datetime.datetime.now()
|
||||
|
||||
def run(self):
|
||||
while True:
|
||||
reqs = []
|
||||
reqs.append(queue.get())
|
||||
|
||||
while len(reqs) < MAX_BATCH_SIZE:
|
||||
try:
|
||||
req = queue.get_nowait()
|
||||
reqs.append(req)
|
||||
except Empty:
|
||||
break
|
||||
|
||||
try:
|
||||
resp = requests.post(URL + '/call', '\n'.join(reqs))
|
||||
if resp.status_code == 500 and self.next_register <= datetime.datetime.now():
|
||||
resp = requests.post(URL + '/call', '\n'.join(registered))
|
||||
self.next_register = datetime.datetime.now() + REGISTER_WAIT
|
||||
logger.debug('Register returned %s for %s metrics; setting next to %s', resp.status_code, len(registered), self.next_register)
|
||||
elif resp.status_code != 200:
|
||||
logger.debug('Failed sending to prometheus: %s: %s: %s', resp.status_code, resp.text, ', '.join(reqs))
|
||||
else:
|
||||
logger.debug('Sent %d prometheus metrics', len(reqs))
|
||||
except:
|
||||
logger.exception('Failed to write to prometheus aggregator: %s', reqs)
|
||||
|
||||
class _Collector(object):
|
||||
def __init__(self, name, help, namespace='', subsystem='', **kwargs):
|
||||
self._name = name
|
||||
self._namespace = namespace
|
||||
self._subsystem = subsystem
|
||||
kwargs['Name'] = name
|
||||
kwargs['Namespace'] = namespace
|
||||
kwargs['Subsystem'] = subsystem
|
||||
kwargs['Type'] = self.__class__.__name__
|
||||
kwargs['Help'] = help
|
||||
enqueue('register', kwargs)
|
||||
|
||||
def __getattr__(self, method):
|
||||
def f(value=0, labelvalues=()):
|
||||
data = {
|
||||
'Name': self._name,
|
||||
'Subsystem': self._subsystem,
|
||||
'Namespace': self._namespace,
|
||||
'Type': self.__class__.__name__,
|
||||
'Value': value,
|
||||
'LabelValues': [str(i) for i in labelvalues],
|
||||
'Method': method,
|
||||
}
|
||||
enqueue('put', data)
|
||||
return f
|
||||
|
||||
class Gauge(_Collector):
|
||||
pass
|
||||
|
||||
class Counter(_Collector):
|
||||
pass
|
||||
|
||||
class Summary(_Collector):
|
||||
pass
|
||||
|
||||
class Histogram(_Collector):
|
||||
pass
|
||||
|
||||
class Untyped(_Collector):
|
||||
pass
|
|
@ -4,12 +4,17 @@ import time
|
|||
|
||||
from functools import wraps
|
||||
from Queue import Queue, Full
|
||||
from util.prometheus import Histogram, Counter
|
||||
|
||||
from flask import g, request
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
resp_time = Histogram('response_time', 'HTTP response time in seconds', labelnames=['endpoint'])
|
||||
resp_code = Counter('response_code', 'HTTP response code', labelnames=['endpoint', 'code'])
|
||||
non_200 = Counter('response_non200', 'Non-200 HTTP response codes', labelnames=['endpoint'])
|
||||
|
||||
class MetricQueue(object):
|
||||
def __init__(self):
|
||||
self._queue = None
|
||||
|
@ -54,10 +59,14 @@ def time_after_request(name, metric_queue):
|
|||
metric_queue.put('ResponseTime', dur, dimensions=dims, unit='Seconds')
|
||||
metric_queue.put('ResponseCode', r.status_code, dimensions=dims)
|
||||
|
||||
resp_time.Observe(dur, labelvalues=[request.endpoint])
|
||||
resp_code.Inc(labelvalues=[request.endpoint, r.status_code])
|
||||
|
||||
if r.status_code >= 500:
|
||||
metric_queue.put('5XXResponse', 1, dimensions={'name': name})
|
||||
elif r.status_code < 200 or r.status_code >= 300:
|
||||
metric_queue.put('Non200Response', 1, dimensions={'name': name})
|
||||
non_200.Inc(labelvalues=[request.endpoint])
|
||||
|
||||
return r
|
||||
return f
|
||||
|
|
Reference in a new issue