Use prometheus as a metric backend

This entails writing a metric aggregation program since each worker has its
own memory, and thus own metrics because of python gunicorn. The python
client is a simple wrapper that makes web requests to it.
This commit is contained in:
Matt Jibson 2015-11-20 15:32:17 -05:00 committed by Joseph Schorr
parent 781f2eec72
commit 3d9acf2fff
10 changed files with 502 additions and 0 deletions

122
util/prometheus/__init__.py Normal file
View file

@ -0,0 +1,122 @@
import datetime
import json
import logging
from Queue import Queue, Full, Empty
from threading import Thread
import requests
logger = logging.getLogger(__name__)
URL = None
QUEUE_MAX = 1000
MAX_BATCH_SIZE = 100
REGISTER_WAIT = datetime.timedelta(hours=1)
queue = Queue(QUEUE_MAX)
registered = []
def init(url):
global URL, queue
if not url:
logger.debug('Prometheus aggregator not started: empty URL')
queue = None
return
URL = url
sender = _QueueSender()
sender.start()
logger.debug('Prometheus aggregator sending to %s', URL)
def enqueue(call, data):
if not queue:
return
v = json.dumps({
'Call': call,
'Data': data,
})
if call == 'register':
registered.append(v)
return
try:
queue.put_nowait(v)
except Full:
# If the queue is full, it is because 1) no aggregator was enabled or 2)
# the aggregator is taking a long time to respond to requests. In the case
# of 1, it's probably enterprise mode and we don't care. In the case of 2,
# the response timeout error is printed at another place. In either case,
# we don't need to print an error here.
pass
class _QueueSender(Thread):
def __init__(self):
Thread.__init__(self)
self.daemon = True
self.next_register = datetime.datetime.now()
def run(self):
while True:
reqs = []
reqs.append(queue.get())
while len(reqs) < MAX_BATCH_SIZE:
try:
req = queue.get_nowait()
reqs.append(req)
except Empty:
break
try:
resp = requests.post(URL + '/call', '\n'.join(reqs))
if resp.status_code == 500 and self.next_register <= datetime.datetime.now():
resp = requests.post(URL + '/call', '\n'.join(registered))
self.next_register = datetime.datetime.now() + REGISTER_WAIT
logger.debug('Register returned %s for %s metrics; setting next to %s', resp.status_code, len(registered), self.next_register)
elif resp.status_code != 200:
logger.debug('Failed sending to prometheus: %s: %s: %s', resp.status_code, resp.text, ', '.join(reqs))
else:
logger.debug('Sent %d prometheus metrics', len(reqs))
except:
logger.exception('Failed to write to prometheus aggregator: %s', reqs)
class _Collector(object):
def __init__(self, name, help, namespace='', subsystem='', **kwargs):
self._name = name
self._namespace = namespace
self._subsystem = subsystem
kwargs['Name'] = name
kwargs['Namespace'] = namespace
kwargs['Subsystem'] = subsystem
kwargs['Type'] = self.__class__.__name__
kwargs['Help'] = help
enqueue('register', kwargs)
def __getattr__(self, method):
def f(value=0, labelvalues=()):
data = {
'Name': self._name,
'Subsystem': self._subsystem,
'Namespace': self._namespace,
'Type': self.__class__.__name__,
'Value': value,
'LabelValues': [str(i) for i in labelvalues],
'Method': method,
}
enqueue('put', data)
return f
class Gauge(_Collector):
pass
class Counter(_Collector):
pass
class Summary(_Collector):
pass
class Histogram(_Collector):
pass
class Untyped(_Collector):
pass

View file

@ -4,12 +4,17 @@ import time
from functools import wraps
from Queue import Queue, Full
from util.prometheus import Histogram, Counter
from flask import g, request
logger = logging.getLogger(__name__)
resp_time = Histogram('response_time', 'HTTP response time in seconds', labelnames=['endpoint'])
resp_code = Counter('response_code', 'HTTP response code', labelnames=['endpoint', 'code'])
non_200 = Counter('response_non200', 'Non-200 HTTP response codes', labelnames=['endpoint'])
class MetricQueue(object):
def __init__(self):
self._queue = None
@ -54,10 +59,14 @@ def time_after_request(name, metric_queue):
metric_queue.put('ResponseTime', dur, dimensions=dims, unit='Seconds')
metric_queue.put('ResponseCode', r.status_code, dimensions=dims)
resp_time.Observe(dur, labelvalues=[request.endpoint])
resp_code.Inc(labelvalues=[request.endpoint, r.status_code])
if r.status_code >= 500:
metric_queue.put('5XXResponse', 1, dimensions={'name': name})
elif r.status_code < 200 or r.status_code >= 300:
metric_queue.put('Non200Response', 1, dimensions={'name': name})
non_200.Inc(labelvalues=[request.endpoint])
return r
return f