util/metrics: remove metricqueue abstraction

This change replaces the metricqueue library with a native Prometheus
client implementation with the intention to aggregated results with the
Prometheus PushGateway.

This change also adds instrumentation for greenlet context switches.
This commit is contained in:
Jimmy Zelinskie 2019-11-13 14:50:33 -05:00
parent 23c5120790
commit 4bf4ce33c9
57 changed files with 526 additions and 690 deletions

View file

@ -3,36 +3,41 @@ import uuid
from datetime import datetime, timedelta
from contextlib import contextmanager
from prometheus_client import Counter, Gauge
from data.database import QueueItem, db, db_for_update, db_random_func
from util.morecollections import AttrDict
queue_item_puts = Counter('quay_queue_item_puts_total',
'number of items that have been added to the queue',
labelnames=['queue_name'])
queue_item_gets = Counter('quay_queue_item_gets_total',
'number of times get() has been called on queue',
labelnames=['queue_name', 'availability'])
queue_item_deletes = Counter('quay_queue_item_deletes_total',
'number of expired queue items that have been deleted')
queue_items_locked = Gauge('quay_queue_items_locked',
'number of queue items that have been acquired',
labelnames=['queue_name'])
queue_items_available = Gauge('quay_queue_items_available',
'number of queue items that have not expired',
labelnames=['queue_name'])
queue_items_available_unlocked = Gauge('quay_queue_items_available_unlocked',
'number of queue items that have not expired and are not locked',
labelnames=['queue_name'])
MINIMUM_EXTENSION = timedelta(seconds=20)
DEFAULT_BATCH_SIZE = 1000
class BuildMetricQueueReporter(object):
""" Metric queue reporter for the build system. """
def __init__(self, metric_queue):
self._metric_queue = metric_queue
def __call__(self, currently_processing, running_count, total_count):
need_capacity_count = total_count - running_count
self._metric_queue.put_deprecated('BuildCapacityShortage', need_capacity_count, unit='Count')
self._metric_queue.build_capacity_shortage.Set(need_capacity_count)
building_percent = 100 if currently_processing else 0
self._metric_queue.percent_building.Set(building_percent)
class WorkQueue(object):
""" Work queue defines methods for interacting with a queue backed by the database. """
def __init__(self, queue_name, transaction_factory,
canonical_name_match_list=None, reporter=None, metric_queue=None,
has_namespace=False):
canonical_name_match_list=None, has_namespace=False):
self._queue_name = queue_name
self._reporter = reporter
self._metric_queue = metric_queue
self._transaction_factory = transaction_factory
self._currently_processing = False
self._has_namespaced_items = has_namespace
@ -129,21 +134,10 @@ class WorkQueue(object):
return (running_count, available_not_running_count, available_count)
def update_metrics(self):
if self._reporter is None and self._metric_queue is None:
return
(running_count, available_not_running_count, available_count) = self.get_metrics()
if self._metric_queue:
self._metric_queue.work_queue_running.Set(running_count, labelvalues=[self._queue_name])
self._metric_queue.work_queue_available.Set(available_count, labelvalues=[self._queue_name])
self._metric_queue.work_queue_available_not_running.Set(available_not_running_count,
labelvalues=[self._queue_name])
if self._reporter:
self._reporter(self._currently_processing, running_count,
running_count + available_not_running_count)
queue_items_locked.labels(self._queue_name).set(running_count)
queue_items_available.labels(self._queue_name).set(available_count)
queue_items_available_unlocked.labels(self._queue_name).set(available_not_running_count)
def has_retries_remaining(self, item_id):
""" Returns whether the queue item with the given id has any retries remaining. If the
@ -204,7 +198,9 @@ class WorkQueue(object):
# Chunk the inserted items into batch_size chunks and insert_many
remaining = list(items_to_insert)
while remaining:
QueueItem.insert_many(remaining[0:batch_size]).execute()
current_batch = remaining[0:batch_size]
QueueItem.insert_many(current_batch).execute()
queue_item_puts.labels(self._queue_name).inc(current_batch)
remaining = remaining[batch_size:]
def put(self, canonical_name_list, message, available_after=0, retries_remaining=5):
@ -214,6 +210,7 @@ class WorkQueue(object):
"""
item = QueueItem.create(**self._queue_dict(canonical_name_list, message, available_after,
retries_remaining))
queue_item_puts.labels(self._queue_name).inc()
return str(item.id)
def _select_available_item(self, ordering_required, now):
@ -289,15 +286,18 @@ class WorkQueue(object):
db_item = self._select_available_item(ordering_required, now)
if db_item is None:
self._currently_processing = False
queue_item_gets.labels(self._queue_name, 'nonexistant').inc()
return None
# Attempt to claim the item for this instance.
was_claimed = self._attempt_to_claim_item(db_item, now, processing_time)
if not was_claimed:
self._currently_processing = False
queue_item_gets.labels(self._queue_name, 'claimed').inc()
return None
self._currently_processing = True
queue_item_gets.labels(self._queue_name, 'acquired').inc()
# Return a view of the queue item rather than an active db object
return AttrDict({
@ -307,8 +307,8 @@ class WorkQueue(object):
})
def cancel(self, item_id):
""" Attempts to cancel the queue item with the given ID from the queue. Returns true on success
and false if the queue item could not be canceled.
""" Attempts to cancel the queue item with the given ID from the queue.
Returns true on success and false if the queue item could not be canceled.
"""
count_removed = QueueItem.delete().where(QueueItem.id == item_id).execute()
return count_removed > 0
@ -375,4 +375,5 @@ def delete_expired(expiration_threshold, deletion_threshold, batch_size):
return 0
QueueItem.delete().where(QueueItem.id << to_delete).execute()
queue_item_deletes.inc(to_delete)
return len(to_delete)

View file

@ -7,6 +7,8 @@ from collections import namedtuple
import bitmath
import resumablehashlib
from prometheus_client import Counter, Histogram
from data.registry_model import registry_model
from data.database import CloseForLongOperation, db_transaction
from digest import digest_tools
@ -18,6 +20,13 @@ from util.registry.torrent import PieceHasher
logger = logging.getLogger(__name__)
chunk_upload_duration = Histogram('quay_chunk_upload_duration_seconds',
'number of seconds for a chunk to be uploaded to the registry',
labelnames=['region'])
pushed_bytes_total = Counter('quay_registry_pushed_bytes_total',
'number of bytes pushed to the registry')
BLOB_CONTENT_TYPE = 'application/octet-stream'
@ -125,13 +134,10 @@ class _BlobUploadManager(object):
""" Returns the unique ID for the blob upload. """
return self.blob_upload.upload_id
def upload_chunk(self, app_config, input_fp, start_offset=0, length=-1, metric_queue=None):
def upload_chunk(self, app_config, input_fp, start_offset=0, length=-1):
""" Uploads a chunk of data found in the given input file-like interface. start_offset and
length are optional and should match a range header if any was given.
If metric_queue is given, the upload time and chunk size are written into the metrics in
the queue.
Returns the total number of bytes uploaded after this upload has completed. Raises
a BlobUploadException if the upload failed.
"""
@ -207,11 +213,8 @@ class _BlobUploadManager(object):
raise BlobUploadException(upload_error)
# Update the chunk upload time and push bytes metrics.
if metric_queue is not None:
metric_queue.chunk_upload_time.Observe(time.time() - start_time, labelvalues=[
length_written, list(location_set)[0]])
metric_queue.push_byte_count.Inc(length_written)
chunk_upload_duration.labels(list(location_set)[0]).observe(time.time() - start_time)
pushed_bytes_total.inc(length_written)
# Ensure we have not gone beyond the max layer size.
new_blob_bytes = self.blob_upload.byte_count + length_written