util/metrics: remove metricqueue abstraction
This change replaces the metricqueue library with a native Prometheus client implementation with the intention to aggregated results with the Prometheus PushGateway. This change also adds instrumentation for greenlet context switches.
This commit is contained in:
parent
23c5120790
commit
4bf4ce33c9
57 changed files with 526 additions and 690 deletions
|
@ -3,36 +3,41 @@ import uuid
|
|||
from datetime import datetime, timedelta
|
||||
from contextlib import contextmanager
|
||||
|
||||
from prometheus_client import Counter, Gauge
|
||||
|
||||
from data.database import QueueItem, db, db_for_update, db_random_func
|
||||
from util.morecollections import AttrDict
|
||||
|
||||
|
||||
queue_item_puts = Counter('quay_queue_item_puts_total',
|
||||
'number of items that have been added to the queue',
|
||||
labelnames=['queue_name'])
|
||||
queue_item_gets = Counter('quay_queue_item_gets_total',
|
||||
'number of times get() has been called on queue',
|
||||
labelnames=['queue_name', 'availability'])
|
||||
queue_item_deletes = Counter('quay_queue_item_deletes_total',
|
||||
'number of expired queue items that have been deleted')
|
||||
|
||||
queue_items_locked = Gauge('quay_queue_items_locked',
|
||||
'number of queue items that have been acquired',
|
||||
labelnames=['queue_name'])
|
||||
queue_items_available = Gauge('quay_queue_items_available',
|
||||
'number of queue items that have not expired',
|
||||
labelnames=['queue_name'])
|
||||
queue_items_available_unlocked = Gauge('quay_queue_items_available_unlocked',
|
||||
'number of queue items that have not expired and are not locked',
|
||||
labelnames=['queue_name'])
|
||||
|
||||
|
||||
MINIMUM_EXTENSION = timedelta(seconds=20)
|
||||
DEFAULT_BATCH_SIZE = 1000
|
||||
|
||||
|
||||
class BuildMetricQueueReporter(object):
|
||||
""" Metric queue reporter for the build system. """
|
||||
def __init__(self, metric_queue):
|
||||
self._metric_queue = metric_queue
|
||||
|
||||
def __call__(self, currently_processing, running_count, total_count):
|
||||
need_capacity_count = total_count - running_count
|
||||
self._metric_queue.put_deprecated('BuildCapacityShortage', need_capacity_count, unit='Count')
|
||||
self._metric_queue.build_capacity_shortage.Set(need_capacity_count)
|
||||
|
||||
building_percent = 100 if currently_processing else 0
|
||||
self._metric_queue.percent_building.Set(building_percent)
|
||||
|
||||
|
||||
class WorkQueue(object):
|
||||
""" Work queue defines methods for interacting with a queue backed by the database. """
|
||||
def __init__(self, queue_name, transaction_factory,
|
||||
canonical_name_match_list=None, reporter=None, metric_queue=None,
|
||||
has_namespace=False):
|
||||
canonical_name_match_list=None, has_namespace=False):
|
||||
self._queue_name = queue_name
|
||||
self._reporter = reporter
|
||||
self._metric_queue = metric_queue
|
||||
self._transaction_factory = transaction_factory
|
||||
self._currently_processing = False
|
||||
self._has_namespaced_items = has_namespace
|
||||
|
@ -129,21 +134,10 @@ class WorkQueue(object):
|
|||
return (running_count, available_not_running_count, available_count)
|
||||
|
||||
def update_metrics(self):
|
||||
if self._reporter is None and self._metric_queue is None:
|
||||
return
|
||||
|
||||
(running_count, available_not_running_count, available_count) = self.get_metrics()
|
||||
|
||||
if self._metric_queue:
|
||||
self._metric_queue.work_queue_running.Set(running_count, labelvalues=[self._queue_name])
|
||||
self._metric_queue.work_queue_available.Set(available_count, labelvalues=[self._queue_name])
|
||||
self._metric_queue.work_queue_available_not_running.Set(available_not_running_count,
|
||||
labelvalues=[self._queue_name])
|
||||
|
||||
|
||||
if self._reporter:
|
||||
self._reporter(self._currently_processing, running_count,
|
||||
running_count + available_not_running_count)
|
||||
queue_items_locked.labels(self._queue_name).set(running_count)
|
||||
queue_items_available.labels(self._queue_name).set(available_count)
|
||||
queue_items_available_unlocked.labels(self._queue_name).set(available_not_running_count)
|
||||
|
||||
def has_retries_remaining(self, item_id):
|
||||
""" Returns whether the queue item with the given id has any retries remaining. If the
|
||||
|
@ -204,7 +198,9 @@ class WorkQueue(object):
|
|||
# Chunk the inserted items into batch_size chunks and insert_many
|
||||
remaining = list(items_to_insert)
|
||||
while remaining:
|
||||
QueueItem.insert_many(remaining[0:batch_size]).execute()
|
||||
current_batch = remaining[0:batch_size]
|
||||
QueueItem.insert_many(current_batch).execute()
|
||||
queue_item_puts.labels(self._queue_name).inc(current_batch)
|
||||
remaining = remaining[batch_size:]
|
||||
|
||||
def put(self, canonical_name_list, message, available_after=0, retries_remaining=5):
|
||||
|
@ -214,6 +210,7 @@ class WorkQueue(object):
|
|||
"""
|
||||
item = QueueItem.create(**self._queue_dict(canonical_name_list, message, available_after,
|
||||
retries_remaining))
|
||||
queue_item_puts.labels(self._queue_name).inc()
|
||||
return str(item.id)
|
||||
|
||||
def _select_available_item(self, ordering_required, now):
|
||||
|
@ -289,15 +286,18 @@ class WorkQueue(object):
|
|||
db_item = self._select_available_item(ordering_required, now)
|
||||
if db_item is None:
|
||||
self._currently_processing = False
|
||||
queue_item_gets.labels(self._queue_name, 'nonexistant').inc()
|
||||
return None
|
||||
|
||||
# Attempt to claim the item for this instance.
|
||||
was_claimed = self._attempt_to_claim_item(db_item, now, processing_time)
|
||||
if not was_claimed:
|
||||
self._currently_processing = False
|
||||
queue_item_gets.labels(self._queue_name, 'claimed').inc()
|
||||
return None
|
||||
|
||||
self._currently_processing = True
|
||||
queue_item_gets.labels(self._queue_name, 'acquired').inc()
|
||||
|
||||
# Return a view of the queue item rather than an active db object
|
||||
return AttrDict({
|
||||
|
@ -307,8 +307,8 @@ class WorkQueue(object):
|
|||
})
|
||||
|
||||
def cancel(self, item_id):
|
||||
""" Attempts to cancel the queue item with the given ID from the queue. Returns true on success
|
||||
and false if the queue item could not be canceled.
|
||||
""" Attempts to cancel the queue item with the given ID from the queue.
|
||||
Returns true on success and false if the queue item could not be canceled.
|
||||
"""
|
||||
count_removed = QueueItem.delete().where(QueueItem.id == item_id).execute()
|
||||
return count_removed > 0
|
||||
|
@ -375,4 +375,5 @@ def delete_expired(expiration_threshold, deletion_threshold, batch_size):
|
|||
return 0
|
||||
|
||||
QueueItem.delete().where(QueueItem.id << to_delete).execute()
|
||||
queue_item_deletes.inc(to_delete)
|
||||
return len(to_delete)
|
||||
|
|
|
@ -7,6 +7,8 @@ from collections import namedtuple
|
|||
import bitmath
|
||||
import resumablehashlib
|
||||
|
||||
from prometheus_client import Counter, Histogram
|
||||
|
||||
from data.registry_model import registry_model
|
||||
from data.database import CloseForLongOperation, db_transaction
|
||||
from digest import digest_tools
|
||||
|
@ -18,6 +20,13 @@ from util.registry.torrent import PieceHasher
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
chunk_upload_duration = Histogram('quay_chunk_upload_duration_seconds',
|
||||
'number of seconds for a chunk to be uploaded to the registry',
|
||||
labelnames=['region'])
|
||||
pushed_bytes_total = Counter('quay_registry_pushed_bytes_total',
|
||||
'number of bytes pushed to the registry')
|
||||
|
||||
|
||||
BLOB_CONTENT_TYPE = 'application/octet-stream'
|
||||
|
||||
|
||||
|
@ -125,13 +134,10 @@ class _BlobUploadManager(object):
|
|||
""" Returns the unique ID for the blob upload. """
|
||||
return self.blob_upload.upload_id
|
||||
|
||||
def upload_chunk(self, app_config, input_fp, start_offset=0, length=-1, metric_queue=None):
|
||||
def upload_chunk(self, app_config, input_fp, start_offset=0, length=-1):
|
||||
""" Uploads a chunk of data found in the given input file-like interface. start_offset and
|
||||
length are optional and should match a range header if any was given.
|
||||
|
||||
If metric_queue is given, the upload time and chunk size are written into the metrics in
|
||||
the queue.
|
||||
|
||||
Returns the total number of bytes uploaded after this upload has completed. Raises
|
||||
a BlobUploadException if the upload failed.
|
||||
"""
|
||||
|
@ -207,11 +213,8 @@ class _BlobUploadManager(object):
|
|||
raise BlobUploadException(upload_error)
|
||||
|
||||
# Update the chunk upload time and push bytes metrics.
|
||||
if metric_queue is not None:
|
||||
metric_queue.chunk_upload_time.Observe(time.time() - start_time, labelvalues=[
|
||||
length_written, list(location_set)[0]])
|
||||
|
||||
metric_queue.push_byte_count.Inc(length_written)
|
||||
chunk_upload_duration.labels(list(location_set)[0]).observe(time.time() - start_time)
|
||||
pushed_bytes_total.inc(length_written)
|
||||
|
||||
# Ensure we have not gone beyond the max layer size.
|
||||
new_blob_bytes = self.blob_upload.byte_count + length_written
|
||||
|
|
Reference in a new issue