util/metrics: remove metricqueue abstraction
This change replaces the metricqueue library with a native Prometheus client implementation with the intention to aggregated results with the Prometheus PushGateway. This change also adds instrumentation for greenlet context switches.
This commit is contained in:
parent
23c5120790
commit
4bf4ce33c9
57 changed files with 526 additions and 690 deletions
|
@ -2,8 +2,8 @@ from datetime import datetime, timedelta
|
|||
|
||||
from data import model
|
||||
from data.database import BlobUpload as BlobUploadTable
|
||||
from workers.blobuploadcleanupworker.models_interface import (
|
||||
BlobUpload, BlobUploadCleanupWorkerDataInterface)
|
||||
from workers.blobuploadcleanupworker.models_interface import (BlobUpload,
|
||||
BlobUploadCleanupWorkerDataInterface)
|
||||
|
||||
|
||||
class PreOCIModel(BlobUploadCleanupWorkerDataInterface):
|
||||
|
|
|
@ -10,12 +10,14 @@ from util.streamingjsonencoder import StreamingJSONEncoder
|
|||
from workers.buildlogsarchiver.models_pre_oci import pre_oci_model as model
|
||||
from workers.worker import Worker
|
||||
|
||||
POLL_PERIOD_SECONDS = 30
|
||||
MEMORY_TEMPFILE_SIZE = 64 * 1024 # Large enough to handle approximately 99% of builds in memory
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
POLL_PERIOD_SECONDS = 30
|
||||
MEMORY_TEMPFILE_SIZE = 64 * 1024 # Large enough to handle approximately 99% of builds in memory
|
||||
|
||||
|
||||
class ArchiveBuildLogsWorker(Worker):
|
||||
def __init__(self):
|
||||
super(ArchiveBuildLogsWorker, self).__init__()
|
||||
|
|
|
@ -9,10 +9,13 @@ from workers.worker import Worker
|
|||
from util.log import logfile_path
|
||||
from util.timedeltastring import convert_to_timedelta
|
||||
|
||||
POLL_PERIOD_SECONDS = 60 * 60 # 1 hour
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
POLL_PERIOD_SECONDS = 60 * 60 # 1 hour
|
||||
|
||||
|
||||
class ExpiredAppSpecificTokenWorker(Worker):
|
||||
def __init__(self):
|
||||
super(ExpiredAppSpecificTokenWorker, self).__init__()
|
||||
|
@ -38,7 +41,7 @@ if __name__ == "__main__":
|
|||
logger.debug('App specific tokens disabled; skipping')
|
||||
while True:
|
||||
time.sleep(100000)
|
||||
|
||||
|
||||
if app.config.get('EXPIRED_APP_SPECIFIC_TOKEN_GC') is None:
|
||||
logger.debug('GC of App specific tokens is disabled; skipping')
|
||||
while True:
|
||||
|
|
|
@ -15,10 +15,11 @@ from app import app, export_action_logs_queue, storage as app_storage, get_app_u
|
|||
from endpoints.api import format_date
|
||||
from data.logs_model import logs_model
|
||||
from data.logs_model.interface import LogsIterationTimeout
|
||||
from workers.queueworker import QueueWorker, JobException
|
||||
from workers.queueworker import QueueWorker
|
||||
from util.log import logfile_path
|
||||
from util.useremails import send_logs_exported_email
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
|
|
@ -9,8 +9,10 @@ from data.model.repository import find_repository_with_garbage, get_random_gc_po
|
|||
from data.model.gc import garbage_collect_repo
|
||||
from workers.worker import Worker
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GarbageCollectionWorker(Worker):
|
||||
def __init__(self):
|
||||
super(GarbageCollectionWorker, self).__init__()
|
||||
|
|
|
@ -1,15 +1,25 @@
|
|||
import logging
|
||||
import time
|
||||
|
||||
from app import app, metric_queue
|
||||
from prometheus_client import Gauge
|
||||
|
||||
from app import app
|
||||
from data.database import UseThenDisconnect
|
||||
from workers.globalpromstats.models_pre_oci import pre_oci_model as model
|
||||
from util.locking import GlobalLock, LockNotAcquiredException
|
||||
from util.log import logfile_path
|
||||
from workers.globalpromstats.models_pre_oci import pre_oci_model as model
|
||||
from workers.worker import Worker
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
repository_rows = Gauge('quay_repository_rows', 'number of repositories in the database')
|
||||
user_rows = Gauge('quay_user_rows', 'number of users in the database')
|
||||
org_rows = Gauge('quay_org_rows', 'number of organizations in the database')
|
||||
robot_rows = Gauge('quay_robot_rows', 'number of robot accounts in the database')
|
||||
|
||||
|
||||
WORKER_FREQUENCY = app.config.get('GLOBAL_PROMETHEUS_STATS_FREQUENCY', 60 * 60)
|
||||
|
||||
|
||||
|
@ -33,13 +43,10 @@ class GlobalPrometheusStatsWorker(Worker):
|
|||
def _report_stats(self):
|
||||
logger.debug('Reporting global stats')
|
||||
with UseThenDisconnect(app.config):
|
||||
# Repository count.
|
||||
metric_queue.repository_count.Set(model.get_repository_count())
|
||||
|
||||
# User counts.
|
||||
metric_queue.user_count.Set(model.get_active_user_count())
|
||||
metric_queue.org_count.Set(model.get_active_org_count())
|
||||
metric_queue.robot_count.Set(model.get_robot_count())
|
||||
repository_rows.set(model.get_repository_count())
|
||||
user_rows.set(model.get_active_user_count())
|
||||
org_rows.set(model.get_active_org_count())
|
||||
robot_rows.set(model.get_robot_count())
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
@ -1,15 +0,0 @@
|
|||
from mock import patch, Mock
|
||||
|
||||
from workers.globalpromstats.globalpromstats import GlobalPrometheusStatsWorker
|
||||
|
||||
from test.fixtures import *
|
||||
|
||||
def test_reportstats(initialized_db):
|
||||
mock = Mock()
|
||||
with patch('workers.globalpromstats.globalpromstats.metric_queue', mock):
|
||||
worker = GlobalPrometheusStatsWorker()
|
||||
worker._report_stats()
|
||||
|
||||
mock.repository_count.Set.assert_called_once()
|
||||
mock.org_count.Set.assert_called_once()
|
||||
mock.robot_count.Set.assert_called_once()
|
|
@ -12,10 +12,13 @@ from workers.worker import Worker
|
|||
from util.log import logfile_path
|
||||
from util.migrate.allocator import yield_random_entries
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
WORKER_TIMEOUT = 600
|
||||
|
||||
|
||||
class LabelBackfillWorker(Worker):
|
||||
def __init__(self):
|
||||
super(LabelBackfillWorker, self).__init__()
|
||||
|
|
|
@ -16,8 +16,10 @@ from util.streamingjsonencoder import StreamingJSONEncoder
|
|||
from util.timedeltastring import convert_to_timedelta
|
||||
from workers.worker import Worker
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
JSON_MIMETYPE = 'application/json'
|
||||
MIN_LOGS_PER_ROTATION = 5000
|
||||
MEMORY_TEMPFILE_SIZE = 12 * 1024 * 1024
|
||||
|
|
|
@ -14,6 +14,7 @@ from test.fixtures import *
|
|||
|
||||
from workers.notificationworker.models_pre_oci import pre_oci_model as model
|
||||
|
||||
|
||||
def test_basic_notification_endtoend(initialized_db):
|
||||
# Ensure the public user doesn't have any notifications.
|
||||
assert not model.user_has_local_notifications('public')
|
||||
|
|
|
@ -7,8 +7,10 @@ from app import app
|
|||
from data.database import CloseForLongOperation
|
||||
from workers.worker import Worker
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JobException(Exception):
|
||||
""" A job exception is an exception that is caused by something being malformed in the job. When
|
||||
a worker raises this exception the job will be terminated and the retry will not be returned
|
||||
|
|
|
@ -4,9 +4,11 @@ import traceback
|
|||
import fnmatch
|
||||
import logging.config
|
||||
|
||||
from prometheus_client import Gauge
|
||||
|
||||
import features
|
||||
|
||||
from app import app, prometheus
|
||||
from app import app
|
||||
from data import database
|
||||
from data.model.repo_mirror import claim_mirror, release_mirror
|
||||
from data.logs_model import logs_model
|
||||
|
@ -16,12 +18,15 @@ from data.model.oci.tag import delete_tag, retarget_tag, lookup_alive_tags_shall
|
|||
from notifications import spawn_notification
|
||||
from util.audit import wrap_repository
|
||||
|
||||
|
||||
from workers.repomirrorworker.repo_mirror_model import repo_mirror_model as model
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
unmirrored_repositories_gauge = prometheus.create_gauge('unmirrored_repositories',
|
||||
'Number of repositories that need to be scanned.')
|
||||
|
||||
|
||||
unmirrored_repositories = Gauge('quay_repository_rows_unmirrored',
|
||||
'number of repositories in the database that have not yet been mirrored')
|
||||
|
||||
|
||||
class PreemptedException(Exception):
|
||||
""" Exception raised if another worker analyzed the image before this worker was able to do so.
|
||||
|
@ -61,7 +66,7 @@ def process_mirrors(skopeo, token=None):
|
|||
logger.exception('Repository Mirror service unavailable')
|
||||
return None
|
||||
|
||||
unmirrored_repositories_gauge.Set(num_remaining)
|
||||
unmirrored_repositories.set(num_remaining)
|
||||
|
||||
return next_token
|
||||
|
||||
|
|
|
@ -7,10 +7,13 @@ from data import model
|
|||
from data.logs_model import logs_model
|
||||
from workers.worker import Worker, with_exponential_backoff
|
||||
|
||||
POLL_PERIOD_SECONDS = 10
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
POLL_PERIOD_SECONDS = 10
|
||||
|
||||
|
||||
class RepositoryActionCountWorker(Worker):
|
||||
def __init__(self):
|
||||
super(RepositoryActionCountWorker, self).__init__()
|
||||
|
|
|
@ -1,14 +1,19 @@
|
|||
import logging.config
|
||||
|
||||
from prometheus_client import Gauge
|
||||
|
||||
from app import app, prometheus
|
||||
from data.database import UseThenDisconnect
|
||||
from workers.securityworker.models_pre_oci import pre_oci_model as model
|
||||
from util.secscan.api import APIRequestFailure
|
||||
from util.secscan.analyzer import PreemptedException
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
unscanned_images_gauge = prometheus.create_gauge('unscanned_images',
|
||||
'Number of images that clair needs to scan.')
|
||||
|
||||
|
||||
unscanned_images = Gauge('quay_security_scanning_unscanned_images_remaining',
|
||||
'number of images that are not scanned by the latest security scanner')
|
||||
|
||||
|
||||
def index_images(target_version, analyzer, token=None):
|
||||
|
@ -31,6 +36,6 @@ def index_images(target_version, analyzer, token=None):
|
|||
logger.exception('Security scanner service unavailable')
|
||||
return
|
||||
|
||||
unscanned_images_gauge.Set(num_remaining)
|
||||
unscanned_images.set(num_remaining)
|
||||
|
||||
return next_token
|
||||
|
|
|
@ -11,8 +11,10 @@ from util.secscan.analyzer import LayerAnalyzer
|
|||
from util.log import logfile_path
|
||||
from endpoints.v2 import v2_bp
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
DEFAULT_INDEXING_INTERVAL = 30
|
||||
|
||||
|
||||
|
|
|
@ -1,13 +1,21 @@
|
|||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from app import app, instance_keys, metric_queue
|
||||
from prometheus_client import Counter
|
||||
|
||||
from app import app, instance_keys
|
||||
from workers.servicekeyworker.models_pre_oci import pre_oci_model as model
|
||||
from workers.worker import Worker
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
instance_key_renewal_self = Counter('quay_instance_key_renewal_self_total',
|
||||
'number of times a Quay instance renews its own key',
|
||||
labelnames=['success'])
|
||||
|
||||
|
||||
class ServiceKeyWorker(Worker):
|
||||
def __init__(self):
|
||||
super(ServiceKeyWorker, self).__init__()
|
||||
|
@ -28,12 +36,12 @@ class ServiceKeyWorker(Worker):
|
|||
except Exception as ex:
|
||||
logger.exception('Failure for automatic refresh of service key %s with new expiration %s',
|
||||
instance_keys.local_key_id, new_expiration)
|
||||
metric_queue.instance_key_renewal_failure.Inc(labelvalues=[instance_keys.local_key_id])
|
||||
instance_key_renewal_self.labels(False).inc()
|
||||
raise ex
|
||||
|
||||
logger.debug('Finished automatic refresh of service key %s with new expiration %s',
|
||||
instance_keys.local_key_id, new_expiration)
|
||||
metric_queue.instance_key_renewal_success.Inc(labelvalues=[instance_keys.local_key_id])
|
||||
instance_key_renewal_self.labels(True).inc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -9,11 +9,14 @@ from data import model
|
|||
from workers.queueworker import QueueWorker, WorkerUnhealthyException, JobException
|
||||
from util.log import logfile_path
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
POLL_PERIOD_SECONDS = 10
|
||||
RESERVATION_SECONDS = app.config.get('STORAGE_REPLICATION_PROCESSING_SECONDS', 60*20)
|
||||
|
||||
|
||||
class StorageReplicationWorker(QueueWorker):
|
||||
def process_queue_item(self, job_details):
|
||||
storage_uuid = job_details['storage_id']
|
||||
|
|
|
@ -26,8 +26,10 @@ from util.bytes import Bytes
|
|||
from util.log import logfile_path
|
||||
from util.migrate.allocator import yield_random_entries
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
WORKER_TIMEOUT = app.config.get('BACKFILL_TAGS_TIMEOUT', 6000)
|
||||
|
||||
|
||||
|
|
|
@ -9,11 +9,14 @@ from workers.worker import Worker
|
|||
from util.timedeltastring import convert_to_timedelta
|
||||
from util.log import logfile_path
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
WORKER_FREQUENCY = app.config.get('TEAM_SYNC_WORKER_FREQUENCY', 60)
|
||||
STALE_CUTOFF = convert_to_timedelta(app.config.get('TEAM_RESYNC_STALE_TIME', '30m'))
|
||||
|
||||
|
||||
class TeamSynchronizationWorker(Worker):
|
||||
""" Worker which synchronizes teams with their backing groups in LDAP/Keystone/etc.
|
||||
"""
|
||||
|
|
|
@ -16,12 +16,13 @@ from workers.exportactionlogsworker import ExportActionLogsWorker, POLL_PERIOD_S
|
|||
|
||||
from test.fixtures import *
|
||||
|
||||
|
||||
_TEST_CONTENT = os.urandom(1024)
|
||||
_TEST_BUCKET = 'some_bucket'
|
||||
_TEST_USER = 'someuser'
|
||||
_TEST_PASSWORD = 'somepassword'
|
||||
_TEST_PATH = 'some/cool/path'
|
||||
_TEST_CONTEXT = StorageContext('nyc', None, None, None, None)
|
||||
_TEST_CONTEXT = StorageContext('nyc', None, None, None)
|
||||
|
||||
|
||||
@pytest.fixture(params=['test', 'mock_s3'])
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import os.path
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from app import storage
|
||||
|
|
Reference in a new issue