util/metrics: remove metricqueue abstraction

This change replaces the metricqueue library with a native Prometheus
client implementation with the intention to aggregated results with the
Prometheus PushGateway.

This change also adds instrumentation for greenlet context switches.
This commit is contained in:
Jimmy Zelinskie 2019-11-13 14:50:33 -05:00
parent 23c5120790
commit 4bf4ce33c9
57 changed files with 526 additions and 690 deletions

View file

@ -2,8 +2,8 @@ from datetime import datetime, timedelta
from data import model
from data.database import BlobUpload as BlobUploadTable
from workers.blobuploadcleanupworker.models_interface import (
BlobUpload, BlobUploadCleanupWorkerDataInterface)
from workers.blobuploadcleanupworker.models_interface import (BlobUpload,
BlobUploadCleanupWorkerDataInterface)
class PreOCIModel(BlobUploadCleanupWorkerDataInterface):

View file

@ -10,12 +10,14 @@ from util.streamingjsonencoder import StreamingJSONEncoder
from workers.buildlogsarchiver.models_pre_oci import pre_oci_model as model
from workers.worker import Worker
POLL_PERIOD_SECONDS = 30
MEMORY_TEMPFILE_SIZE = 64 * 1024 # Large enough to handle approximately 99% of builds in memory
logger = logging.getLogger(__name__)
POLL_PERIOD_SECONDS = 30
MEMORY_TEMPFILE_SIZE = 64 * 1024 # Large enough to handle approximately 99% of builds in memory
class ArchiveBuildLogsWorker(Worker):
def __init__(self):
super(ArchiveBuildLogsWorker, self).__init__()

View file

@ -9,10 +9,13 @@ from workers.worker import Worker
from util.log import logfile_path
from util.timedeltastring import convert_to_timedelta
POLL_PERIOD_SECONDS = 60 * 60 # 1 hour
logger = logging.getLogger(__name__)
POLL_PERIOD_SECONDS = 60 * 60 # 1 hour
class ExpiredAppSpecificTokenWorker(Worker):
def __init__(self):
super(ExpiredAppSpecificTokenWorker, self).__init__()
@ -38,7 +41,7 @@ if __name__ == "__main__":
logger.debug('App specific tokens disabled; skipping')
while True:
time.sleep(100000)
if app.config.get('EXPIRED_APP_SPECIFIC_TOKEN_GC') is None:
logger.debug('GC of App specific tokens is disabled; skipping')
while True:

View file

@ -15,10 +15,11 @@ from app import app, export_action_logs_queue, storage as app_storage, get_app_u
from endpoints.api import format_date
from data.logs_model import logs_model
from data.logs_model.interface import LogsIterationTimeout
from workers.queueworker import QueueWorker, JobException
from workers.queueworker import QueueWorker
from util.log import logfile_path
from util.useremails import send_logs_exported_email
logger = logging.getLogger(__name__)

View file

@ -9,8 +9,10 @@ from data.model.repository import find_repository_with_garbage, get_random_gc_po
from data.model.gc import garbage_collect_repo
from workers.worker import Worker
logger = logging.getLogger(__name__)
class GarbageCollectionWorker(Worker):
def __init__(self):
super(GarbageCollectionWorker, self).__init__()

View file

@ -1,15 +1,25 @@
import logging
import time
from app import app, metric_queue
from prometheus_client import Gauge
from app import app
from data.database import UseThenDisconnect
from workers.globalpromstats.models_pre_oci import pre_oci_model as model
from util.locking import GlobalLock, LockNotAcquiredException
from util.log import logfile_path
from workers.globalpromstats.models_pre_oci import pre_oci_model as model
from workers.worker import Worker
logger = logging.getLogger(__name__)
repository_rows = Gauge('quay_repository_rows', 'number of repositories in the database')
user_rows = Gauge('quay_user_rows', 'number of users in the database')
org_rows = Gauge('quay_org_rows', 'number of organizations in the database')
robot_rows = Gauge('quay_robot_rows', 'number of robot accounts in the database')
WORKER_FREQUENCY = app.config.get('GLOBAL_PROMETHEUS_STATS_FREQUENCY', 60 * 60)
@ -33,13 +43,10 @@ class GlobalPrometheusStatsWorker(Worker):
def _report_stats(self):
logger.debug('Reporting global stats')
with UseThenDisconnect(app.config):
# Repository count.
metric_queue.repository_count.Set(model.get_repository_count())
# User counts.
metric_queue.user_count.Set(model.get_active_user_count())
metric_queue.org_count.Set(model.get_active_org_count())
metric_queue.robot_count.Set(model.get_robot_count())
repository_rows.set(model.get_repository_count())
user_rows.set(model.get_active_user_count())
org_rows.set(model.get_active_org_count())
robot_rows.set(model.get_robot_count())
def main():

View file

@ -1,15 +0,0 @@
from mock import patch, Mock
from workers.globalpromstats.globalpromstats import GlobalPrometheusStatsWorker
from test.fixtures import *
def test_reportstats(initialized_db):
mock = Mock()
with patch('workers.globalpromstats.globalpromstats.metric_queue', mock):
worker = GlobalPrometheusStatsWorker()
worker._report_stats()
mock.repository_count.Set.assert_called_once()
mock.org_count.Set.assert_called_once()
mock.robot_count.Set.assert_called_once()

View file

@ -12,10 +12,13 @@ from workers.worker import Worker
from util.log import logfile_path
from util.migrate.allocator import yield_random_entries
logger = logging.getLogger(__name__)
WORKER_TIMEOUT = 600
class LabelBackfillWorker(Worker):
def __init__(self):
super(LabelBackfillWorker, self).__init__()

View file

@ -16,8 +16,10 @@ from util.streamingjsonencoder import StreamingJSONEncoder
from util.timedeltastring import convert_to_timedelta
from workers.worker import Worker
logger = logging.getLogger(__name__)
JSON_MIMETYPE = 'application/json'
MIN_LOGS_PER_ROTATION = 5000
MEMORY_TEMPFILE_SIZE = 12 * 1024 * 1024

View file

@ -14,6 +14,7 @@ from test.fixtures import *
from workers.notificationworker.models_pre_oci import pre_oci_model as model
def test_basic_notification_endtoend(initialized_db):
# Ensure the public user doesn't have any notifications.
assert not model.user_has_local_notifications('public')

View file

@ -7,8 +7,10 @@ from app import app
from data.database import CloseForLongOperation
from workers.worker import Worker
logger = logging.getLogger(__name__)
class JobException(Exception):
""" A job exception is an exception that is caused by something being malformed in the job. When
a worker raises this exception the job will be terminated and the retry will not be returned

View file

@ -4,9 +4,11 @@ import traceback
import fnmatch
import logging.config
from prometheus_client import Gauge
import features
from app import app, prometheus
from app import app
from data import database
from data.model.repo_mirror import claim_mirror, release_mirror
from data.logs_model import logs_model
@ -16,12 +18,15 @@ from data.model.oci.tag import delete_tag, retarget_tag, lookup_alive_tags_shall
from notifications import spawn_notification
from util.audit import wrap_repository
from workers.repomirrorworker.repo_mirror_model import repo_mirror_model as model
logger = logging.getLogger(__name__)
unmirrored_repositories_gauge = prometheus.create_gauge('unmirrored_repositories',
'Number of repositories that need to be scanned.')
unmirrored_repositories = Gauge('quay_repository_rows_unmirrored',
'number of repositories in the database that have not yet been mirrored')
class PreemptedException(Exception):
""" Exception raised if another worker analyzed the image before this worker was able to do so.
@ -61,7 +66,7 @@ def process_mirrors(skopeo, token=None):
logger.exception('Repository Mirror service unavailable')
return None
unmirrored_repositories_gauge.Set(num_remaining)
unmirrored_repositories.set(num_remaining)
return next_token

View file

@ -7,10 +7,13 @@ from data import model
from data.logs_model import logs_model
from workers.worker import Worker, with_exponential_backoff
POLL_PERIOD_SECONDS = 10
logger = logging.getLogger(__name__)
POLL_PERIOD_SECONDS = 10
class RepositoryActionCountWorker(Worker):
def __init__(self):
super(RepositoryActionCountWorker, self).__init__()

View file

@ -1,14 +1,19 @@
import logging.config
from prometheus_client import Gauge
from app import app, prometheus
from data.database import UseThenDisconnect
from workers.securityworker.models_pre_oci import pre_oci_model as model
from util.secscan.api import APIRequestFailure
from util.secscan.analyzer import PreemptedException
logger = logging.getLogger(__name__)
unscanned_images_gauge = prometheus.create_gauge('unscanned_images',
'Number of images that clair needs to scan.')
unscanned_images = Gauge('quay_security_scanning_unscanned_images_remaining',
'number of images that are not scanned by the latest security scanner')
def index_images(target_version, analyzer, token=None):
@ -31,6 +36,6 @@ def index_images(target_version, analyzer, token=None):
logger.exception('Security scanner service unavailable')
return
unscanned_images_gauge.Set(num_remaining)
unscanned_images.set(num_remaining)
return next_token

View file

@ -11,8 +11,10 @@ from util.secscan.analyzer import LayerAnalyzer
from util.log import logfile_path
from endpoints.v2 import v2_bp
logger = logging.getLogger(__name__)
DEFAULT_INDEXING_INTERVAL = 30

View file

@ -1,13 +1,21 @@
import logging
from datetime import datetime, timedelta
from app import app, instance_keys, metric_queue
from prometheus_client import Counter
from app import app, instance_keys
from workers.servicekeyworker.models_pre_oci import pre_oci_model as model
from workers.worker import Worker
logger = logging.getLogger(__name__)
instance_key_renewal_self = Counter('quay_instance_key_renewal_self_total',
'number of times a Quay instance renews its own key',
labelnames=['success'])
class ServiceKeyWorker(Worker):
def __init__(self):
super(ServiceKeyWorker, self).__init__()
@ -28,12 +36,12 @@ class ServiceKeyWorker(Worker):
except Exception as ex:
logger.exception('Failure for automatic refresh of service key %s with new expiration %s',
instance_keys.local_key_id, new_expiration)
metric_queue.instance_key_renewal_failure.Inc(labelvalues=[instance_keys.local_key_id])
instance_key_renewal_self.labels(False).inc()
raise ex
logger.debug('Finished automatic refresh of service key %s with new expiration %s',
instance_keys.local_key_id, new_expiration)
metric_queue.instance_key_renewal_success.Inc(labelvalues=[instance_keys.local_key_id])
instance_key_renewal_self.labels(True).inc()
if __name__ == "__main__":

View file

@ -9,11 +9,14 @@ from data import model
from workers.queueworker import QueueWorker, WorkerUnhealthyException, JobException
from util.log import logfile_path
logger = logging.getLogger(__name__)
POLL_PERIOD_SECONDS = 10
RESERVATION_SECONDS = app.config.get('STORAGE_REPLICATION_PROCESSING_SECONDS', 60*20)
class StorageReplicationWorker(QueueWorker):
def process_queue_item(self, job_details):
storage_uuid = job_details['storage_id']

View file

@ -26,8 +26,10 @@ from util.bytes import Bytes
from util.log import logfile_path
from util.migrate.allocator import yield_random_entries
logger = logging.getLogger(__name__)
WORKER_TIMEOUT = app.config.get('BACKFILL_TAGS_TIMEOUT', 6000)

View file

@ -9,11 +9,14 @@ from workers.worker import Worker
from util.timedeltastring import convert_to_timedelta
from util.log import logfile_path
logger = logging.getLogger(__name__)
WORKER_FREQUENCY = app.config.get('TEAM_SYNC_WORKER_FREQUENCY', 60)
STALE_CUTOFF = convert_to_timedelta(app.config.get('TEAM_RESYNC_STALE_TIME', '30m'))
class TeamSynchronizationWorker(Worker):
""" Worker which synchronizes teams with their backing groups in LDAP/Keystone/etc.
"""

View file

@ -16,12 +16,13 @@ from workers.exportactionlogsworker import ExportActionLogsWorker, POLL_PERIOD_S
from test.fixtures import *
_TEST_CONTENT = os.urandom(1024)
_TEST_BUCKET = 'some_bucket'
_TEST_USER = 'someuser'
_TEST_PASSWORD = 'somepassword'
_TEST_PATH = 'some/cool/path'
_TEST_CONTEXT = StorageContext('nyc', None, None, None, None)
_TEST_CONTEXT = StorageContext('nyc', None, None, None)
@pytest.fixture(params=['test', 'mock_s3'])

View file

@ -1,4 +1,5 @@
import os.path
from datetime import datetime, timedelta
from app import storage