Merge pull request #1830 from coreos-inc/superuser-dashboard

Add prometheus stats to enable better dashboarding
This commit is contained in:
josephschorr 2016-09-26 17:19:22 +02:00 committed by GitHub
commit ad4efba802
18 changed files with 128 additions and 199 deletions

View file

@ -72,6 +72,11 @@ class BuildJob(object):
""" Returns the namespace under which this build is running. """ """ Returns the namespace under which this build is running. """
return self.repo_build.repository.namespace_user.username return self.repo_build.repository.namespace_user.username
@property
def repo_name(self):
""" Returns the name of the repository under which this build is running. """
return self.repo_build.repository.name
@property @property
def repo_build(self): def repo_build(self):
return self._load_repo_build() return self._load_repo_build()

View file

@ -182,6 +182,7 @@ class EphemeralBuilderManager(BaseManager):
self._build_uuid_to_info.pop(build_job.build_uuid, None) self._build_uuid_to_info.pop(build_job.build_uuid, None)
raise Return() raise Return()
executor_name = build_info.executor_name
execution_id = build_info.execution_id execution_id = build_info.execution_id
# If we have not yet received a heartbeat, then the node failed to boot in some way. We mark # If we have not yet received a heartbeat, then the node failed to boot in some way. We mark
@ -196,7 +197,7 @@ class EphemeralBuilderManager(BaseManager):
execution_id)) execution_id))
if got_lock: if got_lock:
logger.warning('Marking job %s as incomplete', build_job.build_uuid) logger.warning('Marking job %s as incomplete', build_job.build_uuid)
self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE) self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name)
# Finally, we terminate the build execution for the job. We don't do this under a lock as # Finally, we terminate the build execution for the job. We don't do this under a lock as
# terminating a node is an atomic operation; better to make sure it is terminated than not. # terminating a node is an atomic operation; better to make sure it is terminated than not.
@ -550,7 +551,10 @@ class EphemeralBuilderManager(BaseManager):
build_job.build_uuid, job_status) build_job.build_uuid, job_status)
# Mark the job as completed. # Mark the job as completed.
self.job_complete_callback(build_job, job_status) build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
executor_name = build_info.executor_name if build_info else None
self.job_complete_callback(build_job, job_status, executor_name)
# Kill the ephmeral builder. # Kill the ephmeral builder.
yield From(self.kill_builder_executor(build_job.build_uuid)) yield From(self.kill_builder_executor(build_job.build_uuid))

View file

@ -141,7 +141,7 @@ class BuilderServer(object):
self._queue.extend_processing(build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS, self._queue.extend_processing(build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS,
minimum_extension=MINIMUM_JOB_EXTENSION) minimum_extension=MINIMUM_JOB_EXTENSION)
def _job_complete(self, build_job, job_status): def _job_complete(self, build_job, job_status, executor_name=None):
if job_status == BuildJobResult.INCOMPLETE: if job_status == BuildJobResult.INCOMPLETE:
self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30) self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30)
else: else:
@ -152,7 +152,7 @@ class BuilderServer(object):
if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count: if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count:
self._shutdown_event.set() self._shutdown_event.set()
report_completion_status(job_status) _report_completion_status(build_job, job_status, executor_name)
@trollius.coroutine @trollius.coroutine
def _work_checker(self): def _work_checker(self):
@ -229,7 +229,10 @@ class BuilderServer(object):
# Initialize the work queue checker. # Initialize the work queue checker.
yield From(self._work_checker()) yield From(self._work_checker())
def report_completion_status(status): def _report_completion_status(build_job, status, executor_name):
metric_queue.build_counter.Inc(labelvalues=[status])
metric_queue.repository_build_completed.Inc(labelvalues=[build_job.namespace, build_job.repo_name,
status, executor_name or 'executor'])
if status == BuildJobResult.COMPLETE: if status == BuildJobResult.COMPLETE:
status_name = 'CompleteBuilds' status_name = 'CompleteBuilds'
elif status == BuildJobResult.ERROR: elif status == BuildJobResult.ERROR:
@ -240,4 +243,3 @@ def report_completion_status(status):
return return
metric_queue.put_deprecated(status_name, 1, unit='Count') metric_queue.put_deprecated(status_name, 1, unit='Count')
metric_queue.build_counter.Inc(labelvalues=[status_name])

View file

@ -0,0 +1,7 @@
#!/bin/sh
# Ensure dependencies start before the logger
sv check syslog-ng > /dev/null || exit 1
# Start the logger
exec logger -i -t globalpromstats

View file

@ -0,0 +1,8 @@
#! /bin/bash
echo 'Starting global prometheus stats worker'
cd /
venv/bin/python -m workers.globalpromstats
echo 'Global prometheus stats exited'

View file

@ -137,6 +137,10 @@ def get_organizations():
return User.select().where(User.organization == True, User.robot == False) return User.select().where(User.organization == True, User.robot == False)
def get_active_org_count():
return get_organizations().count()
def add_user_as_admin(user_obj, org_obj): def add_user_as_admin(user_obj, org_obj):
try: try:
admin_role = TeamRole.get(name='admin') admin_role = TeamRole.get(name='admin')

View file

@ -16,6 +16,9 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Us
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_repository_count():
return Repository.select().count()
def get_public_repo_visibility(): def get_public_repo_visibility():
return _basequery.get_public_repo_visibility() return _basequery.get_public_repo_visibility()

View file

@ -643,6 +643,10 @@ def get_active_user_count():
return get_active_users().count() return get_active_users().count()
def get_robot_count():
return User.select().where(User.robot == True).count()
def detach_external_login(user, service_name): def detach_external_login(user, service_name):
try: try:
service = LoginService.get(name=service_name) service = LoginService.get(name=service_name)

View file

@ -4,63 +4,14 @@ import json
from flask import request, Blueprint, abort, Response from flask import request, Blueprint, abort, Response
from flask.ext.login import current_user from flask.ext.login import current_user
from auth.auth import require_session_login from auth.auth import require_session_login
from endpoints.common import route_show_if from app import userevents
from app import app, userevents
from auth.permissions import SuperUserPermission
from data.userevent import CannotReadUserEventsException from data.userevent import CannotReadUserEventsException
import features
import psutil
import time
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
realtime = Blueprint('realtime', __name__) realtime = Blueprint('realtime', __name__)
@realtime.route("/ps")
@route_show_if(features.SUPER_USERS)
@require_session_login
def ps():
if not SuperUserPermission().can():
abort(403)
def generator():
while True:
build_status = {}
try:
builder_data = app.config['HTTPCLIENT'].get('http://localhost:8686/status', timeout=1)
if builder_data.status_code == 200:
build_status = json.loads(builder_data.text)
except:
pass
try:
data = {
'count': {
'cpu': psutil.cpu_percent(interval=1, percpu=True),
'virtual_mem': psutil.virtual_memory(),
'swap_mem': psutil.swap_memory(),
'connections': len(psutil.net_connections()),
'processes': len(psutil.pids()),
'network': psutil.net_io_counters()
},
'build': build_status
}
except psutil.AccessDenied:
data = {}
json_string = json.dumps(data)
yield 'data: %s\n\n' % json_string
time.sleep(1)
try:
return Response(generator(), mimetype="text/event-stream")
except:
pass
@realtime.route("/user/") @realtime.route("/user/")
@require_session_login @require_session_login
def index(): def index():

View file

@ -7,7 +7,7 @@ from functools import wraps
from flask import request, make_response, jsonify, session from flask import request, make_response, jsonify, session
from data import model from data import model
from app import authentication, userevents from app import authentication, userevents, metric_queue
from auth.auth import process_auth, generate_signed_token from auth.auth import process_auth, generate_signed_token
from auth.auth_context import get_authenticated_user, get_validated_token, get_validated_oauth_token from auth.auth_context import get_authenticated_user, get_validated_token, get_validated_oauth_token
from auth.permissions import (ModifyRepositoryPermission, UserAdminPermission, from auth.permissions import (ModifyRepositoryPermission, UserAdminPermission,
@ -247,6 +247,7 @@ def update_images(namespace_name, repo_name):
track_and_log('push_repo', repo) track_and_log('push_repo', repo)
spawn_notification(repo, 'repo_push', event_data) spawn_notification(repo, 'repo_push', event_data)
metric_queue.repository_push.Inc(labelvalues=[namespace_name, repo_name, 'v1'])
return make_response('Updated', 204) return make_response('Updated', 204)
abort(403) abort(403)
@ -273,6 +274,7 @@ def get_repository_images(namespace_name, repo_name):
resp.mimetype = 'application/json' resp.mimetype = 'application/json'
track_and_log('pull_repo', repo, analytics_name='pull_repo_100x', analytics_sample=0.01) track_and_log('pull_repo', repo, analytics_name='pull_repo_100x', analytics_sample=0.01)
metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'v1'])
return resp return resp
abort(403) abort(403)

View file

@ -14,7 +14,7 @@ from jwkest.jws import SIGNER_ALGS, keyrep
import features import features
from app import docker_v2_signing_key, app from app import docker_v2_signing_key, app, metric_queue
from auth.registry_jwt_auth import process_registry_jwt_auth from auth.registry_jwt_auth import process_registry_jwt_auth
from endpoints.common import parse_repository_name from endpoints.common import parse_repository_name
from endpoints.decorators import anon_protect from endpoints.decorators import anon_protect
@ -261,6 +261,7 @@ def fetch_manifest_by_tagname(namespace_name, repo_name, manifest_ref):
repo = model.repository.get_repository(namespace_name, repo_name) repo = model.repository.get_repository(namespace_name, repo_name)
if repo is not None: if repo is not None:
track_and_log('pull_repo', repo, analytics_name='pull_repo_100x', analytics_sample=0.01) track_and_log('pull_repo', repo, analytics_name='pull_repo_100x', analytics_sample=0.01)
metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'v2'])
response = make_response(manifest.json_data, 200) response = make_response(manifest.json_data, 200)
response.headers['Content-Type'] = MANIFEST_CONTENT_TYPE response.headers['Content-Type'] = MANIFEST_CONTENT_TYPE
@ -283,6 +284,7 @@ def fetch_manifest_by_digest(namespace_name, repo_name, manifest_ref):
repo = model.repository.get_repository(namespace_name, repo_name) repo = model.repository.get_repository(namespace_name, repo_name)
if repo is not None: if repo is not None:
track_and_log('pull_repo', repo) track_and_log('pull_repo', repo)
metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'v2'])
response = make_response(manifest.json_data, 200) response = make_response(manifest.json_data, 200)
response.headers['Content-Type'] = MANIFEST_CONTENT_TYPE response.headers['Content-Type'] = MANIFEST_CONTENT_TYPE
@ -487,6 +489,7 @@ def _write_manifest(namespace_name, repo_name, manifest):
track_and_log('push_repo', repo, tag=tag_name) track_and_log('push_repo', repo, tag=tag_name)
spawn_notification(repo, 'repo_push', event_data) spawn_notification(repo, 'repo_push', event_data)
metric_queue.repository_push.Inc(labelvalues=[namespace_name, repo_name, 'v2'])
response = make_response('OK', 202) response = make_response('OK', 202)
response.headers['Docker-Content-Digest'] = manifest_digest response.headers['Docker-Content-Digest'] = manifest_digest

View file

@ -249,10 +249,12 @@ def _repo_verb(namespace, repository, tag, verb, formatter, sign=False, checker=
# Check for torrent. If found, we return a torrent for the repo verb image (if the derived # Check for torrent. If found, we return a torrent for the repo verb image (if the derived
# image already exists). # image already exists).
if request.accept_mimetypes.best == 'application/x-bittorrent': if request.accept_mimetypes.best == 'application/x-bittorrent':
metric_queue.repository_pull.Inc(labelvalues=[namespace, repository, verb + '+torrent'])
return _torrent_repo_verb(repo_image, tag, verb, **kwargs) return _torrent_repo_verb(repo_image, tag, verb, **kwargs)
# Log the action. # Log the action.
track_and_log('repo_verb', repo_image.repository, tag=tag, verb=verb, **kwargs) track_and_log('repo_verb', repo_image.repository, tag=tag, verb=verb, **kwargs)
metric_queue.repository_pull.Inc(labelvalues=[namespace, repository, verb])
# Lookup/create the derived image storage for the verb and repo image. # Lookup/create the derived image storage for the verb and repo image.
derived = model.image.find_or_create_derived_storage(repo_image, verb, derived = model.image.find_or_create_derived_storage(repo_image, verb,
@ -402,4 +404,5 @@ def get_tag_torrent(namespace_name, repo_name, digest):
except model.BlobDoesNotExist: except model.BlobDoesNotExist:
abort(404) abort(404)
metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'torrent'])
return _torrent_for_storage(blob, public_repo) return _torrent_for_storage(blob, public_repo)

View file

@ -1,75 +0,0 @@
<div class="ps-usage-graph-element">
<!-- Build Charts -->
<div quay-show="Features.BUILD_SUPPORT">
<div class="alert alert-warning" ng-if="data.build && data.build.job_total == null">
Cannot load build system status. Please restart your container.
</div>
<div ng-if="data.build && data.build.job_total >= 0">
<div class="col-md-6 chart-col">
<h4>Build Queue</h4>
<h5>
Running Jobs: {{ data.build.running_total }} | Total Jobs: {{ data.build.job_total }}
</h5>
<div class="realtime-area-chart"
data="[data.build.job_total, data.build.running_total]"
labels="['Queued Build Jobs', 'Running Build Jobs']"
colors="['rgb(157, 194, 211)', 'rgb(56, 122, 163)']"
counter="counter"
minimum="-10"
maximum="auto"></div>
</div>
<div class="col-md-6 chart-col">
<h4>Local Build Workers</h4>
<h5>
Local Workers: {{ data.build.workers }} | Working: {{ data.build.running_local }}
</h5>
<div class="realtime-area-chart"
data="[data.build.job_total, data.build.workers, data.build.running_local]"
labels="['Queued Build Jobs', 'Build Workers (local)', 'Running Build Jobs (local)']"
colors="['rgb(157, 194, 211)', 'rgb(161, 208, 93)', 'rgb(210, 237, 130)']"
counter="counter"
minimum="-10"
maximum="auto"></div>
</div>
</div>
</div>
<!-- CPU, Memory and Network -->
<div class="col-md-4 chart-col">
<h4>CPU Usage %</h4>
<div class="realtime-line-chart" data="data.count.cpu" counter="counter"
label-template="CPU #{x} %"
minimum="-10" maximum="110"></div>
</div>
<div class="col-md-4 chart-col">
<h4>Process Count</h4>
<div class="realtime-line-chart" data="data.count.processes" counter="counter"
label-template="Process Count"></div>
</div>
<div class="col-md-4 chart-col">
<h4>Virtual Memory %</h4>
<div class="realtime-line-chart" data="data.count.virtual_mem[2]" counter="counter"
label-template="Virtual Memory %"
minimum="-10" maximum="110"></div>
</div>
<div class="col-md-4 chart-col">
<h4>Swap Memory</h4>
<div class="realtime-line-chart" data="data.count.swap_mem[3]" counter="counter"
label-template="Swap Memory %"></div>
</div>
<div class="col-md-4 chart-col">
<h4>Network Connections</h4>
<div class="realtime-line-chart" data="data.count.connections" counter="counter"
label-template="Network Connection Count"></div>
</div>
<div class="col-md-4 chart-col">
<h4>Network Usage (Bytes)</h4>
<div class="realtime-line-chart" data="data.count.network" labels="['Bytes In', 'Bytes Out']" counter="counter"></div>
</div>
</div>

View file

@ -1,50 +0,0 @@
/**
* An element which displays charts and graphs representing the current installation of the
* application. This control requires superuser access and *must be disabled when not visible*.
*/
angular.module('quay').directive('psUsageGraph', function () {
var directiveDefinitionObject = {
priority: 0,
templateUrl: '/static/directives/ps-usage-graph.html',
replace: false,
transclude: false,
restrict: 'C',
scope: {
'isEnabled': '=isEnabled'
},
controller: function($scope, $element) {
$scope.counter = -1;
$scope.data = null;
var source = null;
var connect = function() {
if (source) { return; }
source = new EventSource('/realtime/ps');
source.onmessage = function(e) {
$scope.$apply(function() {
$scope.counter++;
$scope.data = JSON.parse(e.data);
});
};
};
var disconnect = function() {
if (!source) { return; }
source.close();
source = null;
};
$scope.$watch('isEnabled', function(value) {
if (value) {
connect();
} else {
disconnect();
}
});
$scope.$on("$destroy", disconnect);
}
};
return directiveDefinitionObject;
});

View file

@ -29,15 +29,10 @@
$scope.pollChannel = null; $scope.pollChannel = null;
$scope.logsScrolled = false; $scope.logsScrolled = false;
$scope.csrf_token = encodeURIComponent(window.__token); $scope.csrf_token = encodeURIComponent(window.__token);
$scope.dashboardActive = false;
$scope.currentConfig = null; $scope.currentConfig = null;
$scope.serviceKeysActive = false; $scope.serviceKeysActive = false;
$scope.takeOwnershipInfo = null; $scope.takeOwnershipInfo = null;
$scope.setDashboardActive = function(active) {
$scope.dashboardActive = active;
};
$scope.configurationSaved = function(config) { $scope.configurationSaved = function(config) {
$scope.currentConfig = config; $scope.currentConfig = config;
$scope.requiresRestart = true; $scope.requiresRestart = true;

View file

@ -28,10 +28,6 @@
tab-target="#servicekeys" tab-init="loadServiceKeys()"> tab-target="#servicekeys" tab-init="loadServiceKeys()">
<i class="fa fa-key"></i> <i class="fa fa-key"></i>
</span> </span>
<span class="cor-tab" tab-title="Dashboard" tab-target="#dashboard"
tab-shown="setDashboardActive(true)" tab-hidden="setDashboardActive(false)">
<i class="fa fa-tachometer"></i>
</span>
<span class="cor-tab" tab-title="Change Log" tab-target="#change-log" tab-init="getChangeLog()"> <span class="cor-tab" tab-title="Change Log" tab-target="#change-log" tab-init="getChangeLog()">
<i class="fa fa-rss"></i> <i class="fa fa-rss"></i>
</span> </span>
@ -59,11 +55,6 @@
<div class="service-keys-manager" is-enabled="serviceKeysActive"></div> <div class="service-keys-manager" is-enabled="serviceKeysActive"></div>
</div> </div>
<!-- Dashboard tab -->
<div id="dashboard" class="tab-pane">
<div class="ps-usage-graph" is-enabled="dashboardActive"></div>
</div>
<!-- Debugging tab --> <!-- Debugging tab -->
<div id="debug" class="tab-pane"> <div id="debug" class="tab-pane">
<div class="cor-loader" ng-show="!debugServices"></div> <div class="cor-loader" ng-show="!debugServices"></div>

View file

@ -28,8 +28,8 @@ class MetricQueue(object):
self.non_200 = prom.create_counter('response_non200', 'Non-200 HTTP response codes', self.non_200 = prom.create_counter('response_non200', 'Non-200 HTTP response codes',
labelnames=['endpoint']) labelnames=['endpoint'])
self.multipart_upload_start = prom.create_counter('multipart_upload_start', self.multipart_upload_start = prom.create_counter('multipart_upload_start',
'Multipart upload startse') 'Multipart upload started')
self.multipart_upload_end = prom.create_counter('self._metric_queue.multipart_upload_end', self.multipart_upload_end = prom.create_counter('multipart_upload_end',
'Multipart upload ends.', labelnames=['type']) 'Multipart upload ends.', labelnames=['type'])
self.build_capacity_shortage = prom.create_gauge('build_capacity_shortage', self.build_capacity_shortage = prom.create_gauge('build_capacity_shortage',
'Build capacity shortage.') 'Build capacity shortage.')
@ -46,6 +46,22 @@ class MetricQueue(object):
'Available items in a queue', 'Available items in a queue',
labelnames=['queue_name']) labelnames=['queue_name'])
self.repository_pull = prom.create_counter('repository_pull', 'Repository Pull Count',
labelnames=['namespace', 'repo_name', 'protocol'])
self.repository_push = prom.create_counter('repository_push', 'Repository Push Count',
labelnames=['namespace', 'repo_name', 'protocol'])
self.repository_build_completed = prom.create_counter('repository_build_completed',
'Repository Build Complete Count',
labelnames=['namespace', 'repo_name',
'status', 'executor'])
self.repository_count = prom.create_gauge('repository_count', 'Number of repositories')
self.user_count = prom.create_gauge('user_count', 'Number of users')
self.org_count = prom.create_gauge('org_count', 'Number of Organizations')
self.robot_count = prom.create_gauge('robot_count', 'Number of robot accounts')
# Deprecated: Define an in-memory queue for reporting metrics to CloudWatch or another # Deprecated: Define an in-memory queue for reporting metrics to CloudWatch or another
# provider. # provider.
self._queue = None self._queue = None

View file

@ -0,0 +1,56 @@
import logging
import time
from app import app, metric_queue
from data.database import UseThenDisconnect
from data import model
from util.locking import GlobalLock, LockNotAcquiredException
from workers.worker import Worker
logger = logging.getLogger(__name__)
WORKER_FREQUENCY = app.config.get('GLOBAL_PROMETHEUS_STATS_FREQUENCY', 60 * 60)
class GlobalPrometheusStatsWorker(Worker):
""" Worker which reports global stats (# of users, orgs, repos, etc) to Prometheus periodically.
"""
def __init__(self):
super(GlobalPrometheusStatsWorker, self).__init__()
self.add_operation(self._try_report_stats, WORKER_FREQUENCY)
def _try_report_stats(self):
logger.debug('Attempting to report stats')
try:
with GlobalLock('GLOBAL_PROM_STATS'):
self._report_stats()
except LockNotAcquiredException:
return
def _report_stats(self):
logger.debug('Reporting global stats')
with UseThenDisconnect(app.config):
# Repository count.
metric_queue.repository_count.set(model.repository.get_repository_count())
# User counts.
metric_queue.user_count.set(model.user.get_active_user_count())
metric_queue.org_count.set(model.organization.get_active_org_count())
metric_queue.robot_count.set(model.user.get_robot_count())
def main():
logging.config.fileConfig('conf/logging_debug.conf', disable_existing_loggers=False)
if not app.config.get('PROMETHEUS_AGGREGATOR_URL'):
logger.debug('Prometheus not enabled; skipping global stats reporting')
while True:
time.sleep(100000)
worker = GlobalPrometheusStatsWorker()
worker.start()
if __name__ == "__main__":
main()