diff --git a/buildman/jobutil/buildjob.py b/buildman/jobutil/buildjob.py index eecac1000..83b4c42ef 100644 --- a/buildman/jobutil/buildjob.py +++ b/buildman/jobutil/buildjob.py @@ -72,6 +72,11 @@ class BuildJob(object): """ Returns the namespace under which this build is running. """ return self.repo_build.repository.namespace_user.username + @property + def repo_name(self): + """ Returns the name of the repository under which this build is running. """ + return self.repo_build.repository.name + @property def repo_build(self): return self._load_repo_build() diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index 7bcdf3a80..d02b164cb 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -182,6 +182,7 @@ class EphemeralBuilderManager(BaseManager): self._build_uuid_to_info.pop(build_job.build_uuid, None) raise Return() + executor_name = build_info.executor_name execution_id = build_info.execution_id # If we have not yet received a heartbeat, then the node failed to boot in some way. We mark @@ -196,7 +197,7 @@ class EphemeralBuilderManager(BaseManager): execution_id)) if got_lock: logger.warning('Marking job %s as incomplete', build_job.build_uuid) - self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE) + self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name) # Finally, we terminate the build execution for the job. We don't do this under a lock as # terminating a node is an atomic operation; better to make sure it is terminated than not. @@ -550,7 +551,10 @@ class EphemeralBuilderManager(BaseManager): build_job.build_uuid, job_status) # Mark the job as completed. - self.job_complete_callback(build_job, job_status) + build_info = self._build_uuid_to_info.get(build_job.build_uuid, None) + executor_name = build_info.executor_name if build_info else None + + self.job_complete_callback(build_job, job_status, executor_name) # Kill the ephmeral builder. yield From(self.kill_builder_executor(build_job.build_uuid)) diff --git a/buildman/server.py b/buildman/server.py index fcc909714..072dc2c98 100644 --- a/buildman/server.py +++ b/buildman/server.py @@ -141,7 +141,7 @@ class BuilderServer(object): self._queue.extend_processing(build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS, minimum_extension=MINIMUM_JOB_EXTENSION) - def _job_complete(self, build_job, job_status): + def _job_complete(self, build_job, job_status, executor_name=None): if job_status == BuildJobResult.INCOMPLETE: self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30) else: @@ -152,7 +152,7 @@ class BuilderServer(object): if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count: self._shutdown_event.set() - report_completion_status(job_status) + _report_completion_status(build_job, job_status, executor_name) @trollius.coroutine def _work_checker(self): @@ -229,7 +229,10 @@ class BuilderServer(object): # Initialize the work queue checker. yield From(self._work_checker()) -def report_completion_status(status): +def _report_completion_status(build_job, status, executor_name): + metric_queue.build_counter.Inc(labelvalues=[status]) + metric_queue.repository_build_completed.Inc(labelvalues=[build_job.namespace, build_job.repo_name, + status, executor_name or 'executor']) if status == BuildJobResult.COMPLETE: status_name = 'CompleteBuilds' elif status == BuildJobResult.ERROR: @@ -240,4 +243,3 @@ def report_completion_status(status): return metric_queue.put_deprecated(status_name, 1, unit='Count') - metric_queue.build_counter.Inc(labelvalues=[status_name]) diff --git a/conf/init/service/globalpromstats/log/run b/conf/init/service/globalpromstats/log/run new file mode 100755 index 000000000..2f6152e32 --- /dev/null +++ b/conf/init/service/globalpromstats/log/run @@ -0,0 +1,7 @@ +#!/bin/sh + +# Ensure dependencies start before the logger +sv check syslog-ng > /dev/null || exit 1 + +# Start the logger +exec logger -i -t globalpromstats diff --git a/conf/init/service/globalpromstats/run b/conf/init/service/globalpromstats/run new file mode 100755 index 000000000..fab1258fc --- /dev/null +++ b/conf/init/service/globalpromstats/run @@ -0,0 +1,8 @@ +#! /bin/bash + +echo 'Starting global prometheus stats worker' + +cd / +venv/bin/python -m workers.globalpromstats + +echo 'Global prometheus stats exited' diff --git a/data/model/organization.py b/data/model/organization.py index 6fc556f50..2d7d21593 100644 --- a/data/model/organization.py +++ b/data/model/organization.py @@ -137,6 +137,10 @@ def get_organizations(): return User.select().where(User.organization == True, User.robot == False) +def get_active_org_count(): + return get_organizations().count() + + def add_user_as_admin(user_obj, org_obj): try: admin_role = TeamRole.get(name='admin') diff --git a/data/model/repository.py b/data/model/repository.py index 24d82e697..0281bc592 100644 --- a/data/model/repository.py +++ b/data/model/repository.py @@ -16,6 +16,9 @@ from data.database import (Repository, Namespace, RepositoryTag, Star, Image, Us logger = logging.getLogger(__name__) +def get_repository_count(): + return Repository.select().count() + def get_public_repo_visibility(): return _basequery.get_public_repo_visibility() diff --git a/data/model/user.py b/data/model/user.py index 28620b5a1..a711efda7 100644 --- a/data/model/user.py +++ b/data/model/user.py @@ -643,6 +643,10 @@ def get_active_user_count(): return get_active_users().count() +def get_robot_count(): + return User.select().where(User.robot == True).count() + + def detach_external_login(user, service_name): try: service = LoginService.get(name=service_name) diff --git a/endpoints/realtime.py b/endpoints/realtime.py index 23d41ee28..b1e5151b2 100644 --- a/endpoints/realtime.py +++ b/endpoints/realtime.py @@ -4,63 +4,14 @@ import json from flask import request, Blueprint, abort, Response from flask.ext.login import current_user from auth.auth import require_session_login -from endpoints.common import route_show_if -from app import app, userevents -from auth.permissions import SuperUserPermission +from app import userevents from data.userevent import CannotReadUserEventsException -import features -import psutil -import time - logger = logging.getLogger(__name__) realtime = Blueprint('realtime', __name__) -@realtime.route("/ps") -@route_show_if(features.SUPER_USERS) -@require_session_login -def ps(): - if not SuperUserPermission().can(): - abort(403) - - def generator(): - while True: - build_status = {} - try: - builder_data = app.config['HTTPCLIENT'].get('http://localhost:8686/status', timeout=1) - if builder_data.status_code == 200: - build_status = json.loads(builder_data.text) - except: - pass - - try: - data = { - 'count': { - 'cpu': psutil.cpu_percent(interval=1, percpu=True), - 'virtual_mem': psutil.virtual_memory(), - 'swap_mem': psutil.swap_memory(), - 'connections': len(psutil.net_connections()), - 'processes': len(psutil.pids()), - 'network': psutil.net_io_counters() - }, - 'build': build_status - } - except psutil.AccessDenied: - data = {} - - json_string = json.dumps(data) - yield 'data: %s\n\n' % json_string - time.sleep(1) - - try: - return Response(generator(), mimetype="text/event-stream") - except: - pass - - - @realtime.route("/user/") @require_session_login def index(): diff --git a/endpoints/v1/index.py b/endpoints/v1/index.py index 2440cf11a..82d26837e 100644 --- a/endpoints/v1/index.py +++ b/endpoints/v1/index.py @@ -7,7 +7,7 @@ from functools import wraps from flask import request, make_response, jsonify, session from data import model -from app import authentication, userevents +from app import authentication, userevents, metric_queue from auth.auth import process_auth, generate_signed_token from auth.auth_context import get_authenticated_user, get_validated_token, get_validated_oauth_token from auth.permissions import (ModifyRepositoryPermission, UserAdminPermission, @@ -247,6 +247,7 @@ def update_images(namespace_name, repo_name): track_and_log('push_repo', repo) spawn_notification(repo, 'repo_push', event_data) + metric_queue.repository_push.Inc(labelvalues=[namespace_name, repo_name, 'v1']) return make_response('Updated', 204) abort(403) @@ -273,6 +274,7 @@ def get_repository_images(namespace_name, repo_name): resp.mimetype = 'application/json' track_and_log('pull_repo', repo, analytics_name='pull_repo_100x', analytics_sample=0.01) + metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'v1']) return resp abort(403) diff --git a/endpoints/v2/manifest.py b/endpoints/v2/manifest.py index eb07bcf11..b1302d1b8 100644 --- a/endpoints/v2/manifest.py +++ b/endpoints/v2/manifest.py @@ -14,7 +14,7 @@ from jwkest.jws import SIGNER_ALGS, keyrep import features -from app import docker_v2_signing_key, app +from app import docker_v2_signing_key, app, metric_queue from auth.registry_jwt_auth import process_registry_jwt_auth from endpoints.common import parse_repository_name from endpoints.decorators import anon_protect @@ -261,6 +261,7 @@ def fetch_manifest_by_tagname(namespace_name, repo_name, manifest_ref): repo = model.repository.get_repository(namespace_name, repo_name) if repo is not None: track_and_log('pull_repo', repo, analytics_name='pull_repo_100x', analytics_sample=0.01) + metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'v2']) response = make_response(manifest.json_data, 200) response.headers['Content-Type'] = MANIFEST_CONTENT_TYPE @@ -283,6 +284,7 @@ def fetch_manifest_by_digest(namespace_name, repo_name, manifest_ref): repo = model.repository.get_repository(namespace_name, repo_name) if repo is not None: track_and_log('pull_repo', repo) + metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'v2']) response = make_response(manifest.json_data, 200) response.headers['Content-Type'] = MANIFEST_CONTENT_TYPE @@ -487,6 +489,7 @@ def _write_manifest(namespace_name, repo_name, manifest): track_and_log('push_repo', repo, tag=tag_name) spawn_notification(repo, 'repo_push', event_data) + metric_queue.repository_push.Inc(labelvalues=[namespace_name, repo_name, 'v2']) response = make_response('OK', 202) response.headers['Docker-Content-Digest'] = manifest_digest diff --git a/endpoints/verbs.py b/endpoints/verbs.py index 8a3a9a1bb..eff2ca35c 100644 --- a/endpoints/verbs.py +++ b/endpoints/verbs.py @@ -249,10 +249,12 @@ def _repo_verb(namespace, repository, tag, verb, formatter, sign=False, checker= # Check for torrent. If found, we return a torrent for the repo verb image (if the derived # image already exists). if request.accept_mimetypes.best == 'application/x-bittorrent': + metric_queue.repository_pull.Inc(labelvalues=[namespace, repository, verb + '+torrent']) return _torrent_repo_verb(repo_image, tag, verb, **kwargs) # Log the action. track_and_log('repo_verb', repo_image.repository, tag=tag, verb=verb, **kwargs) + metric_queue.repository_pull.Inc(labelvalues=[namespace, repository, verb]) # Lookup/create the derived image storage for the verb and repo image. derived = model.image.find_or_create_derived_storage(repo_image, verb, @@ -402,4 +404,5 @@ def get_tag_torrent(namespace_name, repo_name, digest): except model.BlobDoesNotExist: abort(404) + metric_queue.repository_pull.Inc(labelvalues=[namespace_name, repo_name, 'torrent']) return _torrent_for_storage(blob, public_repo) diff --git a/static/directives/ps-usage-graph.html b/static/directives/ps-usage-graph.html deleted file mode 100644 index a02eaa83c..000000000 --- a/static/directives/ps-usage-graph.html +++ /dev/null @@ -1,75 +0,0 @@ -
- -
-
- Cannot load build system status. Please restart your container. -
-
-
-

Build Queue

-
- Running Jobs: {{ data.build.running_total }} | Total Jobs: {{ data.build.job_total }} -
-
-
- -
-

Local Build Workers

-
- Local Workers: {{ data.build.workers }} | Working: {{ data.build.running_local }} -
-
-
-
-
- - -
-

CPU Usage %

-
-
- -
-

Process Count

-
-
- -
-

Virtual Memory %

-
-
- -
-

Swap Memory

-
-
- -
-

Network Connections

-
-
- -
-

Network Usage (Bytes)

-
-
-
diff --git a/static/js/directives/ui/ps-usage-graph.js b/static/js/directives/ui/ps-usage-graph.js deleted file mode 100644 index 7e1629b34..000000000 --- a/static/js/directives/ui/ps-usage-graph.js +++ /dev/null @@ -1,50 +0,0 @@ -/** - * An element which displays charts and graphs representing the current installation of the - * application. This control requires superuser access and *must be disabled when not visible*. - */ -angular.module('quay').directive('psUsageGraph', function () { - var directiveDefinitionObject = { - priority: 0, - templateUrl: '/static/directives/ps-usage-graph.html', - replace: false, - transclude: false, - restrict: 'C', - scope: { - 'isEnabled': '=isEnabled' - }, - controller: function($scope, $element) { - $scope.counter = -1; - $scope.data = null; - - var source = null; - - var connect = function() { - if (source) { return; } - source = new EventSource('/realtime/ps'); - source.onmessage = function(e) { - $scope.$apply(function() { - $scope.counter++; - $scope.data = JSON.parse(e.data); - }); - }; - }; - - var disconnect = function() { - if (!source) { return; } - source.close(); - source = null; - }; - - $scope.$watch('isEnabled', function(value) { - if (value) { - connect(); - } else { - disconnect(); - } - }); - - $scope.$on("$destroy", disconnect); - } - }; - return directiveDefinitionObject; -}); \ No newline at end of file diff --git a/static/js/pages/superuser.js b/static/js/pages/superuser.js index 1132f27a7..f5b6d19a2 100644 --- a/static/js/pages/superuser.js +++ b/static/js/pages/superuser.js @@ -29,15 +29,10 @@ $scope.pollChannel = null; $scope.logsScrolled = false; $scope.csrf_token = encodeURIComponent(window.__token); - $scope.dashboardActive = false; $scope.currentConfig = null; $scope.serviceKeysActive = false; $scope.takeOwnershipInfo = null; - $scope.setDashboardActive = function(active) { - $scope.dashboardActive = active; - }; - $scope.configurationSaved = function(config) { $scope.currentConfig = config; $scope.requiresRestart = true; diff --git a/static/partials/super-user.html b/static/partials/super-user.html index 56d83adeb..ab640736b 100644 --- a/static/partials/super-user.html +++ b/static/partials/super-user.html @@ -28,10 +28,6 @@ tab-target="#servicekeys" tab-init="loadServiceKeys()"> - - - @@ -59,11 +55,6 @@
- -
-
-
-
diff --git a/util/metrics/metricqueue.py b/util/metrics/metricqueue.py index 303f677ef..a9c8a87ad 100644 --- a/util/metrics/metricqueue.py +++ b/util/metrics/metricqueue.py @@ -28,8 +28,8 @@ class MetricQueue(object): self.non_200 = prom.create_counter('response_non200', 'Non-200 HTTP response codes', labelnames=['endpoint']) self.multipart_upload_start = prom.create_counter('multipart_upload_start', - 'Multipart upload startse') - self.multipart_upload_end = prom.create_counter('self._metric_queue.multipart_upload_end', + 'Multipart upload started') + self.multipart_upload_end = prom.create_counter('multipart_upload_end', 'Multipart upload ends.', labelnames=['type']) self.build_capacity_shortage = prom.create_gauge('build_capacity_shortage', 'Build capacity shortage.') @@ -46,6 +46,22 @@ class MetricQueue(object): 'Available items in a queue', labelnames=['queue_name']) + self.repository_pull = prom.create_counter('repository_pull', 'Repository Pull Count', + labelnames=['namespace', 'repo_name', 'protocol']) + + self.repository_push = prom.create_counter('repository_push', 'Repository Push Count', + labelnames=['namespace', 'repo_name', 'protocol']) + + self.repository_build_completed = prom.create_counter('repository_build_completed', + 'Repository Build Complete Count', + labelnames=['namespace', 'repo_name', + 'status', 'executor']) + + self.repository_count = prom.create_gauge('repository_count', 'Number of repositories') + self.user_count = prom.create_gauge('user_count', 'Number of users') + self.org_count = prom.create_gauge('org_count', 'Number of Organizations') + self.robot_count = prom.create_gauge('robot_count', 'Number of robot accounts') + # Deprecated: Define an in-memory queue for reporting metrics to CloudWatch or another # provider. self._queue = None diff --git a/workers/globalpromstats.py b/workers/globalpromstats.py new file mode 100644 index 000000000..4b5c24242 --- /dev/null +++ b/workers/globalpromstats.py @@ -0,0 +1,56 @@ +import logging +import time + +from app import app, metric_queue +from data.database import UseThenDisconnect +from data import model +from util.locking import GlobalLock, LockNotAcquiredException +from workers.worker import Worker + +logger = logging.getLogger(__name__) + +WORKER_FREQUENCY = app.config.get('GLOBAL_PROMETHEUS_STATS_FREQUENCY', 60 * 60) + + +class GlobalPrometheusStatsWorker(Worker): + """ Worker which reports global stats (# of users, orgs, repos, etc) to Prometheus periodically. + """ + def __init__(self): + super(GlobalPrometheusStatsWorker, self).__init__() + self.add_operation(self._try_report_stats, WORKER_FREQUENCY) + + def _try_report_stats(self): + logger.debug('Attempting to report stats') + + try: + with GlobalLock('GLOBAL_PROM_STATS'): + self._report_stats() + except LockNotAcquiredException: + return + + def _report_stats(self): + logger.debug('Reporting global stats') + with UseThenDisconnect(app.config): + # Repository count. + metric_queue.repository_count.set(model.repository.get_repository_count()) + + # User counts. + metric_queue.user_count.set(model.user.get_active_user_count()) + metric_queue.org_count.set(model.organization.get_active_org_count()) + metric_queue.robot_count.set(model.user.get_robot_count()) + + +def main(): + logging.config.fileConfig('conf/logging_debug.conf', disable_existing_loggers=False) + + if not app.config.get('PROMETHEUS_AGGREGATOR_URL'): + logger.debug('Prometheus not enabled; skipping global stats reporting') + while True: + time.sleep(100000) + + worker = GlobalPrometheusStatsWorker() + worker.start() + + +if __name__ == "__main__": + main()