Merge pull request #1830 from coreos-inc/superuser-dashboard

Add prometheus stats to enable better dashboarding
This commit is contained in:
josephschorr 2016-09-26 17:19:22 +02:00 committed by GitHub
commit ad4efba802
18 changed files with 128 additions and 199 deletions

View file

@ -72,6 +72,11 @@ class BuildJob(object):
""" Returns the namespace under which this build is running. """
return self.repo_build.repository.namespace_user.username
@property
def repo_name(self):
""" Returns the name of the repository under which this build is running. """
return self.repo_build.repository.name
@property
def repo_build(self):
return self._load_repo_build()

View file

@ -182,6 +182,7 @@ class EphemeralBuilderManager(BaseManager):
self._build_uuid_to_info.pop(build_job.build_uuid, None)
raise Return()
executor_name = build_info.executor_name
execution_id = build_info.execution_id
# If we have not yet received a heartbeat, then the node failed to boot in some way. We mark
@ -196,7 +197,7 @@ class EphemeralBuilderManager(BaseManager):
execution_id))
if got_lock:
logger.warning('Marking job %s as incomplete', build_job.build_uuid)
self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE)
self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name)
# Finally, we terminate the build execution for the job. We don't do this under a lock as
# terminating a node is an atomic operation; better to make sure it is terminated than not.
@ -550,7 +551,10 @@ class EphemeralBuilderManager(BaseManager):
build_job.build_uuid, job_status)
# Mark the job as completed.
self.job_complete_callback(build_job, job_status)
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
executor_name = build_info.executor_name if build_info else None
self.job_complete_callback(build_job, job_status, executor_name)
# Kill the ephmeral builder.
yield From(self.kill_builder_executor(build_job.build_uuid))

View file

@ -141,7 +141,7 @@ class BuilderServer(object):
self._queue.extend_processing(build_job.job_item, seconds_from_now=JOB_TIMEOUT_SECONDS,
minimum_extension=MINIMUM_JOB_EXTENSION)
def _job_complete(self, build_job, job_status):
def _job_complete(self, build_job, job_status, executor_name=None):
if job_status == BuildJobResult.INCOMPLETE:
self._queue.incomplete(build_job.job_item, restore_retry=False, retry_after=30)
else:
@ -152,7 +152,7 @@ class BuilderServer(object):
if self._current_status == BuildServerStatus.SHUTDOWN and not self._job_count:
self._shutdown_event.set()
report_completion_status(job_status)
_report_completion_status(build_job, job_status, executor_name)
@trollius.coroutine
def _work_checker(self):
@ -229,7 +229,10 @@ class BuilderServer(object):
# Initialize the work queue checker.
yield From(self._work_checker())
def report_completion_status(status):
def _report_completion_status(build_job, status, executor_name):
metric_queue.build_counter.Inc(labelvalues=[status])
metric_queue.repository_build_completed.Inc(labelvalues=[build_job.namespace, build_job.repo_name,
status, executor_name or 'executor'])
if status == BuildJobResult.COMPLETE:
status_name = 'CompleteBuilds'
elif status == BuildJobResult.ERROR:
@ -240,4 +243,3 @@ def report_completion_status(status):
return
metric_queue.put_deprecated(status_name, 1, unit='Count')
metric_queue.build_counter.Inc(labelvalues=[status_name])