Record metrics in a separate etcd record

This commit is contained in:
Evan Cordell 2016-10-03 12:53:36 -04:00
parent 56c5bab017
commit 42ebb0a6c3
2 changed files with 55 additions and 25 deletions

View file

@ -75,6 +75,7 @@ class EphemeralBuilderManager(BaseManager):
self._etcd_realm_prefix = None
self._etcd_job_prefix = None
self._etcd_lock_prefix = None
self._etcd_metric_prefix = None
self._ephemeral_api_timeout = DEFAULT_EPHEMERAL_API_TIMEOUT
self._ephemeral_setup_timeout = DEFAULT_EPHEMERAL_SETUP_TIMEOUT
@ -356,6 +357,7 @@ class EphemeralBuilderManager(BaseManager):
restarter=self._register_existing_realms)
self._etcd_lock_prefix = self._manager_config.get('ETCD_LOCK_PREFIX', 'lock/')
self._etcd_metric_prefix = self._manager_config.get('ETCD_METRIC_PREFIX', 'metric/')
self._ephemeral_api_timeout = self._manager_config.get('API_TIMEOUT',
DEFAULT_EPHEMERAL_API_TIMEOUT)
@ -494,6 +496,21 @@ class EphemeralBuilderManager(BaseManager):
# Job was started!
logger.debug('Started execution with ID %s for job: %s with executor: %s',
execution_id, build_uuid, started_with_executor.name)
# Store metric data
metric_spec = json.dumps({
'executor_name': started_with_executor.name,
'start_time': time.time(),
})
try:
yield From(self._etcd_client.write(self._etcd_metric_key(realm), metric_spec, prevExist=False,
ttl=setup_time))
except (KeyError, etcd.EtcdKeyError):
logger.error('Realm %s already exists in etcd for job %s ' +
'UUID collision or something is very very wrong.', realm, build_uuid)
except etcd.EtcdException:
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
# Store the realm spec which will allow any manager to accept this builder when it connects
realm_spec = json.dumps({
@ -502,7 +519,6 @@ class EphemeralBuilderManager(BaseManager):
'execution_id': execution_id,
'executor_name': started_with_executor.name,
'job_queue_item': build_job.job_item,
'start_time': time.time(),
})
try:
@ -539,14 +555,7 @@ class EphemeralBuilderManager(BaseManager):
job.build_uuid, build_component.builder_realm)
yield From(build_component.start_build(job))
try:
# log start time to prometheus
realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm)))
parsed_realm_data = json.loads(realm_data.value)
start_time = parsed_realm_data['start_time']
metric_queue.builder_time_to_build.Observe(time.time() - start_time, labelvalues=[parsed_realm_data.get('executor_name', 'unknown')])
except Exception:
logger.exception("Could not write start metric for realm key %s", build_component.builder_realm)
yield From(self._write_duration_metric(metric_queue.builder_time_to_build, build_component.builder_realm))
try:
# Clean up the bookkeeping for allowing any manager to take the job.
@ -563,6 +572,8 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Calling job_completed for job %s with status: %s',
build_job.build_uuid, job_status)
yield From(self._write_duration_metric(metric_queue.build_time, build_component.builder_realm))
# Mark the job as completed. Since this is being invoked from the component, we don't need
# to ask for the phase to be updated as well.
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
@ -571,15 +582,6 @@ class EphemeralBuilderManager(BaseManager):
# Kill the ephemeral builder.
yield From(self.kill_builder_executor(build_job.build_uuid))
try:
# log build time to prometheus
realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm)))
parsed_realm_data = json.loads(realm_data.value)
start_time = parsed_realm_data['start_time']
metric_queue.build_time.Observe(time.time() - start_time, labelvalues=[parsed_realm_data.get('executor_name', 'unknown')])
except Exception:
logger.exception("Could not write build_time metric for realm key %s", build_component.builder_realm)
# Delete the build job from etcd.
job_key = self._etcd_job_key(build_job)
@ -587,6 +589,13 @@ class EphemeralBuilderManager(BaseManager):
yield From(self._etcd_client.delete(job_key))
except (KeyError, etcd.EtcdKeyError):
logger.debug('Builder is asking for job to be removed, but work already completed')
# Delete the metric from etcd
metric_key = self._etcd_metric_key(build_component.builder_realm)
try:
yield From(self._etcd_client.delete(metric_key))
except (KeyError, etcd.EtcdKeyError):
logger.debug('Builder is asking for metric to be removed, but key not found')
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
@ -662,7 +671,25 @@ class EphemeralBuilderManager(BaseManager):
raise Return(True)
except (KeyError, etcd.EtcdKeyError):
raise Return(False)
@coroutine
def _write_duration_metric(self, metric, realm):
""" Returns true if the metric was written and and false otherwise.
"""
try:
metric_data = yield From(self._etcd_client.read(self._etcd_metric_key(realm)))
parsed_metric_data = json.loads(metric_data.value)
start_time = parsed_metric_data['start_time']
metric.Observe(time.time() - start_time, labelvalues=[parsed_metric_data.get('executor_name', 'unknown')])
raise Return(True)
except Exception:
logger.exception("Could not write metric for realm %s", realm)
raise Return(False)
def _etcd_metric_key(self, realm):
""" Create a key which is used to track a job in etcd.
"""
return os.path.join(self._etcd_metric_prefix, realm)
def _etcd_job_key(self, build_job):
""" Create a key which is used to track a job in etcd.

View file

@ -6,7 +6,7 @@ import uuid
import os
from trollius import coroutine, get_event_loop, From, Future, Return
from mock import Mock, ANY
from mock import Mock, ANY, call
from buildman.manager.executor import BuilderExecutor, ExecutorException
from buildman.manager.ephemeral import (EphemeralBuilderManager, EtcdAction,
@ -179,10 +179,11 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
self.assertTrue(is_scheduled)
self.assertEqual(self.test_executor.start_builder.call_count, 1)
# Ensure the job and realm were added to etcd.
# Ensure the job and realm and metric were added to etcd.
self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key)
self.assertTrue(self.etcd_client_mock.write.call_args_list[1][0][0].find('realm/') == 0)
realm_data = json.loads(self.etcd_client_mock.write.call_args_list[1][0][1])
self.assertTrue(self.etcd_client_mock.write.call_args_list[1][0][0].find('metric/') == 0)
self.assertTrue(self.etcd_client_mock.write.call_args_list[2][0][0].find('realm/') == 0)
realm_data = json.loads(self.etcd_client_mock.write.call_args_list[2][0][1])
realm_data['realm'] = REALM_ID
# Right now the job is not registered with any managers because etcd has not accepted the job
@ -212,9 +213,11 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
# Take the job ourselves
yield From(self.manager.build_component_ready(test_component))
read_calls = [call('building/', recursive=True), call(os.path.join('metric/', REALM_ID))]
self.etcd_client_mock.read.assert_has_calls(read_calls)
self.etcd_client_mock.read.assert_called_with(os.path.join('realm/', REALM_ID))
self.etcd_client_mock.delete.assert_called_once_with(os.path.join('realm/', REALM_ID))
delete_calls = [call('building/', recursive=True), call(os.path.join('metric/', REALM_ID))]
self.etcd_client_mock.read.assert_has_calls(delete_calls)
self.etcd_client_mock.delete.reset_mock()
self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
@ -224,7 +227,7 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
# Ensure that the executor kills the job.
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key)
self.etcd_client_mock.delete.assert_has_calls([call(self.mock_job_key)])
# Ensure the build information is cleaned up.
self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID))