Record metrics in a separate etcd record

This commit is contained in:
Evan Cordell 2016-10-03 12:53:36 -04:00
parent 56c5bab017
commit 42ebb0a6c3
2 changed files with 55 additions and 25 deletions

View file

@ -75,6 +75,7 @@ class EphemeralBuilderManager(BaseManager):
self._etcd_realm_prefix = None self._etcd_realm_prefix = None
self._etcd_job_prefix = None self._etcd_job_prefix = None
self._etcd_lock_prefix = None self._etcd_lock_prefix = None
self._etcd_metric_prefix = None
self._ephemeral_api_timeout = DEFAULT_EPHEMERAL_API_TIMEOUT self._ephemeral_api_timeout = DEFAULT_EPHEMERAL_API_TIMEOUT
self._ephemeral_setup_timeout = DEFAULT_EPHEMERAL_SETUP_TIMEOUT self._ephemeral_setup_timeout = DEFAULT_EPHEMERAL_SETUP_TIMEOUT
@ -356,6 +357,7 @@ class EphemeralBuilderManager(BaseManager):
restarter=self._register_existing_realms) restarter=self._register_existing_realms)
self._etcd_lock_prefix = self._manager_config.get('ETCD_LOCK_PREFIX', 'lock/') self._etcd_lock_prefix = self._manager_config.get('ETCD_LOCK_PREFIX', 'lock/')
self._etcd_metric_prefix = self._manager_config.get('ETCD_METRIC_PREFIX', 'metric/')
self._ephemeral_api_timeout = self._manager_config.get('API_TIMEOUT', self._ephemeral_api_timeout = self._manager_config.get('API_TIMEOUT',
DEFAULT_EPHEMERAL_API_TIMEOUT) DEFAULT_EPHEMERAL_API_TIMEOUT)
@ -495,6 +497,21 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Started execution with ID %s for job: %s with executor: %s', logger.debug('Started execution with ID %s for job: %s with executor: %s',
execution_id, build_uuid, started_with_executor.name) execution_id, build_uuid, started_with_executor.name)
# Store metric data
metric_spec = json.dumps({
'executor_name': started_with_executor.name,
'start_time': time.time(),
})
try:
yield From(self._etcd_client.write(self._etcd_metric_key(realm), metric_spec, prevExist=False,
ttl=setup_time))
except (KeyError, etcd.EtcdKeyError):
logger.error('Realm %s already exists in etcd for job %s ' +
'UUID collision or something is very very wrong.', realm, build_uuid)
except etcd.EtcdException:
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
# Store the realm spec which will allow any manager to accept this builder when it connects # Store the realm spec which will allow any manager to accept this builder when it connects
realm_spec = json.dumps({ realm_spec = json.dumps({
'realm': realm, 'realm': realm,
@ -502,7 +519,6 @@ class EphemeralBuilderManager(BaseManager):
'execution_id': execution_id, 'execution_id': execution_id,
'executor_name': started_with_executor.name, 'executor_name': started_with_executor.name,
'job_queue_item': build_job.job_item, 'job_queue_item': build_job.job_item,
'start_time': time.time(),
}) })
try: try:
@ -539,14 +555,7 @@ class EphemeralBuilderManager(BaseManager):
job.build_uuid, build_component.builder_realm) job.build_uuid, build_component.builder_realm)
yield From(build_component.start_build(job)) yield From(build_component.start_build(job))
try: yield From(self._write_duration_metric(metric_queue.builder_time_to_build, build_component.builder_realm))
# log start time to prometheus
realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm)))
parsed_realm_data = json.loads(realm_data.value)
start_time = parsed_realm_data['start_time']
metric_queue.builder_time_to_build.Observe(time.time() - start_time, labelvalues=[parsed_realm_data.get('executor_name', 'unknown')])
except Exception:
logger.exception("Could not write start metric for realm key %s", build_component.builder_realm)
try: try:
# Clean up the bookkeeping for allowing any manager to take the job. # Clean up the bookkeeping for allowing any manager to take the job.
@ -563,6 +572,8 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Calling job_completed for job %s with status: %s', logger.debug('Calling job_completed for job %s with status: %s',
build_job.build_uuid, job_status) build_job.build_uuid, job_status)
yield From(self._write_duration_metric(metric_queue.build_time, build_component.builder_realm))
# Mark the job as completed. Since this is being invoked from the component, we don't need # Mark the job as completed. Since this is being invoked from the component, we don't need
# to ask for the phase to be updated as well. # to ask for the phase to be updated as well.
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None) build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
@ -572,15 +583,6 @@ class EphemeralBuilderManager(BaseManager):
# Kill the ephemeral builder. # Kill the ephemeral builder.
yield From(self.kill_builder_executor(build_job.build_uuid)) yield From(self.kill_builder_executor(build_job.build_uuid))
try:
# log build time to prometheus
realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm)))
parsed_realm_data = json.loads(realm_data.value)
start_time = parsed_realm_data['start_time']
metric_queue.build_time.Observe(time.time() - start_time, labelvalues=[parsed_realm_data.get('executor_name', 'unknown')])
except Exception:
logger.exception("Could not write build_time metric for realm key %s", build_component.builder_realm)
# Delete the build job from etcd. # Delete the build job from etcd.
job_key = self._etcd_job_key(build_job) job_key = self._etcd_job_key(build_job)
try: try:
@ -588,6 +590,13 @@ class EphemeralBuilderManager(BaseManager):
except (KeyError, etcd.EtcdKeyError): except (KeyError, etcd.EtcdKeyError):
logger.debug('Builder is asking for job to be removed, but work already completed') logger.debug('Builder is asking for job to be removed, but work already completed')
# Delete the metric from etcd
metric_key = self._etcd_metric_key(build_component.builder_realm)
try:
yield From(self._etcd_client.delete(metric_key))
except (KeyError, etcd.EtcdKeyError):
logger.debug('Builder is asking for metric to be removed, but key not found')
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status) logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
@coroutine @coroutine
@ -663,6 +672,24 @@ class EphemeralBuilderManager(BaseManager):
except (KeyError, etcd.EtcdKeyError): except (KeyError, etcd.EtcdKeyError):
raise Return(False) raise Return(False)
@coroutine
def _write_duration_metric(self, metric, realm):
""" Returns true if the metric was written and and false otherwise.
"""
try:
metric_data = yield From(self._etcd_client.read(self._etcd_metric_key(realm)))
parsed_metric_data = json.loads(metric_data.value)
start_time = parsed_metric_data['start_time']
metric.Observe(time.time() - start_time, labelvalues=[parsed_metric_data.get('executor_name', 'unknown')])
raise Return(True)
except Exception:
logger.exception("Could not write metric for realm %s", realm)
raise Return(False)
def _etcd_metric_key(self, realm):
""" Create a key which is used to track a job in etcd.
"""
return os.path.join(self._etcd_metric_prefix, realm)
def _etcd_job_key(self, build_job): def _etcd_job_key(self, build_job):
""" Create a key which is used to track a job in etcd. """ Create a key which is used to track a job in etcd.

View file

@ -6,7 +6,7 @@ import uuid
import os import os
from trollius import coroutine, get_event_loop, From, Future, Return from trollius import coroutine, get_event_loop, From, Future, Return
from mock import Mock, ANY from mock import Mock, ANY, call
from buildman.manager.executor import BuilderExecutor, ExecutorException from buildman.manager.executor import BuilderExecutor, ExecutorException
from buildman.manager.ephemeral import (EphemeralBuilderManager, EtcdAction, from buildman.manager.ephemeral import (EphemeralBuilderManager, EtcdAction,
@ -179,10 +179,11 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
self.assertTrue(is_scheduled) self.assertTrue(is_scheduled)
self.assertEqual(self.test_executor.start_builder.call_count, 1) self.assertEqual(self.test_executor.start_builder.call_count, 1)
# Ensure the job and realm were added to etcd. # Ensure the job and realm and metric were added to etcd.
self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key) self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key)
self.assertTrue(self.etcd_client_mock.write.call_args_list[1][0][0].find('realm/') == 0) self.assertTrue(self.etcd_client_mock.write.call_args_list[1][0][0].find('metric/') == 0)
realm_data = json.loads(self.etcd_client_mock.write.call_args_list[1][0][1]) self.assertTrue(self.etcd_client_mock.write.call_args_list[2][0][0].find('realm/') == 0)
realm_data = json.loads(self.etcd_client_mock.write.call_args_list[2][0][1])
realm_data['realm'] = REALM_ID realm_data['realm'] = REALM_ID
# Right now the job is not registered with any managers because etcd has not accepted the job # Right now the job is not registered with any managers because etcd has not accepted the job
@ -212,9 +213,11 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
# Take the job ourselves # Take the job ourselves
yield From(self.manager.build_component_ready(test_component)) yield From(self.manager.build_component_ready(test_component))
read_calls = [call('building/', recursive=True), call(os.path.join('metric/', REALM_ID))]
self.etcd_client_mock.read.assert_has_calls(read_calls)
self.etcd_client_mock.read.assert_called_with(os.path.join('realm/', REALM_ID)) delete_calls = [call('building/', recursive=True), call(os.path.join('metric/', REALM_ID))]
self.etcd_client_mock.delete.assert_called_once_with(os.path.join('realm/', REALM_ID)) self.etcd_client_mock.read.assert_has_calls(delete_calls)
self.etcd_client_mock.delete.reset_mock() self.etcd_client_mock.delete.reset_mock()
self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID)) self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
@ -224,7 +227,7 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
# Ensure that the executor kills the job. # Ensure that the executor kills the job.
self.assertEqual(self.test_executor.stop_builder.call_count, 1) self.assertEqual(self.test_executor.stop_builder.call_count, 1)
self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key) self.etcd_client_mock.delete.assert_has_calls([call(self.mock_job_key)])
# Ensure the build information is cleaned up. # Ensure the build information is cleaned up.
self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID)) self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID))