Record metrics in a separate etcd record
This commit is contained in:
parent
56c5bab017
commit
42ebb0a6c3
2 changed files with 55 additions and 25 deletions
|
@ -75,6 +75,7 @@ class EphemeralBuilderManager(BaseManager):
|
|||
self._etcd_realm_prefix = None
|
||||
self._etcd_job_prefix = None
|
||||
self._etcd_lock_prefix = None
|
||||
self._etcd_metric_prefix = None
|
||||
|
||||
self._ephemeral_api_timeout = DEFAULT_EPHEMERAL_API_TIMEOUT
|
||||
self._ephemeral_setup_timeout = DEFAULT_EPHEMERAL_SETUP_TIMEOUT
|
||||
|
@ -356,6 +357,7 @@ class EphemeralBuilderManager(BaseManager):
|
|||
restarter=self._register_existing_realms)
|
||||
|
||||
self._etcd_lock_prefix = self._manager_config.get('ETCD_LOCK_PREFIX', 'lock/')
|
||||
self._etcd_metric_prefix = self._manager_config.get('ETCD_METRIC_PREFIX', 'metric/')
|
||||
|
||||
self._ephemeral_api_timeout = self._manager_config.get('API_TIMEOUT',
|
||||
DEFAULT_EPHEMERAL_API_TIMEOUT)
|
||||
|
@ -494,6 +496,21 @@ class EphemeralBuilderManager(BaseManager):
|
|||
# Job was started!
|
||||
logger.debug('Started execution with ID %s for job: %s with executor: %s',
|
||||
execution_id, build_uuid, started_with_executor.name)
|
||||
|
||||
# Store metric data
|
||||
metric_spec = json.dumps({
|
||||
'executor_name': started_with_executor.name,
|
||||
'start_time': time.time(),
|
||||
})
|
||||
try:
|
||||
yield From(self._etcd_client.write(self._etcd_metric_key(realm), metric_spec, prevExist=False,
|
||||
ttl=setup_time))
|
||||
except (KeyError, etcd.EtcdKeyError):
|
||||
logger.error('Realm %s already exists in etcd for job %s ' +
|
||||
'UUID collision or something is very very wrong.', realm, build_uuid)
|
||||
except etcd.EtcdException:
|
||||
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
|
||||
|
||||
|
||||
# Store the realm spec which will allow any manager to accept this builder when it connects
|
||||
realm_spec = json.dumps({
|
||||
|
@ -502,7 +519,6 @@ class EphemeralBuilderManager(BaseManager):
|
|||
'execution_id': execution_id,
|
||||
'executor_name': started_with_executor.name,
|
||||
'job_queue_item': build_job.job_item,
|
||||
'start_time': time.time(),
|
||||
})
|
||||
|
||||
try:
|
||||
|
@ -539,14 +555,7 @@ class EphemeralBuilderManager(BaseManager):
|
|||
job.build_uuid, build_component.builder_realm)
|
||||
yield From(build_component.start_build(job))
|
||||
|
||||
try:
|
||||
# log start time to prometheus
|
||||
realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm)))
|
||||
parsed_realm_data = json.loads(realm_data.value)
|
||||
start_time = parsed_realm_data['start_time']
|
||||
metric_queue.builder_time_to_build.Observe(time.time() - start_time, labelvalues=[parsed_realm_data.get('executor_name', 'unknown')])
|
||||
except Exception:
|
||||
logger.exception("Could not write start metric for realm key %s", build_component.builder_realm)
|
||||
yield From(self._write_duration_metric(metric_queue.builder_time_to_build, build_component.builder_realm))
|
||||
|
||||
try:
|
||||
# Clean up the bookkeeping for allowing any manager to take the job.
|
||||
|
@ -563,6 +572,8 @@ class EphemeralBuilderManager(BaseManager):
|
|||
logger.debug('Calling job_completed for job %s with status: %s',
|
||||
build_job.build_uuid, job_status)
|
||||
|
||||
yield From(self._write_duration_metric(metric_queue.build_time, build_component.builder_realm))
|
||||
|
||||
# Mark the job as completed. Since this is being invoked from the component, we don't need
|
||||
# to ask for the phase to be updated as well.
|
||||
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
|
||||
|
@ -571,15 +582,6 @@ class EphemeralBuilderManager(BaseManager):
|
|||
|
||||
# Kill the ephemeral builder.
|
||||
yield From(self.kill_builder_executor(build_job.build_uuid))
|
||||
|
||||
try:
|
||||
# log build time to prometheus
|
||||
realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm)))
|
||||
parsed_realm_data = json.loads(realm_data.value)
|
||||
start_time = parsed_realm_data['start_time']
|
||||
metric_queue.build_time.Observe(time.time() - start_time, labelvalues=[parsed_realm_data.get('executor_name', 'unknown')])
|
||||
except Exception:
|
||||
logger.exception("Could not write build_time metric for realm key %s", build_component.builder_realm)
|
||||
|
||||
# Delete the build job from etcd.
|
||||
job_key = self._etcd_job_key(build_job)
|
||||
|
@ -587,6 +589,13 @@ class EphemeralBuilderManager(BaseManager):
|
|||
yield From(self._etcd_client.delete(job_key))
|
||||
except (KeyError, etcd.EtcdKeyError):
|
||||
logger.debug('Builder is asking for job to be removed, but work already completed')
|
||||
|
||||
# Delete the metric from etcd
|
||||
metric_key = self._etcd_metric_key(build_component.builder_realm)
|
||||
try:
|
||||
yield From(self._etcd_client.delete(metric_key))
|
||||
except (KeyError, etcd.EtcdKeyError):
|
||||
logger.debug('Builder is asking for metric to be removed, but key not found')
|
||||
|
||||
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
|
||||
|
||||
|
@ -662,7 +671,25 @@ class EphemeralBuilderManager(BaseManager):
|
|||
raise Return(True)
|
||||
except (KeyError, etcd.EtcdKeyError):
|
||||
raise Return(False)
|
||||
|
||||
@coroutine
|
||||
def _write_duration_metric(self, metric, realm):
|
||||
""" Returns true if the metric was written and and false otherwise.
|
||||
"""
|
||||
try:
|
||||
metric_data = yield From(self._etcd_client.read(self._etcd_metric_key(realm)))
|
||||
parsed_metric_data = json.loads(metric_data.value)
|
||||
start_time = parsed_metric_data['start_time']
|
||||
metric.Observe(time.time() - start_time, labelvalues=[parsed_metric_data.get('executor_name', 'unknown')])
|
||||
raise Return(True)
|
||||
except Exception:
|
||||
logger.exception("Could not write metric for realm %s", realm)
|
||||
raise Return(False)
|
||||
|
||||
def _etcd_metric_key(self, realm):
|
||||
""" Create a key which is used to track a job in etcd.
|
||||
"""
|
||||
return os.path.join(self._etcd_metric_prefix, realm)
|
||||
|
||||
def _etcd_job_key(self, build_job):
|
||||
""" Create a key which is used to track a job in etcd.
|
||||
|
|
|
@ -6,7 +6,7 @@ import uuid
|
|||
import os
|
||||
|
||||
from trollius import coroutine, get_event_loop, From, Future, Return
|
||||
from mock import Mock, ANY
|
||||
from mock import Mock, ANY, call
|
||||
|
||||
from buildman.manager.executor import BuilderExecutor, ExecutorException
|
||||
from buildman.manager.ephemeral import (EphemeralBuilderManager, EtcdAction,
|
||||
|
@ -179,10 +179,11 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
|
|||
self.assertTrue(is_scheduled)
|
||||
self.assertEqual(self.test_executor.start_builder.call_count, 1)
|
||||
|
||||
# Ensure the job and realm were added to etcd.
|
||||
# Ensure the job and realm and metric were added to etcd.
|
||||
self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key)
|
||||
self.assertTrue(self.etcd_client_mock.write.call_args_list[1][0][0].find('realm/') == 0)
|
||||
realm_data = json.loads(self.etcd_client_mock.write.call_args_list[1][0][1])
|
||||
self.assertTrue(self.etcd_client_mock.write.call_args_list[1][0][0].find('metric/') == 0)
|
||||
self.assertTrue(self.etcd_client_mock.write.call_args_list[2][0][0].find('realm/') == 0)
|
||||
realm_data = json.loads(self.etcd_client_mock.write.call_args_list[2][0][1])
|
||||
realm_data['realm'] = REALM_ID
|
||||
|
||||
# Right now the job is not registered with any managers because etcd has not accepted the job
|
||||
|
@ -212,9 +213,11 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
|
|||
|
||||
# Take the job ourselves
|
||||
yield From(self.manager.build_component_ready(test_component))
|
||||
read_calls = [call('building/', recursive=True), call(os.path.join('metric/', REALM_ID))]
|
||||
self.etcd_client_mock.read.assert_has_calls(read_calls)
|
||||
|
||||
self.etcd_client_mock.read.assert_called_with(os.path.join('realm/', REALM_ID))
|
||||
self.etcd_client_mock.delete.assert_called_once_with(os.path.join('realm/', REALM_ID))
|
||||
delete_calls = [call('building/', recursive=True), call(os.path.join('metric/', REALM_ID))]
|
||||
self.etcd_client_mock.read.assert_has_calls(delete_calls)
|
||||
self.etcd_client_mock.delete.reset_mock()
|
||||
|
||||
self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
|
||||
|
@ -224,7 +227,7 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
|
|||
|
||||
# Ensure that the executor kills the job.
|
||||
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
|
||||
self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key)
|
||||
self.etcd_client_mock.delete.assert_has_calls([call(self.mock_job_key)])
|
||||
|
||||
# Ensure the build information is cleaned up.
|
||||
self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
|
||||
|
|
Reference in a new issue