Record metrics in a separate etcd record
This commit is contained in:
parent
56c5bab017
commit
42ebb0a6c3
2 changed files with 55 additions and 25 deletions
|
@ -75,6 +75,7 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
self._etcd_realm_prefix = None
|
self._etcd_realm_prefix = None
|
||||||
self._etcd_job_prefix = None
|
self._etcd_job_prefix = None
|
||||||
self._etcd_lock_prefix = None
|
self._etcd_lock_prefix = None
|
||||||
|
self._etcd_metric_prefix = None
|
||||||
|
|
||||||
self._ephemeral_api_timeout = DEFAULT_EPHEMERAL_API_TIMEOUT
|
self._ephemeral_api_timeout = DEFAULT_EPHEMERAL_API_TIMEOUT
|
||||||
self._ephemeral_setup_timeout = DEFAULT_EPHEMERAL_SETUP_TIMEOUT
|
self._ephemeral_setup_timeout = DEFAULT_EPHEMERAL_SETUP_TIMEOUT
|
||||||
|
@ -356,6 +357,7 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
restarter=self._register_existing_realms)
|
restarter=self._register_existing_realms)
|
||||||
|
|
||||||
self._etcd_lock_prefix = self._manager_config.get('ETCD_LOCK_PREFIX', 'lock/')
|
self._etcd_lock_prefix = self._manager_config.get('ETCD_LOCK_PREFIX', 'lock/')
|
||||||
|
self._etcd_metric_prefix = self._manager_config.get('ETCD_METRIC_PREFIX', 'metric/')
|
||||||
|
|
||||||
self._ephemeral_api_timeout = self._manager_config.get('API_TIMEOUT',
|
self._ephemeral_api_timeout = self._manager_config.get('API_TIMEOUT',
|
||||||
DEFAULT_EPHEMERAL_API_TIMEOUT)
|
DEFAULT_EPHEMERAL_API_TIMEOUT)
|
||||||
|
@ -495,6 +497,21 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
logger.debug('Started execution with ID %s for job: %s with executor: %s',
|
logger.debug('Started execution with ID %s for job: %s with executor: %s',
|
||||||
execution_id, build_uuid, started_with_executor.name)
|
execution_id, build_uuid, started_with_executor.name)
|
||||||
|
|
||||||
|
# Store metric data
|
||||||
|
metric_spec = json.dumps({
|
||||||
|
'executor_name': started_with_executor.name,
|
||||||
|
'start_time': time.time(),
|
||||||
|
})
|
||||||
|
try:
|
||||||
|
yield From(self._etcd_client.write(self._etcd_metric_key(realm), metric_spec, prevExist=False,
|
||||||
|
ttl=setup_time))
|
||||||
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
|
logger.error('Realm %s already exists in etcd for job %s ' +
|
||||||
|
'UUID collision or something is very very wrong.', realm, build_uuid)
|
||||||
|
except etcd.EtcdException:
|
||||||
|
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
|
||||||
|
|
||||||
|
|
||||||
# Store the realm spec which will allow any manager to accept this builder when it connects
|
# Store the realm spec which will allow any manager to accept this builder when it connects
|
||||||
realm_spec = json.dumps({
|
realm_spec = json.dumps({
|
||||||
'realm': realm,
|
'realm': realm,
|
||||||
|
@ -502,7 +519,6 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
'execution_id': execution_id,
|
'execution_id': execution_id,
|
||||||
'executor_name': started_with_executor.name,
|
'executor_name': started_with_executor.name,
|
||||||
'job_queue_item': build_job.job_item,
|
'job_queue_item': build_job.job_item,
|
||||||
'start_time': time.time(),
|
|
||||||
})
|
})
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -539,14 +555,7 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
job.build_uuid, build_component.builder_realm)
|
job.build_uuid, build_component.builder_realm)
|
||||||
yield From(build_component.start_build(job))
|
yield From(build_component.start_build(job))
|
||||||
|
|
||||||
try:
|
yield From(self._write_duration_metric(metric_queue.builder_time_to_build, build_component.builder_realm))
|
||||||
# log start time to prometheus
|
|
||||||
realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm)))
|
|
||||||
parsed_realm_data = json.loads(realm_data.value)
|
|
||||||
start_time = parsed_realm_data['start_time']
|
|
||||||
metric_queue.builder_time_to_build.Observe(time.time() - start_time, labelvalues=[parsed_realm_data.get('executor_name', 'unknown')])
|
|
||||||
except Exception:
|
|
||||||
logger.exception("Could not write start metric for realm key %s", build_component.builder_realm)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Clean up the bookkeeping for allowing any manager to take the job.
|
# Clean up the bookkeeping for allowing any manager to take the job.
|
||||||
|
@ -563,6 +572,8 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
logger.debug('Calling job_completed for job %s with status: %s',
|
logger.debug('Calling job_completed for job %s with status: %s',
|
||||||
build_job.build_uuid, job_status)
|
build_job.build_uuid, job_status)
|
||||||
|
|
||||||
|
yield From(self._write_duration_metric(metric_queue.build_time, build_component.builder_realm))
|
||||||
|
|
||||||
# Mark the job as completed. Since this is being invoked from the component, we don't need
|
# Mark the job as completed. Since this is being invoked from the component, we don't need
|
||||||
# to ask for the phase to be updated as well.
|
# to ask for the phase to be updated as well.
|
||||||
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
|
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
|
||||||
|
@ -572,15 +583,6 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
# Kill the ephemeral builder.
|
# Kill the ephemeral builder.
|
||||||
yield From(self.kill_builder_executor(build_job.build_uuid))
|
yield From(self.kill_builder_executor(build_job.build_uuid))
|
||||||
|
|
||||||
try:
|
|
||||||
# log build time to prometheus
|
|
||||||
realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm)))
|
|
||||||
parsed_realm_data = json.loads(realm_data.value)
|
|
||||||
start_time = parsed_realm_data['start_time']
|
|
||||||
metric_queue.build_time.Observe(time.time() - start_time, labelvalues=[parsed_realm_data.get('executor_name', 'unknown')])
|
|
||||||
except Exception:
|
|
||||||
logger.exception("Could not write build_time metric for realm key %s", build_component.builder_realm)
|
|
||||||
|
|
||||||
# Delete the build job from etcd.
|
# Delete the build job from etcd.
|
||||||
job_key = self._etcd_job_key(build_job)
|
job_key = self._etcd_job_key(build_job)
|
||||||
try:
|
try:
|
||||||
|
@ -588,6 +590,13 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
except (KeyError, etcd.EtcdKeyError):
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
logger.debug('Builder is asking for job to be removed, but work already completed')
|
logger.debug('Builder is asking for job to be removed, but work already completed')
|
||||||
|
|
||||||
|
# Delete the metric from etcd
|
||||||
|
metric_key = self._etcd_metric_key(build_component.builder_realm)
|
||||||
|
try:
|
||||||
|
yield From(self._etcd_client.delete(metric_key))
|
||||||
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
|
logger.debug('Builder is asking for metric to be removed, but key not found')
|
||||||
|
|
||||||
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
|
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
|
||||||
|
|
||||||
@coroutine
|
@coroutine
|
||||||
|
@ -663,6 +672,24 @@ class EphemeralBuilderManager(BaseManager):
|
||||||
except (KeyError, etcd.EtcdKeyError):
|
except (KeyError, etcd.EtcdKeyError):
|
||||||
raise Return(False)
|
raise Return(False)
|
||||||
|
|
||||||
|
@coroutine
|
||||||
|
def _write_duration_metric(self, metric, realm):
|
||||||
|
""" Returns true if the metric was written and and false otherwise.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
metric_data = yield From(self._etcd_client.read(self._etcd_metric_key(realm)))
|
||||||
|
parsed_metric_data = json.loads(metric_data.value)
|
||||||
|
start_time = parsed_metric_data['start_time']
|
||||||
|
metric.Observe(time.time() - start_time, labelvalues=[parsed_metric_data.get('executor_name', 'unknown')])
|
||||||
|
raise Return(True)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Could not write metric for realm %s", realm)
|
||||||
|
raise Return(False)
|
||||||
|
|
||||||
|
def _etcd_metric_key(self, realm):
|
||||||
|
""" Create a key which is used to track a job in etcd.
|
||||||
|
"""
|
||||||
|
return os.path.join(self._etcd_metric_prefix, realm)
|
||||||
|
|
||||||
def _etcd_job_key(self, build_job):
|
def _etcd_job_key(self, build_job):
|
||||||
""" Create a key which is used to track a job in etcd.
|
""" Create a key which is used to track a job in etcd.
|
||||||
|
|
|
@ -6,7 +6,7 @@ import uuid
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from trollius import coroutine, get_event_loop, From, Future, Return
|
from trollius import coroutine, get_event_loop, From, Future, Return
|
||||||
from mock import Mock, ANY
|
from mock import Mock, ANY, call
|
||||||
|
|
||||||
from buildman.manager.executor import BuilderExecutor, ExecutorException
|
from buildman.manager.executor import BuilderExecutor, ExecutorException
|
||||||
from buildman.manager.ephemeral import (EphemeralBuilderManager, EtcdAction,
|
from buildman.manager.ephemeral import (EphemeralBuilderManager, EtcdAction,
|
||||||
|
@ -179,10 +179,11 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
|
||||||
self.assertTrue(is_scheduled)
|
self.assertTrue(is_scheduled)
|
||||||
self.assertEqual(self.test_executor.start_builder.call_count, 1)
|
self.assertEqual(self.test_executor.start_builder.call_count, 1)
|
||||||
|
|
||||||
# Ensure the job and realm were added to etcd.
|
# Ensure the job and realm and metric were added to etcd.
|
||||||
self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key)
|
self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key)
|
||||||
self.assertTrue(self.etcd_client_mock.write.call_args_list[1][0][0].find('realm/') == 0)
|
self.assertTrue(self.etcd_client_mock.write.call_args_list[1][0][0].find('metric/') == 0)
|
||||||
realm_data = json.loads(self.etcd_client_mock.write.call_args_list[1][0][1])
|
self.assertTrue(self.etcd_client_mock.write.call_args_list[2][0][0].find('realm/') == 0)
|
||||||
|
realm_data = json.loads(self.etcd_client_mock.write.call_args_list[2][0][1])
|
||||||
realm_data['realm'] = REALM_ID
|
realm_data['realm'] = REALM_ID
|
||||||
|
|
||||||
# Right now the job is not registered with any managers because etcd has not accepted the job
|
# Right now the job is not registered with any managers because etcd has not accepted the job
|
||||||
|
@ -212,9 +213,11 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
|
||||||
|
|
||||||
# Take the job ourselves
|
# Take the job ourselves
|
||||||
yield From(self.manager.build_component_ready(test_component))
|
yield From(self.manager.build_component_ready(test_component))
|
||||||
|
read_calls = [call('building/', recursive=True), call(os.path.join('metric/', REALM_ID))]
|
||||||
|
self.etcd_client_mock.read.assert_has_calls(read_calls)
|
||||||
|
|
||||||
self.etcd_client_mock.read.assert_called_with(os.path.join('realm/', REALM_ID))
|
delete_calls = [call('building/', recursive=True), call(os.path.join('metric/', REALM_ID))]
|
||||||
self.etcd_client_mock.delete.assert_called_once_with(os.path.join('realm/', REALM_ID))
|
self.etcd_client_mock.read.assert_has_calls(delete_calls)
|
||||||
self.etcd_client_mock.delete.reset_mock()
|
self.etcd_client_mock.delete.reset_mock()
|
||||||
|
|
||||||
self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
|
self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
|
||||||
|
@ -224,7 +227,7 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase):
|
||||||
|
|
||||||
# Ensure that the executor kills the job.
|
# Ensure that the executor kills the job.
|
||||||
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
|
self.assertEqual(self.test_executor.stop_builder.call_count, 1)
|
||||||
self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key)
|
self.etcd_client_mock.delete.assert_has_calls([call(self.mock_job_key)])
|
||||||
|
|
||||||
# Ensure the build information is cleaned up.
|
# Ensure the build information is cleaned up.
|
||||||
self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
|
self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID))
|
||||||
|
|
Reference in a new issue