diff --git a/buildman/manager/ephemeral.py b/buildman/manager/ephemeral.py index a8ab3e26a..d797eefa7 100644 --- a/buildman/manager/ephemeral.py +++ b/buildman/manager/ephemeral.py @@ -75,6 +75,7 @@ class EphemeralBuilderManager(BaseManager): self._etcd_realm_prefix = None self._etcd_job_prefix = None self._etcd_lock_prefix = None + self._etcd_metric_prefix = None self._ephemeral_api_timeout = DEFAULT_EPHEMERAL_API_TIMEOUT self._ephemeral_setup_timeout = DEFAULT_EPHEMERAL_SETUP_TIMEOUT @@ -356,6 +357,7 @@ class EphemeralBuilderManager(BaseManager): restarter=self._register_existing_realms) self._etcd_lock_prefix = self._manager_config.get('ETCD_LOCK_PREFIX', 'lock/') + self._etcd_metric_prefix = self._manager_config.get('ETCD_METRIC_PREFIX', 'metric/') self._ephemeral_api_timeout = self._manager_config.get('API_TIMEOUT', DEFAULT_EPHEMERAL_API_TIMEOUT) @@ -494,6 +496,21 @@ class EphemeralBuilderManager(BaseManager): # Job was started! logger.debug('Started execution with ID %s for job: %s with executor: %s', execution_id, build_uuid, started_with_executor.name) + + # Store metric data + metric_spec = json.dumps({ + 'executor_name': started_with_executor.name, + 'start_time': time.time(), + }) + try: + yield From(self._etcd_client.write(self._etcd_metric_key(realm), metric_spec, prevExist=False, + ttl=setup_time)) + except (KeyError, etcd.EtcdKeyError): + logger.error('Realm %s already exists in etcd for job %s ' + + 'UUID collision or something is very very wrong.', realm, build_uuid) + except etcd.EtcdException: + logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid) + # Store the realm spec which will allow any manager to accept this builder when it connects realm_spec = json.dumps({ @@ -502,7 +519,6 @@ class EphemeralBuilderManager(BaseManager): 'execution_id': execution_id, 'executor_name': started_with_executor.name, 'job_queue_item': build_job.job_item, - 'start_time': time.time(), }) try: @@ -539,14 +555,7 @@ class EphemeralBuilderManager(BaseManager): job.build_uuid, build_component.builder_realm) yield From(build_component.start_build(job)) - try: - # log start time to prometheus - realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm))) - parsed_realm_data = json.loads(realm_data.value) - start_time = parsed_realm_data['start_time'] - metric_queue.builder_time_to_build.Observe(time.time() - start_time, labelvalues=[parsed_realm_data.get('executor_name', 'unknown')]) - except Exception: - logger.exception("Could not write start metric for realm key %s", build_component.builder_realm) + yield From(self._write_duration_metric(metric_queue.builder_time_to_build, build_component.builder_realm)) try: # Clean up the bookkeeping for allowing any manager to take the job. @@ -563,6 +572,8 @@ class EphemeralBuilderManager(BaseManager): logger.debug('Calling job_completed for job %s with status: %s', build_job.build_uuid, job_status) + yield From(self._write_duration_metric(metric_queue.build_time, build_component.builder_realm)) + # Mark the job as completed. Since this is being invoked from the component, we don't need # to ask for the phase to be updated as well. build_info = self._build_uuid_to_info.get(build_job.build_uuid, None) @@ -571,15 +582,6 @@ class EphemeralBuilderManager(BaseManager): # Kill the ephemeral builder. yield From(self.kill_builder_executor(build_job.build_uuid)) - - try: - # log build time to prometheus - realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm))) - parsed_realm_data = json.loads(realm_data.value) - start_time = parsed_realm_data['start_time'] - metric_queue.build_time.Observe(time.time() - start_time, labelvalues=[parsed_realm_data.get('executor_name', 'unknown')]) - except Exception: - logger.exception("Could not write build_time metric for realm key %s", build_component.builder_realm) # Delete the build job from etcd. job_key = self._etcd_job_key(build_job) @@ -587,6 +589,13 @@ class EphemeralBuilderManager(BaseManager): yield From(self._etcd_client.delete(job_key)) except (KeyError, etcd.EtcdKeyError): logger.debug('Builder is asking for job to be removed, but work already completed') + + # Delete the metric from etcd + metric_key = self._etcd_metric_key(build_component.builder_realm) + try: + yield From(self._etcd_client.delete(metric_key)) + except (KeyError, etcd.EtcdKeyError): + logger.debug('Builder is asking for metric to be removed, but key not found') logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status) @@ -662,7 +671,25 @@ class EphemeralBuilderManager(BaseManager): raise Return(True) except (KeyError, etcd.EtcdKeyError): raise Return(False) + + @coroutine + def _write_duration_metric(self, metric, realm): + """ Returns true if the metric was written and and false otherwise. + """ + try: + metric_data = yield From(self._etcd_client.read(self._etcd_metric_key(realm))) + parsed_metric_data = json.loads(metric_data.value) + start_time = parsed_metric_data['start_time'] + metric.Observe(time.time() - start_time, labelvalues=[parsed_metric_data.get('executor_name', 'unknown')]) + raise Return(True) + except Exception: + logger.exception("Could not write metric for realm %s", realm) + raise Return(False) + def _etcd_metric_key(self, realm): + """ Create a key which is used to track a job in etcd. + """ + return os.path.join(self._etcd_metric_prefix, realm) def _etcd_job_key(self, build_job): """ Create a key which is used to track a job in etcd. diff --git a/test/test_buildman.py b/test/test_buildman.py index a96056bbc..a34746e3d 100644 --- a/test/test_buildman.py +++ b/test/test_buildman.py @@ -6,7 +6,7 @@ import uuid import os from trollius import coroutine, get_event_loop, From, Future, Return -from mock import Mock, ANY +from mock import Mock, ANY, call from buildman.manager.executor import BuilderExecutor, ExecutorException from buildman.manager.ephemeral import (EphemeralBuilderManager, EtcdAction, @@ -179,10 +179,11 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase): self.assertTrue(is_scheduled) self.assertEqual(self.test_executor.start_builder.call_count, 1) - # Ensure the job and realm were added to etcd. + # Ensure the job and realm and metric were added to etcd. self.assertEqual(self.etcd_client_mock.write.call_args_list[0][0][0], self.mock_job_key) - self.assertTrue(self.etcd_client_mock.write.call_args_list[1][0][0].find('realm/') == 0) - realm_data = json.loads(self.etcd_client_mock.write.call_args_list[1][0][1]) + self.assertTrue(self.etcd_client_mock.write.call_args_list[1][0][0].find('metric/') == 0) + self.assertTrue(self.etcd_client_mock.write.call_args_list[2][0][0].find('realm/') == 0) + realm_data = json.loads(self.etcd_client_mock.write.call_args_list[2][0][1]) realm_data['realm'] = REALM_ID # Right now the job is not registered with any managers because etcd has not accepted the job @@ -212,9 +213,11 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase): # Take the job ourselves yield From(self.manager.build_component_ready(test_component)) + read_calls = [call('building/', recursive=True), call(os.path.join('metric/', REALM_ID))] + self.etcd_client_mock.read.assert_has_calls(read_calls) - self.etcd_client_mock.read.assert_called_with(os.path.join('realm/', REALM_ID)) - self.etcd_client_mock.delete.assert_called_once_with(os.path.join('realm/', REALM_ID)) + delete_calls = [call('building/', recursive=True), call(os.path.join('metric/', REALM_ID))] + self.etcd_client_mock.read.assert_has_calls(delete_calls) self.etcd_client_mock.delete.reset_mock() self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID)) @@ -224,7 +227,7 @@ class TestEphemeralLifecycle(EphemeralBuilderTestCase): # Ensure that the executor kills the job. self.assertEqual(self.test_executor.stop_builder.call_count, 1) - self.etcd_client_mock.delete.assert_called_once_with(self.mock_job_key) + self.etcd_client_mock.delete.assert_has_calls([call(self.mock_job_key)]) # Ensure the build information is cleaned up. self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID))