Record metrics in a separate etcd record

This commit is contained in:
Evan Cordell 2016-10-03 12:53:36 -04:00
parent 56c5bab017
commit 42ebb0a6c3
2 changed files with 55 additions and 25 deletions

View file

@ -75,6 +75,7 @@ class EphemeralBuilderManager(BaseManager):
self._etcd_realm_prefix = None
self._etcd_job_prefix = None
self._etcd_lock_prefix = None
self._etcd_metric_prefix = None
self._ephemeral_api_timeout = DEFAULT_EPHEMERAL_API_TIMEOUT
self._ephemeral_setup_timeout = DEFAULT_EPHEMERAL_SETUP_TIMEOUT
@ -356,6 +357,7 @@ class EphemeralBuilderManager(BaseManager):
restarter=self._register_existing_realms)
self._etcd_lock_prefix = self._manager_config.get('ETCD_LOCK_PREFIX', 'lock/')
self._etcd_metric_prefix = self._manager_config.get('ETCD_METRIC_PREFIX', 'metric/')
self._ephemeral_api_timeout = self._manager_config.get('API_TIMEOUT',
DEFAULT_EPHEMERAL_API_TIMEOUT)
@ -494,6 +496,21 @@ class EphemeralBuilderManager(BaseManager):
# Job was started!
logger.debug('Started execution with ID %s for job: %s with executor: %s',
execution_id, build_uuid, started_with_executor.name)
# Store metric data
metric_spec = json.dumps({
'executor_name': started_with_executor.name,
'start_time': time.time(),
})
try:
yield From(self._etcd_client.write(self._etcd_metric_key(realm), metric_spec, prevExist=False,
ttl=setup_time))
except (KeyError, etcd.EtcdKeyError):
logger.error('Realm %s already exists in etcd for job %s ' +
'UUID collision or something is very very wrong.', realm, build_uuid)
except etcd.EtcdException:
logger.exception('Exception when writing realm %s to etcd for job %s', realm, build_uuid)
# Store the realm spec which will allow any manager to accept this builder when it connects
realm_spec = json.dumps({
@ -502,7 +519,6 @@ class EphemeralBuilderManager(BaseManager):
'execution_id': execution_id,
'executor_name': started_with_executor.name,
'job_queue_item': build_job.job_item,
'start_time': time.time(),
})
try:
@ -539,14 +555,7 @@ class EphemeralBuilderManager(BaseManager):
job.build_uuid, build_component.builder_realm)
yield From(build_component.start_build(job))
try:
# log start time to prometheus
realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm)))
parsed_realm_data = json.loads(realm_data.value)
start_time = parsed_realm_data['start_time']
metric_queue.builder_time_to_build.Observe(time.time() - start_time, labelvalues=[parsed_realm_data.get('executor_name', 'unknown')])
except Exception:
logger.exception("Could not write start metric for realm key %s", build_component.builder_realm)
yield From(self._write_duration_metric(metric_queue.builder_time_to_build, build_component.builder_realm))
try:
# Clean up the bookkeeping for allowing any manager to take the job.
@ -563,6 +572,8 @@ class EphemeralBuilderManager(BaseManager):
logger.debug('Calling job_completed for job %s with status: %s',
build_job.build_uuid, job_status)
yield From(self._write_duration_metric(metric_queue.build_time, build_component.builder_realm))
# Mark the job as completed. Since this is being invoked from the component, we don't need
# to ask for the phase to be updated as well.
build_info = self._build_uuid_to_info.get(build_job.build_uuid, None)
@ -571,15 +582,6 @@ class EphemeralBuilderManager(BaseManager):
# Kill the ephemeral builder.
yield From(self.kill_builder_executor(build_job.build_uuid))
try:
# log build time to prometheus
realm_data = yield From(self._etcd_client.read(self._etcd_realm_key(build_component.builder_realm)))
parsed_realm_data = json.loads(realm_data.value)
start_time = parsed_realm_data['start_time']
metric_queue.build_time.Observe(time.time() - start_time, labelvalues=[parsed_realm_data.get('executor_name', 'unknown')])
except Exception:
logger.exception("Could not write build_time metric for realm key %s", build_component.builder_realm)
# Delete the build job from etcd.
job_key = self._etcd_job_key(build_job)
@ -587,6 +589,13 @@ class EphemeralBuilderManager(BaseManager):
yield From(self._etcd_client.delete(job_key))
except (KeyError, etcd.EtcdKeyError):
logger.debug('Builder is asking for job to be removed, but work already completed')
# Delete the metric from etcd
metric_key = self._etcd_metric_key(build_component.builder_realm)
try:
yield From(self._etcd_client.delete(metric_key))
except (KeyError, etcd.EtcdKeyError):
logger.debug('Builder is asking for metric to be removed, but key not found')
logger.debug('job_completed for job %s with status: %s', build_job.build_uuid, job_status)
@ -662,7 +671,25 @@ class EphemeralBuilderManager(BaseManager):
raise Return(True)
except (KeyError, etcd.EtcdKeyError):
raise Return(False)
@coroutine
def _write_duration_metric(self, metric, realm):
""" Returns true if the metric was written and and false otherwise.
"""
try:
metric_data = yield From(self._etcd_client.read(self._etcd_metric_key(realm)))
parsed_metric_data = json.loads(metric_data.value)
start_time = parsed_metric_data['start_time']
metric.Observe(time.time() - start_time, labelvalues=[parsed_metric_data.get('executor_name', 'unknown')])
raise Return(True)
except Exception:
logger.exception("Could not write metric for realm %s", realm)
raise Return(False)
def _etcd_metric_key(self, realm):
""" Create a key which is used to track a job in etcd.
"""
return os.path.join(self._etcd_metric_prefix, realm)
def _etcd_job_key(self, build_job):
""" Create a key which is used to track a job in etcd.