From bbb578b09d1adc4e59739a2e5daeb5b531a8758f Mon Sep 17 00:00:00 2001 From: kalomaze <66376113+kalomaze@users.noreply.github.com> Date: Mon, 22 Jan 2024 05:24:54 -0600 Subject: [PATCH] Capture softmax operations for sampler profiling --- llama.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/llama.cpp b/llama.cpp index e128cea6f..4e912e84c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8005,10 +8005,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can return; } - llama_sample_softmax(ctx, candidates); - const int64_t t_start_sample_us = ggml_time_us(); + llama_sample_softmax(ctx, candidates); + // Compute the cumulative probabilities float cum_sum = 0.0f; size_t last_idx = candidates->size; @@ -8036,11 +8036,11 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can if (p <= 0.0f) { return; } + + const int64_t t_start_sample_us = ggml_time_us(); llama_sample_softmax(ctx, candidates); - const int64_t t_start_sample_us = ggml_time_us(); - float scale = candidates->data[0].p; // scale by max prob size_t i = 1; // first token always matches @@ -8063,9 +8063,10 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * return; } - llama_sample_softmax(nullptr, candidates); const int64_t t_start_sample_us = ggml_time_us(); + llama_sample_softmax(nullptr, candidates); + // Compute the first and second derivatives std::vector first_derivatives(candidates->size - 1); std::vector second_derivatives(candidates->size - 2); @@ -8124,11 +8125,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c return; } + const int64_t t_start_sample_us = ggml_time_us(); + // Compute the softmax of logits and calculate entropy llama_sample_softmax(nullptr, candidates); - const int64_t t_start_sample_us = ggml_time_us(); - float entropy = 0.0f; for (size_t i = 0; i < candidates->size; ++i) { entropy += -candidates->data[i].p * logf(candidates->data[i].p);