From bbb578b09d1adc4e59739a2e5daeb5b531a8758f Mon Sep 17 00:00:00 2001
From: kalomaze <66376113+kalomaze@users.noreply.github.com>
Date: Mon, 22 Jan 2024 05:24:54 -0600
Subject: [PATCH] Capture softmax operations for sampler profiling

---
 llama.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/llama.cpp b/llama.cpp
index e128cea6f..4e912e84c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8005,10 +8005,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
         return;
     }
 
-    llama_sample_softmax(ctx, candidates);
-
     const int64_t t_start_sample_us = ggml_time_us();
 
+    llama_sample_softmax(ctx, candidates);
+
     // Compute the cumulative probabilities
     float cum_sum = 0.0f;
     size_t last_idx = candidates->size;
@@ -8036,11 +8036,11 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
     if (p <= 0.0f) {
         return;
     }
+    
+    const int64_t t_start_sample_us = ggml_time_us();
 
     llama_sample_softmax(ctx, candidates);
 
-    const int64_t t_start_sample_us = ggml_time_us();
-
     float scale = candidates->data[0].p; // scale by max prob
     size_t i = 1; // first token always matches
 
@@ -8063,9 +8063,10 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
         return;
     }
 
-    llama_sample_softmax(nullptr, candidates);
     const int64_t t_start_sample_us = ggml_time_us();
 
+    llama_sample_softmax(nullptr, candidates);
+
     // Compute the first and second derivatives
     std::vector<float> first_derivatives(candidates->size - 1);
     std::vector<float> second_derivatives(candidates->size - 2);
@@ -8124,11 +8125,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
         return;
     }
 
+    const int64_t t_start_sample_us = ggml_time_us();
+
     // Compute the softmax of logits and calculate entropy
     llama_sample_softmax(nullptr, candidates);
 
-    const int64_t t_start_sample_us = ggml_time_us();
-
     float entropy = 0.0f;
     for (size_t i = 0; i < candidates->size; ++i) {
         entropy += -candidates->data[i].p * logf(candidates->data[i].p);