Capture softmax operations for sampler profiling

2024-01-22 05:24:54 -06:00 · 2024-01-22 05:24:54 -06:00 · bbb578b09d
commit bbb578b09d
parent feea528add
1 changed files with 8 additions and 7 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -8005,10 +8005,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
        return;
    }

-    llama_sample_softmax(ctx, candidates);
-
    const int64_t t_start_sample_us = ggml_time_us();

+    llama_sample_softmax(ctx, candidates);
+
    // Compute the cumulative probabilities
    float cum_sum = 0.0f;
    size_t last_idx = candidates->size;
@ -8037,10 +8037,10 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
        return;
    }
    
-    llama_sample_softmax(ctx, candidates);
-
    const int64_t t_start_sample_us = ggml_time_us();

+    llama_sample_softmax(ctx, candidates);
+
    float scale = candidates->data[0].p; // scale by max prob
    size_t i = 1; // first token always matches

@ -8063,9 +8063,10 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
        return;
    }

-    llama_sample_softmax(nullptr, candidates);
    const int64_t t_start_sample_us = ggml_time_us();

+    llama_sample_softmax(nullptr, candidates);
+
    // Compute the first and second derivatives
    std::vector<float> first_derivatives(candidates->size - 1);
    std::vector<float> second_derivatives(candidates->size - 2);
@ -8124,11 +8125,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
        return;
    }

+    const int64_t t_start_sample_us = ggml_time_us();
+
    // Compute the softmax of logits and calculate entropy
    llama_sample_softmax(nullptr, candidates);

-    const int64_t t_start_sample_us = ggml_time_us();
-
    float entropy = 0.0f;
    for (size_t i = 0; i < candidates->size; ++i) {
        entropy += -candidates->data[i].p * logf(candidates->data[i].p);