Capture softmax operations for sampler profiling

This commit is contained in:
kalomaze 2024-01-22 05:24:54 -06:00
parent feea528add
commit bbb578b09d

View file

@ -8005,10 +8005,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
return;
}
llama_sample_softmax(ctx, candidates);
const int64_t t_start_sample_us = ggml_time_us();
llama_sample_softmax(ctx, candidates);
// Compute the cumulative probabilities
float cum_sum = 0.0f;
size_t last_idx = candidates->size;
@ -8037,10 +8037,10 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
return;
}
llama_sample_softmax(ctx, candidates);
const int64_t t_start_sample_us = ggml_time_us();
llama_sample_softmax(ctx, candidates);
float scale = candidates->data[0].p; // scale by max prob
size_t i = 1; // first token always matches
@ -8063,9 +8063,10 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
return;
}
llama_sample_softmax(nullptr, candidates);
const int64_t t_start_sample_us = ggml_time_us();
llama_sample_softmax(nullptr, candidates);
// Compute the first and second derivatives
std::vector<float> first_derivatives(candidates->size - 1);
std::vector<float> second_derivatives(candidates->size - 2);
@ -8124,11 +8125,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
return;
}
const int64_t t_start_sample_us = ggml_time_us();
// Compute the softmax of logits and calculate entropy
llama_sample_softmax(nullptr, candidates);
const int64_t t_start_sample_us = ggml_time_us();
float entropy = 0.0f;
for (size_t i = 0; i < candidates->size; ++i) {
entropy += -candidates->data[i].p * logf(candidates->data[i].p);