diff --git a/examples/common.h b/examples/common.h index 6393c8563..5ceac53c5 100644 --- a/examples/common.h +++ b/examples/common.h @@ -31,7 +31,7 @@ struct gpt_params { int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance - int32_t n_probs = 0; // if greater than 1, output the probabilities of top n_probs tokens. Max 5 + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. // sampling parameters std::unordered_map logit_bias; // logit bias for specific tokens diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 16f5bac0a..4cad658d3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -354,6 +354,7 @@ struct llama_server_context { result.tok = llama_sample_token(ctx, &candidates_p); } } + // Add maximum of 5 most probable tokens to the result for (size_t i = 0; i < std::min(candidates_p.size, std::min((size_t) n_probs, size_t(5))); ++i) { result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p}); }