From b9c60dec98baf7b6f512c521c6e43be3a8b3ea7b Mon Sep 17 00:00:00 2001 From: David Friehs Date: Mon, 8 Jan 2024 08:54:39 +0100 Subject: [PATCH] llama : always reserve n_vocab * n_batch for logits llama_context de-serialization breaks if the contexts have differing capacity for logits and llama_decode will at maximum resize to n_vocab * n_batch. --- llama.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index dfbdcdf75..089533a60 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9795,12 +9795,8 @@ struct llama_context * llama_new_context_with_model( ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } - // resized during inference - if (params.logits_all) { - ctx->logits.reserve(hparams.n_vocab*cparams.n_batch); - } else { - ctx->logits.reserve(hparams.n_vocab); - } + // resized during inference, reserve maximum + ctx->logits.reserve(hparams.n_vocab*cparams.n_batch); if (params.embedding){ ctx->embedding.resize(hparams.n_embd);