diff --git a/llama.cpp b/llama.cpp index 075b1937a..faf65e339 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9195,7 +9195,7 @@ static void llama_output_reserve(llama_context & lctx, int32_t n_outputs) { // alloc only when more than the current capacity is required // TODO: also consider shrinking the buffer - if (prev_size < new_size) { + if (!lctx.buf_output || prev_size < new_size) { if (lctx.buf_output) { #ifndef NDEBUG // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)