From 0093dea953251fd39c91492211c354b7f4e9920b Mon Sep 17 00:00:00 2001
From: David Friehs <david@friehs.info>
Date: Mon, 8 Jan 2024 08:54:13 +0100
Subject: [PATCH] llama : only reserve n_vocab * n_batch at most for logits

llama_decode asserts that only n_batch tokens are passed each call, and
n_ctx is expected to be bigger than n_batch.
---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 91aa3f8e7..dfbdcdf75 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9797,7 +9797,7 @@ struct llama_context * llama_new_context_with_model(
 
         // resized during inference
         if (params.logits_all) {
-            ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
+            ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
         } else {
             ctx->logits.reserve(hparams.n_vocab);
         }