From 9129e937f92172264ed99065a1ac2a97c1e3a1be Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sun, 23 Apr 2023 15:57:06 +0800
Subject: [PATCH] only llama can use batch sizes above 256 to prevent
 unacceptably high memory usage

---
 gpttype_adapter.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 7e27390f9..036c44520 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -358,8 +358,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     int original_threads = params.n_threads;
     if (blasmode)
     {
-        //for gpttype, GPT2 crashes above 256.
-        int bbs = blasbatchsize; //(blasbatchsize>256?256:blasbatchsize);
+        //for non llama, limit to 256
+        int bbs = blasbatchsize;
+        if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT)
+        {
+            bbs = (blasbatchsize > 256 ? 256 : blasbatchsize);
+        }
+
         params.n_batch = bbs; //received reports of 1024 and above crashing on some models
         params.n_threads = 1;
     }