From ed960fa1ab91e0b90e57eb72fa4cabadcac405de Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 18 Jul 2023 19:19:59 +0300
Subject: [PATCH] llama : separate compute buffer for metal

---
 llama.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index c234cdf3f..867b3e59f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1616,7 +1616,6 @@ static bool llama_eval_internal(
 
     LLAMA_ASSERT(lctx.graph_logits != nullptr);
 
-
     // for big prompts, if BLAS is enabled, it is better to use only one thread
     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
     n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
@@ -2719,11 +2718,17 @@ struct llama_context * llama_new_context_with_model(
 
         // TODO: size the buffers more accurately - depends on improved memory management
         ctx->buf_compute_cpu = ggml_backend_alloc_buffer(&model->backend_cpu, MEM_REQ_EVAL().at(ctx->model.type), 2048);
+
 #ifdef GGML_USE_CUDA
         if (params.n_gpu_layers > 0) {
             ctx->buf_compute_cuda = ggml_backend_alloc_buffer(&model->backend_cuda, MEM_REQ_EVAL().at(ctx->model.type), 2048);
         }
 #endif
+#ifdef GGML_USE_METAL
+        if (params.n_gpu_layers > 0) {
+            ctx->buf_compute_metal = ggml_backend_alloc_buffer(&model->backend_metal, MEM_REQ_EVAL().at(ctx->model.type), 2048);
+        }
+#endif
 
         // initialize the graph input/output buffers
         // input buffer