From ed960fa1ab91e0b90e57eb72fa4cabadcac405de Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jul 2023 19:19:59 +0300 Subject: [PATCH] llama : separate compute buffer for metal --- llama.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index c234cdf3f..867b3e59f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1616,7 +1616,6 @@ static bool llama_eval_internal( LLAMA_ASSERT(lctx.graph_logits != nullptr); - // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads; @@ -2719,11 +2718,17 @@ struct llama_context * llama_new_context_with_model( // TODO: size the buffers more accurately - depends on improved memory management ctx->buf_compute_cpu = ggml_backend_alloc_buffer(&model->backend_cpu, MEM_REQ_EVAL().at(ctx->model.type), 2048); + #ifdef GGML_USE_CUDA if (params.n_gpu_layers > 0) { ctx->buf_compute_cuda = ggml_backend_alloc_buffer(&model->backend_cuda, MEM_REQ_EVAL().at(ctx->model.type), 2048); } #endif +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { + ctx->buf_compute_metal = ggml_backend_alloc_buffer(&model->backend_metal, MEM_REQ_EVAL().at(ctx->model.type), 2048); + } +#endif // initialize the graph input/output buffers // input buffer