From da9fc775a3651994f8feb83449aaf9b14f71c3c9 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 26 Dec 2023 01:28:39 +0100
Subject: [PATCH] fix scratch buffer size, re-enable vmm pool for all devices

---
 ggml-cuda.cu | 4 ++--
 llama.cpp    | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index dac4ab201..b19e7ad62 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6741,7 +6741,7 @@ static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
 }
 
 static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_size) {
-    if (device == g_main_device && g_device_caps[device].vmm) {
+    if (g_device_caps[device].vmm) {
         return ggml_cuda_pool_malloc_vmm(device, size, actual_size);
     } else {
         return ggml_cuda_pool_malloc_leg(device, size, actual_size);
@@ -6749,7 +6749,7 @@ static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_siz
 }
 
 static void ggml_cuda_pool_free(int device, void * ptr, size_t size) {
-    if (device == g_main_device && g_device_caps[device].vmm) {
+    if (g_device_caps[device].vmm) {
         ggml_cuda_pool_free_vmm(device, ptr, size);
     } else {
         ggml_cuda_pool_free_leg(device, ptr, size);
diff --git a/llama.cpp b/llama.cpp
index 0b99f1e03..4aa59c4c0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9519,7 +9519,8 @@ struct llama_context * llama_new_context_with_model(
             ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
 #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
             if (model->n_gpu_layers > 0) {
-                ggml_cuda_set_scratch_size(alloc_size);
+                // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
+                ggml_cuda_set_scratch_size(alloc_size + 64);
                 LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
 
                 // calculate total VRAM usage