diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index f9d8bcc57..8584a2033 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1,3 +1,4 @@
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
@@ -461,7 +462,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
 static bool g_mul_mat_q = true;
 
 static void * g_scratch_buffer = nullptr;
-static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
+static size_t g_scratch_size = 0; // disabled by default
 static size_t g_scratch_offset = 0;
 
 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@@ -7075,10 +7076,12 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
 }
 
 void ggml_cuda_set_scratch_size(const size_t scratch_size) {
+    // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
+    // it still won't always work as expected, but it's better than nothing
     if (scratch_size > g_scratch_size) {
         ggml_cuda_free_scratch();
     }
-    g_scratch_size = scratch_size;
+    g_scratch_size = std::max(g_scratch_size, scratch_size);
 }
 
 void ggml_cuda_free_scratch() {
diff --git a/llama.cpp b/llama.cpp
index 64bb20479..435102a74 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1149,15 +1149,6 @@ struct llama_context {
     // key + value cache for the self attention
     struct llama_kv_cache kv_self;
 
-    size_t kv_size() const {
-        size_t result = 2ull;
-        result *= (size_t) model.hparams.n_embd_gqa();
-        result *= (size_t) cparams.n_ctx;
-        result *= (size_t) model.hparams.n_layer;
-        result *= sizeof(ggml_fp16_t);
-        return result;
-    }
-
     std::mt19937 rng;
 
     bool has_evaluated_once = false;
@@ -1235,11 +1226,20 @@ static bool llama_kv_cache_init(
 
     (void) n_gpu_layers;
 #ifdef GGML_USE_CUBLAS
+    size_t vram_kv_cache = 0;
+
     if (n_gpu_layers > n_layer + 1) {
         ggml_cuda_assign_buffers_no_scratch(cache.v);
+        LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
+        vram_kv_cache += ggml_nbytes(cache.v);
     }
     if (n_gpu_layers > n_layer + 2) {
         ggml_cuda_assign_buffers_no_scratch(cache.k);
+        LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
+        vram_kv_cache += ggml_nbytes(cache.k);
+    }
+    if (vram_kv_cache > 0) {
+        LLAMA_LOG_INFO("%s: VRAM kv cache = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
     }
 #endif // GGML_USE_CUBLAS
 
@@ -1567,7 +1567,7 @@ struct llama_model_loader {
                         lmlock->grow_to(size_lock);
                     }
                     break;
-#if defined(GGML_USE_CUBLAS)
+#ifdef GGML_USE_CUBLAS
                 case GGML_BACKEND_GPU:
                 case GGML_BACKEND_GPU_SPLIT:
                     // old code:
@@ -1968,7 +1968,7 @@ static void llm_load_tensors(
     }
 
     (void) main_gpu;
-#if defined(GGML_USE_CUBLAS)
+#ifdef GGML_USE_CUBLAS
     LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
     ggml_cuda_set_main_device(main_gpu);
 #define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
@@ -2329,7 +2329,7 @@ static void llm_load_tensors(
     }
 
     (void) tensor_split;
-#if defined(GGML_USE_CUBLAS)
+#ifdef GGML_USE_CUBLAS
     {
         ggml_cuda_set_tensor_split(tensor_split);
     }
@@ -6330,30 +6330,6 @@ struct llama_context * llama_new_context_with_model(
             LLAMA_LOG_INFO("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
         }
 
-#ifdef GGML_USE_CUBLAS
-        {
-            size_t vram_kv_cache = 0;
-            if (model->n_gpu_layers > (int) hparams.n_layer + 1) {
-
-                if (params.low_vram) {
-                    LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
-                } else {
-                    LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
-                    vram_kv_cache += ctx->kv_size() / 2;
-                }
-            }
-            if (model->n_gpu_layers > (int) hparams.n_layer + 2) {
-                if (params.low_vram) {
-                    LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
-                } else {
-                    LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
-                    vram_kv_cache += ctx->kv_size() / 2;
-                }
-            }
-            LLAMA_LOG_INFO("%s: VRAM kv cache = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
-        }
-#endif
-
         // resized during inference
         if (params.logits_all) {
             ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);