diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index de0fd447d..5bb8e1332 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2347,6 +2347,15 @@ void ggml_cuda_set_scratch_size(size_t scratch_size) {
     g_scratch_size = scratch_size;
 }
 
+void ggml_cuda_free_scratch() {
+    if (g_scratch_buffer == nullptr) {
+        return;
+    }
+
+    CUDA_CHECK(cudaFree(g_scratch_buffer));
+    g_scratch_buffer = nullptr;
+}
+
 bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
     ggml_cuda_func_t func;
     const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
diff --git a/ggml-cuda.h b/ggml-cuda.h
index e964f9a48..d32b44842 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -31,6 +31,7 @@ void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
 void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
 void   ggml_cuda_set_main_device(int main_device);
 void   ggml_cuda_set_scratch_size(size_t scratch_size);
+void   ggml_cuda_free_scratch(void);
 bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 
 #ifdef  __cplusplus
diff --git a/llama.cpp b/llama.cpp
index 36a5facc4..412005286 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -215,6 +215,7 @@ struct llama_model {
         for (size_t i = 0; i < tensors_by_name.size(); ++i) {
             ggml_cuda_free_data(tensors_by_name[i].second);
         }
+        ggml_cuda_free_scratch();
 #elif defined(GGML_USE_CLBLAST)
         for (size_t i = 0; i < tensors_by_name.size(); ++i) {
             ggml_cl_free_data(tensors_by_name[i].second);