diff --git a/ggml-cuda.cu b/ggml-cuda.cu index de0fd447d..5bb8e1332 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2347,6 +2347,15 @@ void ggml_cuda_set_scratch_size(size_t scratch_size) { g_scratch_size = scratch_size; } +void ggml_cuda_free_scratch() { + if (g_scratch_buffer == nullptr) { + return; + } + + CUDA_CHECK(cudaFree(g_scratch_buffer)); + g_scratch_buffer = nullptr; +} + bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ ggml_cuda_func_t func; const bool any_on_device = tensor->backend == GGML_BACKEND_GPU diff --git a/ggml-cuda.h b/ggml-cuda.h index e964f9a48..d32b44842 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -31,6 +31,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); void ggml_cuda_set_main_device(int main_device); void ggml_cuda_set_scratch_size(size_t scratch_size); +void ggml_cuda_free_scratch(void); bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); #ifdef __cplusplus diff --git a/llama.cpp b/llama.cpp index 36a5facc4..412005286 100644 --- a/llama.cpp +++ b/llama.cpp @@ -215,6 +215,7 @@ struct llama_model { for (size_t i = 0; i < tensors_by_name.size(); ++i) { ggml_cuda_free_data(tensors_by_name[i].second); } + ggml_cuda_free_scratch(); #elif defined(GGML_USE_CLBLAST) for (size_t i = 0; i < tensors_by_name.size(); ++i) { ggml_cl_free_data(tensors_by_name[i].second);