From b723fe7028d417046f38919dd8bea9a9698f0d5c Mon Sep 17 00:00:00 2001 From: Bach Le Date: Wed, 12 Jul 2023 22:59:04 +0800 Subject: [PATCH] Track and free temporary ggml_tensor_extra_gpu struct created during eval --- ggml-cuda.cu | 43 +++++++++++++++++++++++++++++++++++++++++-- ggml-cuda.h | 2 ++ llama.cpp | 6 ++++++ 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 89e69bdc1..2a9aeaafa 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -239,6 +239,11 @@ struct ggml_tensor_extra_gpu { cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs }; +struct ggml_tensor_extra_gpu_tmp { + struct ggml_tensor_extra_gpu main; + struct ggml_tensor_extra_gpu_tmp * next; +}; + static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -2080,6 +2085,9 @@ struct cuda_buffer { size_t size = 0; }; +// TODO: Make this TLS? +static struct ggml_tensor_extra_gpu_tmp * g_eval_extras = NULL; +static bool g_evaluating = false; static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS]; static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; @@ -3260,6 +3268,38 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) { delete extra; } +void ggml_cuda_begin_eval() { + GGML_ASSERT(!g_evaluating); + g_evaluating = true; +} + +struct ggml_tensor_extra_gpu * ggml_cuda_alloc_extra() { + if (g_evaluating) { + struct ggml_tensor_extra_gpu_tmp * extra = new ggml_tensor_extra_gpu_tmp; + memset(extra, 0, sizeof(*extra)); + extra->next = g_eval_extras; + g_eval_extras = extra; + return &extra->main; + } else { + struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; + memset(extra, 0, sizeof(*extra)); + return extra; + } +} + +void ggml_cuda_end_eval() { + GGML_ASSERT(g_evaluating); + + for (struct ggml_tensor_extra_gpu_tmp * i = g_eval_extras; i != nullptr;) { + struct ggml_tensor_extra_gpu_tmp * next = i->next; + delete i; + i = next; + } + + g_eval_extras = nullptr; + g_evaluating = false; +} + void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) { if (scratch && g_scratch_size == 0) { return; @@ -3277,8 +3317,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo } tensor->backend = GGML_BACKEND_GPU; - struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; - memset(extra, 0, sizeof(*extra)); + struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_extra(); const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || tensor->op == GGML_OP_VIEW || diff --git a/ggml-cuda.h b/ggml-cuda.h index 3c1e8deb6..ccdf9c73d 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -30,6 +30,8 @@ void ggml_cuda_set_main_device(int main_device); void ggml_cuda_set_scratch_size(size_t scratch_size); void ggml_cuda_free_scratch(void); bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +void ggml_cuda_begin_eval(void); +void ggml_cuda_end_eval(void); #ifdef __cplusplus } diff --git a/llama.cpp b/llama.cpp index 2d09d6ce7..86750fbf6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1379,6 +1379,8 @@ static bool llama_eval_internal( offload_func_t offload_func_v = llama_nop; #ifdef GGML_USE_CUBLAS + ggml_cuda_begin_eval(); + if (n_gpu_layers > n_layer) { offload_func_nr = ggml_cuda_assign_buffers; } @@ -1721,6 +1723,10 @@ static bool llama_eval_internal( lctx.get_buf_max_mem(1)/1024.0/1024.0); #endif +#ifdef GGML_USE_CUBLAS + ggml_cuda_end_eval(); +#endif + ggml_free(ctx0); // measure the performance only for the single-token evals