From 17e841ac2209dfae4f61c04f8a2b86515af63638 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 28 Sep 2023 20:43:04 +0200 Subject: [PATCH] cuda : print total VRAM used --- llama.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/llama.cpp b/llama.cpp index f7e2b8d0c..30c8d0f93 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6773,6 +6773,29 @@ struct llama_context * llama_new_context_with_model( #ifdef GGML_USE_CUBLAS ggml_cuda_set_scratch_size(alloc_size); LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0); + + // calculate total VRAM usage + auto add_tensor = [](const ggml_tensor * t, size_t & size) { + if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) { + size += ggml_nbytes(t); + } + }; + size_t model_vram_size = 0; + for (const auto & kv : model->tensors_by_name) { + add_tensor(kv.second, model_vram_size); + } + + size_t kv_vram_size = 0; + add_tensor(ctx->kv_self.k, kv_vram_size); + add_tensor(ctx->kv_self.v, kv_vram_size); + + size_t ctx_vram_size = alloc_size + kv_vram_size; + size_t total_vram_size = model_vram_size + ctx_vram_size; + + LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__, + total_vram_size / 1024.0 / 1024.0, + model_vram_size / 1024.0 / 1024.0, + ctx_vram_size / 1024.0 / 1024.0); #endif }