diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f9d8bcc57..8584a2033 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,3 +1,4 @@ +#include #include #include #include @@ -461,7 +462,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; static bool g_mul_mat_q = true; static void * g_scratch_buffer = nullptr; -static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default +static size_t g_scratch_size = 0; // disabled by default static size_t g_scratch_offset = 0; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; @@ -7075,10 +7076,12 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) { } void ggml_cuda_set_scratch_size(const size_t scratch_size) { + // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously + // it still won't always work as expected, but it's better than nothing if (scratch_size > g_scratch_size) { ggml_cuda_free_scratch(); } - g_scratch_size = scratch_size; + g_scratch_size = std::max(g_scratch_size, scratch_size); } void ggml_cuda_free_scratch() { diff --git a/llama.cpp b/llama.cpp index 64bb20479..435102a74 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1149,15 +1149,6 @@ struct llama_context { // key + value cache for the self attention struct llama_kv_cache kv_self; - size_t kv_size() const { - size_t result = 2ull; - result *= (size_t) model.hparams.n_embd_gqa(); - result *= (size_t) cparams.n_ctx; - result *= (size_t) model.hparams.n_layer; - result *= sizeof(ggml_fp16_t); - return result; - } - std::mt19937 rng; bool has_evaluated_once = false; @@ -1235,11 +1226,20 @@ static bool llama_kv_cache_init( (void) n_gpu_layers; #ifdef GGML_USE_CUBLAS + size_t vram_kv_cache = 0; + if (n_gpu_layers > n_layer + 1) { ggml_cuda_assign_buffers_no_scratch(cache.v); + LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); + vram_kv_cache += ggml_nbytes(cache.v); } if (n_gpu_layers > n_layer + 2) { ggml_cuda_assign_buffers_no_scratch(cache.k); + LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); + vram_kv_cache += ggml_nbytes(cache.k); + } + if (vram_kv_cache > 0) { + LLAMA_LOG_INFO("%s: VRAM kv cache = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); } #endif // GGML_USE_CUBLAS @@ -1567,7 +1567,7 @@ struct llama_model_loader { lmlock->grow_to(size_lock); } break; -#if defined(GGML_USE_CUBLAS) +#ifdef GGML_USE_CUBLAS case GGML_BACKEND_GPU: case GGML_BACKEND_GPU_SPLIT: // old code: @@ -1968,7 +1968,7 @@ static void llm_load_tensors( } (void) main_gpu; -#if defined(GGML_USE_CUBLAS) +#ifdef GGML_USE_CUBLAS LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU @@ -2329,7 +2329,7 @@ static void llm_load_tensors( } (void) tensor_split; -#if defined(GGML_USE_CUBLAS) +#ifdef GGML_USE_CUBLAS { ggml_cuda_set_tensor_split(tensor_split); } @@ -6330,30 +6330,6 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } -#ifdef GGML_USE_CUBLAS - { - size_t vram_kv_cache = 0; - if (model->n_gpu_layers > (int) hparams.n_layer + 1) { - - if (params.low_vram) { - LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); - } else { - LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); - vram_kv_cache += ctx->kv_size() / 2; - } - } - if (model->n_gpu_layers > (int) hparams.n_layer + 2) { - if (params.low_vram) { - LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); - } else { - LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); - vram_kv_cache += ctx->kv_size() / 2; - } - } - LLAMA_LOG_INFO("%s: VRAM kv cache = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); - } -#endif - // resized during inference if (params.logits_all) { ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);