From 695c15e1741c5446d8b94a3d5a7137854ff9bdc7 Mon Sep 17 00:00:00 2001 From: John Date: Mon, 19 Jun 2023 23:41:59 +0200 Subject: [PATCH] Bugfix for --ngl on low vram not working correctly --- libfalcon.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libfalcon.cpp b/libfalcon.cpp index 72ac5b3c8..8e3a4206d 100644 --- a/libfalcon.cpp +++ b/libfalcon.cpp @@ -1251,7 +1251,7 @@ if (n_gpu_layers > 0) const int i_gpu_start = n_layer - n_gpu_layers; int i_gpu_last = n_layer; // allows to terminate the offloading earlier. TODO: instead do a proper calculation run and determine the start before the loop model.i_gpu_start = i_gpu_start; - model.i_gpu_last = i_gpu_last; + model.i_gpu_last = i_gpu_last; // if VRAM doesn't run out i_gpu_last is always the last layer model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { @@ -1287,15 +1287,14 @@ if (n_gpu_layers > 0) vram_layer = calculate_layer_vram_bytes(layer); vram_weights += vram_layer; vram_free = (vram_layer > vram_free) ? 0 : vram_free - vram_layer; // simulate the layer being loaded in VRAM - // test if we have enough VRAM to load the next layer + // test if we have enough VRAM to offload the next layer if (i < n_layer && vram_free <= (vram_overhead+vram_scratch+vram_reserved+vram_layer)) { - // this needs some polishing (instead of fiddling with --ngl I'd like the option to auto-fill the vram with as many layers as possible as an alternative) fprintf(stderr, "INFO: Not enough VRAM to load all requested layers - at layer %d of %d: skipping\n", i, n_layer); - n_gpu_layers = i+1; model.n_gpu_layers = n_gpu_layers; i_gpu_last = i; model.i_gpu_last = i_gpu_last; + n_gpu_layers = i_gpu_last - i_gpu_start; } } @@ -1482,6 +1481,7 @@ static bool falcon_eval_internal( offload_func_t offload_func_kqv = llama_nop; #ifdef GGML_USE_CUBLAS + // todo: use either a flag in model/params or a backend test to determine if norm/output are on GPU if (n_gpu_layers > n_layer) { offload_func_nr = ggml_cuda_assign_buffers; } @@ -1700,7 +1700,7 @@ static bool falcon_eval_internal( offload_func_t offload_func = llama_nop; #ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { + if (n_gpu_layers > 0 && n_layer >= i_gpu_start && n_layer <= i_gpu_last) { offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU } #endif // GGML_USE_CUBLAS @@ -2726,8 +2726,8 @@ struct falcon_context * falcon_init_from_file( } // model_load_internal() may change this if VRAM runs out params.n_gpu_layers = ctx->model.n_gpu_layers; - params.i_gpu_start = ctx->model.i_gpu_start; - params.i_gpu_last = ctx->model.i_gpu_last; + params.i_gpu_start = ctx->model.i_gpu_start; // first layer that's GPU accelerated + params.i_gpu_last = ctx->model.i_gpu_last; // last layer that's GPU accelerated // reserve memory for context buffers