diff --git a/libfalcon.cpp b/libfalcon.cpp index 2d3039136..2c62d65f2 100644 --- a/libfalcon.cpp +++ b/libfalcon.cpp @@ -868,8 +868,10 @@ static bool kv_cache_init( const struct falcon_hparams & hparams, struct falcon_kv_cache & cache, ggml_type wtype, - int n_ctx) { + int n_ctx, + int n_gpu_layers) { + const int64_t n_layer = hparams.n_layer; const int64_t head_dim = hparams.n_embd / hparams.n_head; const int64_t n_elements = hparams.n_layer * n_ctx * head_dim * hparams.n_head_kv; @@ -893,6 +895,14 @@ static bool kv_cache_init( ggml_set_name(cache.k, "cache_k"); ggml_set_name(cache.v, "cache_v"); + (void) n_gpu_layers; +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer + 1) { + ggml_cuda_assign_buffers_no_scratch(cache.k); + ggml_cuda_assign_buffers_no_scratch(cache.v); + } +#endif // GGML_USE_CUBLAS + return true; } @@ -1391,18 +1401,14 @@ static bool falcon_eval_internal( // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal // in that case ggml_cuda_assign_buffers has no effect offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; + offload_func_t offload_func_kqv = llama_nop; #ifdef GGML_USE_CUBLAS if (n_gpu_layers > n_layer) { offload_func_nr = ggml_cuda_assign_buffers; } if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers; + offload_func_kqv = ggml_cuda_assign_buffers; } #endif // GGML_USE_CUBLAS @@ -2622,7 +2628,7 @@ struct falcon_context * falcon_init_from_file( // reserve memory for context buffers if (!params.vocab_only) { - if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { + if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr;