diff --git a/llama.cpp b/llama.cpp index c79005265..b2a6369e6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1155,19 +1155,26 @@ static void llama_model_load_internal( ggml_backend backend_norm; ggml_backend backend_output; if (n_gpu_layers > int(n_layer)) { // NOLINT - backend_norm = LLAMA_BACKEND_OFFLOAD; + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; } else { backend_norm = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU; } - // norm is not performance relevant on its own but keeping it in VRAM reduces data copying: model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm); model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output); - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - LLAMA_ASSERT(backend_norm == GGML_BACKEND_GPU); + if (backend_norm == GGML_BACKEND_GPU) { vram_weights += ggml_nbytes(model.norm); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { vram_weights += ggml_nbytes(model.output); } }