add overlooked offload code ggml-ci

2023-10-06 12:39:27 -07:00 · 2023-10-06 12:39:27 -07:00 · 485a471e93
commit 485a471e93
parent 0c1a8f67a5
1 changed files with 33 additions and 6 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -2587,9 +2587,38 @@ static void llm_load_tensors(
            case LLM_ARCH_PERSIMMON:
                {
                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);
-                    model.output_norm    = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          GGML_BACKEND_CPU);
+
-                    model.output_norm_b  = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          GGML_BACKEND_CPU);
+                    {
-                    model.output         = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                        ggml_backend backend_norm;
                        ggml_backend backend_output;
                        if (n_gpu_layers > int(n_layer)) {
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
                            backend_norm = LLAMA_BACKEND_OFFLOAD;
 #else
                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
 #endif // _WIN32
                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
                        }
                        model.output_norm    = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
                        model.output_norm_b  = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
                        model.output         = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
                        if (backend_norm == GGML_BACKEND_GPU) {
                            vram_weights += ggml_nbytes(model.output_norm);
                            vram_weights += ggml_nbytes(model.output_norm_b);
                        }
                        if (backend_output == GGML_BACKEND_GPU_SPLIT) {
                            vram_weights += ggml_nbytes(model.output);
                        }
                    }
                    const uint32_t n_ff = hparams.n_ff;
                    const int i_gpu_start = n_layer - n_gpu_layers;
@ -3549,11 +3578,9 @@ static struct ggml_cgraph * llm_build_refact(
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_compute.size,
        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc   =*/ false,
+        /*.no_alloc   =*/ true,
    };
    params.no_alloc = true;
    struct ggml_context * ctx0 = ggml_init(params);
    ggml_cgraph * gf = ggml_new_graph(ctx0);