llama.cpp: Include the changes from #6122 to exclude the unused outputs of the last layers.

2024-03-27 04:22:09 +00:00 · 2024-03-27 04:22:09 +00:00 · e4a16f2493
commit e4a16f2493
parent 3c0b830808
1 changed files with 7 additions and 0 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -6525,6 +6525,13 @@ struct llm_build_context {
                cb(cur, "kqv_out", il);
            }

+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,      cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
            cb(ffn_inp, "ffn_inp", il);