llama: dbrx: no attention output layer

2024-04-06 22:22:57 +02:00 · 2024-04-06 22:22:57 +02:00 · 9c7dedb0f3
commit 9c7dedb0f3
parent 76f266beef
1 changed files with 47 additions and 46 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -7165,7 +7165,7 @@ struct llm_build_context {
                cb(Vcur, "Vcur", il);

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                                   model.layers[il].wo, model.layers[il].bo,
+                                   NULL, NULL,
                                   Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
            }

@ -7182,6 +7182,7 @@ struct llm_build_context {

            // feed-forward network
            // MoE branch
+            {
                // FIXME REVIEW: I do not see this op in https://huggingface.co/databricks/dbrx-instruct/blob/464e701f50aef4c1b59c81fb5667819a5d08e108/modeling_dbrx.py#L727
                cur = llm_build_norm(ctx0, ffn_inp, hparams,
                                     NULL, NULL,
@ -7212,7 +7213,6 @@ struct llm_build_context {

                // compute expert outputs
                ggml_tensor * moe_out = nullptr;
-
                for (int i = 0; i < n_expert_used; ++i) {
                    ggml_tensor * cur_expert;

@ -7244,6 +7244,7 @@ struct llm_build_context {
                    }
                }
                cur = moe_out;
+            }

            // DbrxNormAttentionNorm
            {