llama : change naming to prefer "_enc" suffix

2024-07-04 11:24:13 +03:00 · 2024-07-04 11:24:13 +03:00 · 03ab5dd67c
commit 03ab5dd67c
parent d40c9a1d3a
1 changed files with 103 additions and 98 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2265,6 +2265,7 @@ struct llama_cparams {
    void * cb_eval_user_data;
 };

+// TODO: separate into "llama_layer_enc" and "llama_layer_dec"
 struct llama_layer {
    // normalization
    struct ggml_tensor * attn_norm;
@ -2282,8 +2283,8 @@ struct llama_layer {
    struct ggml_tensor * attn_sub_norm;
    struct ggml_tensor * attn_post_norm;
    struct ggml_tensor * ffn_sub_norm;
-    struct ggml_tensor * cross_attn_norm;
-    struct ggml_tensor * enc_attn_norm;
+    struct ggml_tensor * attn_norm_cross;
+    struct ggml_tensor * attn_norm_enc;

    // attention
    struct ggml_tensor * wq;
@ -2295,14 +2296,14 @@ struct llama_layer {
    struct ggml_tensor * wq_b;
    struct ggml_tensor * wkv_a_mqa;
    struct ggml_tensor * wkv_b;
-    struct ggml_tensor * cross_wq;
-    struct ggml_tensor * cross_wk;
-    struct ggml_tensor * cross_wv;
-    struct ggml_tensor * cross_wo;
-    struct ggml_tensor * enc_wq;
-    struct ggml_tensor * enc_wk;
-    struct ggml_tensor * enc_wv;
-    struct ggml_tensor * enc_wo;
+    struct ggml_tensor * wq_cross;
+    struct ggml_tensor * wk_cross;
+    struct ggml_tensor * wv_cross;
+    struct ggml_tensor * wo_cross;
+    struct ggml_tensor * wq_enc;
+    struct ggml_tensor * wk_enc;
+    struct ggml_tensor * wv_enc;
+    struct ggml_tensor * wo_enc;

    // attention bias
    struct ggml_tensor * bq;
@ -2312,9 +2313,9 @@ struct llama_layer {
    struct ggml_tensor * bqkv;

    // relative position bias
-    struct ggml_tensor * rel_attn_b;
-    struct ggml_tensor * enc_rel_attn_b;
-    struct ggml_tensor * cross_rel_attn_b;
+    struct ggml_tensor * attn_rel_b;
+    struct ggml_tensor * attn_rel_b_enc;
+    struct ggml_tensor * attn_rel_b_cross;

    // normalization
    struct ggml_tensor * ffn_norm;
@ -2323,15 +2324,15 @@ struct llama_layer {
    struct ggml_tensor * layer_out_norm;
    struct ggml_tensor * layer_out_norm_b;
    struct ggml_tensor * ffn_norm_exps;
-    struct ggml_tensor * enc_ffn_norm;
+    struct ggml_tensor * ffn_norm_enc;

    // ff
    struct ggml_tensor * ffn_gate; // w1
    struct ggml_tensor * ffn_down; // w2
    struct ggml_tensor * ffn_up;   // w3
-    struct ggml_tensor * enc_ffn_gate;
-    struct ggml_tensor * enc_ffn_down;
-    struct ggml_tensor * enc_ffn_up;
+    struct ggml_tensor * ffn_gate_enc;
+    struct ggml_tensor * ffn_down_enc;
+    struct ggml_tensor * ffn_up_enc;

    // ff MoE
    struct ggml_tensor * ffn_gate_inp;
@ -2565,7 +2566,7 @@ struct llama_model {
    struct ggml_tensor * output_norm_b;
    struct ggml_tensor * output;
    struct ggml_tensor * output_b;
-    struct ggml_tensor * enc_output_norm;
+    struct ggml_tensor * output_norm_enc;

    std::vector<llama_layer> layers;

@ -2721,8 +2722,8 @@ struct llama_context {
    struct ggml_tensor * inp_s_mask;      // F32 [1, n_kv]
    struct ggml_tensor * inp_s_seq;       // I32 [n_kv, n_batch]
    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
-    struct ggml_tensor * inp_enc_output;    // F32 [n_embd, n_enc_outputs]
-    struct ggml_tensor * inp_cross_KQ_mask; // F32 [n_enc_outputs, n_batch]
+    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
+    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]

    // control vectors
    struct llama_control_vector cvec;
@ -7080,8 +7081,9 @@ static bool llm_load_tensors(

                    // output
                    {
-                        model.enc_output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
                        model.output_norm     = ml.create_tensor(ctx_output, tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd});
+
                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        // if output is NULL, init from the input tok embed
                        if (model.output == NULL) {
@ -7095,35 +7097,35 @@ static bool llm_load_tensors(

                        auto & layer = model.layers[i];

-                        layer.enc_attn_norm  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd});
-                        layer.enc_rel_attn_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {hparams.n_head, hparams.n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_enc  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM,  "weight", i), {n_embd});
+                        layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {hparams.n_head, hparams.n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);

-                        layer.enc_wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.enc_wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.enc_wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-                        layer.enc_wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
+                        layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
+                        layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});

-                        layer.enc_ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
-                        layer.enc_ffn_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                        layer.enc_ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.enc_ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up_enc   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP,   "weight", i), {n_embd,   n_ff});

                        layer.attn_norm  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_ATTN_NORM,  "weight", i), {n_embd});
-                        layer.rel_attn_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {hparams.n_head, hparams.n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_rel_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {hparams.n_head, hparams.n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);

                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});

-                        layer.cross_attn_norm  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd});
+                        layer.attn_norm_cross  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM,  "weight", i), {n_embd});
                        // this tensor seems to be unused in HF transformers implementation
-                        layer.cross_rel_attn_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {hparams.n_head, hparams.n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_rel_b_cross = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {hparams.n_head, hparams.n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);

-                        layer.cross_wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.cross_wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
-                        layer.cross_wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
-                        layer.cross_wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
+                        layer.wq_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_Q,   "weight", i), {n_embd, n_embd_k_gqa});
+                        layer.wk_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        layer.wv_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        layer.wo_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});

                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd});
                        layer.ffn_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd,   n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
@ -7922,7 +7924,7 @@ struct llm_build_context {
    const int32_t n_tokens;
    const int32_t n_kv;     // size of KV cache to consider (n_kv <= kv_self.size)
    const int32_t n_outputs;
-    const int32_t n_enc_outputs;
+    const int32_t n_outputs_enc;
    const int32_t kv_head;  // index of where we store new KV data in the cache
    const int32_t n_ctx_orig;

@ -7972,7 +7974,7 @@ struct llm_build_context {
        n_tokens         (batch.n_tokens),
        n_kv             (worst_case ? kv_self.size : kv_self.n),
        n_outputs        (worst_case ? n_tokens : lctx.n_outputs),
-        n_enc_outputs    (worst_case ? n_tokens : lctx.encoder_output.size() / hparams.n_embd),
+        n_outputs_enc    (worst_case ? n_tokens : lctx.encoder_output.size() / hparams.n_embd),
        kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
        n_ctx_orig       (cparams.n_ctx_orig_yarn),
        flash_attn       (cparams.flash_attn),
@ -8005,8 +8007,8 @@ struct llm_build_context {
        lctx.inp_s_mask      = nullptr;
        lctx.inp_s_seq       = nullptr;
        lctx.inp_pos_bucket    = nullptr;
-        lctx.inp_enc_output    = nullptr;
-        lctx.inp_cross_KQ_mask = nullptr;
+        lctx.inp_embd_enc      = nullptr;
+        lctx.inp_KQ_mask_cross = nullptr;
    }

    void free() {
@ -8259,7 +8261,7 @@ struct llm_build_context {
        return gf;
    }

-    struct ggml_tensor * llm_build_inp_rel_pos_bucket(bool causal) {
+    struct ggml_tensor * llm_build_pos_bucket(bool causal) {
        if (causal) {
            lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv,     n_tokens);
        } else {
@ -8272,11 +8274,11 @@ struct llm_build_context {
        return lctx.inp_pos_bucket;
    }

-    struct ggml_tensor * llm_build_rel_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * rel_attn_b) {
+    struct ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
        struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
        cb(pos_bucket_1d, "pos_bucket_1d", -1);

-        struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, rel_attn_b, pos_bucket_1d);
+        struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
        cb(pos_bias, "pos_bias", -1);

        pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0],  0);
@ -8291,19 +8293,19 @@ struct llm_build_context {
        return pos_bias;
    }

-    struct ggml_tensor * llm_build_inp_enc_output() {
+    struct ggml_tensor * llm_build_inp_embd_enc() {
        const int64_t n_embd = hparams.n_embd;
-        lctx.inp_enc_output = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc_outputs);
-        ggml_set_input(lctx.inp_enc_output);
-        cb(lctx.inp_enc_output, "enc_output", -1);
-        return lctx.inp_enc_output;
+        lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
+        ggml_set_input(lctx.inp_embd_enc);
+        cb(lctx.inp_embd_enc, "embd_enc", -1);
+        return lctx.inp_embd_enc;
    }

-    struct ggml_tensor * llm_build_inp_cross_KQ_mask() {
-        lctx.inp_cross_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_enc_outputs, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        ggml_set_input(lctx.inp_cross_KQ_mask);
-        cb(lctx.inp_cross_KQ_mask, "enc_mask", -1);
-        return lctx.inp_cross_KQ_mask;
+    struct ggml_tensor * llm_build_inp_KQ_mask_cross() {
+        lctx.inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        ggml_set_input(lctx.inp_KQ_mask_cross);
+        cb(lctx.inp_KQ_mask_cross, "KQ_mask_cross", -1);
+        return lctx.inp_KQ_mask_cross;
    }

    struct ggml_cgraph * build_llama() {
@ -12629,29 +12631,29 @@ struct llm_build_context {
        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);

        if (lctx.is_encoding) {
-            struct ggml_tensor * enc_pos_buckets = llm_build_inp_rel_pos_bucket(false);
+            struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);

            // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-            struct ggml_tensor * enc_KQ_mask = build_inp_KQ_mask(false);
+            struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);

            for (int il = 0; il < n_layer; ++il) {
                struct ggml_tensor * inpSA = inpL;

                // norm
                cur = llm_build_norm(ctx0, inpL, hparams,
-                        model.layers[il].enc_attn_norm, NULL,
+                        model.layers[il].attn_norm_enc, NULL,
                        LLM_NORM_RMS, cb, il);
                cb(cur, "attn_norm", il);

                // self-attention
                {
-                    struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].enc_wq, cur);
+                    struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur);
                    cb(Qcur, "Qcur", il);

-                    struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].enc_wk, cur);
+                    struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur);
                    cb(Kcur, "Kcur", il);

-                    struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].enc_wv, cur);
+                    struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur);
                    cb(Vcur, "Vcur", il);

                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@ -12663,12 +12665,12 @@ struct llm_build_context {
                    struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
                    cb(kq, "kq", il);

-                    struct ggml_tensor * rel_attn_b = model.layers[il].enc_rel_attn_b ? model.layers[il].enc_rel_attn_b : model.layers[0].enc_rel_attn_b;
-                    struct ggml_tensor * pos_bias = llm_build_rel_pos_bias(enc_pos_buckets, rel_attn_b);
+                    struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
+                    struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
                    struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
                    cb(kq_b, "kq_b", il);

-                    kq = ggml_soft_max_ext(ctx0, kq_b, enc_KQ_mask, 1.0f, hparams.f_max_alibi_bias);
+                    kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
                    cb(kq, "kq_soft_max_ext", il);

                    struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@ -12685,7 +12687,7 @@ struct llm_build_context {

                    ggml_build_forward_expand(gf, cur);

-                    cur = ggml_mul_mat(ctx0, model.layers[il].enc_wo, cur);
+                    cur = ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur);
                    cb(cur, "kqv_out", il);
                }

@ -12703,18 +12705,18 @@ struct llm_build_context {
                // feed-forward network
                {
                    cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                            model.layers[il].enc_ffn_norm, NULL,
+                            model.layers[il].ffn_norm_enc, NULL,
                            LLM_NORM_RMS, cb, il);
                    cb(cur, "ffn_norm", il);

                    // T5 uses relu, flan-T5 uses gelu-gated
                    cur = llm_build_ffn(ctx0, cur,
-                            model.layers[il].enc_ffn_up,   NULL, NULL,
-                            model.layers[il].enc_ffn_gate, NULL, NULL,
-                            model.layers[il].enc_ffn_down, NULL, NULL,
+                            model.layers[il].ffn_up_enc,   NULL, NULL,
+                            model.layers[il].ffn_gate_enc, NULL, NULL,
+                            model.layers[il].ffn_down_enc, NULL, NULL,
                            NULL,
-                            model.layers[il].enc_ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
-                            model.layers[il].enc_ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
+                            model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+                            model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
                            cb, il);
                    cb(cur, "ffn_out", il);
                }
@ -12736,15 +12738,15 @@ struct llm_build_context {
            cb(cur, "result_embd", -1);

            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.enc_output_norm, NULL,
+                    model.output_norm_enc, NULL,
                    LLM_NORM_RMS, cb, -1);
            cb(cur, "result_norm", -1);
        } else {
-            struct ggml_tensor * enc_output = llm_build_inp_enc_output();
-            struct ggml_tensor * dec_pos_buckets = llm_build_inp_rel_pos_bucket(true);
+            struct ggml_tensor * embd_enc        = llm_build_inp_embd_enc();
+            struct ggml_tensor * pos_buckets_dec = llm_build_pos_bucket(true);

-            struct ggml_tensor * dec_KQ_mask = build_inp_KQ_mask();
-            struct ggml_tensor * cross_KQ_mask = llm_build_inp_cross_KQ_mask();
+            struct ggml_tensor * KQ_mask_dec   = build_inp_KQ_mask();
+            struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();

            for (int il = 0; il < n_layer; ++il) {
                struct ggml_tensor * inpSA = inpL;
@ -12791,12 +12793,12 @@ struct llm_build_context {
                    struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
                    cb(kq, "kq", il);

-                    struct ggml_tensor * rel_attn_b = model.layers[il].rel_attn_b ? model.layers[il].rel_attn_b : model.layers[0].rel_attn_b;
-                    struct ggml_tensor * pos_bias = llm_build_rel_pos_bias(dec_pos_buckets, rel_attn_b);
+                    struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
+                    struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_buckets_dec, attn_rel_b);
                    struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
                    cb(kq_b, "kq_b", il);

-                    kq = ggml_soft_max_ext(ctx0, kq_b, dec_KQ_mask, 1.0f, hparams.f_max_alibi_bias);
+                    kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
                    cb(kq, "kq_soft_max_ext", il);

                    struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
@ -12821,23 +12823,23 @@ struct llm_build_context {

                // norm
                cur = llm_build_norm(ctx0, cur, hparams,
-                        model.layers[il].cross_attn_norm, NULL,
+                        model.layers[il].attn_norm_cross, NULL,
                        LLM_NORM_RMS, cb, il);
-                cb(cur, "cross_attn_norm", il);
+                cb(cur, "attn_norm_cross", il);

                // cross-attention
                {
-                    struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_wq, cur);
+                    struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur);
                    cb(Qcur, "Qcur", il);

-                    struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_wk, enc_output);
+                    struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc);
                    cb(Kcur, "Kcur", il);

-                    struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_wv, enc_output);
+                    struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc);
                    cb(Vcur, "Vcur", il);

                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_enc_outputs);
+                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);

                    struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
                    struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
@ -12845,13 +12847,13 @@ struct llm_build_context {
                    struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
                    cb(kq, "kq", il);

-                    kq = ggml_soft_max_ext(ctx0, kq, cross_KQ_mask, 1.0f, hparams.f_max_alibi_bias);
+                    kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
                    cb(kq, "kq_soft_max_ext", il);

-                    struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_enc_outputs)));
+                    struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
                    cb(v, "v", il);

-                    struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_enc_outputs, n_embd_head, n_head_kv), kq);
+                    struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
                    cb(kqv, "kqv", il);

                    struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
@ -12862,7 +12864,7 @@ struct llm_build_context {

                    ggml_build_forward_expand(gf, cur);

-                    cur = ggml_mul_mat(ctx0, model.layers[il].cross_wo, cur);
+                    cur = ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur);
                    cb(cur, "kqv_out", il);
                }

@ -12891,8 +12893,8 @@ struct llm_build_context {
                            model.layers[il].ffn_gate, NULL, NULL,
                            model.layers[il].ffn_down, NULL, NULL,
                            NULL,
-                            model.layers[il].enc_ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
-                            model.layers[il].enc_ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
+                            model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+                            model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
                            cb, il);
                    cb(cur, "ffn_out", il);
                }
@ -13630,17 +13632,20 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        }
    }

-    if (!lctx.is_encoding && lctx.inp_enc_output) {
-        ggml_backend_tensor_set(lctx.inp_enc_output, lctx.encoder_output.data(), 0, lctx.encoder_output.size() * ggml_element_size(lctx.inp_enc_output));
+    if (!lctx.is_encoding && lctx.inp_embd_enc) {
+        assert(lctx.inp_embd_enc->type == GGML_TYPE_F32);
+        assert(ggml_nelements(lctx.inp_embd_enc) == lctx.encoder_output.size());
+
+        ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.encoder_output.data(), 0, ggml_nbytes(lctx.inp_embd_enc));
    }

-    if (!lctx.is_encoding && lctx.inp_cross_KQ_mask) {
+    if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) {
        const int64_t n_encoder_output     = lctx.encoder_output.size() / hparams.n_embd;
        const int64_t n_tokens = batch.n_tokens;

-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cross_KQ_mask->buffer));
+        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer));

-        float * data = (float *) lctx.inp_cross_KQ_mask->data;
+        float * data = (float *) lctx.inp_KQ_mask_cross->data;

        for (int h = 0; h < 1; ++h) {
            for (int j = 0; j < n_tokens; ++j) {
@ -14127,7 +14132,7 @@ static int llama_encode_internal(
        lctx.output_ids[i] = i;
    }

-    lctx.inp_enc_output = NULL;
+    lctx.inp_embd_enc = NULL;

    for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
        const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);