dbrx: minor

2024-04-06 18:59:53 +02:00 · 2024-04-06 18:59:53 +02:00 · a7f9a3eafc
commit a7f9a3eafc
parent e4f8ee4f48
3 changed files with 14 additions and 17 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -1456,6 +1456,7 @@ class Qwen2MoeModel(Model):
        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()}
        added_vocab = tokenizer.get_added_vocab()

+        # REVIEW: Not tested yet, need to deep dive this tiktoken
        for i in range(vocab_size):
            if i not in reverse_vocab:
                tokens.append(f"[PAD{i}]")
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -96,13 +96,13 @@ class TensorNameMap:
            "model.layers.{bid}.norm",                                # mamba-qbert
            "backbone.layers.{bid}.norm",                             # mamba
            "transformer.decoder_layer.{bid}.rms_norm",               # Grok
-            "transformer.blocks.{bid}.norm_attn_norm.norm_1.weight",  # DBRX
+            "transformer.blocks.{bid}.norm_attn_norm.norm_1.weight",  # dbrx
        ),

        # Attention norm 2
        MODEL_TENSOR.ATTN_NORM_2: (
            "transformer.h.{bid}.ln_attn",                            # falcon40b
-            "transformer.blocks.{bid}.norm_attn_norm.norm_2.weight",  # DBRX
+            "transformer.blocks.{bid}.norm_attn_norm.norm_2.weight",  # dbrx
        ),

        # Attention query-key-value
@ -110,7 +110,7 @@ class TensorNameMap:
            "gpt_neox.layers.{bid}.attention.query_key_value",                     # gptneox
            "transformer.h.{bid}.attn.c_attn",                                     # gpt2 qwen
            "transformer.blocks.{bid}.attn.Wqkv",                                  # mpt
-            "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv.weight",            # DBRX
+            "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv.weight",            # dbrx
            "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
            "h.{bid}.self_attention.query_key_value",                              # bloom
            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
@ -172,7 +172,7 @@ class TensorNameMap:
            "model.layers.{bid}.attention.wo",                               # internlm2
            "encoder.layers.{bid}.attn.out_proj",                            # nomic-bert
            "transformer.decoder_layer.{bid}.multi_head_attention.linear",   # Grok
-            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight",  # DBRX
+            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight",  # dbrx
        ),

        # Attention output norm
@ -209,7 +209,7 @@ class TensorNameMap:
            "layers.{bid}.feed_forward.gate",                    # mixtral
            "model.layers.{bid}.block_sparse_moe.gate",          # mixtral
            "transformer.decoder_layer.{bid}.router",            # Grok
-            "transformer.blocks.{bid}.ffn.router.layer.weight",  # DBRX
+            "transformer.blocks.{bid}.ffn.router.layer.weight",  # dbrx
        ),

        # Feed-forward up
@ -238,7 +238,7 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_UP_EXP: (
            "layers.{bid}.feed_forward.experts.w3",                 # mixtral (merged)
            "transformer.decoder_layer.{bid}.moe.linear_v",         # Grok (merged)
-            "transformer.blocks.{bid}.ffn.experts.mlp.w2",          # DBRX
+            "transformer.blocks.{bid}.ffn.experts.mlp.w2",          # dbrx
        ),

        # AWQ-activation gate
@ -259,7 +259,7 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_GATE_EXP: (
            "layers.{bid}.feed_forward.experts.w1",         # mixtral (merged)
            "transformer.decoder_layer.{bid}.moe.linear",   # Grok (merged)
-            "transformer.blocks.{bid}.ffn.experts.mlp.v1",  # DBRX
+            "transformer.blocks.{bid}.ffn.experts.mlp.v1",  # dbrx
        ),

        # Feed-forward down
@ -287,7 +287,7 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_DOWN_EXP: (
            "layers.{bid}.feed_forward.experts.w2",                 # mixtral (merged)
            "transformer.decoder_layer.{bid}.moe.linear_1",         # Grok (merged)
-            "transformer.blocks.{bid}.ffn.experts.mlp.w1",          # DBRX
+            "transformer.blocks.{bid}.ffn.experts.mlp.w1",          # dbrx
        ),

        MODEL_TENSOR.ATTN_Q_NORM: (
--- a/llama.cpp
+++ b/llama.cpp
@ -7131,18 +7131,14 @@ struct llm_build_context {
                                 LLM_NORM, cb, il);
            cb(cur, "attn_norm", il);

+            cur = llm_build_norm(ctx0, inpL, hparams,
+                                 model.layers[il].attn_norm_2,
+                                 NULL,
+                                 LLM_NORM, cb, il);
+            cb(cur, "attn_norm_2", il);

            // self-attention
            {
-                if (model.layers[il].attn_norm_2) {
-                    // DBRX
-                    cur = llm_build_norm(ctx0, inpL, hparams,
-                                         model.layers[il].attn_norm_2,
-                                         NULL,
-                                         LLM_NORM, cb, il);
-                    cb(cur, "attn_norm_2", il);
-                }
-
                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
                cb(cur, "wqkv", il);