diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5d858a123..4faa04e57 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1456,6 +1456,7 @@ class Qwen2MoeModel(Model): reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()} added_vocab = tokenizer.get_added_vocab() + # REVIEW: Not tested yet, need to deep dive this tiktoken for i in range(vocab_size): if i not in reverse_vocab: tokens.append(f"[PAD{i}]") diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 48c9bd08d..3638e2aea 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -96,13 +96,13 @@ class TensorNameMap: "model.layers.{bid}.norm", # mamba-qbert "backbone.layers.{bid}.norm", # mamba "transformer.decoder_layer.{bid}.rms_norm", # Grok - "transformer.blocks.{bid}.norm_attn_norm.norm_1.weight", # DBRX + "transformer.blocks.{bid}.norm_attn_norm.norm_1.weight", # dbrx ), # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( "transformer.h.{bid}.ln_attn", # falcon40b - "transformer.blocks.{bid}.norm_attn_norm.norm_2.weight", # DBRX + "transformer.blocks.{bid}.norm_attn_norm.norm_2.weight", # dbrx ), # Attention query-key-value @@ -110,7 +110,7 @@ class TensorNameMap: "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox "transformer.h.{bid}.attn.c_attn", # gpt2 qwen "transformer.blocks.{bid}.attn.Wqkv", # mpt - "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv.weight", # DBRX + "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv.weight", # dbrx "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon @@ -172,7 +172,7 @@ class TensorNameMap: "model.layers.{bid}.attention.wo", # internlm2 "encoder.layers.{bid}.attn.out_proj", # nomic-bert "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok - "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # DBRX + "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # dbrx ), # Attention output norm @@ -209,7 +209,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral "transformer.decoder_layer.{bid}.router", # Grok - "transformer.blocks.{bid}.ffn.router.layer.weight", # DBRX + "transformer.blocks.{bid}.ffn.router.layer.weight", # dbrx ), # Feed-forward up @@ -238,7 +238,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w2", # DBRX + "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx ), # AWQ-activation gate @@ -259,7 +259,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # DBRX + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx ), # Feed-forward down @@ -287,7 +287,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_DOWN_EXP: ( "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # DBRX + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx ), MODEL_TENSOR.ATTN_Q_NORM: ( diff --git a/llama.cpp b/llama.cpp index 279b39dfb..016e119cb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7131,18 +7131,14 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "attn_norm", il); + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm_2, + NULL, + LLM_NORM, cb, il); + cb(cur, "attn_norm_2", il); // self-attention { - if (model.layers[il].attn_norm_2) { - // DBRX - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm_2, - NULL, - LLM_NORM, cb, il); - cb(cur, "attn_norm_2", il); - } - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il);