diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index a639e39ef..228b68e14 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1535,7 +1535,7 @@ class DbrxModel(Model): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: experts = True data_torch = data_torch.view(n_expert, n_ff, n_embd) - if permute_tensor := exp_tensor_names[exp_tensor_name] is not None: + if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: data_torch = data_torch.permute(*permute_tensor) break diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 886256102..a610acc58 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -649,11 +649,11 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_OUT_NORM, MODEL_TENSOR.FFN_GATE_INP, MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, - MODEL_TENSOR.LAYER_OUT_NORM, ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 5872e2b23..510a273c8 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -179,6 +179,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.output.LayerNorm", # bert "encoder.layers.{bid}.norm1", # nomic-bert "transformer.decoder_layer.{bid}.rms_norm_1", # Grok + "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), # Rotary embeddings @@ -309,7 +310,6 @@ class TensorNameMap: "encoder.layer.{bid}.output.LayerNorm", # bert "encoder.layers.{bid}.norm2", # nomic-bert "transformer.decoder_layer.{bid}.rms_norm_3", # Grok - "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), MODEL_TENSOR.SSM_IN: ( diff --git a/llama.cpp b/llama.cpp index 26dc24ebc..a9437a5b5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -939,11 +939,11 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, - { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, }, }, { @@ -4692,12 +4692,13 @@ static bool llm_load_tensors( layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); layer.wo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS,"weight", i), {n_embd, n_ff, n_expert}); layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS,"weight", i), {n_ff, n_embd, n_expert}); layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); - layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); } } break; case LLM_ARCH_BAICHUAN: @@ -7121,6 +7122,11 @@ struct llm_build_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_out_norm, NULL, + LLM_NORM, cb, il); + cb(cur, "attn_out_norm", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -7128,12 +7134,6 @@ struct llm_build_context { // MoE branch cur = build_moe(cur, n_tokens, il); - // DBRX norm2 - cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].layer_out_norm, NULL, - LLM_NORM, cb, il); - cb(cur, "layer_out_norm", il); - cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il);