convert: dbrx: fix mixed up and down expert tensors
llama: dbrx: review graph
This commit is contained in:
parent
e3c1e8127c
commit
0a35f5881b
3 changed files with 33 additions and 26 deletions
|
@ -646,9 +646,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
MODEL_TENSOR.OUTPUT,
|
MODEL_TENSOR.OUTPUT,
|
||||||
MODEL_TENSOR.ATTN_QKV,
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
MODEL_TENSOR.ATTN_OUT,
|
|
||||||
MODEL_TENSOR.ATTN_NORM,
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
MODEL_TENSOR.ATTN_NORM_2,
|
MODEL_TENSOR.ATTN_NORM_2,
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||||
MODEL_TENSOR.FFN_GATE_INP,
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
MODEL_TENSOR.FFN_GATE_EXP,
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
|
|
@ -155,31 +155,31 @@ class TensorNameMap:
|
||||||
|
|
||||||
# Attention output
|
# Attention output
|
||||||
MODEL_TENSOR.ATTN_OUT: (
|
MODEL_TENSOR.ATTN_OUT: (
|
||||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen
|
"transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen
|
||||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
"h.{bid}.self_attention.dense", # bloom
|
"h.{bid}.self_attention.dense", # bloom
|
||||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wo", # llama-pth
|
"layers.{bid}.attention.wo", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.output.dense", # bert
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||||
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||||
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
|
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
|
||||||
"model.layers.{bid}.self_attn.dense", # persimmon
|
"model.layers.{bid}.self_attn.dense", # persimmon
|
||||||
"h.{bid}.attn.c_proj", # gpt2
|
"h.{bid}.attn.c_proj", # gpt2
|
||||||
"transformer.h.{bid}.mixer.out_proj", # phi2
|
"transformer.h.{bid}.mixer.out_proj", # phi2
|
||||||
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wo", # internlm2
|
"model.layers.{bid}.attention.wo", # internlm2
|
||||||
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok
|
||||||
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # dbrx
|
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output norm
|
# Attention output norm
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: (
|
MODEL_TENSOR.ATTN_OUT_NORM: (
|
||||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
"encoder.layers.{bid}.norm1", # nomic-bert
|
"encoder.layers.{bid}.norm1", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
|
|
15
llama.cpp
15
llama.cpp
|
@ -4695,7 +4695,6 @@ static bool llm_load_tensors(
|
||||||
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2,"weight", i), {n_embd});
|
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2,"weight", i), {n_embd});
|
||||||
|
|
||||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd});
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd});
|
||||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
||||||
|
|
||||||
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
||||||
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
||||||
|
@ -7184,9 +7183,10 @@ struct llm_build_context {
|
||||||
|
|
||||||
// feed-forward network
|
// feed-forward network
|
||||||
// MoE branch
|
// MoE branch
|
||||||
|
// FIXME REVIEW: I do not see this op in https://huggingface.co/databricks/dbrx-instruct/blob/464e701f50aef4c1b59c81fb5667819a5d08e108/modeling_dbrx.py#L727
|
||||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
model.layers[il].ffn_norm, NULL,
|
NULL, NULL,
|
||||||
LLM_NORM_RMS, cb, il);
|
LLM_NORM, cb, il);
|
||||||
cb(cur, "ffn_norm", il);
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
||||||
|
@ -7244,9 +7244,16 @@ struct llm_build_context {
|
||||||
cb(moe_out, "ffn_moe_out", il);
|
cb(moe_out, "ffn_moe_out", il);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = moe_out;
|
cur = moe_out;
|
||||||
|
|
||||||
|
// DbrxNormAttentionNorm
|
||||||
|
{
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].layer_out_norm, NULL,
|
||||||
|
LLM_NORM, cb, il);
|
||||||
|
cb(cur, "layer_out_norm", il);
|
||||||
|
}
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue