support qwen2moe

This commit is contained in:
simonJJJ 2024-03-15 15:12:24 +08:00
parent 306d34be7a
commit 56a38c46f5
6 changed files with 418 additions and 74 deletions

View file

@ -1219,6 +1219,16 @@ class Qwen2Model(Model):
model_arch = gguf.MODEL_ARCH.QWEN2 model_arch = gguf.MODEL_ARCH.QWEN2
@Model.register("Qwen2MoeForCausalLM")
class Qwen2MoeModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN2MOE
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
@Model.register("GPT2LMHeadModel") @Model.register("GPT2LMHeadModel")
class GPT2Model(Model): class GPT2Model(Model):
model_arch = gguf.MODEL_ARCH.GPT2 model_arch = gguf.MODEL_ARCH.GPT2

View file

@ -941,7 +941,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
#define GGML_MAX_BACKENDS 16 #define GGML_MAX_BACKENDS 16
#define GGML_MAX_SPLITS 256 #define GGML_MAX_SPLITS 256
#define GGML_MAX_SPLIT_INPUTS 16 #define GGML_MAX_SPLIT_INPUTS 256
struct ggml_backend_sched_split { struct ggml_backend_sched_split {
int backend_id; int backend_id;

2
ggml.h
View file

@ -227,7 +227,7 @@
#define GGML_MAX_DIMS 4 #define GGML_MAX_DIMS 4
#define GGML_MAX_PARAMS 2048 #define GGML_MAX_PARAMS 2048
#define GGML_MAX_CONTEXTS 64 #define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 10 #define GGML_MAX_SRC 62
#ifndef GGML_MAX_NAME #ifndef GGML_MAX_NAME
#define GGML_MAX_NAME 64 #define GGML_MAX_NAME 64
#endif #endif

View file

@ -111,6 +111,7 @@ class MODEL_ARCH(IntEnum):
STABLELM = auto() STABLELM = auto()
QWEN = auto() QWEN = auto()
QWEN2 = auto() QWEN2 = auto()
QWEN2MOE = auto()
PHI2 = auto() PHI2 = auto()
PLAMO = auto() PLAMO = auto()
CODESHELL = auto() CODESHELL = auto()
@ -123,41 +124,45 @@ class MODEL_ARCH(IntEnum):
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
TOKEN_EMBD = auto() TOKEN_EMBD = auto()
TOKEN_EMBD_NORM = auto() TOKEN_EMBD_NORM = auto()
TOKEN_TYPES = auto() TOKEN_TYPES = auto()
POS_EMBD = auto() POS_EMBD = auto()
OUTPUT = auto() OUTPUT = auto()
OUTPUT_NORM = auto() OUTPUT_NORM = auto()
ROPE_FREQS = auto() ROPE_FREQS = auto()
ATTN_Q = auto() ATTN_Q = auto()
ATTN_K = auto() ATTN_K = auto()
ATTN_V = auto() ATTN_V = auto()
ATTN_QKV = auto() ATTN_QKV = auto()
ATTN_OUT = auto() ATTN_OUT = auto()
ATTN_NORM = auto() ATTN_NORM = auto()
ATTN_NORM_2 = auto() ATTN_NORM_2 = auto()
ATTN_OUT_NORM = auto() ATTN_OUT_NORM = auto()
ATTN_ROT_EMBD = auto() ATTN_ROT_EMBD = auto()
FFN_GATE_INP = auto() FFN_GATE_INP = auto()
FFN_NORM = auto() FFN_GATE_INP_SHARED_EXP = auto()
FFN_GATE = auto() FFN_NORM = auto()
FFN_DOWN = auto() FFN_GATE = auto()
FFN_UP = auto() FFN_DOWN = auto()
FFN_ACT = auto() FFN_UP = auto()
FFN_GATE_EXP = auto() FFN_ACT = auto()
FFN_DOWN_EXP = auto() FFN_GATE_EXP = auto()
FFN_UP_EXP = auto() FFN_DOWN_EXP = auto()
ATTN_Q_NORM = auto() FFN_UP_EXP = auto()
ATTN_K_NORM = auto() FFN_GATE_SHARED_EXP = auto()
LAYER_OUT_NORM = auto() FFN_DOWN_SHARED_EXP = auto()
SSM_IN = auto() FFN_UP_SHARED_EXP = auto()
SSM_CONV1D = auto() ATTN_Q_NORM = auto()
SSM_X = auto() ATTN_K_NORM = auto()
SSM_DT = auto() LAYER_OUT_NORM = auto()
SSM_A = auto() SSM_IN = auto()
SSM_D = auto() SSM_CONV1D = auto()
SSM_OUT = auto() SSM_X = auto()
SSM_DT = auto()
SSM_A = auto()
SSM_D = auto()
SSM_OUT = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -177,6 +182,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.QWEN: "qwen", MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.QWEN2: "qwen2", MODEL_ARCH.QWEN2: "qwen2",
MODEL_ARCH.QWEN2MOE: "qwen2moe",
MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI2: "phi2",
MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.PLAMO: "plamo",
MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.CODESHELL: "codeshell",
@ -189,41 +195,45 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
MODEL_TENSOR.TOKEN_TYPES: "token_types", MODEL_TENSOR.TOKEN_TYPES: "token_types",
MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output", MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.ROPE_FREQS: "rope_freqs", MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", MODEL_TENSOR.FFN_GATE_INP_SHARED_EXP: "blk.{bid}.ffn_gate_inp_shared_exp",
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", MODEL_TENSOR.FFN_GATE_SHARED_EXP: "blk.{bid}.ffn_gate_shared_exp",
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", MODEL_TENSOR.FFN_DOWN_SHARED_EXP: "blk.{bid}.ffn_down_shared_exp",
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", MODEL_TENSOR.FFN_UP_SHARED_EXP: "blk.{bid}.ffn_up_shared_exp",
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
} }
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -433,6 +443,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.QWEN2MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_INP_SHARED_EXP,
MODEL_TENSOR.FFN_GATE_SHARED_EXP,
MODEL_TENSOR.FFN_DOWN_SHARED_EXP,
MODEL_TENSOR.FFN_UP_SHARED_EXP,
],
MODEL_ARCH.PLAMO: [ MODEL_ARCH.PLAMO: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT_NORM,

View file

@ -195,6 +195,11 @@ class TensorNameMap:
MODEL_TENSOR.FFN_GATE_INP: ( MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral "layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral
"model.layers.{bid}.mlp.gate", # qwen2moe
),
MODEL_TENSOR.FFN_GATE_INP_SHARED_EXP: (
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
), ),
# Feed-forward up # Feed-forward up
@ -223,6 +228,11 @@ class TensorNameMap:
MODEL_TENSOR.FFN_UP_EXP: ( MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
"model.layers.{bid}.mlp.experts.{xid}.up_proj", # qwen2moe
),
MODEL_TENSOR.FFN_UP_SHARED_EXP: (
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
), ),
# AWQ-activation gate # AWQ-activation gate
@ -243,6 +253,11 @@ class TensorNameMap:
MODEL_TENSOR.FFN_GATE_EXP: ( MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
"model.layers.{bid}.mlp.experts.{xid}.gate_proj", # qwen2moe
),
MODEL_TENSOR.FFN_GATE_SHARED_EXP: (
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
), ),
# Feed-forward down # Feed-forward down
@ -270,6 +285,11 @@ class TensorNameMap:
MODEL_TENSOR.FFN_DOWN_EXP: ( MODEL_TENSOR.FFN_DOWN_EXP: (
"layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
"model.layers.{bid}.mlp.experts.{xid}.down_proj", # qwen2moe
),
MODEL_TENSOR.FFN_DOWN_SHARED_EXP: (
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
), ),
MODEL_TENSOR.ATTN_Q_NORM: ( MODEL_TENSOR.ATTN_Q_NORM: (
@ -343,7 +363,7 @@ class TensorNameMap:
if tensor not in MODEL_TENSORS[arch]: if tensor not in MODEL_TENSORS[arch]:
continue continue
# TODO: make this configurable # TODO: make this configurable
n_experts = 8 n_experts = 60
for xid in range(n_experts): for xid in range(n_experts):
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
self.mapping[tensor_name] = (tensor, tensor_name) self.mapping[tensor_name] = (tensor, tensor_name)

287
llama.cpp
View file

@ -102,7 +102,7 @@
#endif #endif
#define LLAMA_MAX_NODES 8192 #define LLAMA_MAX_NODES 8192
#define LLAMA_MAX_EXPERTS 8 #define LLAMA_MAX_EXPERTS 60
// //
@ -205,6 +205,7 @@ enum llm_arch {
LLM_ARCH_STABLELM, LLM_ARCH_STABLELM,
LLM_ARCH_QWEN, LLM_ARCH_QWEN,
LLM_ARCH_QWEN2, LLM_ARCH_QWEN2,
LLM_ARCH_QWEN2MOE,
LLM_ARCH_PHI2, LLM_ARCH_PHI2,
LLM_ARCH_PLAMO, LLM_ARCH_PLAMO,
LLM_ARCH_CODESHELL, LLM_ARCH_CODESHELL,
@ -234,6 +235,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_STABLELM, "stablelm" },
{ LLM_ARCH_QWEN, "qwen" }, { LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_QWEN2, "qwen2" }, { LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
{ LLM_ARCH_PHI2, "phi2" }, { LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PLAMO, "plamo" }, { LLM_ARCH_PLAMO, "plamo" },
{ LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_CODESHELL, "codeshell" },
@ -400,6 +402,7 @@ enum llm_tensor {
LLM_TENSOR_ATTN_OUT_NORM, LLM_TENSOR_ATTN_OUT_NORM,
LLM_TENSOR_ATTN_ROT_EMBD, LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_FFN_GATE_INP, LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_INP_SHARED_EXP,
LLM_TENSOR_FFN_NORM, LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE, LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_DOWN,
@ -408,6 +411,9 @@ enum llm_tensor {
LLM_TENSOR_FFN_DOWN_EXP, LLM_TENSOR_FFN_DOWN_EXP,
LLM_TENSOR_FFN_GATE_EXP, LLM_TENSOR_FFN_GATE_EXP,
LLM_TENSOR_FFN_UP_EXP, LLM_TENSOR_FFN_UP_EXP,
LLM_TENSOR_FFN_DOWN_SHARED_EXP,
LLM_TENSOR_FFN_GATE_SHARED_EXP,
LLM_TENSOR_FFN_UP_SHARED_EXP,
LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_LAYER_OUT_NORM, LLM_TENSOR_LAYER_OUT_NORM,
@ -673,6 +679,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_QWEN2MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
{ LLM_TENSOR_FFN_GATE_INP_SHARED_EXP, "blk.%d.ffn_gate_inp_shared_exp" },
{ LLM_TENSOR_FFN_GATE_SHARED_EXP, "blk.%d.ffn_gate_shared_exp" },
{ LLM_TENSOR_FFN_DOWN_SHARED_EXP, "blk.%d.ffn_down_shared_exp" },
{ LLM_TENSOR_FFN_UP_SHARED_EXP, "blk.%d.ffn_up_shared_exp" },
},
},
{ {
LLM_ARCH_PHI2, LLM_ARCH_PHI2,
{ {
@ -1797,6 +1825,12 @@ struct llama_layer {
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS]; struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS]; struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
// ff shared expert
struct ggml_tensor * ffn_gate_inp_shared_exp;
struct ggml_tensor * ffn_gate_shared_exp;
struct ggml_tensor * ffn_down_shared_exp;
struct ggml_tensor * ffn_up_shared_exp;
// ff bias // ff bias
struct ggml_tensor * ffn_down_b; // b2 struct ggml_tensor * ffn_down_b; // b2
struct ggml_tensor * ffn_up_b; // b3 struct ggml_tensor * ffn_up_b; // b3
@ -3522,6 +3556,14 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_QWEN2MOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_1B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_PHI2: case LLM_ARCH_PHI2:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@ -4570,6 +4612,56 @@ static bool llm_load_tensors(
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
} }
} break; } break;
case LLM_ARCH_QWEN2MOE:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i);
ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
// optional bias tensors
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd});
GGML_ASSERT(hparams.n_expert > 0);
GGML_ASSERT(hparams.n_expert_used > 0);
// MoE branch
auto n_ff_exp = n_ff / hparams.n_expert_used;
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff_exp});
layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {n_ff_exp, n_embd});
layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff_exp});
}
// Shared expert branch
layer.ffn_gate_inp_shared_exp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHARED_EXP, "weight", i), {n_embd});
layer.ffn_gate_shared_exp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHARED_EXP, "weight", i), {n_embd, n_ff});
layer.ffn_down_shared_exp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHARED_EXP, "weight", i), { n_ff, n_embd});
layer.ffn_up_shared_exp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHARED_EXP, "weight", i), {n_embd, n_ff});
}
} break;
case LLM_ARCH_PHI2: case LLM_ARCH_PHI2:
{ {
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@ -7152,6 +7244,194 @@ struct llm_build_context {
return gf; return gf;
} }
struct ggml_cgraph * build_qwen2moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
cb(inpL, "inp_embd", -1);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
cb(inp_pos, "inp_pos", -1);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
cb(KQ_mask, "KQ_mask", -1);
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// norm
cur = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
// self_attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
cb(Vcur, "Vcur", il);
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
ggml_build_forward_expand(gf, Qcur);
ggml_build_forward_expand(gf, Kcur);
ggml_build_forward_expand(gf, Vcur);
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head,
n_head, n_tokens), inp_pos,
n_rot, rope_type, 0, n_orig_ctx, freq_base,
freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head,
n_head_kv, n_tokens), inp_pos,
n_rot, rope_type, 0, n_orig_ctx, freq_base,
freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx,
n_tokens, kv_head, n_kv, 1.0f/sqrtf(float
(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// MoE branch
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
cb(logits, "ffn_moe_logits", il);
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
cb(probs, "ffn_moe_probs", il);
// select experts
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
cb(selected_experts->src[0], "ffn_moe_argsort", il);
ggml_tensor * weights = ggml_get_rows(ctx0,
ggml_reshape_3d(ctx0, probs, 1, n_expert,
n_tokens), selected_experts);
cb(weights, "ffn_moe_weights", il);
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
// compute expert outputs
ggml_tensor * moe_out = nullptr;
for (int i = 0; i < n_expert_used; ++i) {
ggml_tensor * cur_expert;
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
cb(cur_up, "ffn_moe_up", il);
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
cb(cur_gate, "ffn_moe_gate", il);
cur_gate = ggml_silu(ctx0, cur_gate);
cb(cur_gate, "ffn_moe_silu", il);
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
cb(cur_expert, "ffn_moe_gate_par", il);
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
cb(cur_expert, "ffn_moe_down", il);
cur_expert = ggml_mul(ctx0, cur_expert,
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
cb(cur_expert, "ffn_moe_weighted", il);
if (i == 0) {
moe_out = cur_expert;
} else {
moe_out = ggml_add(ctx0, moe_out, cur_expert);
cb(moe_out, "ffn_moe_out", il);
}
}
ggml_tensor * gate_shared_exp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shared_exp, cur);
cb(gate_shared_exp, "ffn_moe_gate_inp_shared_exp", il);
// sigmoid
ggml_tensor * logits_shared_exp = ggml_silu(ctx0, gate_shared_exp);
cb(logits_shared_exp, "ffn_moe_logits_shared_exp", il);
ggml_tensor * probs_shared_exp = ggml_div(ctx0, logits_shared_exp, gate_shared_exp);
cb(probs_shared_exp, "ffn_moe_probs_shared_exp", il);
ggml_tensor * ffn_shared_exp = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up_shared_exp, NULL,
model.layers[il].ffn_gate_shared_exp, NULL,
model.layers[il].ffn_down_shared_exp, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(ffn_shared_exp, "ffn_moe_shared_exp", il);
ggml_tensor * ffn_shared_exp_out = ggml_mul(ctx0, ffn_shared_exp, probs_shared_exp);
cb(ffn_shared_exp_out, "ffn_moe_shared_exp_out", il);
moe_out = ggml_add(ctx0, moe_out, ffn_shared_exp_out);
cb(moe_out, "ffn_out", il);
cur = moe_out;
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_phi2() { struct ggml_cgraph * build_phi2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@ -8433,6 +8713,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_qwen2(); result = llm.build_qwen2();
} break; } break;
case LLM_ARCH_QWEN2MOE:
{
result = llm.build_qwen2moe();
} break;
case LLM_ARCH_PHI2: case LLM_ARCH_PHI2:
{ {
result = llm.build_phi2(); result = llm.build_phi2();
@ -13063,6 +13347,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_STABLELM: case LLM_ARCH_STABLELM:
case LLM_ARCH_QWEN: case LLM_ARCH_QWEN:
case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2MOE:
case LLM_ARCH_PHI2: case LLM_ARCH_PHI2:
case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA:
case LLM_ARCH_STARCODER2: case LLM_ARCH_STARCODER2: