move deepseek above deepseek2
This commit is contained in:
parent
5806435526
commit
78ef42665b
2 changed files with 128 additions and 128 deletions
|
@ -3430,85 +3430,6 @@ class ArcticModel(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
@Model.register("DeepseekV2ForCausalLM")
|
|
||||||
class DeepseekV2Model(Model):
|
|
||||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
|
||||||
|
|
||||||
def set_vocab(self):
|
|
||||||
self._set_vocab_gpt2()
|
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
|
||||||
super().set_gguf_parameters()
|
|
||||||
hparams = self.hparams
|
|
||||||
|
|
||||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
|
||||||
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
|
||||||
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
|
||||||
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
|
||||||
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
|
||||||
self.gguf_writer.add_value_length(hparams["v_head_dim"])
|
|
||||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
|
||||||
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
|
||||||
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
|
||||||
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
|
||||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
|
||||||
|
|
||||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
|
||||||
if self.hparams["rope_scaling"].get("type") == "yarn":
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
|
||||||
|
|
||||||
_experts: list[dict[str, Tensor]] | None = None
|
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
||||||
# process the experts separately
|
|
||||||
if name.find("mlp.experts") != -1:
|
|
||||||
n_experts = self.hparams["n_routed_experts"]
|
|
||||||
assert bid is not None
|
|
||||||
|
|
||||||
if self._experts is None:
|
|
||||||
self._experts = [{} for _ in range(self.block_count)]
|
|
||||||
|
|
||||||
self._experts[bid][name] = data_torch
|
|
||||||
|
|
||||||
if len(self._experts[bid]) >= n_experts * 3:
|
|
||||||
tensors: list[tuple[str, Tensor]] = []
|
|
||||||
|
|
||||||
# merge the experts into a single 3d tensor
|
|
||||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
|
||||||
datas: list[Tensor] = []
|
|
||||||
|
|
||||||
for xid in range(n_experts):
|
|
||||||
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
|
||||||
datas.append(self._experts[bid][ename])
|
|
||||||
del self._experts[bid][ename]
|
|
||||||
|
|
||||||
data_torch = torch.stack(datas, dim=0)
|
|
||||||
|
|
||||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
|
||||||
|
|
||||||
new_name = self.map_tensor_name(merged_name)
|
|
||||||
|
|
||||||
tensors.append((new_name, data_torch))
|
|
||||||
return tensors
|
|
||||||
else:
|
|
||||||
return []
|
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
|
||||||
|
|
||||||
def prepare_tensors(self):
|
|
||||||
super().prepare_tensors()
|
|
||||||
|
|
||||||
if self._experts is not None:
|
|
||||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
|
||||||
experts = [k for d in self._experts for k in d.keys()]
|
|
||||||
if len(experts) > 0:
|
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
|
||||||
|
|
||||||
|
|
||||||
@Model.register("DeepseekForCausalLM")
|
@Model.register("DeepseekForCausalLM")
|
||||||
class DeepseekModel(Model):
|
class DeepseekModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK
|
||||||
|
@ -3600,6 +3521,85 @@ class DeepseekModel(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("DeepseekV2ForCausalLM")
|
||||||
|
class DeepseekV2Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
hparams = self.hparams
|
||||||
|
|
||||||
|
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||||
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
||||||
|
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
||||||
|
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
||||||
|
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
||||||
|
self.gguf_writer.add_value_length(hparams["v_head_dim"])
|
||||||
|
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||||
|
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
||||||
|
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
||||||
|
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||||
|
|
||||||
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
|
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||||
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
||||||
|
|
||||||
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# process the experts separately
|
||||||
|
if name.find("mlp.experts") != -1:
|
||||||
|
n_experts = self.hparams["n_routed_experts"]
|
||||||
|
assert bid is not None
|
||||||
|
|
||||||
|
if self._experts is None:
|
||||||
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._experts[bid]) >= n_experts * 3:
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
|
# merge the experts into a single 3d tensor
|
||||||
|
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||||
|
datas: list[Tensor] = []
|
||||||
|
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||||
|
datas.append(self._experts[bid][ename])
|
||||||
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
|
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||||
|
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
|
||||||
|
tensors.append((new_name, data_torch))
|
||||||
|
return tensors
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def prepare_tensors(self):
|
||||||
|
super().prepare_tensors()
|
||||||
|
|
||||||
|
if self._experts is not None:
|
||||||
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||||
|
experts = [k for d in self._experts for k in d.keys()]
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
@Model.register("T5WithLMHeadModel")
|
@Model.register("T5WithLMHeadModel")
|
||||||
@Model.register("T5ForConditionalGeneration")
|
@Model.register("T5ForConditionalGeneration")
|
||||||
@Model.register("MT5ForConditionalGeneration")
|
@Model.register("MT5ForConditionalGeneration")
|
||||||
|
|
|
@ -8917,6 +8917,55 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_DEEPSEEK:
|
||||||
|
{
|
||||||
|
|
||||||
|
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||||
|
const int64_t n_expert_shared = hparams.n_expert_shared;
|
||||||
|
|
||||||
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
// output
|
||||||
|
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
|
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
||||||
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||||
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||||
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||||
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
if (i < (int) hparams.n_layer_dense_lead) {
|
||||||
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
} else {
|
||||||
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||||
|
|
||||||
|
if (n_expert == 0) {
|
||||||
|
throw std::runtime_error("n_expert must be > 0");
|
||||||
|
}
|
||||||
|
if (n_expert_used == 0) {
|
||||||
|
throw std::runtime_error("n_expert_used must be > 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
// MoE branch
|
||||||
|
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||||
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||||
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||||
|
|
||||||
|
// Shared expert branch
|
||||||
|
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||||
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
||||||
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_DEEPSEEK2:
|
case LLM_ARCH_DEEPSEEK2:
|
||||||
{
|
{
|
||||||
const bool is_lite = (hparams.n_layer == 27);
|
const bool is_lite = (hparams.n_layer == 27);
|
||||||
|
@ -8959,55 +9008,6 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
if (i < (int) hparams.n_layer_dense_lead) {
|
|
||||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
||||||
} else {
|
|
||||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
||||||
|
|
||||||
if (n_expert == 0) {
|
|
||||||
throw std::runtime_error("n_expert must be > 0");
|
|
||||||
}
|
|
||||||
if (n_expert_used == 0) {
|
|
||||||
throw std::runtime_error("n_expert_used must be > 0");
|
|
||||||
}
|
|
||||||
|
|
||||||
// MoE branch
|
|
||||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
||||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
||||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
||||||
|
|
||||||
// Shared expert branch
|
|
||||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
||||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
|
||||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} break;
|
|
||||||
case LLM_ARCH_DEEPSEEK:
|
|
||||||
{
|
|
||||||
|
|
||||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
||||||
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
||||||
|
|
||||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
||||||
|
|
||||||
// output
|
|
||||||
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
||||||
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
|
||||||
auto & layer = model.layers[i];
|
|
||||||
|
|
||||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
||||||
|
|
||||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
||||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
||||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
||||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
||||||
|
|
||||||
if (i < (int) hparams.n_layer_dense_lead) {
|
if (i < (int) hparams.n_layer_dense_lead) {
|
||||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue