Fix whitespaces
This commit is contained in:
parent
6052e3b3a7
commit
81ce9df3ee
3 changed files with 38 additions and 44 deletions
|
@ -1068,16 +1068,14 @@ class GrokModel(Model):
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
self.gguf_writer.add_name("Grok")
|
self.gguf_writer.add_name("Grok")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Model.register("MiniCPMForCausalLM")
|
@Model.register("MiniCPMForCausalLM")
|
||||||
class MiniCPMModel(Model):
|
class MiniCPMModel(Model):
|
||||||
|
|
|
@ -23,7 +23,7 @@ class TensorNameMap:
|
||||||
"model.embedding", # mamba-qbert
|
"model.embedding", # mamba-qbert
|
||||||
"backbone.embedding", # mamba
|
"backbone.embedding", # mamba
|
||||||
"backbone.embeddings", # mamba-hf
|
"backbone.embeddings", # mamba-hf
|
||||||
"transformer.in_out_embed", # Grok
|
"transformer.in_out_embed", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
|
@ -67,7 +67,7 @@ class TensorNameMap:
|
||||||
"lm_head.ln", # phi2
|
"lm_head.ln", # phi2
|
||||||
"model.norm_f", # mamba-qbert
|
"model.norm_f", # mamba-qbert
|
||||||
"backbone.norm_f", # mamba
|
"backbone.norm_f", # mamba
|
||||||
"transformer.rms_norm", # Grok
|
"transformer.rms_norm", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rope frequencies
|
# Rope frequencies
|
||||||
|
@ -95,7 +95,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.attention_norm", # internlm2
|
"model.layers.{bid}.attention_norm", # internlm2
|
||||||
"model.layers.{bid}.norm", # mamba-qbert
|
"model.layers.{bid}.norm", # mamba-qbert
|
||||||
"backbone.layers.{bid}.norm", # mamba
|
"backbone.layers.{bid}.norm", # mamba
|
||||||
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
|
@ -119,34 +119,34 @@ class TensorNameMap:
|
||||||
|
|
||||||
# Attention query
|
# Attention query
|
||||||
MODEL_TENSOR.ATTN_Q: (
|
MODEL_TENSOR.ATTN_Q: (
|
||||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wq", # llama-pth
|
"layers.{bid}.attention.wq", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.query", # bert
|
"encoder.layer.{bid}.attention.self.query", # bert
|
||||||
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||||
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wq", # internlm2
|
"model.layers.{bid}.attention.wq", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention key
|
# Attention key
|
||||||
MODEL_TENSOR.ATTN_K: (
|
MODEL_TENSOR.ATTN_K: (
|
||||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wk", # llama-pth
|
"layers.{bid}.attention.wk", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.key", # bert
|
"encoder.layer.{bid}.attention.self.key", # bert
|
||||||
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||||
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wk", # internlm2
|
"model.layers.{bid}.attention.wk", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention value
|
# Attention value
|
||||||
MODEL_TENSOR.ATTN_V: (
|
MODEL_TENSOR.ATTN_V: (
|
||||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wv", # llama-pth
|
"layers.{bid}.attention.wv", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.value", # bert
|
"encoder.layer.{bid}.attention.self.value", # bert
|
||||||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||||
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wv", # internlm2
|
"model.layers.{bid}.attention.wv", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -168,14 +168,14 @@ class TensorNameMap:
|
||||||
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wo", # internlm2
|
"model.layers.{bid}.attention.wo", # internlm2
|
||||||
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.linear" # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output norm
|
# Attention output norm
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: (
|
MODEL_TENSOR.ATTN_OUT_NORM: (
|
||||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
"encoder.layers.{bid}.norm1", # nomic-bert
|
"encoder.layers.{bid}.norm1", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
|
@ -198,15 +198,13 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.ln2", # yi
|
"model.layers.{bid}.ln2", # yi
|
||||||
"h.{bid}.ln_2", # gpt2
|
"h.{bid}.ln_2", # gpt2
|
||||||
"model.layers.{bid}.ffn_norm", # internlm2
|
"model.layers.{bid}.ffn_norm", # internlm2
|
||||||
|
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
|
||||||
|
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP: (
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
"layers.{bid}.feed_forward.gate", # mixtral
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||||
"transformer.decoder_layer.{bid}.router" # Grok
|
"transformer.decoder_layer.{bid}.router" # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
|
@ -234,8 +232,8 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral
|
"layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
|
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
|
||||||
"transformer.decoder_layer.{bid}.moe.{xid}.linear_v", # Grok
|
"transformer.decoder_layer.{bid}.moe.{xid}.linear_v", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# AWQ-activation gate
|
# AWQ-activation gate
|
||||||
|
@ -256,7 +254,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral
|
"layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
|
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
|
||||||
"transformer.decoder_layer.{bid}.moe.{xid}.linear" # Grok
|
"transformer.decoder_layer.{bid}.moe.{xid}.linear" # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
|
@ -284,7 +282,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral
|
"layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
|
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
|
||||||
"transformer.decoder_layer.{bid}.moe.{xid}.linear_1", # Grok
|
"transformer.decoder_layer.{bid}.moe.{xid}.linear_1", # Grok
|
||||||
|
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -303,9 +301,9 @@ class TensorNameMap:
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM: (
|
MODEL_TENSOR.LAYER_OUT_NORM: (
|
||||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
"encoder.layers.{bid}.norm2", # nomic-bert
|
"encoder.layers.{bid}.norm2", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_IN: (
|
MODEL_TENSOR.SSM_IN: (
|
||||||
|
|
|
@ -4330,7 +4330,7 @@ static bool llm_load_tensors(
|
||||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
|
||||||
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
|
||||||
|
@ -4345,8 +4345,6 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
|
layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
|
||||||
layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
|
layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
||||||
}
|
}
|
||||||
|
@ -6480,7 +6478,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = moe_out;
|
cur = moe_out;
|
||||||
|
|
||||||
|
|
||||||
// Grok
|
// Grok
|
||||||
// if layer_out_norm is present then apply it before adding the input
|
// if layer_out_norm is present then apply it before adding the input
|
||||||
|
@ -6515,7 +6513,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
// lm_head
|
// lm_head
|
||||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
|
||||||
|
|
||||||
// Grok
|
// Grok
|
||||||
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
// multiply logits by output_multiplier_scale of 0.5773502691896257
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue