From 81ce9df3ee03d68f857d53a200b83b7269d695b1 Mon Sep 17 00:00:00 2001 From: Julius Arkenberg Date: Thu, 21 Mar 2024 19:59:15 +0000 Subject: [PATCH] Fix whitespaces --- convert-hf-to-gguf.py | 8 ++--- gguf-py/gguf/tensor_mapping.py | 66 +++++++++++++++++----------------- llama.cpp | 8 ++--- 3 files changed, 38 insertions(+), 44 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index c9cca34fa..723ea18e3 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1068,16 +1068,14 @@ class GrokModel(Model): def set_vocab(self): self._set_vocab_sentencepiece() - + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - + def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_name("Grok") - - - + @Model.register("MiniCPMForCausalLM") class MiniCPMModel(Model): diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 7f482dd77..11fd34b8b 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -23,7 +23,7 @@ class TensorNameMap: "model.embedding", # mamba-qbert "backbone.embedding", # mamba "backbone.embeddings", # mamba-hf - "transformer.in_out_embed", # Grok + "transformer.in_out_embed", # Grok ), # Token type embeddings @@ -67,7 +67,7 @@ class TensorNameMap: "lm_head.ln", # phi2 "model.norm_f", # mamba-qbert "backbone.norm_f", # mamba - "transformer.rms_norm", # Grok + "transformer.rms_norm", # Grok ), # Rope frequencies @@ -95,7 +95,7 @@ class TensorNameMap: "model.layers.{bid}.attention_norm", # internlm2 "model.layers.{bid}.norm", # mamba-qbert "backbone.layers.{bid}.norm", # mamba - "transformer.decoder_layer.{bid}.rms_norm", # Grok + "transformer.decoder_layer.{bid}.rms_norm", # Grok ), # Attention norm 2 @@ -119,34 +119,34 @@ class TensorNameMap: # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf - "layers.{bid}.attention.wq", # llama-pth - "encoder.layer.{bid}.attention.self.query", # bert - "transformer.h.{bid}.attn.q_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.q_proj", # plamo - "model.layers.{bid}.attention.wq", # internlm2 + "model.layers.{bid}.self_attn.q_proj", # llama-hf + "layers.{bid}.attention.wq", # llama-pth + "encoder.layer.{bid}.attention.self.query", # bert + "transformer.h.{bid}.attn.q_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.q_proj", # plamo + "model.layers.{bid}.attention.wq", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok ), # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf - "layers.{bid}.attention.wk", # llama-pth - "encoder.layer.{bid}.attention.self.key", # bert - "transformer.h.{bid}.attn.k_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.k_proj", # plamo - "model.layers.{bid}.attention.wk", # internlm2 + "model.layers.{bid}.self_attn.k_proj", # llama-hf + "layers.{bid}.attention.wk", # llama-pth + "encoder.layer.{bid}.attention.self.key", # bert + "transformer.h.{bid}.attn.k_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.k_proj", # plamo + "model.layers.{bid}.attention.wk", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok ), # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf - "layers.{bid}.attention.wv", # llama-pth - "encoder.layer.{bid}.attention.self.value", # bert - "transformer.h.{bid}.attn.v_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.v_proj", # plamo - "model.layers.{bid}.attention.wv", # internlm2 + "model.layers.{bid}.self_attn.v_proj", # llama-hf + "layers.{bid}.attention.wv", # llama-pth + "encoder.layer.{bid}.attention.self.value", # bert + "transformer.h.{bid}.attn.v_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.v_proj", # plamo + "model.layers.{bid}.attention.wv", # internlm2 "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok ), @@ -168,14 +168,14 @@ class TensorNameMap: "model.layers.layers.{bid}.self_attn.o_proj", # plamo "model.layers.{bid}.attention.wo", # internlm2 "encoder.layers.{bid}.attn.out_proj", # nomic-bert - "transformer.decoder_layer.{bid}.multi_head_attention.linear" # Grok + "transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok ), # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( "encoder.layer.{bid}.attention.output.LayerNorm", # bert "encoder.layers.{bid}.norm1", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_1", # Grok + "transformer.decoder_layer.{bid}.rms_norm_1", # Grok ), # Rotary embeddings @@ -198,15 +198,13 @@ class TensorNameMap: "model.layers.{bid}.ln2", # yi "h.{bid}.ln_2", # gpt2 "model.layers.{bid}.ffn_norm", # internlm2 - - "transformer.decoder_layer.{bid}.rms_norm_2", # Grok - + "transformer.decoder_layer.{bid}.rms_norm_2", # Grok ), MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral - "transformer.decoder_layer.{bid}.router" # Grok + "transformer.decoder_layer.{bid}.router" # Grok ), # Feed-forward up @@ -234,8 +232,8 @@ class TensorNameMap: MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral - "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral - "transformer.decoder_layer.{bid}.moe.{xid}.linear_v", # Grok + "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral + "transformer.decoder_layer.{bid}.moe.{xid}.linear_v", # Grok ), # AWQ-activation gate @@ -256,7 +254,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral - "transformer.decoder_layer.{bid}.moe.{xid}.linear" # Grok + "transformer.decoder_layer.{bid}.moe.{xid}.linear" # Grok ), # Feed-forward down @@ -284,7 +282,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_DOWN_EXP: ( "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral - "transformer.decoder_layer.{bid}.moe.{xid}.linear_1", # Grok + "transformer.decoder_layer.{bid}.moe.{xid}.linear_1", # Grok ), @@ -303,9 +301,9 @@ class TensorNameMap: ), MODEL_TENSOR.LAYER_OUT_NORM: ( - "encoder.layer.{bid}.output.LayerNorm", # bert - "encoder.layers.{bid}.norm2", # nomic-bert - "transformer.decoder_layer.{bid}.rms_norm_3", # Grok + "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_3", # Grok ), MODEL_TENSOR.SSM_IN: ( diff --git a/llama.cpp b/llama.cpp index a015d67b1..5ad82e69b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4330,7 +4330,7 @@ static bool llm_load_tensors( layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); - + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false); @@ -4345,8 +4345,6 @@ static bool llm_load_tensors( layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); } - - layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); } @@ -6480,7 +6478,7 @@ struct llm_build_context { } cur = moe_out; - + // Grok // if layer_out_norm is present then apply it before adding the input @@ -6515,7 +6513,7 @@ struct llm_build_context { // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); - + // Grok // multiply logits by output_multiplier_scale of 0.5773502691896257