Fix whitespaces

2024-03-21 19:59:15 +00:00 · 2024-03-21 19:59:15 +00:00 · 81ce9df3ee
commit 81ce9df3ee
parent 6052e3b3a7
3 changed files with 38 additions and 44 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -1068,16 +1068,14 @@ class GrokModel(Model):

    def set_vocab(self):
        self._set_vocab_sentencepiece()
-        
+
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-    
+
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_name("Grok")
-        
-        
-        
+

@Model.register("MiniCPMForCausalLM")
 class MiniCPMModel(Model):
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -23,7 +23,7 @@ class TensorNameMap:
            "model.embedding",                           # mamba-qbert
            "backbone.embedding",                        # mamba
            "backbone.embeddings",                       # mamba-hf
-            "transformer.in_out_embed",           # Grok
+            "transformer.in_out_embed",                  # Grok
        ),

        # Token type embeddings
@ -67,7 +67,7 @@ class TensorNameMap:
            "lm_head.ln",                              # phi2
            "model.norm_f",                            # mamba-qbert
            "backbone.norm_f",                         # mamba
-            "transformer.rms_norm",             # Grok
+            "transformer.rms_norm",                    # Grok
        ),

        # Rope frequencies
@ -95,7 +95,7 @@ class TensorNameMap:
            "model.layers.{bid}.attention_norm",                    # internlm2
            "model.layers.{bid}.norm",                              # mamba-qbert
            "backbone.layers.{bid}.norm",                           # mamba
-            "transformer.decoder_layer.{bid}.rms_norm",      # Grok
+            "transformer.decoder_layer.{bid}.rms_norm",             # Grok
        ),

        # Attention norm 2
@ -119,34 +119,34 @@ class TensorNameMap:

        # Attention query
        MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj",         # llama-hf
-            "layers.{bid}.attention.wq",                   # llama-pth
-            "encoder.layer.{bid}.attention.self.query",    # bert
-            "transformer.h.{bid}.attn.q_proj",             # gpt-j
-            "model.layers.layers.{bid}.self_attn.q_proj",  # plamo
-            "model.layers.{bid}.attention.wq",             # internlm2
+            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf
+            "layers.{bid}.attention.wq",                                 # llama-pth
+            "encoder.layer.{bid}.attention.self.query",                  # bert
+            "transformer.h.{bid}.attn.q_proj",                           # gpt-j
+            "model.layers.layers.{bid}.self_attn.q_proj",                # plamo
+            "model.layers.{bid}.attention.wq",                           # internlm2
            "transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
        ),

        # Attention key
        MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj",         # llama-hf
-            "layers.{bid}.attention.wk",                   # llama-pth
-            "encoder.layer.{bid}.attention.self.key",      # bert
-            "transformer.h.{bid}.attn.k_proj",             # gpt-j
-            "model.layers.layers.{bid}.self_attn.k_proj",  # plamo
-            "model.layers.{bid}.attention.wk",             # internlm2
+            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf
+            "layers.{bid}.attention.wk",                               # llama-pth
+            "encoder.layer.{bid}.attention.self.key",                  # bert
+            "transformer.h.{bid}.attn.k_proj",                         # gpt-j
+            "model.layers.layers.{bid}.self_attn.k_proj",              # plamo
+            "model.layers.{bid}.attention.wk",                         # internlm2
            "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
        ),

        # Attention value
        MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj",         # llama-hf
-            "layers.{bid}.attention.wv",                   # llama-pth
-            "encoder.layer.{bid}.attention.self.value",    # bert
-            "transformer.h.{bid}.attn.v_proj",             # gpt-j
-            "model.layers.layers.{bid}.self_attn.v_proj",  # plamo
-            "model.layers.{bid}.attention.wv",             # internlm2
+            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf
+            "layers.{bid}.attention.wv",                                 # llama-pth
+            "encoder.layer.{bid}.attention.self.value",                  # bert
+            "transformer.h.{bid}.attn.v_proj",                           # gpt-j
+            "model.layers.layers.{bid}.self_attn.v_proj",                # plamo
+            "model.layers.{bid}.attention.wv",                           # internlm2
            "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
        ),

@ -168,14 +168,14 @@ class TensorNameMap:
            "model.layers.layers.{bid}.self_attn.o_proj",                # plamo
            "model.layers.{bid}.attention.wo",                           # internlm2
            "encoder.layers.{bid}.attn.out_proj",                        # nomic-bert
-            "transformer.decoder_layer.{bid}.multi_head_attention.linear" # Grok
+            "transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok
        ),

        # Attention output norm
        MODEL_TENSOR.ATTN_OUT_NORM: (
            "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
            "encoder.layers.{bid}.norm1",                      # nomic-bert
-            "transformer.decoder_layer.{bid}.rms_norm_1",  # Grok
+            "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
        ),

        # Rotary embeddings
@ -198,15 +198,13 @@ class TensorNameMap:
            "model.layers.{bid}.ln2",                                        # yi
            "h.{bid}.ln_2",                                                  # gpt2
            "model.layers.{bid}.ffn_norm",                                   # internlm2
-            
-            "transformer.decoder_layer.{bid}.rms_norm_2", # Grok
-            
+            "transformer.decoder_layer.{bid}.rms_norm_2",                    # Grok
        ),

        MODEL_TENSOR.FFN_GATE_INP: (
            "layers.{bid}.feed_forward.gate",           # mixtral
            "model.layers.{bid}.block_sparse_moe.gate", # mixtral
-            "transformer.decoder_layer.{bid}.router" # Grok
+            "transformer.decoder_layer.{bid}.router"    # Grok
        ),

        # Feed-forward up
@ -234,8 +232,8 @@ class TensorNameMap:

        MODEL_TENSOR.FFN_UP_EXP: (
            "layers.{bid}.feed_forward.experts.{xid}.w3",           # mixtral
-            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral  
-            "transformer.decoder_layer.{bid}.moe.{xid}.linear_v", # Grok
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
+            "transformer.decoder_layer.{bid}.moe.{xid}.linear_v",   # Grok
        ),

        # AWQ-activation gate
@ -256,7 +254,7 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_GATE_EXP: (
            "layers.{bid}.feed_forward.experts.{xid}.w1",           # mixtral
            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
-            "transformer.decoder_layer.{bid}.moe.{xid}.linear" # Grok
+            "transformer.decoder_layer.{bid}.moe.{xid}.linear"      # Grok
        ),

        # Feed-forward down
@ -284,7 +282,7 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_DOWN_EXP: (
            "layers.{bid}.feed_forward.experts.{xid}.w2",           # mixtral
            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
-            "transformer.decoder_layer.{bid}.moe.{xid}.linear_1", # Grok
+            "transformer.decoder_layer.{bid}.moe.{xid}.linear_1",   # Grok

        ),

@ -303,9 +301,9 @@ class TensorNameMap:
        ),

        MODEL_TENSOR.LAYER_OUT_NORM: (
-            "encoder.layer.{bid}.output.LayerNorm",  # bert
-            "encoder.layers.{bid}.norm2",            # nomic-bert
-            "transformer.decoder_layer.{bid}.rms_norm_3", # Grok
+            "encoder.layer.{bid}.output.LayerNorm",         # bert
+            "encoder.layers.{bid}.norm2",                   # nomic-bert
+            "transformer.decoder_layer.{bid}.rms_norm_3",   # Grok
        ),

        MODEL_TENSOR.SSM_IN: (
--- a/llama.cpp
+++ b/llama.cpp
@ -4330,7 +4330,7 @@ static bool llm_load_tensors(
                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});

                        layer.attn_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
-                        
+
                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});

                        layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
@ -4345,8 +4345,6 @@ static bool llm_load_tensors(
                            layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {  n_ff, n_embd});
                            layer.ffn_up_exp[x]   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), {n_embd,   n_ff});
                        }
-                        
-                        

                        layer.layer_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
                    }
@ -6480,7 +6478,7 @@ struct llm_build_context {
            }

            cur = moe_out;
-            
+

            // Grok
            // if layer_out_norm is present then apply it before adding the input
@ -6515,7 +6513,7 @@ struct llm_build_context {

        // lm_head
        cur = ggml_mul_mat(ctx0, model.output, cur);
-        
+

        // Grok
        // multiply logits by output_multiplier_scale of 0.5773502691896257