From 81ce9df3ee03d68f857d53a200b83b7269d695b1 Mon Sep 17 00:00:00 2001
From: Julius Arkenberg <julius@rkenberg.de>
Date: Thu, 21 Mar 2024 19:59:15 +0000
Subject: [PATCH] Fix whitespaces

---
 convert-hf-to-gguf.py          |  8 ++---
 gguf-py/gguf/tensor_mapping.py | 66 +++++++++++++++++-----------------
 llama.cpp                      |  8 ++---
 3 files changed, 38 insertions(+), 44 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index c9cca34fa..723ea18e3 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1068,16 +1068,14 @@ class GrokModel(Model):
 
     def set_vocab(self):
         self._set_vocab_sentencepiece()
-        
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-    
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_name("Grok")
-        
-        
-        
+
 
 @Model.register("MiniCPMForCausalLM")
 class MiniCPMModel(Model):
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 7f482dd77..11fd34b8b 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -23,7 +23,7 @@ class TensorNameMap:
             "model.embedding",                           # mamba-qbert
             "backbone.embedding",                        # mamba
             "backbone.embeddings",                       # mamba-hf
-            "transformer.in_out_embed",           # Grok
+            "transformer.in_out_embed",                  # Grok
         ),
 
         # Token type embeddings
@@ -67,7 +67,7 @@ class TensorNameMap:
             "lm_head.ln",                              # phi2
             "model.norm_f",                            # mamba-qbert
             "backbone.norm_f",                         # mamba
-            "transformer.rms_norm",             # Grok
+            "transformer.rms_norm",                    # Grok
         ),
 
         # Rope frequencies
@@ -95,7 +95,7 @@ class TensorNameMap:
             "model.layers.{bid}.attention_norm",                    # internlm2
             "model.layers.{bid}.norm",                              # mamba-qbert
             "backbone.layers.{bid}.norm",                           # mamba
-            "transformer.decoder_layer.{bid}.rms_norm",      # Grok
+            "transformer.decoder_layer.{bid}.rms_norm",             # Grok
         ),
 
         # Attention norm 2
@@ -119,34 +119,34 @@ class TensorNameMap:
 
         # Attention query
         MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj",         # llama-hf
-            "layers.{bid}.attention.wq",                   # llama-pth
-            "encoder.layer.{bid}.attention.self.query",    # bert
-            "transformer.h.{bid}.attn.q_proj",             # gpt-j
-            "model.layers.layers.{bid}.self_attn.q_proj",  # plamo
-            "model.layers.{bid}.attention.wq",             # internlm2
+            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf
+            "layers.{bid}.attention.wq",                                 # llama-pth
+            "encoder.layer.{bid}.attention.self.query",                  # bert
+            "transformer.h.{bid}.attn.q_proj",                           # gpt-j
+            "model.layers.layers.{bid}.self_attn.q_proj",                # plamo
+            "model.layers.{bid}.attention.wq",                           # internlm2
             "transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
         ),
 
         # Attention key
         MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj",         # llama-hf
-            "layers.{bid}.attention.wk",                   # llama-pth
-            "encoder.layer.{bid}.attention.self.key",      # bert
-            "transformer.h.{bid}.attn.k_proj",             # gpt-j
-            "model.layers.layers.{bid}.self_attn.k_proj",  # plamo
-            "model.layers.{bid}.attention.wk",             # internlm2
+            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf
+            "layers.{bid}.attention.wk",                               # llama-pth
+            "encoder.layer.{bid}.attention.self.key",                  # bert
+            "transformer.h.{bid}.attn.k_proj",                         # gpt-j
+            "model.layers.layers.{bid}.self_attn.k_proj",              # plamo
+            "model.layers.{bid}.attention.wk",                         # internlm2
             "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
         ),
 
         # Attention value
         MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj",         # llama-hf
-            "layers.{bid}.attention.wv",                   # llama-pth
-            "encoder.layer.{bid}.attention.self.value",    # bert
-            "transformer.h.{bid}.attn.v_proj",             # gpt-j
-            "model.layers.layers.{bid}.self_attn.v_proj",  # plamo
-            "model.layers.{bid}.attention.wv",             # internlm2
+            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf
+            "layers.{bid}.attention.wv",                                 # llama-pth
+            "encoder.layer.{bid}.attention.self.value",                  # bert
+            "transformer.h.{bid}.attn.v_proj",                           # gpt-j
+            "model.layers.layers.{bid}.self_attn.v_proj",                # plamo
+            "model.layers.{bid}.attention.wv",                           # internlm2
             "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
         ),
 
@@ -168,14 +168,14 @@ class TensorNameMap:
             "model.layers.layers.{bid}.self_attn.o_proj",                # plamo
             "model.layers.{bid}.attention.wo",                           # internlm2
             "encoder.layers.{bid}.attn.out_proj",                        # nomic-bert
-            "transformer.decoder_layer.{bid}.multi_head_attention.linear" # Grok
+            "transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok
         ),
 
         # Attention output norm
         MODEL_TENSOR.ATTN_OUT_NORM: (
             "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
             "encoder.layers.{bid}.norm1",                      # nomic-bert
-            "transformer.decoder_layer.{bid}.rms_norm_1",  # Grok
+            "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
         ),
 
         # Rotary embeddings
@@ -198,15 +198,13 @@ class TensorNameMap:
             "model.layers.{bid}.ln2",                                        # yi
             "h.{bid}.ln_2",                                                  # gpt2
             "model.layers.{bid}.ffn_norm",                                   # internlm2
-            
-            "transformer.decoder_layer.{bid}.rms_norm_2", # Grok
-            
+            "transformer.decoder_layer.{bid}.rms_norm_2",                    # Grok
         ),
 
         MODEL_TENSOR.FFN_GATE_INP: (
             "layers.{bid}.feed_forward.gate",           # mixtral
             "model.layers.{bid}.block_sparse_moe.gate", # mixtral
-            "transformer.decoder_layer.{bid}.router" # Grok
+            "transformer.decoder_layer.{bid}.router"    # Grok
         ),
 
         # Feed-forward up
@@ -234,8 +232,8 @@ class TensorNameMap:
 
         MODEL_TENSOR.FFN_UP_EXP: (
             "layers.{bid}.feed_forward.experts.{xid}.w3",           # mixtral
-            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral  
-            "transformer.decoder_layer.{bid}.moe.{xid}.linear_v", # Grok
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
+            "transformer.decoder_layer.{bid}.moe.{xid}.linear_v",   # Grok
         ),
 
         # AWQ-activation gate
@@ -256,7 +254,7 @@ class TensorNameMap:
         MODEL_TENSOR.FFN_GATE_EXP: (
             "layers.{bid}.feed_forward.experts.{xid}.w1",           # mixtral
             "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
-            "transformer.decoder_layer.{bid}.moe.{xid}.linear" # Grok
+            "transformer.decoder_layer.{bid}.moe.{xid}.linear"      # Grok
         ),
 
         # Feed-forward down
@@ -284,7 +282,7 @@ class TensorNameMap:
         MODEL_TENSOR.FFN_DOWN_EXP: (
             "layers.{bid}.feed_forward.experts.{xid}.w2",           # mixtral
             "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
-            "transformer.decoder_layer.{bid}.moe.{xid}.linear_1", # Grok
+            "transformer.decoder_layer.{bid}.moe.{xid}.linear_1",   # Grok
 
         ),
 
@@ -303,9 +301,9 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.LAYER_OUT_NORM: (
-            "encoder.layer.{bid}.output.LayerNorm",  # bert
-            "encoder.layers.{bid}.norm2",            # nomic-bert
-            "transformer.decoder_layer.{bid}.rms_norm_3", # Grok
+            "encoder.layer.{bid}.output.LayerNorm",         # bert
+            "encoder.layers.{bid}.norm2",                   # nomic-bert
+            "transformer.decoder_layer.{bid}.rms_norm_3",   # Grok
         ),
 
         MODEL_TENSOR.SSM_IN: (
diff --git a/llama.cpp b/llama.cpp
index a015d67b1..5ad82e69b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4330,7 +4330,7 @@ static bool llm_load_tensors(
                         layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
                         layer.attn_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
-                        
+
                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 
                         layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
@@ -4345,8 +4345,6 @@ static bool llm_load_tensors(
                             layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {  n_ff, n_embd});
                             layer.ffn_up_exp[x]   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), {n_embd,   n_ff});
                         }
-                        
-                        
 
                         layer.layer_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
                     }
@@ -6480,7 +6478,7 @@ struct llm_build_context {
             }
 
             cur = moe_out;
-            
+
 
             // Grok
             // if layer_out_norm is present then apply it before adding the input
@@ -6515,7 +6513,7 @@ struct llm_build_context {
 
         // lm_head
         cur = ggml_mul_mat(ctx0, model.output, cur);
-        
+
 
         // Grok
         // multiply logits by output_multiplier_scale of 0.5773502691896257