From b3ba05e5bc62de2154198a8f335b58d54457e259 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Dec 2024 22:37:26 +0200
Subject: [PATCH] layer norm

---
 convert_hf_to_gguf.py            |  2 +-
 examples/tts/convert_pt_to_hf.py | 10 ++++++++++
 gguf-py/gguf/constants.py        |  3 ---
 gguf-py/gguf/tensor_mapping.py   |  6 +-----
 src/llama.cpp                    |  9 +++++++--
 5 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ebeb3840c..a86490831 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2046,7 +2046,7 @@ class OuteTTSVocoderModel(Model):
             logger.debug(f"Skipping {name!r}")
             return []
 
-        print(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
+        logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
 
         return [(self.map_tensor_name(name), data_torch)]
 
diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index 389d2de50..4a0d4bcc8 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -88,6 +88,16 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
         if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed":
             new_key = "backbone.embedding.weight"
 
+        # these are the only rows used
+        # ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100
+        if new_key == "backbone.norm.scale.weight":
+            new_key = "backbone.norm.weight"
+            value = value[0]
+
+        if new_key == "backbone.norm.shift.weight":
+            new_key = "backbone.norm.bias"
+            value = value[0]
+
         size_mb = value.element_size() * value.nelement() / (1024 * 1024)
         print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 14e68cffa..81e434d11 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -267,7 +267,6 @@ class MODEL_ARCH(IntEnum):
 class MODEL_TENSOR(IntEnum):
     TOKEN_EMBD           = auto()
     TOKEN_EMBD_NORM      = auto()
-    TOKEN_EMBD_SHIFT     = auto()
     TOKEN_TYPES          = auto()
     POS_EMBD             = auto()
     OUTPUT               = auto()
@@ -451,7 +450,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.TOKEN_EMBD:                "token_embd",
     MODEL_TENSOR.TOKEN_EMBD_NORM:           "token_embd_norm",
-    MODEL_TENSOR.TOKEN_EMBD_SHIFT:          "token_embd_shift",
     MODEL_TENSOR.TOKEN_TYPES:               "token_types",
     MODEL_TENSOR.POS_EMBD:                  "position_embd",
     MODEL_TENSOR.OUTPUT_NORM:               "output_norm",
@@ -1415,7 +1413,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
     MODEL_ARCH.OUTETTS_VOC: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.TOKEN_EMBD_NORM,
-        MODEL_TENSOR.TOKEN_EMBD_SHIFT,
         MODEL_TENSOR.CONV1D,
         MODEL_TENSOR.CONV_NEXT_DW,
         MODEL_TENSOR.CONV_NEXT_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 4355ccf11..872205e77 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -43,11 +43,7 @@ class TensorNameMap:
             "emb_ln",                     # nomic-bert
             "transformer.norm",           # openelm
             "rwkv.blocks.0.pre_ln",       # rwkv
-            "backbone.norm.scale",        # outetts
-        ),
-
-        MODEL_TENSOR.TOKEN_EMBD_SHIFT: (
-            "backbone.norm.shift",        # outetts
+            "backbone.norm",              # outetts
         ),
 
         # Position embeddings
diff --git a/src/llama.cpp b/src/llama.cpp
index 31ce4b31a..6c38d9315 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -505,7 +505,6 @@ struct LLM_KV {
 enum llm_tensor {
     LLM_TENSOR_TOKEN_EMBD,
     LLM_TENSOR_TOKEN_EMBD_NORM,
-    LLM_TENSOR_TOKEN_EMBD_SHIFT,
     LLM_TENSOR_TOKEN_TYPES,
     LLM_TENSOR_POS_EMBD,
     LLM_TENSOR_OUTPUT,
@@ -1619,7 +1618,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
         {
             { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
             { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
-            { LLM_TENSOR_TOKEN_EMBD_SHIFT,  "token_embd_shift" },
             { LLM_TENSOR_CONV1D,            "conv1d" },
             { LLM_TENSOR_CONV_NEXT_DW,      "conv_next.dw" },
             { LLM_TENSOR_CONV_NEXT_NORM,    "conv_next.norm" },
@@ -9519,6 +9517,9 @@ static bool llm_load_tensors(
                 {
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
+                    model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {768}, 0);
+                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {768}, 0);
+
                     model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0);
                     model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {768}, 0);
 
@@ -17337,6 +17338,10 @@ struct llm_build_context {
                     LLM_NORM_GROUP, cb, 0);
         }
 
+        cur = llm_build_norm(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), hparams,
+                model.tok_norm,
+                model.tok_norm_b,
+                LLM_NORM, cb, -1);
 
         printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);