layer norm

2024-12-10 22:37:26 +02:00 · 2024-12-10 22:37:26 +02:00 · b3ba05e5bc
commit b3ba05e5bc
parent 435cfd788b
5 changed files with 19 additions and 11 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -2046,7 +2046,7 @@ class OuteTTSVocoderModel(Model):
            logger.debug(f"Skipping {name!r}")
            return []

-        print(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
+        logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")

        return [(self.map_tensor_name(name), data_torch)]

--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@ -88,6 +88,16 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
        if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed":
            new_key = "backbone.embedding.weight"

+        # these are the only rows used
+        # ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100
+        if new_key == "backbone.norm.scale.weight":
+            new_key = "backbone.norm.weight"
+            value = value[0]
+
+        if new_key == "backbone.norm.shift.weight":
+            new_key = "backbone.norm.bias"
+            value = value[0]
+
        size_mb = value.element_size() * value.nelement() / (1024 * 1024)
        print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")

--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -267,7 +267,6 @@ class MODEL_ARCH(IntEnum):
 class MODEL_TENSOR(IntEnum):
    TOKEN_EMBD           = auto()
    TOKEN_EMBD_NORM      = auto()
-    TOKEN_EMBD_SHIFT     = auto()
    TOKEN_TYPES          = auto()
    POS_EMBD             = auto()
    OUTPUT               = auto()
@ -451,7 +450,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.TOKEN_EMBD:                "token_embd",
    MODEL_TENSOR.TOKEN_EMBD_NORM:           "token_embd_norm",
-    MODEL_TENSOR.TOKEN_EMBD_SHIFT:          "token_embd_shift",
    MODEL_TENSOR.TOKEN_TYPES:               "token_types",
    MODEL_TENSOR.POS_EMBD:                  "position_embd",
    MODEL_TENSOR.OUTPUT_NORM:               "output_norm",
@ -1415,7 +1413,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
    MODEL_ARCH.OUTETTS_VOC: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.TOKEN_EMBD_NORM,
-        MODEL_TENSOR.TOKEN_EMBD_SHIFT,
        MODEL_TENSOR.CONV1D,
        MODEL_TENSOR.CONV_NEXT_DW,
        MODEL_TENSOR.CONV_NEXT_NORM,
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -43,11 +43,7 @@ class TensorNameMap:
            "emb_ln",                     # nomic-bert
            "transformer.norm",           # openelm
            "rwkv.blocks.0.pre_ln",       # rwkv
-            "backbone.norm.scale",        # outetts
-        ),
-
-        MODEL_TENSOR.TOKEN_EMBD_SHIFT: (
-            "backbone.norm.shift",        # outetts
+            "backbone.norm",              # outetts
        ),

        # Position embeddings
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -505,7 +505,6 @@ struct LLM_KV {
 enum llm_tensor {
    LLM_TENSOR_TOKEN_EMBD,
    LLM_TENSOR_TOKEN_EMBD_NORM,
-    LLM_TENSOR_TOKEN_EMBD_SHIFT,
    LLM_TENSOR_TOKEN_TYPES,
    LLM_TENSOR_POS_EMBD,
    LLM_TENSOR_OUTPUT,
@ -1619,7 +1618,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
        {
            { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
            { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
-            { LLM_TENSOR_TOKEN_EMBD_SHIFT,  "token_embd_shift" },
            { LLM_TENSOR_CONV1D,            "conv1d" },
            { LLM_TENSOR_CONV_NEXT_DW,      "conv_next.dw" },
            { LLM_TENSOR_CONV_NEXT_NORM,    "conv_next.norm" },
@ -9519,6 +9517,9 @@ static bool llm_load_tensors(
                {
                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

+                    model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {768}, 0);
+                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {768}, 0);
+
                    model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0);
                    model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {768}, 0);

@ -17337,6 +17338,10 @@ struct llm_build_context {
                    LLM_NORM_GROUP, cb, 0);
        }

+        cur = llm_build_norm(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), hparams,
+                model.tok_norm,
+                model.tok_norm_b,
+                LLM_NORM, cb, -1);

        printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);