tts : add OuteTTS support (#10784)

* server : add "tokens" output ggml-ci * server : output embeddings for all tokens when pooling = none ggml-ci * server : be explicit about the pooling type in the tests ggml-ci * server : do not normalize embeddings when there is no pooling ggml-ci * llama : add OuteTTS support (wip) * wip * extract features * first conv * group norm * resnet conv * resnet * attn * pos net * layer norm * convnext * head * hann window * fix n_embd + remove llama.cpp hacks * compute hann window * fft * spectrum processing * clean-up * tts : receive input text and generate codes * clip : fix new conv name * tts : minor fix * tts : add header + minor fixes ggml-ci * tts : add matchematical constant ggml-ci * tts : fix sampling + cut initial noise * tts : fixes * tts : update default samplers ggml-ci * tts : text pre-processing * tts : outetts-voc -> wavtokenizer-dec * tts : remove hardcoded constants ggml-ci * tts : fix tensor shapes * llama : refactor wavtokenizer tensors ggml-ci * cont ggml-ci * cont [no ci] * llama : update WavTokenizer to non-causal attn * llama : handle no-vocab detokenization * tts : add Python example for OuteTTS (wip) * tts : extend python example to generate spectrogram ggml-ci * server : fix rebase artifacts * tts : enable "return_tokens" in Python example ggml-ci * tts : minor fixes * common : support HF download for vocoder
2024-12-18 19:27:21 +02:00 · 2024-12-18 19:27:21 +02:00 · 0bf2d10c55
commit 0bf2d10c55
parent 7bbb5acf12
19 changed files with 2509 additions and 532 deletions
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -631,6 +631,21 @@ class GGUFWriter:
    def add_embedding_length(self, length: int) -> None:
        self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)

+    def add_features_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
+
+    def add_posnet_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_posnet_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_convnext_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_convnext_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)
+
    def add_block_count(self, length: int) -> None:
        self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)

@ -727,6 +742,12 @@ class GGUFWriter:
    def add_layer_norm_rms_eps(self, value: float) -> None:
        self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)

+    def add_group_norm_eps(self, value: float) -> None:
+        self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value)
+
+    def add_group_norm_groups(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.GROUPNORM_GROUPS.format(arch=self.arch), value)
+
    def add_causal_attention(self, value: bool) -> None:
        self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)