tts : add OuteTTS support (#10784)
* server : add "tokens" output ggml-ci * server : output embeddings for all tokens when pooling = none ggml-ci * server : be explicit about the pooling type in the tests ggml-ci * server : do not normalize embeddings when there is no pooling ggml-ci * llama : add OuteTTS support (wip) * wip * extract features * first conv * group norm * resnet conv * resnet * attn * pos net * layer norm * convnext * head * hann window * fix n_embd + remove llama.cpp hacks * compute hann window * fft * spectrum processing * clean-up * tts : receive input text and generate codes * clip : fix new conv name * tts : minor fix * tts : add header + minor fixes ggml-ci * tts : add matchematical constant ggml-ci * tts : fix sampling + cut initial noise * tts : fixes * tts : update default samplers ggml-ci * tts : text pre-processing * tts : outetts-voc -> wavtokenizer-dec * tts : remove hardcoded constants ggml-ci * tts : fix tensor shapes * llama : refactor wavtokenizer tensors ggml-ci * cont ggml-ci * cont [no ci] * llama : update WavTokenizer to non-causal attn * llama : handle no-vocab detokenization * tts : add Python example for OuteTTS (wip) * tts : extend python example to generate spectrogram ggml-ci * server : fix rebase artifacts * tts : enable "return_tokens" in Python example ggml-ci * tts : minor fixes * common : support HF download for vocoder
This commit is contained in:
parent
7bbb5acf12
commit
0bf2d10c55
19 changed files with 2509 additions and 532 deletions
|
@ -42,6 +42,7 @@ class TensorNameMap:
|
|||
"emb_ln", # nomic-bert
|
||||
"transformer.norm", # openelm
|
||||
"rwkv.blocks.0.pre_ln", # rwkv
|
||||
"backbone.norm", # wavtokenizer
|
||||
),
|
||||
|
||||
# Position embeddings
|
||||
|
@ -60,6 +61,7 @@ class TensorNameMap:
|
|||
"lm_head.linear", # phi2
|
||||
"output_layer", # chatglm
|
||||
"head", # rwkv
|
||||
"head.out", # wavtokenizer
|
||||
),
|
||||
|
||||
# Output norm
|
||||
|
@ -80,6 +82,7 @@ class TensorNameMap:
|
|||
"transformer.norm", # openelm
|
||||
"model.norm", # nemotron
|
||||
"rwkv.ln_out", # rwkv
|
||||
"backbone.final_layer_norm", # wavtokenizer
|
||||
),
|
||||
|
||||
# Rope frequencies
|
||||
|
@ -90,6 +93,10 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.ROPE_FACTORS_LONG: (),
|
||||
MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
|
||||
|
||||
MODEL_TENSOR.CONV1D: (
|
||||
"backbone.embed", # roberta
|
||||
),
|
||||
}
|
||||
|
||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
|
@ -681,6 +688,8 @@ class TensorNameMap:
|
|||
"encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
|
||||
),
|
||||
|
||||
############################################################################
|
||||
# TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
|
||||
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
||||
"encoder.final_layer_norm", # t5
|
||||
),
|
||||
|
@ -693,6 +702,67 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.CLS_OUT: (
|
||||
"classifier.out_proj", # roberta
|
||||
),
|
||||
#############################################################################
|
||||
|
||||
MODEL_TENSOR.CONVNEXT_DW: (
|
||||
"backbone.convnext.{bid}.dwconv", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CONVNEXT_NORM: (
|
||||
"backbone.convnext.{bid}.norm", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CONVNEXT_PW1: (
|
||||
"backbone.convnext.{bid}.pwconv1", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CONVNEXT_PW2: (
|
||||
"backbone.convnext.{bid}.pwconv2", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CONVNEXT_GAMMA: (
|
||||
"backbone.convnext.{bid}.gamma", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.POSNET_CONV1: (
|
||||
"backbone.posnet.{bid}.conv1", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.POSNET_CONV2: (
|
||||
"backbone.posnet.{bid}.conv2", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.POSNET_NORM: (
|
||||
"backbone.posnet.{bid}.norm", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.POSNET_NORM1: (
|
||||
"backbone.posnet.{bid}.norm1", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.POSNET_NORM2: (
|
||||
"backbone.posnet.{bid}.norm2", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.POSNET_ATTN_NORM: (
|
||||
"backbone.posnet.{bid}.norm", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.POSNET_ATTN_Q: (
|
||||
"backbone.posnet.{bid}.q", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.POSNET_ATTN_K: (
|
||||
"backbone.posnet.{bid}.k", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.POSNET_ATTN_V: (
|
||||
"backbone.posnet.{bid}.v", # wavtokenizer
|
||||
),
|
||||
|
||||
MODEL_TENSOR.POSNET_ATTN_OUT: (
|
||||
"backbone.posnet.{bid}.proj_out", # wavtokenizer
|
||||
),
|
||||
}
|
||||
|
||||
# architecture-specific block mappings
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue