From 980d6310322859cc957d77dbfcdee92b62260302 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Dec 2024 19:21:50 +0200 Subject: [PATCH] llama : refactor wavtokenizer tensors ggml-ci --- common/arg.cpp | 2 +- convert_hf_to_gguf.py | 12 +- examples/tts/convert_pt_to_hf.py | 27 +- examples/tts/tts.cpp | 4 + gguf-py/gguf/constants.py | 100 ++--- gguf-py/gguf/gguf_writer.py | 14 +- gguf-py/gguf/tensor_mapping.py | 50 +-- src/llama.cpp | 694 ++++++++++++------------------- 8 files changed, 394 insertions(+), 509 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 3d03c676c..93c15ecdc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -842,7 +842,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_sparam()); add_opt(common_arg( - {"--sampling-seq"}, "SEQUENCE", + {"--sampling-seq", "--sampler-seq"}, "SEQUENCE", string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), [](common_params & params, const std::string & value) { params.sampling.samplers = common_sampler_types_from_chars(value); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ad423d0b2..7bf67a268 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -326,8 +326,8 @@ class Model: gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, - gguf.MODEL_TENSOR.POS_NET_NORM1, - gguf.MODEL_TENSOR.POS_NET_NORM2, + gguf.MODEL_TENSOR.POSNET_NORM1, + gguf.MODEL_TENSOR.POSNET_NORM2, ) ) or not new_name.endswith(".weight") @@ -2059,12 +2059,16 @@ class WavTokenizerDecModel(Model): super().set_gguf_parameters() self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) - self.gguf_writer.add_posnet_length (self.hparams["n_embd_posnet"]) - self.gguf_writer.add_convnext_length (self.hparams["n_embd_convnext"]) self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) + self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) + self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"]) + + self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) + self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index 99fef33d2..adba21a36 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -74,12 +74,13 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): new_key = key new_key = new_key.replace('state_dict.', '') + new_key = new_key.replace('pos_net', 'posnet') - # check if matches "backbone.pos_net.%d.bias" or "backbone.pos_net.%d.weight" - if new_key.startswith("backbone.pos_net."): - match = re.match(r"backbone\.pos_net\.(\d+)\.(bias|weight)", new_key) + # check if matches "backbone.posnet.%d.bias" or "backbone.posnet.%d.weight" + if new_key.startswith("backbone.posnet."): + match = re.match(r"backbone\.posnet\.(\d+)\.(bias|weight)", new_key) if match: - new_key = f"backbone.pos_net.{match.group(1)}.norm.{match.group(2)}" + new_key = f"backbone.posnet.{match.group(1)}.norm.{match.group(2)}" # "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" -> "backbone.embedding.weight" if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed": @@ -99,7 +100,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): new_key = new_key.replace("gamma", "gamma.weight") # convert from 1D [768] to 2D [768, 1] so that ggml_add can broadcast the bias - if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.pos_net") or new_key.startswith("backbone.embed.bias")): + if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.posnet") or new_key.startswith("backbone.embed.bias")): value = value.unsqueeze(1) if new_key.endswith("dwconv.bias"): @@ -155,8 +156,6 @@ config = { ], "hidden_size": 1282, "n_embd_features": 512, - "n_embd_posnet": 768, - "n_embd_convnext": 768, "n_ff": 2304, "vocab_size": 4096, "n_head": 1, @@ -164,7 +163,19 @@ config = { "group_norm_epsilon": 1e-6, "group_norm_groups": 32, "max_position_embeddings": 8192, # ? - "num_hidden_layers": 12 + "n_layer": 12, + "posnet": { + "n_embd": 768, + "n_layer": 6 + }, + "convnext": { + "n_embd": 768, + "n_layer": 12 + }, + #"n_embd_posnet": 768, + #"n_embd_convnext": 768, + #"n_layer_posnet": 6, + #"n_layer_convnext": 12 } with open(path_dst + '/config.json', 'w') as f: diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 57956e5a1..aa5508be3 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -476,6 +476,10 @@ int main(int argc, char ** argv) { smpl[i] = common_sampler_init(model_ttc, params.sampling); } + LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl[0])); + LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str()); + LOG_INF("sampler chain: %s\n", common_sampler_print(smpl[0]).c_str()); + LOG_INF("%s: loading done\n", __func__); const auto t_main_start = ggml_time_us(); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 139d0d15f..a40df974d 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -91,8 +91,6 @@ class Keys: CONTEXT_LENGTH = "{arch}.context_length" EMBEDDING_LENGTH = "{arch}.embedding_length" FEATURES_LENGTH = "{arch}.features_length" - POSNET_LENGTH = "{arch}.posnet_length" - CONVNEXT_LENGTH = "{arch}.convnext_length" BLOCK_COUNT = "{arch}.block_count" LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" @@ -160,6 +158,14 @@ class Keys: class WKV: HEAD_SIZE = "{arch}.wkv.head_size" + class PosNet: + EMBEDDING_LENGTH = "{arch}.posnet.embedding_length" + BLOCK_COUNT = "{arch}.posnet.block_count" + + class ConvNext: + EMBEDDING_LENGTH = "{arch}.convnext.embedding_length" + BLOCK_COUNT = "{arch}.convnext.block_count" + class Tokenizer: MODEL = "tokenizer.ggml.model" PRE = "tokenizer.ggml.pre" @@ -377,21 +383,21 @@ class MODEL_TENSOR(IntEnum): CLS = auto() # classifier CLS_OUT = auto() # classifier output projection CONV1D = auto() - CONV_NEXT_DW = auto() - CONV_NEXT_NORM = auto() - CONV_NEXT_PW1 = auto() - CONV_NEXT_PW2 = auto() - CONV_NEXT_GAMMA = auto() - POS_NET_CONV1 = auto() - POS_NET_CONV2 = auto() - POS_NET_NORM = auto() - POS_NET_NORM1 = auto() - POS_NET_NORM2 = auto() - POS_NET_ATTN_NORM = auto() - POS_NET_ATTN_Q = auto() - POS_NET_ATTN_K = auto() - POS_NET_ATTN_V = auto() - POS_NET_ATTN_OUT = auto() + CONVNEXT_DW = auto() + CONVNEXT_NORM = auto() + CONVNEXT_PW1 = auto() + CONVNEXT_PW2 = auto() + CONVNEXT_GAMMA = auto() + POSNET_CONV1 = auto() + POSNET_CONV2 = auto() + POSNET_NORM = auto() + POSNET_NORM1 = auto() + POSNET_NORM2 = auto() + POSNET_ATTN_NORM = auto() + POSNET_ATTN_Q = auto() + POSNET_ATTN_K = auto() + POSNET_ATTN_V = auto() + POSNET_ATTN_OUT = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -558,21 +564,21 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.CLS: "cls", MODEL_TENSOR.CLS_OUT: "cls.output", MODEL_TENSOR.CONV1D: "conv1d", - MODEL_TENSOR.CONV_NEXT_DW: "conv_next.{bid}.dw", - MODEL_TENSOR.CONV_NEXT_NORM: "conv_next.{bid}.norm", - MODEL_TENSOR.CONV_NEXT_PW1: "conv_next.{bid}.pw1", - MODEL_TENSOR.CONV_NEXT_PW2: "conv_next.{bid}.pw2", - MODEL_TENSOR.CONV_NEXT_GAMMA: "conv_next.{bid}.gamma", - MODEL_TENSOR.POS_NET_CONV1: "pos_net.{bid}.conv1", - MODEL_TENSOR.POS_NET_CONV2: "pos_net.{bid}.conv2", - MODEL_TENSOR.POS_NET_NORM: "pos_net.{bid}.norm", - MODEL_TENSOR.POS_NET_NORM1: "pos_net.{bid}.norm1", - MODEL_TENSOR.POS_NET_NORM2: "pos_net.{bid}.norm2", - MODEL_TENSOR.POS_NET_ATTN_NORM: "pos_net.{bid}.attn_norm", - MODEL_TENSOR.POS_NET_ATTN_Q: "pos_net.{bid}.attn_q", - MODEL_TENSOR.POS_NET_ATTN_K: "pos_net.{bid}.attn_k", - MODEL_TENSOR.POS_NET_ATTN_V: "pos_net.{bid}.attn_v", - MODEL_TENSOR.POS_NET_ATTN_OUT: "pos_net.{bid}.attn_output", + MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw", + MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm", + MODEL_TENSOR.CONVNEXT_PW1: "convnext.{bid}.pw1", + MODEL_TENSOR.CONVNEXT_PW2: "convnext.{bid}.pw2", + MODEL_TENSOR.CONVNEXT_GAMMA: "convnext.{bid}.gamma", + MODEL_TENSOR.POSNET_CONV1: "posnet.{bid}.conv1", + MODEL_TENSOR.POSNET_CONV2: "posnet.{bid}.conv2", + MODEL_TENSOR.POSNET_NORM: "posnet.{bid}.norm", + MODEL_TENSOR.POSNET_NORM1: "posnet.{bid}.norm1", + MODEL_TENSOR.POSNET_NORM2: "posnet.{bid}.norm2", + MODEL_TENSOR.POSNET_ATTN_NORM: "posnet.{bid}.attn_norm", + MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q", + MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", + MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", + MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1415,23 +1421,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, MODEL_TENSOR.CONV1D, - MODEL_TENSOR.CONV_NEXT_DW, - MODEL_TENSOR.CONV_NEXT_NORM, - MODEL_TENSOR.CONV_NEXT_PW1, - MODEL_TENSOR.CONV_NEXT_PW2, - MODEL_TENSOR.CONV_NEXT_GAMMA, + MODEL_TENSOR.CONVNEXT_DW, + MODEL_TENSOR.CONVNEXT_NORM, + MODEL_TENSOR.CONVNEXT_PW1, + MODEL_TENSOR.CONVNEXT_PW2, + MODEL_TENSOR.CONVNEXT_GAMMA, MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.POS_NET_CONV1, - MODEL_TENSOR.POS_NET_CONV2, - MODEL_TENSOR.POS_NET_NORM, - MODEL_TENSOR.POS_NET_NORM1, - MODEL_TENSOR.POS_NET_NORM2, - MODEL_TENSOR.POS_NET_ATTN_NORM, - MODEL_TENSOR.POS_NET_ATTN_Q, - MODEL_TENSOR.POS_NET_ATTN_K, - MODEL_TENSOR.POS_NET_ATTN_V, - MODEL_TENSOR.POS_NET_ATTN_OUT, + MODEL_TENSOR.POSNET_CONV1, + MODEL_TENSOR.POSNET_CONV2, + MODEL_TENSOR.POSNET_NORM, + MODEL_TENSOR.POSNET_NORM1, + MODEL_TENSOR.POSNET_NORM2, + MODEL_TENSOR.POSNET_ATTN_NORM, + MODEL_TENSOR.POSNET_ATTN_Q, + MODEL_TENSOR.POSNET_ATTN_K, + MODEL_TENSOR.POSNET_ATTN_V, + MODEL_TENSOR.POSNET_ATTN_OUT, ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 5bb2a8af8..3023b539a 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -634,11 +634,17 @@ class GGUFWriter: def add_features_length(self, length: int) -> None: self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length) - def add_posnet_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.POSNET_LENGTH.format(arch=self.arch), length) + def add_posnet_embedding_length(self, length: int) -> None: + self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length) - def add_convnext_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.CONVNEXT_LENGTH.format(arch=self.arch), length) + def add_posnet_block_count(self, length: int) -> None: + self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length) + + def add_convnext_embedding_length(self, length: int) -> None: + self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length) + + def add_convnext_block_count(self, length: int) -> None: + self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length) def add_block_count(self, length: int) -> None: self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 296f1ca05..82cdb121a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -704,64 +704,64 @@ class TensorNameMap: ), ############################################################################# - MODEL_TENSOR.CONV_NEXT_DW: ( + MODEL_TENSOR.CONVNEXT_DW: ( "backbone.convnext.{bid}.dwconv", # wavtokenizer ), - MODEL_TENSOR.CONV_NEXT_NORM: ( + MODEL_TENSOR.CONVNEXT_NORM: ( "backbone.convnext.{bid}.norm", # wavtokenizer ), - MODEL_TENSOR.CONV_NEXT_PW1: ( + MODEL_TENSOR.CONVNEXT_PW1: ( "backbone.convnext.{bid}.pwconv1", # wavtokenizer ), - MODEL_TENSOR.CONV_NEXT_PW2: ( + MODEL_TENSOR.CONVNEXT_PW2: ( "backbone.convnext.{bid}.pwconv2", # wavtokenizer ), - MODEL_TENSOR.CONV_NEXT_GAMMA: ( + MODEL_TENSOR.CONVNEXT_GAMMA: ( "backbone.convnext.{bid}.gamma", # wavtokenizer ), - MODEL_TENSOR.POS_NET_CONV1: ( - "backbone.pos_net.{bid}.conv1", # wavtokenizer + MODEL_TENSOR.POSNET_CONV1: ( + "backbone.posnet.{bid}.conv1", # wavtokenizer ), - MODEL_TENSOR.POS_NET_CONV2: ( - "backbone.pos_net.{bid}.conv2", # wavtokenizer + MODEL_TENSOR.POSNET_CONV2: ( + "backbone.posnet.{bid}.conv2", # wavtokenizer ), - MODEL_TENSOR.POS_NET_NORM: ( - "backbone.pos_net.{bid}.norm", # wavtokenizer + MODEL_TENSOR.POSNET_NORM: ( + "backbone.posnet.{bid}.norm", # wavtokenizer ), - MODEL_TENSOR.POS_NET_NORM1: ( - "backbone.pos_net.{bid}.norm1", # wavtokenizer + MODEL_TENSOR.POSNET_NORM1: ( + "backbone.posnet.{bid}.norm1", # wavtokenizer ), - MODEL_TENSOR.POS_NET_NORM2: ( - "backbone.pos_net.{bid}.norm2", # wavtokenizer + MODEL_TENSOR.POSNET_NORM2: ( + "backbone.posnet.{bid}.norm2", # wavtokenizer ), - MODEL_TENSOR.POS_NET_ATTN_NORM: ( - "backbone.pos_net.{bid}.norm", # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_NORM: ( + "backbone.posnet.{bid}.norm", # wavtokenizer ), - MODEL_TENSOR.POS_NET_ATTN_Q: ( - "backbone.pos_net.{bid}.q", # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_Q: ( + "backbone.posnet.{bid}.q", # wavtokenizer ), - MODEL_TENSOR.POS_NET_ATTN_K: ( - "backbone.pos_net.{bid}.k", # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_K: ( + "backbone.posnet.{bid}.k", # wavtokenizer ), - MODEL_TENSOR.POS_NET_ATTN_V: ( - "backbone.pos_net.{bid}.v", # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_V: ( + "backbone.posnet.{bid}.v", # wavtokenizer ), - MODEL_TENSOR.POS_NET_ATTN_OUT: ( - "backbone.pos_net.{bid}.proj_out", # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_OUT: ( + "backbone.posnet.{bid}.proj_out", # wavtokenizer ), } diff --git a/src/llama.cpp b/src/llama.cpp index 4c298d406..2d16ce72b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -277,8 +277,6 @@ enum llm_kv { LLM_KV_CONTEXT_LENGTH, LLM_KV_EMBEDDING_LENGTH, LLM_KV_FEATURES_LENGTH, - LLM_KV_POSNET_LENGTH, - LLM_KV_CONVNEXT_LENGTH, LLM_KV_BLOCK_COUNT, LLM_KV_LEADING_DENSE_BLOCK_COUNT, LLM_KV_FEED_FORWARD_LENGTH, @@ -375,6 +373,12 @@ enum llm_kv { LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, + LLM_KV_POSNET_EMBEDDING_LENGTH, + LLM_KV_POSNET_BLOCK_COUNT, + + LLM_KV_CONVNEXT_EMBEDDING_LENGTH, + LLM_KV_CONVNEXT_BLOCK_COUNT, + // deprecated: LLM_KV_TOKENIZER_PREFIX_ID, LLM_KV_TOKENIZER_SUFFIX_ID, @@ -399,8 +403,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, { LLM_KV_FEATURES_LENGTH, "%s.features_length" }, - { LLM_KV_POSNET_LENGTH, "%s.posnet_length" }, - { LLM_KV_CONVNEXT_LENGTH, "%s.convnext_length" }, { LLM_KV_BLOCK_COUNT, "%s.block_count" }, { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" }, { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, @@ -464,6 +466,12 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, + { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" }, + { LLM_KV_POSNET_BLOCK_COUNT, "%s.posnet.block_count" }, + + { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" }, + { LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, @@ -623,11 +631,11 @@ enum llm_tensor { LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, LLM_TENSOR_CONV1D, - LLM_TENSOR_CONV_NEXT_DW, - LLM_TENSOR_CONV_NEXT_NORM, - LLM_TENSOR_CONV_NEXT_PW1, - LLM_TENSOR_CONV_NEXT_PW2, - LLM_TENSOR_CONV_NEXT_GAMMA, + LLM_TENSOR_CONVNEXT_DW, + LLM_TENSOR_CONVNEXT_NORM, + LLM_TENSOR_CONVNEXT_PW1, + LLM_TENSOR_CONVNEXT_PW2, + LLM_TENSOR_CONVNEXT_GAMMA, LLM_TENSOR_POS_NET_CONV1, LLM_TENSOR_POS_NET_CONV2, LLM_TENSOR_POS_NET_NORM, @@ -1628,23 +1636,23 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_CONV1D, "conv1d" }, - { LLM_TENSOR_CONV_NEXT_DW, "conv_next.%d.dw" }, - { LLM_TENSOR_CONV_NEXT_NORM, "conv_next.%d.norm" }, - { LLM_TENSOR_CONV_NEXT_PW1, "conv_next.%d.pw1" }, - { LLM_TENSOR_CONV_NEXT_PW2, "conv_next.%d.pw2" }, - { LLM_TENSOR_CONV_NEXT_GAMMA, "conv_next.%d.gamma" }, + { LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" }, + { LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" }, + { LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" }, + { LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" }, + { LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_POS_NET_CONV1, "pos_net.%d.conv1" }, - { LLM_TENSOR_POS_NET_CONV2, "pos_net.%d.conv2" }, - { LLM_TENSOR_POS_NET_NORM, "pos_net.%d.norm" }, - { LLM_TENSOR_POS_NET_NORM1, "pos_net.%d.norm1" }, - { LLM_TENSOR_POS_NET_NORM2, "pos_net.%d.norm2" }, - { LLM_TENSOR_POS_NET_ATTN_NORM, "pos_net.%d.attn_norm" }, - { LLM_TENSOR_POS_NET_ATTN_Q, "pos_net.%d.attn_q" }, - { LLM_TENSOR_POS_NET_ATTN_K, "pos_net.%d.attn_k" }, - { LLM_TENSOR_POS_NET_ATTN_V, "pos_net.%d.attn_v" }, - { LLM_TENSOR_POS_NET_ATTN_OUT, "pos_net.%d.attn_output" }, + { LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" }, + { LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" }, + { LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" }, + { LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" }, + { LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" }, + { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" }, + { LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" }, + { LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" }, + { LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" }, + { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" }, }, }, { @@ -2537,6 +2545,16 @@ static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; +struct llama_hparams_posnet { + uint32_t n_embd; + uint32_t n_layer; +}; + +struct llama_hparams_convnext { + uint32_t n_embd; + uint32_t n_layer; +}; + struct llama_hparams { bool vocab_only; bool rope_finetuned; @@ -2546,6 +2564,7 @@ struct llama_hparams { uint32_t n_vocab = 0; uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; + uint32_t n_embd_features = 0; uint32_t n_layer; uint32_t n_rot; uint32_t n_swa = 0; // sliding window attention (SWA) @@ -2557,9 +2576,8 @@ struct llama_hparams { uint32_t n_rel_attn_bkts = 0; // for WavTokenizer - uint32_t n_embd_features = 0; - uint32_t n_embd_posnet = 0; - uint32_t n_embd_convnext = 0; + struct llama_hparams_posnet posnet; + struct llama_hparams_convnext convnext; std::array n_head_arr; std::array n_head_kv_arr; @@ -2623,66 +2641,6 @@ struct llama_hparams { enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE; - bool operator!=(const llama_hparams & other) const { - if (this->vocab_only != other.vocab_only) return true; - if (this->n_vocab != other.n_vocab) return true; - if (this->n_ctx_train != other.n_ctx_train) return true; - if (this->n_embd != other.n_embd) return true; - if (this->n_layer != other.n_layer) return true; - if (this->n_rot != other.n_rot) return true; - if (this->n_swa != other.n_swa) return true; - if (this->n_embd_head_k != other.n_embd_head_k) return true; - if (this->n_embd_head_v != other.n_embd_head_v) return true; - if (this->n_expert != other.n_expert) return true; - if (this->n_expert_used != other.n_expert_used) return true; - - if (this->n_head_arr != other.n_head_arr) return true; - if (this->n_head_kv_arr != other.n_head_kv_arr) return true; - if (this->n_ff_arr != other.n_ff_arr) return true; - - if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true; - if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true; - if (this->n_lora_q != other.n_lora_q) return true; - if (this->n_lora_kv != other.n_lora_kv) return true; - if (this->n_ff_exp != other.n_ff_exp) return true; - if (this->n_ff_shexp != other.n_ff_shexp) return true; - if (this->n_expert_shared != other.n_expert_shared) return true; - - if (this->rope_finetuned != other.rope_finetuned) return true; - if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true; - if (std::equal(std::begin(this->rope_sections), - std::end(this->rope_sections), - std::begin(other.rope_sections))) return true; - - if (this->ssm_d_conv != other.ssm_d_conv) return true; - if (this->ssm_d_inner != other.ssm_d_inner) return true; - if (this->ssm_d_state != other.ssm_d_state) return true; - if (this->ssm_dt_rank != other.ssm_dt_rank) return true; - if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true; - - if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true; - if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true; - if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true; - if (this->wkv_head_size != other.wkv_head_size) return true; - - if (this->dec_start_token_id != other.dec_start_token_id) return true; - - const float EPSILON = 1e-9f; - - if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; - if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; - if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true; - if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; - if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; - if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true; - if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true; - if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true; - if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true; - if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true; - - return false; - } - uint32_t n_head(uint32_t il = 0) const { if (il < n_layer) { return n_head_arr[il]; @@ -2735,21 +2693,21 @@ struct llama_hparams { if (wkv_head_size != 0) { // for RWKV models return 2 * n_embd; - } else { - // TODO: maybe support other convolution strides than 1 - // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed - return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner; } + + // TODO: maybe support other convolution strides than 1 + // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed + return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner; } uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings if (wkv_head_size != 0) { // corresponds to RWKV's wkv_states size return n_embd * wkv_head_size; - } else { - // corresponds to Mamba's ssm_states size - return ssm_d_state * ssm_d_inner; } + + // corresponds to Mamba's ssm_states size + return ssm_d_state * ssm_d_inner; } }; @@ -2787,6 +2745,57 @@ struct llama_cparams { void * cb_eval_user_data; }; +struct llama_layer_posnet { + // resnet + struct ggml_tensor * norm1 = nullptr; + struct ggml_tensor * norm1_b = nullptr; + + struct ggml_tensor * conv1 = nullptr; + struct ggml_tensor * conv1_b = nullptr; + + struct ggml_tensor * norm2 = nullptr; + struct ggml_tensor * norm2_b = nullptr; + + struct ggml_tensor * conv2 = nullptr; + struct ggml_tensor * conv2_b = nullptr; + + // attention + struct ggml_tensor * attn_norm = nullptr; + struct ggml_tensor * attn_norm_b = nullptr; + + struct ggml_tensor * attn_q = nullptr; + struct ggml_tensor * attn_q_b = nullptr; + + struct ggml_tensor * attn_k = nullptr; + struct ggml_tensor * attn_k_b = nullptr; + + struct ggml_tensor * attn_v = nullptr; + struct ggml_tensor * attn_v_b = nullptr; + + struct ggml_tensor * attn_o = nullptr; + struct ggml_tensor * attn_o_b = nullptr; + + // normalize + struct ggml_tensor * norm = nullptr; + struct ggml_tensor * norm_b = nullptr; +}; + +struct llama_layer_convnext { + struct ggml_tensor * dw; + struct ggml_tensor * dw_b; + + struct ggml_tensor * norm; + struct ggml_tensor * norm_b; + + struct ggml_tensor * pw1; + struct ggml_tensor * pw1_b; + + struct ggml_tensor * pw2; + struct ggml_tensor * pw2_b; + + struct ggml_tensor * gamma; +}; + // TODO: separate into "llama_layer_enc" and "llama_layer_dec" struct llama_layer { llama_layer() { @@ -2938,20 +2947,9 @@ struct llama_layer { struct ggml_tensor * ffn_up_scale; struct ggml_tensor * ffn_down_scale; - // convnext - struct ggml_tensor * convnext_dw; - struct ggml_tensor * convnext_dw_b; + struct llama_layer_posnet posnet; - struct ggml_tensor * convnext_norm; - struct ggml_tensor * convnext_norm_b; - - struct ggml_tensor * convnext_pw1; - struct ggml_tensor * convnext_pw1_b; - - struct ggml_tensor * convnext_pw2; - struct ggml_tensor * convnext_pw2_b; - - struct ggml_tensor * convnext_gamma; + struct llama_layer_convnext convnext; }; // very similar to llama_batch, @@ -3082,85 +3080,9 @@ struct llama_model { struct ggml_tensor * cls_out = nullptr; struct ggml_tensor * cls_out_b = nullptr; - // wavtokenizer decoder - // TODO: dedup struct ggml_tensor * conv_1d = nullptr; struct ggml_tensor * conv_1d_b = nullptr; - struct ggml_tensor * hann_window = nullptr; - - // resnet 0 - struct ggml_tensor * posnet_0_norm1 = nullptr; - struct ggml_tensor * posnet_0_norm1_b = nullptr; - - struct ggml_tensor * posnet_0_conv1 = nullptr; - struct ggml_tensor * posnet_0_conv1_b = nullptr; - - struct ggml_tensor * posnet_0_norm2 = nullptr; - struct ggml_tensor * posnet_0_norm2_b = nullptr; - - struct ggml_tensor * posnet_0_conv2 = nullptr; - struct ggml_tensor * posnet_0_conv2_b = nullptr; - - // resnet 1 - struct ggml_tensor * posnet_1_norm1 = nullptr; - struct ggml_tensor * posnet_1_norm1_b = nullptr; - - struct ggml_tensor * posnet_1_conv1 = nullptr; - struct ggml_tensor * posnet_1_conv1_b = nullptr; - - struct ggml_tensor * posnet_1_norm2 = nullptr; - struct ggml_tensor * posnet_1_norm2_b = nullptr; - - struct ggml_tensor * posnet_1_conv2 = nullptr; - struct ggml_tensor * posnet_1_conv2_b = nullptr; - - // attn 2 - struct ggml_tensor * posnet_2_attn_norm = nullptr; - struct ggml_tensor * posnet_2_attn_norm_b = nullptr; - - struct ggml_tensor * posnet_2_attn_q = nullptr; - struct ggml_tensor * posnet_2_attn_q_b = nullptr; - - struct ggml_tensor * posnet_2_attn_k = nullptr; - struct ggml_tensor * posnet_2_attn_k_b = nullptr; - - struct ggml_tensor * posnet_2_attn_v = nullptr; - struct ggml_tensor * posnet_2_attn_v_b = nullptr; - - struct ggml_tensor * posnet_2_attn_o = nullptr; - struct ggml_tensor * posnet_2_attn_o_b = nullptr; - - // resnet 3 - struct ggml_tensor * posnet_3_norm1 = nullptr; - struct ggml_tensor * posnet_3_norm1_b = nullptr; - - struct ggml_tensor * posnet_3_conv1 = nullptr; - struct ggml_tensor * posnet_3_conv1_b = nullptr; - - struct ggml_tensor * posnet_3_norm2 = nullptr; - struct ggml_tensor * posnet_3_norm2_b = nullptr; - - struct ggml_tensor * posnet_3_conv2 = nullptr; - struct ggml_tensor * posnet_3_conv2_b = nullptr; - - // resnet 4 - struct ggml_tensor * posnet_4_norm1 = nullptr; - struct ggml_tensor * posnet_4_norm1_b = nullptr; - - struct ggml_tensor * posnet_4_conv1 = nullptr; - struct ggml_tensor * posnet_4_conv1_b = nullptr; - - struct ggml_tensor * posnet_4_norm2 = nullptr; - struct ggml_tensor * posnet_4_norm2_b = nullptr; - - struct ggml_tensor * posnet_4_conv2 = nullptr; - struct ggml_tensor * posnet_4_conv2_b = nullptr; - - // resnet 5 - struct ggml_tensor * posnet_5_norm = nullptr; - struct ggml_tensor * posnet_5_norm_b = nullptr; - std::vector layers; // gguf metadata @@ -5705,8 +5627,12 @@ static void llm_load_hparams( if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) { ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); - ml.get_key(LLM_KV_POSNET_LENGTH, hparams.n_embd_posnet); - ml.get_key(LLM_KV_CONVNEXT_LENGTH, hparams.n_embd_convnext); + + ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd); + ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer); + + ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd); + ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer); } GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); @@ -7493,11 +7419,11 @@ static const std::map llm_tensor_info_mapping = { {LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CONV_NEXT_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, - {LLM_TENSOR_CONV_NEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CONV_NEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CONV_NEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CONV_NEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CONVNEXT_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, + {LLM_TENSOR_CONVNEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, }; // checks if the weight tensor can be used with the specified buffer type and device @@ -7738,7 +7664,8 @@ static bool llm_load_tensors( model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; - const int n_layer = hparams.n_layer; + const int n_layer = hparams.n_layer; + bool use_mmap_buffer = true; // build a list of buffer types for the CPU and GPU devices @@ -9574,107 +9501,105 @@ static bool llm_load_tensors( } break; case LLM_ARCH_WAVTOKENIZER_DEC: { - const int64_t n_embd_features = hparams.n_embd_features; - const int64_t n_embd_posnet = hparams.n_embd_posnet; - const int64_t n_embd_convnext = hparams.n_embd_convnext; + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0); - model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd_features, n_vocab}, 0); + model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0); + model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0); - model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd_posnet}, 0); - model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd_posnet}, 0); + // posnet + { + const int64_t n_embd = hparams.posnet.n_embd; - model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd_features, n_embd_posnet}, 0); - model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, n_embd_posnet}, 0); + for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) { + auto & layer = model.layers[i].posnet; - model.posnet_0_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {1, n_embd_posnet}, 0); - model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 0), {1, n_embd_posnet}, 0); + // posnet: + // + // - resnet + // - resnet + // - attn + // - resnet + // - resnet + // - norm + // + switch (i) { + case 0: + case 1: + case 3: + case 4: + { + layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0); + layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0); - model.posnet_0_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 0), {1, n_embd_posnet}, 0); + layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0); + layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0); - model.posnet_0_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {1, n_embd_posnet}, 0); - model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 0), {1, n_embd_posnet}, 0); + layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0); + layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0); - model.posnet_0_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 0), {1, n_embd_posnet}, 0); + layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0); + layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0); + } break; + case 2: + { + layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0); + layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0); - model.posnet_1_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {1, n_embd_posnet}, 0); - model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 1), {1, n_embd_posnet}, 0); + layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0); + layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0); - model.posnet_1_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 1), {1, n_embd_posnet}, 0); + layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0); + layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0); - model.posnet_1_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {1, n_embd_posnet}, 0); - model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 1), {1, n_embd_posnet}, 0); + layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0); + layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0); - model.posnet_1_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 1), {1, n_embd_posnet}, 0); - - model.posnet_2_attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {1, n_embd_posnet}, 0); - model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 2), {1, n_embd_posnet}, 0); - - model.posnet_2_attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", 2), {1, n_embd_posnet}, 0); - - model.posnet_2_attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", 2), {1, n_embd_posnet}, 0); - - model.posnet_2_attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", 2), {1, n_embd_posnet}, 0); - - model.posnet_2_attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", 2), {1, n_embd_posnet}, 0); - - model.posnet_3_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {1, n_embd_posnet}, 0); - model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 3), {1, n_embd_posnet}, 0); - - model.posnet_3_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 3), {1, n_embd_posnet}, 0); - - model.posnet_3_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {1, n_embd_posnet}, 0); - model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 3), {1, n_embd_posnet}, 0); - - model.posnet_3_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 3), {1, n_embd_posnet}, 0); - - model.posnet_4_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {1, n_embd_posnet}, 0); - model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 4), {1, n_embd_posnet}, 0); - - model.posnet_4_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 4), {1, n_embd_posnet}, 0); - - model.posnet_4_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {1, n_embd_posnet}, 0); - model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 4), {1, n_embd_posnet}, 0); - - model.posnet_4_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 4), {1, n_embd_posnet}, 0); - - model.posnet_5_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {1, n_embd_posnet}, 0); - model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 5), {1, n_embd_posnet}, 0); - - for (int i = 0; i < n_layer; ++i) { - auto & layer = model.layers[i]; - - layer.convnext_dw = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "weight", i), {7, 1, n_embd_convnext}, 0); - layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias", i), {1, n_embd_convnext}, 0); - - layer.convnext_norm = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "weight", i), {n_embd_convnext}, 0); - layer.convnext_norm_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "bias", i), {n_embd_convnext}, 0); - - layer.convnext_pw1 = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "weight", i), {n_embd_convnext, n_ff}, 0); - layer.convnext_pw1_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "bias", i), {n_ff}, 0); - - layer.convnext_pw2 = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "weight", i), {n_ff, n_embd_convnext}, 0); - layer.convnext_pw2_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "bias", i), {n_embd_convnext}, 0); - - layer.convnext_gamma = create_tensor(tn(LLM_TENSOR_CONV_NEXT_GAMMA, "weight", i), {n_embd_convnext}, 0); + layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0); + layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0); + } break; + case 5: + { + layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0); + layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0); + } break; + default: GGML_ABORT("unknown posnet layer"); + }; + } } - // output - model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd_convnext}, 0); - model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd_convnext}, 0); + GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd); - model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd_convnext, n_embd}, 0); + model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0); + model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0); + + // convnext + { + const int64_t n_embd = hparams.convnext.n_embd; + + for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) { + auto & layer = model.layers[i].convnext; + + layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0); + layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0); + + layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0); + layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0); + + layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0); + layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0); + + layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0); + layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0); + + layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0); + } + + // output + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); + } + + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0); model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0); } break; default: @@ -17266,156 +17191,82 @@ struct llm_build_context { cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1); cur = ggml_add(ctx0, cur, model.conv_1d_b); - inpL = cur; + // posnet + for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) { + const auto & layer = model.layers[il].posnet; - // resnet block 0 - { - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_0_norm1, - model.posnet_0_norm1_b, - LLM_NORM_GROUP, cb, 0); + inpL = cur; - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + switch (il) { + case 0: + case 1: + case 3: + case 4: + { + cur = llm_build_norm(ctx0, cur, hparams, + layer.norm1, + layer.norm1_b, + LLM_NORM_GROUP, cb, 0); - cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_0_conv1_b); + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_0_norm2, - model.posnet_0_norm2_b, - LLM_NORM_GROUP, cb, 0); + cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.conv1_b); - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + cur = llm_build_norm(ctx0, cur, hparams, + layer.norm2, + layer.norm2_b, + LLM_NORM_GROUP, cb, 0); - cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_0_conv2_b); + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - cur = ggml_add(ctx0, cur, inpL); - } + cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.conv2_b); - inpL = cur; + cur = ggml_add(ctx0, cur, inpL); + } break; + case 2: + { + cur = llm_build_norm(ctx0, cur, hparams, + layer.attn_norm, + layer.attn_norm_b, + LLM_NORM_GROUP, cb, 0); - // resnet block 1 - { - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_1_norm1, - model.posnet_1_norm1_b, - LLM_NORM_GROUP, cb, 0); + struct ggml_tensor * q; + struct ggml_tensor * k; + struct ggml_tensor * v; - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1); + k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1); + v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1); - cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_1_conv1_b); + q = ggml_add(ctx0, q, layer.attn_q_b); + k = ggml_add(ctx0, k, layer.attn_k_b); + v = ggml_add(ctx0, v, layer.attn_v_b); - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_1_norm2, - model.posnet_1_norm2_b, - LLM_NORM_GROUP, cb, 0); + q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); + k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_1_conv2_b); + kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f); - cur = ggml_add(ctx0, cur, inpL); - } + cur = ggml_mul_mat(ctx0, kq, v); - inpL = cur; + cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.attn_o_b); - // attention block - { - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_2_attn_norm, - model.posnet_2_attn_norm_b, - LLM_NORM_GROUP, cb, 0); - - struct ggml_tensor * q; - struct ggml_tensor * k; - struct ggml_tensor * v; - - q = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_q, cur, 1, 1); - k = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_k, cur, 1, 1); - v = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_v, cur, 1, 1); - - q = ggml_add(ctx0, q, model.posnet_2_attn_q_b); - k = ggml_add(ctx0, k, model.posnet_2_attn_k_b); - v = ggml_add(ctx0, v, model.posnet_2_attn_v_b); - - q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); - k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); - - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - - kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(model.hparams.n_embd_posnet)), 0.0f); - - cur = ggml_mul_mat(ctx0, kq, v); - - cur = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_o, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_2_attn_o_b); - - cur = ggml_add(ctx0, cur, inpL); - } - - inpL = cur; - - // resnet block 3 - { - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_3_norm1, - model.posnet_3_norm1_b, - LLM_NORM_GROUP, cb, 0); - - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - - cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_3_conv1_b); - - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_3_norm2, - model.posnet_3_norm2_b, - LLM_NORM_GROUP, cb, 0); - - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - - cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_3_conv2_b); - - cur = ggml_add(ctx0, cur, inpL); - } - - inpL = cur; - - // resnet block 4 - { - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_4_norm1, - model.posnet_4_norm1_b, - LLM_NORM_GROUP, cb, 0); - - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - - cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_4_conv1_b); - - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_4_norm2, - model.posnet_4_norm2_b, - LLM_NORM_GROUP, cb, 0); - - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - - cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_4_conv2_b); - - cur = ggml_add(ctx0, cur, inpL); - } - - // normalize block 5 - { - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_5_norm, - model.posnet_5_norm_b, - LLM_NORM_GROUP, cb, 0); + cur = ggml_add(ctx0, cur, inpL); + } break; + case 5: + { + cur = llm_build_norm(ctx0, cur, hparams, + layer.norm, + layer.norm_b, + LLM_NORM_GROUP, cb, 0); + } break; + default: GGML_ABORT("unknown posnet layer"); + }; } cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); @@ -17429,27 +17280,30 @@ struct llm_build_context { inpL = cur; - for (int il = 0; il < n_layer; ++il) { + // convnext + for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) { + const auto & layer = model.layers[il].convnext; + cur = inpL; - cur = ggml_conv_1d_dw_ph(ctx0, model.layers[il].convnext_dw, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.layers[il].convnext_dw_b); + cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.dw_b); cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].convnext_norm, - model.layers[il].convnext_norm_b, + layer.norm, + layer.norm_b, LLM_NORM, cb, -1); cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].convnext_pw1, model.layers[il].convnext_pw1_b, NULL, - NULL, NULL, NULL, - model.layers[il].convnext_pw2, model.layers[il].convnext_pw2_b, NULL, + layer.pw1, layer.pw1_b, NULL, + NULL, NULL, NULL, + layer.pw2, layer.pw2_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cur = ggml_mul(ctx0, cur, model.layers[il].convnext_gamma); + cur = ggml_mul(ctx0, cur, layer.gamma); cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));