diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 93d046e8d..48a4e0e21 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3480,7 +3480,7 @@ class RWKV6Qwen2Model(Rwkv6Model): yield (new_name, data) -@Model.register("Rwkv7ForCausalLM") +@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") class Rwkv7Model(Rwkv6Model): model_arch = gguf.MODEL_ARCH.RWKV7 @@ -3489,16 +3489,26 @@ class Rwkv7Model(Rwkv6Model): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] - head_size = self.hparams["head_size"] + try: + head_size = self.hparams["head_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + except KeyError: + head_size = self.hparams["head_dim"] + layer_norm_eps = self.hparams["norm_eps"] hidden_size = self.hparams["hidden_size"] - layer_norm_eps = self.hparams["layer_norm_epsilon"] intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4) # ICLR: In-Context-Learning-Rate - lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) - lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) - lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) - lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + try: + lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + except KeyError: + lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) @@ -3517,17 +3527,43 @@ class Rwkv7Model(Rwkv6Model): self.gguf_writer.add_head_count(0) lerp_weights: dict[int, dict[str, Tensor]] = {} + lora_needs_transpose: bool = True def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # unify tensor names here to make life easier + name = name.replace("blocks", "layers").replace("ffn", "feed_forward") + name = name.replace("self_attn", "attention").replace("attn", "attention") + name = name.replace("time_mixer.", "") + # lora layer names in fla-hub's impl + if "_lora.lora" in name: + self.lora_needs_transpose = False + name = name.replace("_lora.lora.0.weight", "1.weight") + name = name.replace("_lora.lora.2.weight", "2.weight") + name = name.replace("_lora.lora.2.bias", "0.weight") + + name = name.replace("feed_forward_norm", "ln2") + name = name.replace("g_norm", "ln_x") + + if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0: + # some models have dummy v0/v1/v2 on first layer while others don't + # ignore them all since they are not used + return + if bid is not None and "attention.x_" in name: - try: - self.lerp_weights[bid][name] = data_torch - except KeyError: - self.lerp_weights[bid] = {name: data_torch} - if all(f"model.blocks.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in ["r", "w", "k", "v", "a", "g"]): + if "attention.x_x" in name: + # already concatenated new_name = f"blk.{bid}.time_mix_lerp_fused.weight" - data = torch.stack([self.lerp_weights[bid][f"model.blocks.{bid}.attention.x_{i}"].squeeze(0) for i in ["r", "w", "k", "v", "a", "g"]], dim=0) + data = data_torch.reshape(6, 1, -1) yield (new_name, data) + else: + try: + self.lerp_weights[bid][name] = data_torch + except KeyError: + self.lerp_weights[bid] = {name: data_torch} + if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in ["r", "w", "k", "v", "a", "g"]): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"].squeeze(0) for i in ["r", "w", "k", "v", "a", "g"]], dim=0) + yield (new_name, data) return else: data_torch = data_torch.squeeze() @@ -3536,7 +3572,7 @@ class Rwkv7Model(Rwkv6Model): if not (new_name.endswith(".weight") or new_name.endswith(".bias")): new_name += ".weight" - if any( + if self.lora_needs_transpose and any( new_name.endswith(t) for t in [ "time_mix_w1.weight", "time_mix_w2.weight", "time_mix_a1.weight", "time_mix_a2.weight", @@ -3558,7 +3594,7 @@ class Rwkv7Model(Rwkv6Model): @Model.register("RwkvHybridForCausalLM") -class ARwkv7Model(Model): +class ARwkv7Model(Rwkv7Model): model_arch = gguf.MODEL_ARCH.ARWKV7 def set_vocab(self): @@ -3599,41 +3635,6 @@ class ARwkv7Model(Model): # required by llama.cpp, unused self.gguf_writer.add_head_count(0) - lerp_weights: dict[int, dict[str, Tensor]] = {} - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if bid is not None and "self_attn.time_mixer.x_" in name: - try: - self.lerp_weights[bid][name] = data_torch - except KeyError: - self.lerp_weights[bid] = {name: data_torch} - if all(f"model.layers.{bid}.self_attn.time_mixer.x_{i}" in self.lerp_weights[bid].keys() for i in ["r", "w", "k", "v", "a", "g"]): - new_name = f"blk.{bid}.time_mix_lerp_fused.weight" - data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.self_attn.time_mixer.x_{i}"].squeeze(0) for i in ["r", "w", "k", "v", "a", "g"]], dim=0) - yield (new_name, data) - return - else: - data_torch = data_torch.squeeze() - new_name = self.map_tensor_name(name) - - if not (new_name.endswith(".weight") or new_name.endswith(".bias")): - new_name += ".weight" - - if any( - new_name.endswith(t) for t in [ - "time_mix_w1.weight", "time_mix_w2.weight", - "time_mix_a1.weight", "time_mix_a2.weight", - "time_mix_v1.weight", "time_mix_v2.weight", - "time_mix_g1.weight", "time_mix_g2.weight", - ] - ): - data_torch = data_torch.transpose(0, 1) - - if 'r_k' in new_name: - data_torch = data_torch.flatten() - - yield (new_name, data_torch) - @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") class MambaModel(Model): diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 77dc62256..cb041c89c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -44,6 +44,7 @@ class TensorNameMap: "transformer.norm", # openelm "rwkv.blocks.0.pre_ln", # rwkv6 "model.pre_ln", # rwkv7 + "model.layers.0.pre_norm", # rwkv7 "backbone.norm", # wavtokenizer ), @@ -126,7 +127,7 @@ class TensorNameMap: "encoder.layers.{bid}.input_layernorm", # chatglm "transformer.layers.{bid}.attn_norm", # openelm "rwkv.blocks.{bid}.ln1", # rwkv6 - "model.blocks.{bid}.ln1", # rwkv7 + "model.layers.{bid}.ln1", # rwkv7 ), # Attention norm 2 @@ -134,7 +135,7 @@ class TensorNameMap: "transformer.h.{bid}.ln_attn", # falcon40b "encoder.layer.{bid}.layer_norm_1", # jina-v2-code "rwkv.blocks.{bid}.ln2", # rwkv6 - "model.blocks.{bid}.ln2", # rwkv7 + "model.layers.{bid}.ln2", # rwkv7 ), # Attention query-key-value @@ -468,77 +469,63 @@ class TensorNameMap: ), MODEL_TENSOR.TIME_MIX_W0: ( - "model.blocks.{bid}.attention.w0", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.w0", # arwkv7 + "model.layers.{bid}.attention.w0", # rwkv7 ), MODEL_TENSOR.TIME_MIX_W1: ( "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv6 "model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2 - "model.blocks.{bid}.attention.w1", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.w1", # arwkv7 + "model.layers.{bid}.attention.w1", # rwkv7 ), MODEL_TENSOR.TIME_MIX_W2: ( "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv6 "model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2 - "model.blocks.{bid}.attention.w2", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.w2", # arwkv7 + "model.layers.{bid}.attention.w2", # rwkv7 ), MODEL_TENSOR.TIME_MIX_A0: ( - "model.blocks.{bid}.attention.a0", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.a0", # arwkv7 + "model.layers.{bid}.attention.a0", # rwkv7 ), MODEL_TENSOR.TIME_MIX_A1: ( - "model.blocks.{bid}.attention.a1", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.a1", # arwkv7 + "model.layers.{bid}.attention.a1", # rwkv7 ), MODEL_TENSOR.TIME_MIX_A2: ( - "model.blocks.{bid}.attention.a2", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.a2", # arwkv7 + "model.layers.{bid}.attention.a2", # rwkv7 ), MODEL_TENSOR.TIME_MIX_V0: ( - "model.blocks.{bid}.attention.v0", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.v0", # arwkv7 + "model.layers.{bid}.attention.v0", # rwkv7 ), MODEL_TENSOR.TIME_MIX_V1: ( - "model.blocks.{bid}.attention.v1", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.v1", # arwkv7 + "model.layers.{bid}.attention.v1", # rwkv7 ), MODEL_TENSOR.TIME_MIX_V2: ( - "model.blocks.{bid}.attention.v2", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.v2", # arwkv7 + "model.layers.{bid}.attention.v2", # rwkv7 ), MODEL_TENSOR.TIME_MIX_G1: ( - "model.blocks.{bid}.attention.g1", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.g1", # arwkv7 + "model.layers.{bid}.attention.g1", # rwkv7 ), MODEL_TENSOR.TIME_MIX_G2: ( - "model.blocks.{bid}.attention.g2", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.g2", # arwkv7 + "model.layers.{bid}.attention.g2", # rwkv7 ), MODEL_TENSOR.TIME_MIX_K_K: ( - "model.blocks.{bid}.attention.k_k", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.k_k", # arwkv7 + "model.layers.{bid}.attention.k_k", # rwkv7 ), MODEL_TENSOR.TIME_MIX_K_A: ( - "model.blocks.{bid}.attention.k_a", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.k_a", # arwkv7 + "model.layers.{bid}.attention.k_a", # rwkv7 ), MODEL_TENSOR.TIME_MIX_R_K: ( - "model.blocks.{bid}.attention.r_k", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.r_k", # arwkv7 + "model.layers.{bid}.attention.r_k", # rwkv7 ), MODEL_TENSOR.TIME_MIX_LERP_X: ( @@ -591,47 +578,46 @@ class TensorNameMap: ), MODEL_TENSOR.TIME_MIX_KEY: ( - "rwkv.blocks.{bid}.attention.key", # rwkv6 - "model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2 - "model.blocks.{bid}.attention.key", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.key.weight", # arwkv7 + "rwkv.blocks.{bid}.attention.key", # rwkv6 + "model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.key", # rwkv7 + "model.layers.{bid}.attention.k_proj", # rwkv7 ), MODEL_TENSOR.TIME_MIX_VALUE: ( - "rwkv.blocks.{bid}.attention.value", # rwkv6 - "model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2 - "model.blocks.{bid}.attention.value", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.value.weight", # arwkv7 + "rwkv.blocks.{bid}.attention.value", # rwkv6 + "model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.value", # rwkv7 + "model.layers.{bid}.attention.v_proj", # rwkv7 ), MODEL_TENSOR.TIME_MIX_RECEPTANCE: ( - "rwkv.blocks.{bid}.attention.receptance", # rwkv6 - "model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2 - "model.blocks.{bid}.attention.receptance", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.receptance.weight", # arwkv7 + "rwkv.blocks.{bid}.attention.receptance", # rwkv6 + "model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.receptance", # rwkv7 + "model.layers.{bid}.attention.r_proj", # rwkv7 ), MODEL_TENSOR.TIME_MIX_GATE: ( "rwkv.blocks.{bid}.attention.gate", # rwkv6 "model.layers.{bid}.self_attn.gate", # rwkv6qwen2 - "model.layers.{bid}.self_attn.time_mixer.gate.weight", # arwkv7 ), MODEL_TENSOR.TIME_MIX_LN: ( "rwkv.blocks.{bid}.attention.ln_x", # rwkv6 - "model.blocks.{bid}.attention.ln_x" # rwkv7 + "model.layers.{bid}.attention.ln_x" # rwkv7 ), MODEL_TENSOR.TIME_MIX_OUTPUT: ( - "rwkv.blocks.{bid}.attention.output", # rwkv - "model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2 - "model.blocks.{bid}.attention.output", # rwkv7 - "model.layers.{bid}.self_attn.time_mixer.output.weight", # arwkv7 + "rwkv.blocks.{bid}.attention.output", # rwkv + "model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.output", # rwkv7 + "model.layers.{bid}.attention.o_proj", # rwkv7 ), MODEL_TENSOR.CHANNEL_MIX_LERP_K: ( "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv6 - "model.blocks.{bid}.feed_forward.x_k", # rwkv7 + "model.layers.{bid}.feed_forward.x_k", # rwkv7 ), MODEL_TENSOR.CHANNEL_MIX_LERP_R: ( @@ -640,7 +626,7 @@ class TensorNameMap: MODEL_TENSOR.CHANNEL_MIX_KEY: ( "rwkv.blocks.{bid}.feed_forward.key", # rwkv6 - "model.blocks.{bid}.feed_forward.key", # rwkv7 + "model.layers.{bid}.feed_forward.key", # rwkv7 ), MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: ( @@ -649,7 +635,7 @@ class TensorNameMap: MODEL_TENSOR.CHANNEL_MIX_VALUE: ( "rwkv.blocks.{bid}.feed_forward.value", # rwkv6 - "model.blocks.{bid}.feed_forward.value", # rwkv7 + "model.layers.{bid}.feed_forward.value", # rwkv7 ), MODEL_TENSOR.ATTN_Q_A: ( diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4f28c5b59..16e404c0e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3396,9 +3396,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0); layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0); - layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0); - layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0); - layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0); + if (i == 0) { + // actually not used + layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0); + layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0); + layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0); + } else { + layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0); + layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0); + layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0); + } layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);