diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 941697d6d..a39f2dc73 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2758,6 +2758,8 @@ class Rwkv6Model(Model): layer_norm_eps = self.hparams["layer_norm_epsilon"] rescale_every_n_layers = self.hparams["rescale_every"] intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32) + time_mix_extra_dim = 64 if hidden_size == 4096 else 32 + time_decay_extra_dim = 128 if hidden_size == 4096 else 64 # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) @@ -2766,6 +2768,8 @@ class Rwkv6Model(Model): self.gguf_writer.add_layer_norm_eps(layer_norm_eps) self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) self.gguf_writer.add_feed_forward_length(intermediate_size) self.gguf_writer.add_file_type(self.ftype) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ebeb200aa..a48c4fb67 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -95,6 +95,8 @@ class Keys: ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" + TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" + TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" class Attention: HEAD_COUNT = "{arch}.attention.head_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 0388db567..3c95c2673 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -673,6 +673,12 @@ class GGUFWriter: def add_rescale_every_n_layers(self, count: int) -> None: self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count) + def add_time_mix_extra_dim(self, dim: int) -> None: + self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim) + + def add_time_decay_extra_dim(self, dim: int) -> None: + self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim) + def add_wkv_head_size(self, size: int) -> None: self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) diff --git a/src/llama.cpp b/src/llama.cpp index a0b9dbc87..62e8c0c34 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -298,6 +298,8 @@ enum llm_kv { LLM_KV_ATTN_LOGIT_SOFTCAPPING, LLM_KV_FINAL_LOGIT_SOFTCAPPING, LLM_KV_RESCALE_EVERY_N_LAYERS, + LLM_KV_TIME_MIX_EXTRA_DIM, + LLM_KV_TIME_DECAY_EXTRA_DIM, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -400,6 +402,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, + { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" }, + { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -2296,6 +2300,8 @@ struct llama_hparams { // for RWKV uint32_t rescale_every_n_layers = 0; + uint32_t time_mix_extra_dim = 0; + uint32_t time_decay_extra_dim = 0; uint32_t wkv_head_size = 0; float rope_attn_factor = 1.0f; @@ -2362,6 +2368,8 @@ struct llama_hparams { if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true; if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true; + if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true; + if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true; if (this->wkv_head_size != other.wkv_head_size) return true; if (this->dec_start_token_id != other.dec_start_token_id) return true; @@ -5909,6 +5917,8 @@ static void llm_load_hparams( { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size); + ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim); + ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim); ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); switch (hparams.n_layer) { @@ -8365,8 +8375,8 @@ static bool llm_load_tensors( model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); - const int time_mix_extra_dim = (n_embd == 4096) ? 64 : 32; - const int time_decay_extra_dim = (n_embd == 4096) ? 128 : 64; + const int time_mix_extra_dim = hparams.time_mix_extra_dim; + const int time_decay_extra_dim = hparams.time_decay_extra_dim; const int head_size = hparams.wkv_head_size; const int attn_hidden_size = n_embd; const int ffn_size = hparams.n_ff_arr[0];