From 8d498c7075f6e304b2f3e688f3a706ec11d533dc Mon Sep 17 00:00:00 2001 From: Molly Sophia Date: Tue, 6 Aug 2024 18:53:27 +0800 Subject: [PATCH] Add ``rescale_every_n_layers`` parameter Signed-off-by: Molly Sophia --- convert_hf_to_gguf.py | 4 ++-- gguf-py/gguf/constants.py | 1 + gguf-py/gguf/gguf_writer.py | 3 +++ src/llama.cpp | 11 ++++++++++- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d109857a2..65165b764 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2754,6 +2754,7 @@ class RwkvModel(Model): head_size = self.hparams["head_size"] hidden_size = self.hparams["hidden_size"] layer_norm_eps = self.hparams["layer_norm_epsilon"] + rescale_every_n_layers = self.hparams["rescale_every"] # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) @@ -2762,14 +2763,13 @@ class RwkvModel(Model): self.gguf_writer.add_head_count(0) self.gguf_writer.add_layer_norm_eps(layer_norm_eps) self.gguf_writer.add_feed_forward_length(0) # required by llama.cpp + self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) # temporarlily reuse mamba hparams self.gguf_writer.add_ssm_inner_size(hidden_size) self.gguf_writer.add_ssm_conv_kernel(3) self.gguf_writer.add_ssm_state_size(head_size) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - new_name = self.map_tensor_name(name) if not (new_name.endswith(".weight") or new_name.endswith(".bias")): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b6f29ba9e..a6883b392 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -94,6 +94,7 @@ class Keys: DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" + RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" class Attention: HEAD_COUNT = "{arch}.attention.head_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index af3b98c67..6bc3782c3 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -670,6 +670,9 @@ class GGUFWriter: def add_expert_weights_scale(self, value: float) -> None: self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value) + def add_rescale_every_n_layers(self, count: int) -> None: + self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count) + def add_layer_norm_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value) diff --git a/src/llama.cpp b/src/llama.cpp index c43776acd..bfc292f59 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -297,6 +297,7 @@ enum llm_kv { LLM_KV_DECODER_START_TOKEN_ID, LLM_KV_ATTN_LOGIT_SOFTCAPPING, LLM_KV_FINAL_LOGIT_SOFTCAPPING, + LLM_KV_RESCALE_EVERY_N_LAYERS, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -391,11 +392,12 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" }, { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, - { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, + { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, + { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -2287,6 +2289,9 @@ struct llama_hparams { float f_attn_logit_softcapping = 50.0f; float f_final_logit_softcapping = 30.0f; + // for RWKV + uint32_t rescale_every_n_layers = 0; + float rope_attn_factor = 1.0f; float rope_freq_base_train; float rope_freq_scale_train; @@ -5883,6 +5888,7 @@ static void llm_load_hparams( case LLM_ARCH_RWKV: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); // TODO: Re-using mamba keys right now, but RWKV isn't state-space ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); @@ -15130,6 +15136,9 @@ struct llm_build_context { ffn_shift ) ); + if ((layer_i + 1) % hparams.rescale_every_n_layers == 0) { + x = ggml_scale_inplace(ctx0, x, 0.5F); + } } // Something related to skipping tokens, specifics unclear