llama: rwkv6: Add kv `time_mix_extra_dim
and
time_decay_extra_dim
`
Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
This commit is contained in:
parent
c414a24a5a
commit
6d69fd77b1
4 changed files with 24 additions and 2 deletions
|
@ -2758,6 +2758,8 @@ class Rwkv6Model(Model):
|
||||||
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
||||||
rescale_every_n_layers = self.hparams["rescale_every"]
|
rescale_every_n_layers = self.hparams["rescale_every"]
|
||||||
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
|
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
|
||||||
|
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
|
||||||
|
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
|
||||||
|
|
||||||
# RWKV isn't context limited
|
# RWKV isn't context limited
|
||||||
self.gguf_writer.add_context_length(1048576)
|
self.gguf_writer.add_context_length(1048576)
|
||||||
|
@ -2766,6 +2768,8 @@ class Rwkv6Model(Model):
|
||||||
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
||||||
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
|
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
|
||||||
self.gguf_writer.add_wkv_head_size(head_size)
|
self.gguf_writer.add_wkv_head_size(head_size)
|
||||||
|
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
||||||
|
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
||||||
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
|
|
@ -95,6 +95,8 @@ class Keys:
|
||||||
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
||||||
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
||||||
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
|
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
|
||||||
|
TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
|
||||||
|
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
|
|
@ -673,6 +673,12 @@ class GGUFWriter:
|
||||||
def add_rescale_every_n_layers(self, count: int) -> None:
|
def add_rescale_every_n_layers(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
|
self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_time_mix_extra_dim(self, dim: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim)
|
||||||
|
|
||||||
|
def add_time_decay_extra_dim(self, dim: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
|
||||||
|
|
||||||
def add_wkv_head_size(self, size: int) -> None:
|
def add_wkv_head_size(self, size: int) -> None:
|
||||||
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
|
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
|
||||||
|
|
||||||
|
|
|
@ -298,6 +298,8 @@ enum llm_kv {
|
||||||
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
||||||
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
||||||
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
||||||
|
LLM_KV_TIME_MIX_EXTRA_DIM,
|
||||||
|
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
||||||
|
|
||||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||||
|
@ -400,6 +402,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
||||||
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
||||||
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
||||||
|
{ LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
|
||||||
|
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
|
||||||
|
|
||||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||||
|
@ -2296,6 +2300,8 @@ struct llama_hparams {
|
||||||
|
|
||||||
// for RWKV
|
// for RWKV
|
||||||
uint32_t rescale_every_n_layers = 0;
|
uint32_t rescale_every_n_layers = 0;
|
||||||
|
uint32_t time_mix_extra_dim = 0;
|
||||||
|
uint32_t time_decay_extra_dim = 0;
|
||||||
uint32_t wkv_head_size = 0;
|
uint32_t wkv_head_size = 0;
|
||||||
|
|
||||||
float rope_attn_factor = 1.0f;
|
float rope_attn_factor = 1.0f;
|
||||||
|
@ -2362,6 +2368,8 @@ struct llama_hparams {
|
||||||
if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
|
if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
|
||||||
|
|
||||||
if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
|
if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
|
||||||
|
if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
|
||||||
|
if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
|
||||||
if (this->wkv_head_size != other.wkv_head_size) return true;
|
if (this->wkv_head_size != other.wkv_head_size) return true;
|
||||||
|
|
||||||
if (this->dec_start_token_id != other.dec_start_token_id) return true;
|
if (this->dec_start_token_id != other.dec_start_token_id) return true;
|
||||||
|
@ -5909,6 +5917,8 @@ static void llm_load_hparams(
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
||||||
|
ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
|
||||||
|
ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
||||||
ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
|
ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
|
@ -8365,8 +8375,8 @@ static bool llm_load_tensors(
|
||||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
||||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||||
|
|
||||||
const int time_mix_extra_dim = (n_embd == 4096) ? 64 : 32;
|
const int time_mix_extra_dim = hparams.time_mix_extra_dim;
|
||||||
const int time_decay_extra_dim = (n_embd == 4096) ? 128 : 64;
|
const int time_decay_extra_dim = hparams.time_decay_extra_dim;
|
||||||
const int head_size = hparams.wkv_head_size;
|
const int head_size = hparams.wkv_head_size;
|
||||||
const int attn_hidden_size = n_embd;
|
const int attn_hidden_size = n_embd;
|
||||||
const int ffn_size = hparams.n_ff_arr[0];
|
const int ffn_size = hparams.n_ff_arr[0];
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue