llama: rwkv6: Make use of key `feed_forward_length`

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
This commit is contained in:
Molly Sophia 2024-08-25 16:16:29 +08:00
parent 87a29014a4
commit c414a24a5a
2 changed files with 3 additions and 3 deletions

View file

@ -2757,6 +2757,7 @@ class Rwkv6Model(Model):
hidden_size = self.hparams["hidden_size"] hidden_size = self.hparams["hidden_size"]
layer_norm_eps = self.hparams["layer_norm_epsilon"] layer_norm_eps = self.hparams["layer_norm_epsilon"]
rescale_every_n_layers = self.hparams["rescale_every"] rescale_every_n_layers = self.hparams["rescale_every"]
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
# RWKV isn't context limited # RWKV isn't context limited
self.gguf_writer.add_context_length(1048576) self.gguf_writer.add_context_length(1048576)
@ -2765,11 +2766,11 @@ class Rwkv6Model(Model):
self.gguf_writer.add_layer_norm_eps(layer_norm_eps) self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
self.gguf_writer.add_wkv_head_size(head_size) self.gguf_writer.add_wkv_head_size(head_size)
self.gguf_writer.add_feed_forward_length(intermediate_size)
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
# required by llama.cpp, unused # required by llama.cpp, unused
self.gguf_writer.add_head_count(0) self.gguf_writer.add_head_count(0)
self.gguf_writer.add_feed_forward_length(0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name) new_name = self.map_tensor_name(name)

View file

@ -8369,7 +8369,7 @@ static bool llm_load_tensors(
const int time_decay_extra_dim = (n_embd == 4096) ? 128 : 64; const int time_decay_extra_dim = (n_embd == 4096) ? 128 : 64;
const int head_size = hparams.wkv_head_size; const int head_size = hparams.wkv_head_size;
const int attn_hidden_size = n_embd; const int attn_hidden_size = n_embd;
const int ffn_size = (int)(n_embd * 3.5 / 32) * 32; const int ffn_size = hparams.n_ff_arr[0];
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_layer = ctx_for_layer(i); ggml_context * ctx_layer = ctx_for_layer(i);
@ -8392,7 +8392,6 @@ static bool llm_load_tensors(
layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}); layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}); layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1});
// TODO: Parametrize hardcoded dimensions for first & decay
layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}); layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size});
layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}); layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd});
layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}); layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim});