llama: rwkv6: Keep `time_mix_w1/w2` as F32

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
2024-08-26 09:32:16 +08:00 · 2024-08-26 09:32:16 +08:00 · e0ea51144e
commit e0ea51144e
parent 601b5920c6
2 changed files with 4 additions and 0 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -300,6 +300,8 @@ class Model:
                            gguf.MODEL_TENSOR.TOKEN_TYPES,
                            gguf.MODEL_TENSOR.SSM_CONV1D,
                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
+                            gguf.MODEL_TENSOR.TIME_MIX_W1,
+                            gguf.MODEL_TENSOR.TIME_MIX_W2,
                        )
                    )
                    or not new_name.endswith(".weight")
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -17472,6 +17472,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

        // do not quantize RWKV's time_mix_first tensors
        quantize &= name.find("time_mix_first.weight") == std::string::npos;
+        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_w2.weight") == std::string::npos;

        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;