Remove custom pre attention scaling and use computed value instead.

2024-06-29 23:02:50 -04:00 · 2024-06-29 23:02:50 -04:00 · 51f0bd50a1
commit 51f0bd50a1
parent a89427908d
4 changed files with 2 additions and 13 deletions
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -52,7 +52,6 @@ class Keys:
        DECODER_START_TOKEN_ID            = "{arch}.decoder_start_token_id"
        ATTN_LOGIT_SOFTCAPPING            = "{arch}.attn_logit_softcapping"
        FINAL_LOGIT_SOFTCAPPING           = "{arch}.final_logit_softcapping"
-        QUERY_PRE_ATTN_SCALAR             = "{arch}.query_pre_attn_scalar"

    class Attention:
        HEAD_COUNT        = "{arch}.attention.head_count"
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -522,9 +522,6 @@ class GGUFWriter:
    def add_final_logit_softcapping(self, value: float) -> None:
        self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value)

-    def add_query_pre_attn_scalar(self, value: float) -> None:
-        self.add_float32(Keys.LLM.QUERY_PRE_ATTN_SCALAR.format(arch=self.arch), value)
-
    def add_expert_count(self, count: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)