Add attention and final logit softcapping.

2024-06-28 15:42:19 -04:00 · 2024-06-28 15:42:19 -04:00 · 4d3f17b4ac
commit 4d3f17b4ac
parent 8748d8ac6f
4 changed files with 38 additions and 2 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -2363,6 +2363,14 @@ class Gemma2Model(Model):
        self.gguf_writer.add_key_length(hparams["head_dim"])
        self.gguf_writer.add_value_length(hparams["head_dim"])
        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_float32(
+            gguf.Keys.LLM.ATTN_LOGIT_SOFTCAPPING.format(arch=self.model_arch),
+            self.hparams["attn_logit_softcapping"]
+        )
+        self.gguf_writer.add_float32(
+            gguf.Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.model_arch),
+            self.hparams["final_logit_softcapping"]
+        )

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unusem
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -50,6 +50,8 @@ class Keys:
        POOLING_TYPE                      = "{arch}.pooling_type"
        LOGIT_SCALE                       = "{arch}.logit_scale"
        DECODER_START_TOKEN_ID            = "{arch}.decoder_start_token_id"
+        ATTN_LOGIT_SOFTCAPPING            = "{arch}.attn_logit_softcapping"
+        FINAL_LOGIT_SOFTCAPPING           = "{arch}.final_logit_softcapping"

    class Attention:
        HEAD_COUNT        = "{arch}.attention.head_count"
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -627,6 +627,9 @@ class GGUFWriter:
    def add_mask_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.MASK_ID, id)

+    def add_eot_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.EOT_ID, id)
+
    def add_add_bos_token(self, value: bool) -> None:
        self.add_bool(Keys.Tokenizer.ADD_BOS, value)

--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -302,6 +302,8 @@ enum llm_kv {
    LLM_KV_POOLING_TYPE,
    LLM_KV_LOGIT_SCALE,
    LLM_KV_DECODER_START_TOKEN_ID,
+    LLM_KV_ATTN_LOGIT_SOFTCAPPING,
+    LLM_KV_FINAL_LOGIT_SOFTCAPPING,

    LLM_KV_ATTENTION_HEAD_COUNT,
    LLM_KV_ATTENTION_HEAD_COUNT_KV,
@ -392,6 +394,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_POOLING_TYPE ,                     "%s.pooling_type"                      },
    { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
    { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
+    { LLM_KV_ATTN_LOGIT_SOFTCAPPING,            "%s.attn_logit_softcapping"            },
+    { LLM_KV_FINAL_LOGIT_SOFTCAPPING,           "%s.final_logit_softcapping"           },

    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
@ -2099,6 +2103,9 @@ struct llama_hparams {
    float f_norm_eps;
    float f_norm_rms_eps;

+    float f_attn_logit_softcapping;
+    float f_final_logit_softcapping;
+
    float    rope_attn_factor = 1.0f;
    float    rope_freq_base_train;
    float    rope_freq_scale_train;
@ -2115,8 +2122,9 @@ struct llama_hparams {
    float f_max_alibi_bias = 0.0f;
    float f_logit_scale    = 0.0f;

-    bool causal_attn = true;
-    bool use_alibi   = false;
+    bool causal_attn   = true;
+    bool use_alibi     = false;
+    bool attn_soft_cap = false;

    enum llama_pooling_type      pooling_type            = LLAMA_POOLING_TYPE_NONE;
    enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
@ -4702,6 +4710,9 @@ static void llm_load_hparams(
        case LLM_ARCH_GEMMA2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping);
+                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping);
+                hparams.attn_soft_cap = true;

                switch (hparams.n_layer) {
                    case 42: model.type = e_model::MODEL_9B; break;
@ -7579,6 +7590,12 @@ static struct ggml_tensor * llm_build_kqv(
            kq = ggml_scale(ctx, kq, 30);
        }

+        if (hparams.attn_soft_cap) {
+            kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping);
+            kq = ggml_tanh(ctx, kq);
+            kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping);
+        }
+
        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
        cb(kq, "kq_soft_max_ext", il);

@ -11106,6 +11123,12 @@ struct llm_build_context {

        // lm_head
        cur = ggml_mul_mat(ctx0, model.output, cur);
+
+        // final logit soft-capping
+        cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
+        cur = ggml_tanh(ctx0, cur);
+        cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
+
        cb(cur, "result_output", -1);

        ggml_build_forward_expand(gf, cur);