better naming

2024-06-30 22:27:47 +02:00 · 2024-06-30 22:27:47 +02:00 · 46b56e6768
commit 46b56e6768
parent ab2c3de9b3
4 changed files with 12 additions and 7 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -2369,6 +2369,7 @@ class Gemma2Model(Model):
        self.gguf_writer.add_final_logit_softcapping(
            self.hparams["final_logit_softcapping"]
        )
+        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unusem
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -66,6 +66,7 @@ class Keys:
        Q_LORA_RANK       = "{arch}.attention.q_lora_rank"
        KV_LORA_RANK      = "{arch}.attention.kv_lora_rank"
        REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
+        SLIDING_WINDOW    = "{arch}.attention.sliding_window"

    class Rope:
        DIMENSION_COUNT         = "{arch}.rope.dimension_count"
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -552,6 +552,9 @@ class GGUFWriter:
    def add_relative_attn_buckets_count(self, value: int) -> None:
        self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)

+    def add_sliding_window(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
+
    def add_pooling_type(self, value: PoolingType) -> None:
        self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)

--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -287,7 +287,6 @@ enum llm_kv {

    LLM_KV_VOCAB_SIZE,
    LLM_KV_CONTEXT_LENGTH,
-    LLM_KV_CONTEXT_LENGTH_SWA,
    LLM_KV_EMBEDDING_LENGTH,
    LLM_KV_BLOCK_COUNT,
    LLM_KV_LEADING_DENSE_BLOCK_COUNT,
@ -318,6 +317,7 @@ enum llm_kv {
    LLM_KV_ATTENTION_Q_LORA_RANK,
    LLM_KV_ATTENTION_KV_LORA_RANK,
    LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
+    LLM_KV_ATTENTION_SLIDING_WINDOW,

    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_FREQ_BASE,
@ -380,7 +380,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {

    { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
    { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
-    { LLM_KV_CONTEXT_LENGTH_SWA,                "%s.context_length_swa"                },
    { LLM_KV_EMBEDDING_LENGTH,                  "%s.embedding_length"                  },
    { LLM_KV_BLOCK_COUNT,                       "%s.block_count"                       },
    { LLM_KV_LEADING_DENSE_BLOCK_COUNT,         "%s.leading_dense_block_count"         },
@ -411,6 +410,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },

    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
@ -2082,7 +2082,6 @@ struct llama_hparams {

    uint32_t n_vocab;
    uint32_t n_ctx_train;    // context size the model was trained on
-    int32_t  n_ctx_swa = -1; // context size for sliding window attention (SWA)
    uint32_t n_embd;
    uint32_t n_head;
    uint32_t n_head_kv;
@ -2102,6 +2101,7 @@ struct llama_hparams {
    uint32_t n_ff_shexp = 0;
    uint32_t n_expert_shared = 0;
    float    expert_weights_scale = 0.0;
+    uint32_t n_sliding = 0; // sliding window attention (SWA)

    float f_norm_eps;
    float f_norm_rms_eps;
@ -4715,8 +4715,8 @@ static void llm_load_hparams(
            } break;
        case LLM_ARCH_GEMMA2:
            {
-                hparams.n_ctx_swa = 4096; // default value
-                ml.get_key(LLM_KV_CONTEXT_LENGTH_SWA, hparams.n_ctx_swa, false);
+                hparams.n_sliding = 4096; // default value of gemma 2
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_sliding, false);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
@ -12687,11 +12687,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {

            float * data = (float *) lctx.inp_KQ_mask->data;
            float * data_swa = nullptr;
-            const llama_pos n_keep_swa = hparams.n_ctx_swa - batch.n_tokens;
+            const llama_pos n_keep_swa = hparams.n_sliding - batch.n_tokens;

            if (lctx.model.arch == LLM_ARCH_GEMMA2) {
                GGML_ASSERT(!lctx.inp_KQ_mask_l.empty() && "gemma 2 requires different KQ mask per layer");
-                GGML_ASSERT(hparams.n_ctx_swa > 0);
+                GGML_ASSERT(hparams.n_sliding > 0);
                data_swa = (float *) lctx.inp_KQ_mask_l[0]->data;
                data     = (float *) lctx.inp_KQ_mask_l[1]->data;
                // because layer masks are alternate for gemma 2, we only need to take first 2 layers