llama : implement YaRN RoPE scaling (#2268)

Co-authored-by: cebtenzzre <cebtenzzre@gmail.com> Co-authored-by: Jeffrey Quesnelle <jquesnelle@gmail.com>
2023-11-01 18:04:33 -04:00 · 2023-11-01 18:04:33 -04:00 · 898aeca90a
commit 898aeca90a
parent c43c2da8af
15 changed files with 763 additions and 257 deletions
--- a/llama.h
+++ b/llama.h
@ -106,6 +106,14 @@ extern "C" {
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };

+    enum llama_rope_scaling_type {
+        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
+        LLAMA_ROPE_SCALING_NONE        = 0,
+        LLAMA_ROPE_SCALING_LINEAR      = 1,
+        LLAMA_ROPE_SCALING_YARN        = 2,
+        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
+    };
+
    typedef struct llama_token_data {
        llama_token id; // token id
        float logit;    // log-odds of the token
@ -172,10 +180,16 @@ extern "C" {
        uint32_t n_batch;         // prompt processing maximum batch size
        uint32_t n_threads;       // number of threads to use for generation
        uint32_t n_threads_batch; // number of threads to use for batch processing
+        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`

        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float rope_freq_base;  // RoPE base frequency, 0 = from model
-        float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
+        float    rope_freq_base;   // RoPE base frequency, 0 = from model
+        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
+        float    yarn_attn_factor; // YaRN magnitude scaling factor
+        float    yarn_beta_fast;   // YaRN low correction dim
+        float    yarn_beta_slow;   // YaRN high correction dim
+        uint32_t yarn_orig_ctx;    // YaRN original context size

        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)