From e17c8497cff7841932e47df572a4cfffa7f8ff3c Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 2 Jul 2023 17:25:08 +0800 Subject: [PATCH] switched to NTK aware scaling --- ggml-cuda.cu | 4 ++-- ggml.c | 37 +++++++++++++++++++++---------------- ggml.h | 8 ++------ llama.cpp | 2 +- 4 files changed, 26 insertions(+), 25 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index ef6d28f4e..6bc489ce4 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2223,10 +2223,10 @@ inline void ggml_cuda_op_rope( const int n_ctx = ((int32_t *) src1->data)[3]; GGML_ASSERT(mode == 0); - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02); - const float p = n_ctx <= GGML_TRAINING_CTX ? p0 : p0 * GGML_TRAINING_CTX / n_ctx; + const float p = p0; // compute rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); diff --git a/ggml.c b/ggml.c index 1bd43ff4b..ca87aea02 100644 --- a/ggml.c +++ b/ggml.c @@ -4242,6 +4242,22 @@ static inline int ggml_up(int n, int m) { #define ggml_assert_aligned(ptr) \ GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) +float get_theta_scale(int n_dims,int n_past,int n_ctx) +{ + if(n_ctx<=2048) //normie mode + { + return powf(10000.0, -2.0f/n_dims); + } + else + { + //using scaled NTK aware ctx + float a = (n_ctx<=4096?4.0:8.0); + float m = powf(a, n_dims / (n_dims - 2.0)); + float s = powf(10000.0 * m, -2.0f/n_dims); + return s; + } +} + //////////////////////////////////////////////////////////////////////////////// struct ggml_context * ggml_init(struct ggml_init_params params) { @@ -12531,7 +12547,7 @@ static void ggml_compute_forward_rope_f32( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12571,9 +12587,7 @@ static void ggml_compute_forward_rope_f32( dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta; } } else if (!is_neox) { - if (n_ctx > GGML_TRAINING_CTX) { - theta = theta * GGML_TRAINING_CTX / n_ctx; - } + for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); @@ -12674,7 +12688,7 @@ static void ggml_compute_forward_rope_f16( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12714,9 +12728,6 @@ static void ggml_compute_forward_rope_f16( dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta); } } if (!is_neox) { - if (n_ctx > GGML_TRAINING_CTX) { - theta = theta * GGML_TRAINING_CTX / n_ctx; - } for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); @@ -12842,7 +12853,7 @@ static void ggml_compute_forward_rope_back_f32( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); const bool is_neox = mode & 2; @@ -12856,9 +12867,6 @@ static void ggml_compute_forward_rope_back_f32( float theta = (float)p; if (!is_neox) { - if (n_ctx > GGML_TRAINING_CTX) { - theta = theta * GGML_TRAINING_CTX / n_ctx; - } for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); @@ -12959,7 +12967,7 @@ static void ggml_compute_forward_rope_back_f16( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx); const bool is_neox = mode & 2; @@ -12973,9 +12981,6 @@ static void ggml_compute_forward_rope_back_f16( float theta = (float)p; if (!is_neox) { - if (n_ctx > GGML_TRAINING_CTX) { - theta = theta * GGML_TRAINING_CTX / n_ctx; - } for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); diff --git a/ggml.h b/ggml.h index e05de3eb5..d4d0330d1 100644 --- a/ggml.h +++ b/ggml.h @@ -201,12 +201,6 @@ #define GGML_MAX_NAME 48 #define GGML_DEFAULT_N_THREADS 4 -// Maximum training context of the model in use -// For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B) -#ifndef GGML_TRAINING_CTX -#define GGML_TRAINING_CTX 2176 -#endif - #define GGML_ASSERT(x) \ do { \ if (!(x)) { \ @@ -510,6 +504,8 @@ extern "C" { // use this to compute the memory overhead of a tensor GGML_API size_t ggml_tensor_overhead(void); + GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx); + // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); diff --git a/llama.cpp b/llama.cpp index f0cb84139..af182a2f9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2633,7 +2633,7 @@ struct llama_context * llama_new_context_with_model( ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); - const size_t bigctxmul = (hparams.n_ctx>2048?2:1); + const size_t bigctxmul = (hparams.n_ctx>4096?3:(hparams.n_ctx>2048?2:1)); ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)*bigctxmul); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)*bigctxmul); }