diff --git a/expose.h b/expose.h index 2c8cc98a2..71648d292 100644 --- a/expose.h +++ b/expose.h @@ -36,6 +36,7 @@ struct load_model_inputs const int debugmode = 0; const int forceversion = 0; const int gpulayers = 0; + const bool linear_rope; const char * banned_tokens[ban_token_max]; }; struct generation_inputs diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 75db6e9f3..e2400457d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2548,6 +2548,10 @@ inline void ggml_cuda_op_rope( const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02); const float p = p0; + if(!get_ntk_rope_scale_mode()) + { + p = n_ctx <= GGML_TRAINING_CTX ? p0 : p0 * GGML_TRAINING_CTX / n_ctx; + } // compute rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); diff --git a/ggml.c b/ggml.c index 6616c9e69..1b253f081 100644 --- a/ggml.c +++ b/ggml.c @@ -4284,20 +4284,33 @@ static inline int ggml_up(int n, int m) { #define ggml_assert_aligned(ptr) \ GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) +static bool useNtkRope = true; //uses linear rope if not NTK +void set_ntk_rope_scale_mode(bool useNtk) +{ + useNtkRope = useNtk; +} +bool get_ntk_rope_scale_mode() +{ + return useNtkRope; +} float get_theta_scale(int n_dims,int n_past,int n_ctx) { - if(n_ctx<=2048) //normie mode - { - return powf(10000.0, -2.0f/n_dims); - } - else - { - //using scaled NTK aware ctx - float a = (n_ctx<=4096?4.0:8.0); - float m = powf(a, n_dims / (n_dims - 2.0)); - float s = powf(10000.0 * m, -2.0f/n_dims); - return s; - } + if (!get_ntk_rope_scale_mode()) + { + return powf(10000.0, -2.0f / n_dims); + } + if (n_ctx <= 2048) //normie mode + { + return powf(10000.0, -2.0f / n_dims); + } + else + { + //using scaled NTK aware ctx + float a = (n_ctx <= 4096 ? 4.0 : 8.0); + float m = powf(a, n_dims / (n_dims - 2.0)); + float s = powf(10000.0 * m, -2.0f / n_dims); + return s; + } } //////////////////////////////////////////////////////////////////////////////// @@ -12044,7 +12057,9 @@ static void ggml_compute_forward_rope_f32( dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta; } } else if (!is_neox) { - + if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) { + theta = theta * GGML_TRAINING_CTX / n_ctx; + } for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); @@ -12172,6 +12187,9 @@ static void ggml_compute_forward_rope_f16( dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta); } } if (!is_neox) { + if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) { + theta = theta * GGML_TRAINING_CTX / n_ctx; + } for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); @@ -12297,6 +12315,9 @@ static void ggml_compute_forward_rope_back_f32( float theta = (float)p; if (!is_neox) { + if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) { + theta = theta * GGML_TRAINING_CTX / n_ctx; + } for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); @@ -12397,6 +12418,9 @@ static void ggml_compute_forward_rope_back_f16( float theta = (float)p; if (!is_neox) { + if (!get_ntk_rope_scale_mode() && n_ctx > GGML_TRAINING_CTX) { + theta = theta * GGML_TRAINING_CTX / n_ctx; + } for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); diff --git a/ggml.h b/ggml.h index 484b4ecb6..6bb12f3e7 100644 --- a/ggml.h +++ b/ggml.h @@ -203,6 +203,12 @@ #define GGML_UNUSED(x) (void)(x) +// Maximum training context of the model in use +// For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B) +#ifndef GGML_TRAINING_CTX +#define GGML_TRAINING_CTX 2048 +#endif + #define GGML_ASSERT(x) \ do { \ if (!(x)) { \ @@ -537,6 +543,8 @@ extern "C" { // use this to compute the memory overhead of a tensor GGML_API size_t ggml_tensor_overhead(void); + GGML_API void set_ntk_rope_scale_mode(bool useNtk); + GGML_API bool get_ntk_rope_scale_mode(); GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx); // main diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 5996911a1..6dcc2cfe7 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -346,6 +346,13 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx = mpt_ctx_v3.hparams.n_ctx = params.n_ctx; + //handle linear rope + if(inputs.linear_rope) + { + printf("Using Linear RoPE scaling instead of NTK-Aware scaling.\n"); + } + set_ntk_rope_scale_mode(!inputs.linear_rope); + //handle custom token bans banned_tokens.clear(); for(int x=0;x0 and (inputs.sampler_order[0]!=6 or inputs.sampler_order[inputs.sampler_len-1]!=5): + print("\n(Warning!!! Poor sampler_order detected! You will have reduced quality. Recommended values are [6,0,1,3,4,2,5])") except TypeError as e: print("ERROR: sampler_order must be a list of integers: " + str(e)) inputs.seed = seed @@ -606,14 +610,13 @@ def RunServerMultiThreaded(addr, port, embedded_kailite = None): # note: customtkinter-5.2.0 def show_new_gui(): - import customtkinter as ctk from tkinter.filedialog import askopenfilename from tkinter.filedialog import asksaveasfile # if args received, launch if len(sys.argv) != 1: - root = ctk.CTk() - #we dont want the useless window to be visible, but we want it in taskbar + import tkinter as tk + root = tk.Tk() #we dont want the useless window to be visible, but we want it in taskbar root.attributes("-alpha", 0) args.model_param = askopenfilename(title="Select ggml model .bin files") root.destroy() @@ -623,6 +626,8 @@ def show_new_gui(): sys.exit(2) return + import customtkinter as ctk + nextstate = 0 #0=exit, 1=launch, 2=oldgui windowwidth = 520 windowheight = 500 @@ -1413,6 +1418,7 @@ if __name__ == '__main__': parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true') parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192], default=2048) parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512) + parser.add_argument("--linearrope", help="If set, uses linear RoPE scaling. Otherwise, uses NTK-Aware scaling.", action='store_true') parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true') parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true') parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents the EOS token from being generated. This flag unbans it.", action='store_true')