switched to NTK aware scaling
This commit is contained in:
parent
e19483ca0f
commit
e17c8497cf
4 changed files with 26 additions and 25 deletions
|
@ -2223,10 +2223,10 @@ inline void ggml_cuda_op_rope(
|
||||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||||
GGML_ASSERT(mode == 0);
|
GGML_ASSERT(mode == 0);
|
||||||
|
|
||||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
||||||
const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02);
|
const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02);
|
||||||
|
|
||||||
const float p = n_ctx <= GGML_TRAINING_CTX ? p0 : p0 * GGML_TRAINING_CTX / n_ctx;
|
const float p = p0;
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
||||||
|
|
37
ggml.c
37
ggml.c
|
@ -4242,6 +4242,22 @@ static inline int ggml_up(int n, int m) {
|
||||||
#define ggml_assert_aligned(ptr) \
|
#define ggml_assert_aligned(ptr) \
|
||||||
GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
||||||
|
|
||||||
|
float get_theta_scale(int n_dims,int n_past,int n_ctx)
|
||||||
|
{
|
||||||
|
if(n_ctx<=2048) //normie mode
|
||||||
|
{
|
||||||
|
return powf(10000.0, -2.0f/n_dims);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//using scaled NTK aware ctx
|
||||||
|
float a = (n_ctx<=4096?4.0:8.0);
|
||||||
|
float m = powf(a, n_dims / (n_dims - 2.0));
|
||||||
|
float s = powf(10000.0 * m, -2.0f/n_dims);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
struct ggml_context * ggml_init(struct ggml_init_params params) {
|
struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
|
@ -12531,7 +12547,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
// row index used to determine which thread to use
|
// row index used to determine which thread to use
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
@ -12571,9 +12587,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
||||||
}
|
}
|
||||||
} else if (!is_neox) {
|
} else if (!is_neox) {
|
||||||
if (n_ctx > GGML_TRAINING_CTX) {
|
|
||||||
theta = theta * GGML_TRAINING_CTX / n_ctx;
|
|
||||||
}
|
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
const float cos_theta = cosf(theta);
|
const float cos_theta = cosf(theta);
|
||||||
const float sin_theta = sinf(theta);
|
const float sin_theta = sinf(theta);
|
||||||
|
@ -12674,7 +12688,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
// row index used to determine which thread to use
|
// row index used to determine which thread to use
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
@ -12714,9 +12728,6 @@ static void ggml_compute_forward_rope_f16(
|
||||||
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
||||||
}
|
}
|
||||||
} if (!is_neox) {
|
} if (!is_neox) {
|
||||||
if (n_ctx > GGML_TRAINING_CTX) {
|
|
||||||
theta = theta * GGML_TRAINING_CTX / n_ctx;
|
|
||||||
}
|
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
const float cos_theta = cosf(theta);
|
const float cos_theta = cosf(theta);
|
||||||
const float sin_theta = sinf(theta);
|
const float sin_theta = sinf(theta);
|
||||||
|
@ -12842,7 +12853,7 @@ static void ggml_compute_forward_rope_back_f32(
|
||||||
// row index used to determine which thread to use
|
// row index used to determine which thread to use
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
|
|
||||||
|
@ -12856,9 +12867,6 @@ static void ggml_compute_forward_rope_back_f32(
|
||||||
float theta = (float)p;
|
float theta = (float)p;
|
||||||
|
|
||||||
if (!is_neox) {
|
if (!is_neox) {
|
||||||
if (n_ctx > GGML_TRAINING_CTX) {
|
|
||||||
theta = theta * GGML_TRAINING_CTX / n_ctx;
|
|
||||||
}
|
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
const float cos_theta = cosf(theta);
|
const float cos_theta = cosf(theta);
|
||||||
const float sin_theta = sinf(theta);
|
const float sin_theta = sinf(theta);
|
||||||
|
@ -12959,7 +12967,7 @@ static void ggml_compute_forward_rope_back_f16(
|
||||||
// row index used to determine which thread to use
|
// row index used to determine which thread to use
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
|
|
||||||
|
@ -12973,9 +12981,6 @@ static void ggml_compute_forward_rope_back_f16(
|
||||||
float theta = (float)p;
|
float theta = (float)p;
|
||||||
|
|
||||||
if (!is_neox) {
|
if (!is_neox) {
|
||||||
if (n_ctx > GGML_TRAINING_CTX) {
|
|
||||||
theta = theta * GGML_TRAINING_CTX / n_ctx;
|
|
||||||
}
|
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
const float cos_theta = cosf(theta);
|
const float cos_theta = cosf(theta);
|
||||||
const float sin_theta = sinf(theta);
|
const float sin_theta = sinf(theta);
|
||||||
|
|
8
ggml.h
8
ggml.h
|
@ -201,12 +201,6 @@
|
||||||
#define GGML_MAX_NAME 48
|
#define GGML_MAX_NAME 48
|
||||||
#define GGML_DEFAULT_N_THREADS 4
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
|
|
||||||
// Maximum training context of the model in use
|
|
||||||
// For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B)
|
|
||||||
#ifndef GGML_TRAINING_CTX
|
|
||||||
#define GGML_TRAINING_CTX 2176
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define GGML_ASSERT(x) \
|
#define GGML_ASSERT(x) \
|
||||||
do { \
|
do { \
|
||||||
if (!(x)) { \
|
if (!(x)) { \
|
||||||
|
@ -510,6 +504,8 @@ extern "C" {
|
||||||
// use this to compute the memory overhead of a tensor
|
// use this to compute the memory overhead of a tensor
|
||||||
GGML_API size_t ggml_tensor_overhead(void);
|
GGML_API size_t ggml_tensor_overhead(void);
|
||||||
|
|
||||||
|
GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
|
||||||
|
|
||||||
// main
|
// main
|
||||||
|
|
||||||
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||||
|
|
|
@ -2633,7 +2633,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
||||||
|
|
||||||
const size_t bigctxmul = (hparams.n_ctx>2048?2:1);
|
const size_t bigctxmul = (hparams.n_ctx>4096?3:(hparams.n_ctx>2048?2:1));
|
||||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)*bigctxmul);
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)*bigctxmul);
|
||||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)*bigctxmul);
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)*bigctxmul);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue