cuda : add TODOs for RoPE NeoX implementation

This commit is contained in:
Georgi Gerganov 2023-08-23 21:32:12 +03:00
parent f8ee54bd2c
commit 8c6d3939c7
2 changed files with 30 additions and 3 deletions

View file

@ -3907,6 +3907,29 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
dst[i + 1] = x0*sin_theta + x1*cos_theta; dst[i + 1] = x0*sin_theta + x1*cos_theta;
} }
// TODO: this implementation is wrong!
//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
// const float p_delta, const int p_delta_rows, const float theta_scale) {
// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
//
// if (col >= ncols) {
// return;
// }
//
// const int row = blockDim.x*blockIdx.x + threadIdx.x;
// const int i = row*ncols + col/2;
//
// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
// const float sin_theta = sinf(theta);
// const float cos_theta = cosf(theta);
//
// const float x0 = x[i + 0];
// const float x1 = x[i + ncols/2];
//
// dst[i + 0] = x0*cos_theta - x1*sin_theta;
// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
//}
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) { static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
const int col = blockDim.x*blockIdx.x + threadIdx.x; const int col = blockDim.x*blockIdx.x + threadIdx.x;
const int half_n_dims = ncols/4; const int half_n_dims = ncols/4;
@ -5515,7 +5538,8 @@ inline void ggml_cuda_op_rope(
const float theta_scale = powf(freq_base, -2.0f/n_dims); const float theta_scale = powf(freq_base, -2.0f/n_dims);
const bool is_glm = mode & 4; const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
// compute // compute
if (is_glm) { if (is_glm) {
@ -5523,6 +5547,9 @@ inline void ggml_cuda_op_rope(
const float id_p = min(p, n_ctx - 2.f); const float id_p = min(p, n_ctx - 2.f);
const float block_p = max(p - (n_ctx - 2.f), 0.f); const float block_p = max(p - (n_ctx - 2.f), 0.f);
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main); rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
} else if (is_neox) {
GGML_ASSERT(false && "RoPE NeoX not implemented yet");
#pragma message("TODO: implement RoPE NeoX for CUDA")
} else { } else {
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);

4
ggml.c
View file

@ -12537,7 +12537,7 @@ static void ggml_compute_forward_rope_f32(
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta; dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
} }
} else { } else {
// TODO: this is probably wrong, but I can't figure it out .. // TODO: this might be wrong for ne0 != n_dims - need double check
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) { for (int64_t ic = 0; ic < n_dims; ic += 2) {
@ -12666,7 +12666,7 @@ static void ggml_compute_forward_rope_f16(
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
} }
} else { } else {
// TODO: this is probably wrong, but I can't figure it out .. // TODO: this might be wrong for ne0 != n_dims - need double check
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) { for (int64_t ic = 0; ic < n_dims; ic += 2) {