cuda : add TODOs for RoPE NeoX implementation

2023-08-23 21:32:12 +03:00 · 2023-08-23 21:32:12 +03:00 · 8c6d3939c7
commit 8c6d3939c7
parent f8ee54bd2c
2 changed files with 30 additions and 3 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -3907,6 +3907,29 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
    dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }
 // TODO: this implementation is wrong!
 //static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
 //                                const float p_delta, const int p_delta_rows, const float theta_scale) {
 //    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 //
 //    if (col >= ncols) {
 //        return;
 //    }
 //
 //    const int row = blockDim.x*blockIdx.x + threadIdx.x;
 //    const int i = row*ncols + col/2;
 //
 //    const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
 //    const float sin_theta = sinf(theta);
 //    const float cos_theta = cosf(theta);
 //
 //    const float x0 = x[i + 0];
 //    const float x1 = x[i + ncols/2];
 //
 //    dst[i + 0]       = x0*cos_theta - x1*sin_theta;
 //    dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
 //}
 static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
    const int half_n_dims = ncols/4;
@ -5515,7 +5538,8 @@ inline void ggml_cuda_op_rope(
    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const bool is_glm = mode & 4;
+    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
    // compute
    if (is_glm) {
@ -5523,6 +5547,9 @@ inline void ggml_cuda_op_rope(
        const float id_p = min(p, n_ctx - 2.f);
        const float block_p = max(p - (n_ctx - 2.f), 0.f);
        rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
    } else if (is_neox) {
        GGML_ASSERT(false && "RoPE NeoX not implemented yet");
 #pragma message("TODO: implement RoPE NeoX for CUDA")
    } else {
        const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
        rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
--- a/ggml.c
+++ b/ggml.c
@ -12537,7 +12537,7 @@ static void ggml_compute_forward_rope_f32(
                        dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
                    }
                } else {
-                    // TODO: this is probably wrong, but I can't figure it out ..
+                    // TODO: this might be wrong for ne0 != n_dims - need double check
                    // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
@ -12666,7 +12666,7 @@ static void ggml_compute_forward_rope_f16(
                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                    }
                } else {
-                    // TODO: this is probably wrong, but I can't figure it out ..
+                    // TODO: this might be wrong for ne0 != n_dims - need double check
                    // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                        for (int64_t ic = 0; ic < n_dims; ic += 2) {