fix rope shift

2023-09-20 12:42:50 +02:00 · 2023-09-20 12:42:50 +02:00 · d30ab79b18
commit d30ab79b18
parent 2e92aefef3
2 changed files with 17 additions and 16 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -4370,7 +4370,7 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
    const int i2 = row/p_delta_rows;

    const int p = has_pos ? pos[i2] : 0;
-    const float p0 = p * freq_scale;
+    const float p0 = p*freq_scale;
    const float theta = p0*powf(theta_scale, col/2);
    const float sin_theta = sinf(theta);
    const float cos_theta = cosf(theta);
@ -4396,7 +4396,7 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
    const int i2 = row/p_delta_rows;

    const int p = has_pos ? pos[i2] : 0;
-    const float p0 = p * freq_scale;
+    const float p0 = p*freq_scale;
    const float theta = p0*powf(theta_scale, col/2);
    const float sin_theta = sinf(theta);
    const float cos_theta = cosf(theta);
@ -6106,15 +6106,11 @@ inline void ggml_cuda_op_rope(

    const float theta_scale = powf(freq_base, -2.0f/n_dims);

-    int32_t * pos = nullptr;
+    const int32_t * pos = nullptr;
    if ((mode & 1) == 0) {
        GGML_ASSERT(src1->type == GGML_TYPE_I32);
        GGML_ASSERT(src1->ne[0] == ne2);
-        GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
-        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-        int id;
-        CUDA_CHECK(cudaGetDevice(&id));
-        pos = (int32_t *) src1_extra->data_device[id];
+        pos = (const int32_t *) src1_dd;
    }

    const bool is_neox = mode & 2;
@ -7092,8 +7088,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {

    struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-    CUDA_CHECK(cudaMemcpyAsync(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice, main_stream));
+    CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
 }

 void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
--- a/llama.cpp
+++ b/llama.cpp
@ -2747,14 +2747,16 @@ static struct ggml_cgraph * llm_build_llama(
        }

        for (int il = 0; il < n_layer; ++il) {
-            ggml_build_forward_expand(gf,
+            struct ggml_tensor * tmp =
                    ggml_rope_custom_inplace(ctx0,
                        ggml_view_3d(ctx0, kv_self.k,
                            n_embd_head, n_head_kv, n_ctx,
                            ggml_element_size(kv_self.k)*n_embd_head,
                            ggml_element_size(kv_self.k)*n_embd_gqa,
                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
+                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
+            offload_func_kq(tmp);
+            ggml_build_forward_expand(gf, tmp);
        }
    }

@ -3137,14 +3139,16 @@ static struct ggml_cgraph * llm_build_baichaun(
        }

        for (int il = 0; il < n_layer; ++il) {
-            ggml_build_forward_expand(gf,
+            struct ggml_tensor * tmp =
                    ggml_rope_custom_inplace(ctx0,
                        ggml_view_3d(ctx0, kv_self.k,
                            n_embd_head, n_head_kv, n_ctx,
                            ggml_element_size(kv_self.k)*n_embd_head,
                            ggml_element_size(kv_self.k)*n_embd_gqa,
                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
+                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
+            offload_func_kq(tmp);
+            ggml_build_forward_expand(gf, tmp);
        }
    }

@ -3547,14 +3551,16 @@ static struct ggml_cgraph * llm_build_falcon(
        }

        for (int il = 0; il < n_layer; ++il) {
-            ggml_build_forward_expand(gf,
+            struct ggml_tensor * tmp =
                    ggml_rope_custom_inplace(ctx0,
                        ggml_view_3d(ctx0, kv_self.k,
                            n_embd_head, n_head_kv, n_ctx,
                            ggml_element_size(kv_self.k)*n_embd_head,
                            ggml_element_size(kv_self.k)*n_embd_gqa,
                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 2, 0, freq_base, freq_scale));
+                        K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
+            offload_func_kq(tmp);
+            ggml_build_forward_expand(gf, tmp);
        }
    }