ggml_cuda_compute_forward

2023-05-24 12:55:50 +02:00 · 2023-05-24 12:55:50 +02:00 · 971920e935
commit 971920e935
parent 071dcd351b
4 changed files with 65 additions and 52 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -862,6 +862,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
    }
 }

+bool ggml_cuda_can_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    return src1->backend == GGML_BACKEND_CUDA;
+}
+
 void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    ggml_cuda_op<GGML_CUDA_OP_TYPE_FFF, ggml_cuda_op_mul>(src0, src1, dst);
@ -968,3 +972,34 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
    free(buf_host);
    fclose(fp);
 }
+
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
+    switch (tensor->op) {
+        case GGML_OP_MUL:
+            if (!ggml_cuda_can_mul(tensor->src0, tensor->src1, tensor)) {
+                return false;
+            }
+            if (params->ith != 0) {
+                return true;
+            }
+            if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+                return true;
+            }
+            ggml_cuda_mul(tensor->src0, tensor->src1, tensor);
+            return true;
+        case GGML_OP_MUL_MAT:
+            if (!ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
+                return false;
+            }
+            if (params->ith != 0) {
+                return true;
+            }
+            if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+                return true;
+            }
+            ggml_cuda_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, params->wsize);
+            return true;
+        default:
+            return false;
+    }
+}
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -16,6 +16,7 @@ void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);

 void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);

 #ifdef  __cplusplus
 }
--- a/ggml.c
+++ b/ggml.c
@ -3647,26 +3647,6 @@ struct ggml_context_container {
    struct ggml_context context;
 };

-//
-// compute types
-//
-
-enum ggml_task_type {
-    GGML_TASK_INIT = 0,
-    GGML_TASK_COMPUTE,
-    GGML_TASK_FINALIZE,
-};
-
-struct ggml_compute_params {
-    enum ggml_task_type type;
-
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-};
-
 //
 // ggml state
 //
@ -8166,14 +8146,7 @@ static void ggml_compute_forward_mul_f32(
    const int ith = params->ith;
    const int nth = params->nth;

-#ifdef GGML_USE_CUBLAS
-    if (src1->backend == GGML_BACKEND_CUDA) {
-        if (ith == 0) {
-            ggml_cuda_mul(src0, src1, dst);
-        }
-        return;
-    }
-#elif defined(GGML_USE_CLBLAST)
+#ifdef GGML_USE_CLBLAST
    if (src1->backend == GGML_BACKEND_CL) {
        if (ith == 0) {
            ggml_cl_mul(src0, src1, dst);
@ -9614,14 +9587,7 @@ static void ggml_compute_forward_mul_mat_f32(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows

-#if defined(GGML_USE_CUBLAS)
-    if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#elif defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CLBLAST)
    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@ -9786,14 +9752,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows

-#if defined(GGML_USE_CUBLAS)
-    if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#elif defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CLBLAST)
    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@ -9998,14 +9957,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows

-#if defined(GGML_USE_CUBLAS)
-    if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#elif defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CLBLAST)
    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
@ -12931,6 +12883,13 @@ static void ggml_compute_forward_map_binary(
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
    GGML_ASSERT(params);

+#ifdef GGML_USE_CUBLAS
+    bool used_cuda = ggml_cuda_compute_forward(params, tensor);
+    if (used_cuda) {
+        return;
+    }
+#endif // GGML_USE_CUBLAS
+
    switch (tensor->op) {
        case GGML_OP_DUP:
            {
--- a/ggml.h
+++ b/ggml.h
@ -413,6 +413,24 @@ extern "C" {
        bool   no_alloc;   // don't allocate memory for the tensor data
    };

+
+    // compute types
+    enum ggml_task_type {
+        GGML_TASK_INIT = 0,
+        GGML_TASK_COMPUTE,
+        GGML_TASK_FINALIZE,
+    };
+
+    struct ggml_compute_params {
+        enum ggml_task_type type;
+
+        int ith, nth;
+
+        // work buffer for all threads
+        size_t wsize;
+        void * wdata;
+    };
+
    // misc

    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program