diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e6e83b050..cec2da5e7 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -862,6 +862,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm } } +bool ggml_cuda_can_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + return src1->backend == GGML_BACKEND_CUDA; +} + void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); ggml_cuda_op(src0, src1, dst); @@ -968,3 +972,34 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const free(buf_host); fclose(fp); } + +bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ + switch (tensor->op) { + case GGML_OP_MUL: + if (!ggml_cuda_can_mul(tensor->src0, tensor->src1, tensor)) { + return false; + } + if (params->ith != 0) { + return true; + } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return true; + } + ggml_cuda_mul(tensor->src0, tensor->src1, tensor); + return true; + case GGML_OP_MUL_MAT: + if (!ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) { + return false; + } + if (params->ith != 0) { + return true; + } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return true; + } + ggml_cuda_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, params->wsize); + return true; + default: + return false; + } +} diff --git a/ggml-cuda.h b/ggml-cuda.h index 2ef636593..f71701ce8 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -16,6 +16,7 @@ void * ggml_cuda_host_malloc(size_t size); void ggml_cuda_host_free(void * ptr); void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset); +bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); #ifdef __cplusplus } diff --git a/ggml.c b/ggml.c index 00bbee503..4a05df1e3 100644 --- a/ggml.c +++ b/ggml.c @@ -3647,26 +3647,6 @@ struct ggml_context_container { struct ggml_context context; }; -// -// compute types -// - -enum ggml_task_type { - GGML_TASK_INIT = 0, - GGML_TASK_COMPUTE, - GGML_TASK_FINALIZE, -}; - -struct ggml_compute_params { - enum ggml_task_type type; - - int ith, nth; - - // work buffer for all threads - size_t wsize; - void * wdata; -}; - // // ggml state // @@ -8166,14 +8146,7 @@ static void ggml_compute_forward_mul_f32( const int ith = params->ith; const int nth = params->nth; -#ifdef GGML_USE_CUBLAS - if (src1->backend == GGML_BACKEND_CUDA) { - if (ith == 0) { - ggml_cuda_mul(src0, src1, dst); - } - return; - } -#elif defined(GGML_USE_CLBLAST) +#ifdef GGML_USE_CLBLAST if (src1->backend == GGML_BACKEND_CL) { if (ith == 0) { ggml_cl_mul(src0, src1, dst); @@ -9614,14 +9587,7 @@ static void ggml_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } - return; - } -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(src0, src1, dst)) { if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); @@ -9786,14 +9752,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } - return; - } -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(src0, src1, dst)) { if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); @@ -9998,14 +9957,7 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } - return; - } -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(src0, src1, dst)) { if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); @@ -12931,6 +12883,13 @@ static void ggml_compute_forward_map_binary( static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); +#ifdef GGML_USE_CUBLAS + bool used_cuda = ggml_cuda_compute_forward(params, tensor); + if (used_cuda) { + return; + } +#endif // GGML_USE_CUBLAS + switch (tensor->op) { case GGML_OP_DUP: { diff --git a/ggml.h b/ggml.h index 2ea87ce9a..5dbe0f2ff 100644 --- a/ggml.h +++ b/ggml.h @@ -413,6 +413,24 @@ extern "C" { bool no_alloc; // don't allocate memory for the tensor data }; + + // compute types + enum ggml_task_type { + GGML_TASK_INIT = 0, + GGML_TASK_COMPUTE, + GGML_TASK_FINALIZE, + }; + + struct ggml_compute_params { + enum ggml_task_type type; + + int ith, nth; + + // work buffer for all threads + size_t wsize; + void * wdata; + }; + // misc GGML_API void ggml_time_init(void); // call this once at the beginning of the program