From 1b2ec1aa720474c05c975b8b0471a389c42ffb29 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 25 Jul 2023 19:01:28 +0200 Subject: [PATCH] Move to graph function similar to CUDA implementation --- ggml-vulkan.cpp | 59 +++++++++++++++++++++++++++++++++++++------------ ggml-vulkan.h | 5 +---- ggml.c | 21 ++++++------------ 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 957da19cf..6b3a923ab 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -44,6 +44,8 @@ #define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1 #define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2 +typedef void (*ggml_vk_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + struct vk_buffer { vk::Buffer buffer; vk::DeviceMemory device_memory; @@ -1889,7 +1891,7 @@ static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } -bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +static bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { const int64_t ne10 = src1->ne[0]; const int64_t ne0 = dst->ne[0]; @@ -1906,7 +1908,7 @@ bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens return false; } -bool ggml_vk_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { +static bool ggml_vk_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { #ifdef VK_DEBUG std::cerr << "ggml_vk_mul_mat_use_f16(" << src0 << ", " << src1 << ")" << std::endl; #endif @@ -1929,7 +1931,7 @@ bool ggml_vk_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_ return mul_mat_f16_transfer < mul_mat_q_transfer; } -void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +static void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { #ifdef VK_DEBUG std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl; #endif @@ -1954,16 +1956,6 @@ void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * } } -size_t ggml_vk_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { -#ifdef VK_DEBUG - std::cerr << "ggml_vk_mul_mat_get_wsize(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl; -#endif - if (ggml_vk_mul_mat_use_f16(src0, src1, dst)) { - return ggml_nelements(src1) * sizeof(ggml_fp16_t); - } - return 0; -} - static void ggml_vk_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef VK_DEBUG std::cerr << "ggml_vk_mul_f32((type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3]; @@ -2062,7 +2054,7 @@ static void ggml_vk_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_vk_pool_free(d_D); } -void ggml_vk_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +static void ggml_vk_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); ggml_vk_mul_f32(src0, src1, dst); } @@ -2097,6 +2089,45 @@ void ggml_vk_transform_tensor(void * data, ggml_tensor * tensor) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); } +bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ + ggml_vk_func_t func; + const bool any_on_device = tensor->backend == GGML_BACKEND_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + + switch (tensor->op) { + case GGML_OP_MUL: + if (!any_on_device) { + return false; + } + + func = ggml_vk_mul; + + break; + case GGML_OP_MUL_MAT: + if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { + return false; + } + + func = ggml_vk_mul_mat; + + break; + default: + return false; + } + + if (params->ith != 0) { + return true; + } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return true; + } + + func(tensor->src[0], tensor->src[1], tensor); + + return true; +} + #ifdef VK_CHK_KERNEL void ggml_vk_test_transfer(size_t ne) { #ifdef VK_DEBUG diff --git a/ggml-vulkan.h b/ggml-vulkan.h index e5880f448..79baea6de 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -8,10 +8,7 @@ extern "C" { void ggml_vk_init(void); -void ggml_vk_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -size_t ggml_vk_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); void * ggml_vk_host_malloc(size_t size); void ggml_vk_host_free(void * ptr); diff --git a/ggml.c b/ggml.c index 413425135..d0a65295c 100644 --- a/ggml.c +++ b/ggml.c @@ -9197,13 +9197,6 @@ static void ggml_compute_forward_mul_f32( } return; } -#elif defined(GGML_USE_VULKAN) - if (src1->backend == GGML_BACKEND_GPU) { - if (ith == 0) { - ggml_vk_mul(src0, src1, dst); - } - return; - } #endif const int64_t nr = ggml_nrows(src0); @@ -10749,13 +10742,6 @@ static void ggml_compute_forward_mul_mat( } return; } -#elif defined(GGML_USE_VULKAN) - if (ggml_vk_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_vk_mul_mat(src0, src1, dst); - } - return; - } #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) @@ -14887,6 +14873,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); +#elif defined(GGML_USE_VULKAN) + bool skip_cpu = ggml_vk_compute_forward(params, tensor); + if (skip_cpu) { + return; + } + GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); #endif // GGML_USE_CUBLAS switch (tensor->op) {