From 0645ed5c97d0dc7c77c038fa966316ebc6c4bd65 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 5 Aug 2024 09:19:01 +0200 Subject: [PATCH] Optimize Vulkan REPEAT performance --- ggml/src/ggml-vulkan.cpp | 130 +++++------------- ggml/src/vulkan-shaders/repeat.comp | 24 ++++ .../src/vulkan-shaders/vulkan-shaders-gen.cpp | 4 + 3 files changed, 59 insertions(+), 99 deletions(-) create mode 100644 ggml/src/vulkan-shaders/repeat.comp diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index d7fea78d0..32d66a0c3 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -186,6 +186,7 @@ struct vk_device_struct { vk_pipeline pipeline_sqr_f32; vk_pipeline pipeline_clamp_f32; vk_pipeline pipeline_pad_f32; + vk_pipeline pipeline_repeat_f32; vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; vk_pipeline pipeline_norm_f32; vk_pipeline pipeline_group_norm_f32; @@ -1659,6 +1660,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); @@ -3847,76 +3850,6 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx } } -static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - VK_LOG_DEBUG("ggml_vk_op_repeat(" << src0 << ", " << src1 << ", " << dst << ")"); - const uint64_t ne0 = dst->ne[0]; - const uint64_t ne1 = dst->ne[1]; - const uint64_t ne2 = dst->ne[2]; - const uint64_t ne3 = dst->ne[3]; - - const uint64_t ne00 = src0->ne[0]; - const uint64_t ne01 = src0->ne[1]; - const uint64_t ne02 = src0->ne[2]; - const uint64_t ne03 = src0->ne[3]; - - const uint64_t nb0 = dst->nb[0]; - const uint64_t nb1 = dst->nb[1]; - const uint64_t nb2 = dst->nb[2]; - const uint64_t nb3 = dst->nb[3]; - - const uint64_t nb00 = src0->nb[0]; - const uint64_t nb01 = src0->nb[1]; - const uint64_t nb02 = src0->nb[2]; - const uint64_t nb03 = src0->nb[3]; - - // guaranteed to be an integer due to the check in ggml_can_repeat - const uint64_t nr0 = ne0/ne00; - const uint64_t nr1 = ne1/ne01; - const uint64_t nr2 = ne2/ne02; - const uint64_t nr3 = ne3/ne03; - - // TODO: support for transposed / permuted tensors - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - - const vk_buffer src_buf = extra_src0->buffer_gpu.lock(); - const uint64_t src_offset = extra_src0->offset + src0->view_offs; - vk_buffer dst_buf = extra->buffer_gpu.lock(); - const uint64_t dst_offset = extra->offset + dst->view_offs; - - std::vector copies; - - for (uint64_t i3 = 0; i3 < nr3; i3++) { - for (uint64_t k3 = 0; k3 < ne03; k3++) { - for (uint64_t i2 = 0; i2 < nr2; i2++) { - for (uint64_t k2 = 0; k2 < ne02; k2++) { - for (uint64_t i1 = 0; i1 < nr1; i1++) { - for (uint64_t k1 = 0; k1 < ne01; k1++) { - for (uint64_t i0 = 0; i0 < nr0; i0++) { - copies.push_back({ - src_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01, - dst_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0, - ne00*nb0, - }); - } - } - } - } - } - } - } - - ggml_vk_sync_buffers(subctx); - subctx->s->buffer.copyBuffer(src_buf->buffer, dst_buf->buffer, copies); - - GGML_UNUSED(ctx); - GGML_UNUSED(src1); -} - - static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) { switch (op) { case GGML_OP_GET_ROWS: @@ -3982,6 +3915,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_pad_f32; } return nullptr; + case GGML_OP_REPEAT: + if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) { + return ctx->device->pipeline_repeat_f32; + } + return nullptr; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: @@ -4104,15 +4042,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const GGML_UNUSED(src2); } -static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) { - switch(op) { - case GGML_OP_REPEAT: - return ggml_vk_op_repeat; - default: - return nullptr; - } -} - static bool ggml_vk_op_supports_incontiguous(ggml_op op) { switch (op) { case GGML_OP_CPY: @@ -4126,6 +4055,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { case GGML_OP_SQR: case GGML_OP_CLAMP: case GGML_OP_PAD: + case GGML_OP_REPEAT: return true; default: return false; @@ -4173,21 +4103,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co const uint64_t ned = ned0 * ned1; vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op); - ggml_vk_func_t op_func; if (pipeline == nullptr) { - op_func = ggml_vk_op_get_func(op); - if (op_func == nullptr) { - std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type); - if (src1 != nullptr) { - std::cerr << " and " << ggml_type_name(src1->type); - } - std::cerr << " to " << ggml_type_name(dst->type) << std::endl; - GGML_ABORT("fatal error"); + std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(op) << " for " << ggml_type_name(src0->type); + if (src1 != nullptr) { + std::cerr << " and " << ggml_type_name(src1->type); } - - op_func(ctx, subctx, src0, src1, dst); - return; + std::cerr << " to " << ggml_type_name(dst->type) << std::endl; + GGML_ABORT("fatal error"); } const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op); @@ -4337,6 +4260,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co case GGML_OP_SQR: case GGML_OP_CLAMP: case GGML_OP_PAD: + case GGML_OP_REPEAT: case GGML_OP_CPY: case GGML_OP_CONCAT: case GGML_OP_UPSCALE: @@ -4452,10 +4376,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } } -static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {}); -} - static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); @@ -4603,6 +4523,19 @@ static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const }); } +static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, { + (uint32_t)ggml_nelements(dst), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, + }); +} + static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; const uint32_t src0_type_size = ggml_type_size(src0->type); @@ -6637,10 +6570,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const return false; } break; case GGML_OP_REPEAT: - { - ggml_type src0_type = op->src[0]->type; - return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; - } break; + return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float); case GGML_OP_ROPE: return ggml_is_contiguous(op->src[0]); case GGML_OP_NONE: @@ -7104,6 +7034,8 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); } else if (tensor->op == GGML_OP_PAD) { tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]); + } else if (tensor->op == GGML_OP_REPEAT) { + tensor_clone = ggml_repeat(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_ADD) { tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_NORM) { diff --git a/ggml/src/vulkan-shaders/repeat.comp b/ggml/src/vulkan-shaders/repeat.comp new file mode 100644 index 000000000..a86af87e7 --- /dev/null +++ b/ggml/src/vulkan-shaders/repeat.comp @@ -0,0 +1,24 @@ +#version 450 + +#include "types.comp" +#include "generic_unary_head.comp" + +uint src0_idx_mod(uint idx) { + const uint i13 = idx / (p.ne12*p.ne11*p.ne10); + const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; + const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10); + const uint i12_offset = i12*p.ne11*p.ne10; + const uint i11 = (idx - i13_offset - i12_offset) / p.ne10; + const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; + return (i13 % p.ne03)*p.nb03 + (i12 % p.ne02)*p.nb02 + (i11 % p.ne01)*p.nb01 + (i10 % p.ne00)*p.nb00; +} + +void main() { + const uint idx = get_idx(); + + if (idx >= p.ne) { + return; + } + + data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]); +} diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp index a792e203b..53ceb13d3 100644 --- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp @@ -380,6 +380,10 @@ void process_shaders(std::vector>& tasks) { string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); })); + tasks.push_back(std::async(std::launch::async, [] { + string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + })); + tasks.push_back(std::async(std::launch::async, [] { string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); }));