diff --git a/ggml-vulkan-matmul.comp b/ggml-vulkan-matmul.comp new file mode 100644 index 000000000..4a34e950e --- /dev/null +++ b/ggml-vulkan-matmul.comp @@ -0,0 +1,33 @@ +#version 450 + +layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A { float A_data[]; }; +layout (binding = 1) readonly buffer B { float B_data[]; }; +layout (binding = 2) writeonly buffer D { float D_data[]; }; + +layout (push_constant) uniform parameter +{ + int M; + int N; + int K; + int stride_a; + int stride_b; + int stride_d; +} p; + +void main() +{ + int i01 = int(gl_GlobalInvocationID.x); + int i11 = int(gl_GlobalInvocationID.y); + + if (i01 < p.M && i11 < p.N) { + float sum = 0.0f; + + for (int i = 0; i < p.K; i++) { + sum += A_data[i01 * p.stride_a + i] * B_data[i11 * p.stride_b + i]; + } + + D_data[i11 * p.stride_d + i01] = sum; + } +} diff --git a/ggml-vulkan-matmul.glsl b/ggml-vulkan-matmul.glsl deleted file mode 100644 index e26fbbc09..000000000 --- a/ggml-vulkan-matmul.glsl +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2023 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Original at https://github.com/google/uVkCompute/blob/f3180c7e72ae639c0a7bc8cff7ed615b63ced27c/benchmarks/mmt/mmt_i8.glsl -// Modified by 0cc4m for FP32 - -#version 450 core -#pragma use_vulkan_memory_model - -#extension GL_EXT_scalar_block_layout : enable -#extension GL_EXT_control_flow_attributes : enable - -#extension GL_KHR_shader_subgroup_basic : enable - -#define WG_X 32 -#define WG_Y 2 -#define M0 32 -#define N0 256 -#define K0 16 - -layout(binding = 0) buffer InputA { vec4 x[]; } inputA; -layout(binding = 1) buffer InputB { vec4 x[]; } inputB; -layout(binding = 2) buffer Output { float x[]; } outputO; - -layout(local_size_x = WG_X, local_size_y = WG_Y, local_size_z = 1) in; - -layout(constant_id = 0) const uint M = 1; -layout(constant_id = 1) const uint N = 1; -layout(constant_id = 2) const uint K = 1; - -const uint VECTORIZE_K = 4; -const uint K_VEC = K / VECTORIZE_K; -const uint K0_VEC = K0 / VECTORIZE_K; - -const uint VECTORIZE_N = 4; -const uint N_VEC = N / VECTORIZE_N; -const uint N0_VEC = N0 / VECTORIZE_N; - -const uint strideA = K_VEC; // Stride of the `inputA` matrix. -const uint strideB = K_VEC; // Stride of the `inputB` matrix. -const uint strideC = N; // Stride of the `outputO` matrix. - -// Each workgroup processes an output tile of size [M0 x N0], therefore -// each thread processes a [M0/WG_Y x N0/WG_X] subview. -const uint C_ROWS = M0 / WG_Y; -const uint C_COLS = N0 / WG_X; - -/// Returns the index of `X[i, j]`, where `X` is a 2D matrix of stride |stride|. -uint coordToOffset(uint i, uint j, uint stride) { return stride * i + j; } - -float sdot(vec4 lhs, vec4 rhs) { - vec4 mul = vec4(lhs) * vec4(rhs); - return float(mul.x) + float(mul.y) + float(mul.z) + float(mul.w); -} - -void main() { - const uvec2 wgID = gl_WorkGroupID.xy; - const uvec2 localID = gl_LocalInvocationID.xy; - const uint threadID = gl_SubgroupInvocationID; - const uint subgroupID = gl_SubgroupID; - const uint subgroupSz = gl_SubgroupSize; - const uint numSubgroups = gl_NumSubgroups; - - // The start offsets of the tile processed by this thread in this workgroup. - const uint x_offset = wgID.x * N0 + C_COLS * localID.x; - const uint y_offset = wgID.y * M0 + C_ROWS * localID.y; - - float C[C_ROWS][C_COLS]; // Local data for the output. - - // Initialize result to zero. - [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { - [[unroll]] for (uint j = 0; j < C_COLS; ++j) { - C[i][j] = 0; - } - } - - for (uint k = 0; k < K_VEC; k += K0_VEC) { - [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { - [[unroll]] for (uint kk = 0; kk < K0_VEC; ++kk) { - uint y = y_offset + i; - uint gk = k + (kk + threadID) % K0_VEC; - vec4 lhs = inputA.x[coordToOffset(y, gk, strideA)]; - [[unroll]] for (uint j = 0; j < C_COLS; ++j) { - // Calculate the inner product `C[i, j] := sum(A[i, ..] * B[j, ..])`. - uint x = x_offset + j; - vec4 rhs = inputB.x[coordToOffset(x, gk, strideB)]; - C[i][j] += sdot(lhs, rhs); - } - } - } - } - - // Store the accumulated results in `outputO`. - [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { - uint y = y_offset + i; - [[unroll]] for (uint j = 0; j < C_COLS; ++j) { - uint x = x_offset + j; - outputO.x[coordToOffset(y, x, strideC)] = C[i][j]; - } - } -} diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 34b2617e1..f42254ecd 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -107,7 +107,13 @@ void ggml_vk_init(void) { descriptor_set_layout_binding); vk_pipeline_matmul_dsl = vk_device.createDescriptorSetLayout(descriptor_set_layout_create_info); - vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), vk_pipeline_matmul_dsl); + vk::PushConstantRange push_constant_range( + vk::ShaderStageFlagBits::eCompute, + 0, + 6 * sizeof(int) + ); + + vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), vk_pipeline_matmul_dsl, push_constant_range); vk_pipeline_matmul_layout = vk_device.createPipelineLayout(pipeline_layout_create_info); vk::PipelineCache pipeline_cache = vk_device.createPipelineCache(vk::PipelineCacheCreateInfo()); @@ -186,7 +192,7 @@ static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf) { vk::BufferCreateInfo buffer_create_info{ vk::BufferCreateFlags(), size, - vk::BufferUsageFlagBits::eStorageBuffer, + vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst, vk::SharingMode::eExclusive, 1, &vk_compute_queue_family_index @@ -205,10 +211,6 @@ static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf) { VkMemoryPropertyFlags mem_prop_flags; vmaGetAllocationMemoryProperties(vk_allocator, buf->allocation, &mem_prop_flags); - - if(!(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) { - printf("Nope\n"); - } } static void ggml_vk_pool_free(vk_buffer* buffer) { @@ -226,12 +228,27 @@ static void ggml_vk_pool_free(vk_buffer* buffer) { vmaDestroyBuffer(vk_allocator, buffer->buffer, buffer->allocation); } -static void ggml_vk_buffer_write(VkCommandBuffer cmd_buf, vk_buffer* dst, size_t offset, const void * src, size_t size) { +static vk::CommandBuffer ggml_vk_cmd_buffer_create() { + vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(), vk_compute_queue_family_index); + vk::CommandPool command_pool = vk_device.createCommandPool(command_pool_create_info); + + vk::CommandBufferAllocateInfo command_buffer_alloc_info( + command_pool, + vk::CommandBufferLevel::ePrimary, + 1); + const std::vector cmd_buffers = vk_device.allocateCommandBuffers(command_buffer_alloc_info); + return cmd_buffers.front(); +} + +static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src, size_t size) { VkMemoryPropertyFlags mem_prop_flags; vmaGetAllocationMemoryProperties(vk_allocator, dst->allocation, &mem_prop_flags); if(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { memcpy(dst->info.pMappedData, src, size); + if (!(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { + vmaFlushAllocation(vk_allocator, dst->allocation, 0, VK_WHOLE_SIZE); + } } else { // Allocation ended up in a non-mappable memory - need to transfer. VkBufferCreateInfo staging_buf_create_info = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; @@ -261,16 +278,39 @@ static void ggml_vk_buffer_write(VkCommandBuffer cmd_buf, vk_buffer* dst, size_t 0, // srcOffset 0, // dstOffset, size}; // size - vkCmdCopyBuffer(cmd_buf, staging_buf, dst->buffer, 1, &buf_copy); + + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(); + vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); + cmd_buffer.begin(cmd_buffer_begin_info); + vkCmdCopyBuffer(cmd_buffer, staging_buf, dst->buffer, 1, &buf_copy); + cmd_buffer.end(); + + vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0); + vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo()); + + vk::SubmitInfo submit_info(0, + nullptr, + nullptr, + 1, + &cmd_buffer); + queue.submit({ submit_info }, fence); + vk_device.waitForFences({ fence }, + true, + uint64_t(-1)); vmaDestroyBuffer(vk_allocator, staging_buf, staging_alloc); } } -static void ggml_vk_buffer_read(VkCommandBuffer cmd_buf, vk_buffer* src, size_t offset, void * dst, size_t size) { +static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size) { + vk::CommandBuffer cmd_buf = ggml_vk_cmd_buffer_create(); + VkMemoryPropertyFlags mem_prop_flags; vmaGetAllocationMemoryProperties(vk_allocator, src->allocation, &mem_prop_flags); if(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { + if (!(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { + vmaInvalidateAllocation(vk_allocator, src->allocation, 0, VK_WHOLE_SIZE); + } memcpy(dst, src->info.pMappedData, size); } else { // Allocation ended up in a non-mappable memory - need to transfer. @@ -298,15 +338,32 @@ static void ggml_vk_buffer_read(VkCommandBuffer cmd_buf, vk_buffer* src, size_t offset, // srcOffset 0, // dstOffset, size}; // size - vkCmdCopyBuffer(cmd_buf, src->buffer, staging_buf, 1, &buf_copy); vmaInvalidateAllocation(vk_allocator, staging_alloc, 0, VK_WHOLE_SIZE); - // [Executed in runtime]: + + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(); + vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); + cmd_buffer.begin(cmd_buffer_begin_info); + vkCmdCopyBuffer(cmd_buffer, src->buffer, staging_buf, 1, &buf_copy); + cmd_buffer.end(); + + vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0); + vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo()); + + vk::SubmitInfo submit_info(0, + nullptr, + nullptr, + 1, + &cmd_buffer); + queue.submit({ submit_info }, fence); + vk_device.waitForFences({ fence }, + true, + uint64_t(-1)); memcpy(dst, staging_alloc_info.pMappedData, size); vmaDestroyBuffer(vk_allocator, staging_buf, staging_alloc); } } -static void ggml_vk_h2d_tensor_2d(VkCommandBuffer cmd_buf, vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2) { +static void ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2) { const uint64_t ne0 = src->ne[0]; const uint64_t ne1 = src->ne[1]; const uint64_t nb0 = src->nb[0]; @@ -319,13 +376,12 @@ static void ggml_vk_h2d_tensor_2d(VkCommandBuffer cmd_buf, vk_buffer* dst, size_ const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3); if (nb0 == ts && nb1 == ts*ne0/bs) { - ggml_vk_buffer_write(cmd_buf, dst, offset, x, ne1*nb1); + ggml_vk_buffer_write(dst, offset, x, ne1*nb1); return; } if (nb0 == ts) { - // Might be better to use vkCmdCopyBuffer here for (uint64_t i1 = 0; i1 < ne1; i1++) { - ggml_vk_buffer_write(cmd_buf, dst, offset + ne0 * i1, x + ts*ne0/bs, ne0*nb0); + ggml_vk_buffer_write(dst, offset + ne0 * i1, (uint8_t *)x + ts*ne0/bs, ne0*nb0); } return; } @@ -336,7 +392,6 @@ static void ggml_vk_h2d_tensor_2d(VkCommandBuffer cmd_buf, vk_buffer* dst, size_ dst_ptr[offset + i1 * ts*ne0/bs + i0 * ts] = xc[i1 * nb1 + i0 * nb0]; } } - return; } static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -386,36 +441,30 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr }; vk_device.updateDescriptorSets(write_descriptor_sets, {}); - vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(), vk_compute_queue_family_index); - vk::CommandPool command_pool = vk_device.createCommandPool(command_pool_create_info); + std::array push_constants = { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 }; + assert( ( sizeof( push_constants ) <= vk_physical_device.getProperties().limits.maxPushConstantsSize ) && "Too many push constants" ); - vk::CommandBufferAllocateInfo command_buffer_alloc_info( - command_pool, - vk::CommandBufferLevel::ePrimary, - 1); - const std::vector cmd_buffers = vk_device.allocateCommandBuffers(command_buffer_alloc_info); - vk::CommandBuffer cmd_buffer = cmd_buffers.front(); + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(); for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { // copy data to device if (src0->backend != GGML_BACKEND_GPU) { - ggml_vk_h2d_tensor_2d(cmd_buffer, &d_X, 0, src0, i03, i02); + ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02); } - ggml_vk_h2d_tensor_2d(cmd_buffer, &d_Y, 0, src1, i03, i02); - - printf("Beginning Vulkan kernel call\n"); + ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02); // compute vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); cmd_buffer.begin(cmd_buffer_begin_info); + cmd_buffer.pushConstants(vk_pipeline_matmul_layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants); cmd_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, vk_pipeline_matmul); cmd_buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, vk_pipeline_matmul_layout, 0, { descriptor_set }, {}); - cmd_buffer.dispatch(d_ne, 1, 1); + cmd_buffer.dispatch((ne01 + 31) / 32, (ne11 + 31) / 32, 1); cmd_buffer.end(); vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0); @@ -431,11 +480,10 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr true, uint64_t(-1)); - printf("Vulkan kernel call done\n"); - // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - ggml_vk_buffer_read(cmd_buffer, &d_D, 0, d, sizeof(float) * d_ne); + float * d_blas = (float *) malloc(sizeof(float) * d_ne); + ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne); } } @@ -447,6 +495,7 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + assert(false); // const int64_t ne00 = src0->ne[0]; // const int64_t ne01 = src0->ne[1]; // const int64_t ne02 = src0->ne[2]; @@ -577,7 +626,7 @@ bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens const int64_t ne1 = dst->ne[1]; // TODO: find the optimal values for these - if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && + if ((src0->type == GGML_TYPE_F32 /*|| src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)*/) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {