diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 4542f33c5..0de498ec3 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -71,9 +71,11 @@ struct vk_buffer { // Staging buffers vk_buffer * sb_write; vk_buffer * sb_read; + uint32_t qf_owner; }; struct vk_pipeline { + std::string name; vk::DescriptorSetLayout dsl; vk::DescriptorPool descriptor_pool; vk::DescriptorSet descriptor_set; @@ -121,7 +123,7 @@ uint32_t vk_device_vendor_id; vk_queue vk_compute_queue; vk_queue vk_transfer_queues[VK_TRANSFER_QUEUE_COUNT]; VmaAllocator vk_allocator; -vk::PipelineStageFlags vk_stage_flags[8] = { vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands }; +vk::PipelineStageFlags vk_stage_flags[8] = { vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader }; vk_pipeline vk_pipeline_matmul_f32_l, vk_pipeline_matmul_f32_m, vk_pipeline_matmul_f32_s, vk_pipeline_matmul_f16_l, vk_pipeline_matmul_f16_m, vk_pipeline_matmul_f16_s; vk_pipeline vk_pipeline_matmul_split_k_reduce; vk_pipeline vk_pipeline_f16_to_f32, vk_pipeline_dequant_q4_0; @@ -133,11 +135,15 @@ bool vk_fp16_support = false; static std::vector> vk_buf_list; static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_count, std::array wg_denoms, std::vector&& specialization_constants) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_create_pipeline(" << path << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_count << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants)" << std::endl; +#endif GGML_ASSERT(parameter_count > 0); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); vk_pipeline pipeline; + pipeline.name = path; pipeline.parameter_count = parameter_count; pipeline.push_constant_size = push_constant_count * sizeof(int); pipeline.wg_denoms = wg_denoms; @@ -227,6 +233,9 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::s } static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl; +#endif if (q.cmd_buffers.size() > q.cmd_buffer_idx) { // Reuse command buffer return q.cmd_buffers[q.cmd_buffer_idx++]; @@ -246,6 +255,9 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) { } static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector&& wait_semaphores, std::vector&& signal_semaphores) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_create_submission()" << std::endl; +#endif vk_submission s; s.buffer = ggml_vk_create_cmd_buffer(q); s.wait_semaphores = wait_semaphores; @@ -254,10 +266,16 @@ static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector&& wait_semaphores, std::vector&& signal_semaphores) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_create_sequence_1()" << std::endl; +#endif return { ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores)) }; } static void ggml_vk_submit(vk_queue& q, std::vector& sequences, vk::Fence fence) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_submit()" << std::endl; +#endif if (sequences.empty()) { return; } @@ -285,39 +303,10 @@ static void ggml_vk_submit(vk_queue& q, std::vector& sequences, vk: sequences.clear(); } -static vk_submission ggml_vk_submit_pipeline(vk_pipeline& pipeline, std::vector buffers, const std::vector&& push_constants, std::array elements, vk_queue& q, std::vector&& wait_semaphores, std::vector&& signal_semaphores) { - std::vector descriptor_buffer_infos; - std::vector write_descriptor_sets; - for (uint32_t i = 0; i < pipeline.parameter_count; i++) { - descriptor_buffer_infos.push_back({buffers[i]->buffer, 0, buffers[i]->size}); - } - for (uint32_t i = 0; i < pipeline.parameter_count; i++) { - write_descriptor_sets.push_back({pipeline.descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); - } - - vk_device.updateDescriptorSets(write_descriptor_sets, {}); - - vk_submission s; - s.buffer = ggml_vk_create_cmd_buffer(q); - - s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); - s.buffer.pushConstants(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants); - s.buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline.pipeline); - s.buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, - pipeline.layout, - 0, - { pipeline.descriptor_set }, - {}); - s.buffer.dispatch(CEIL_DIV(elements[0], pipeline.wg_denoms[0]), CEIL_DIV(elements[1], pipeline.wg_denoms[1]), CEIL_DIV(elements[2], pipeline.wg_denoms[2])); - s.buffer.end(); - - s.wait_semaphores = wait_semaphores; - s.signal_semaphores = signal_semaphores; - - return s; -} - static uint32_t ggml_vk_find_queue_family_index(std::vector& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_find_queue_family_index()" << std::endl; +#endif const uint32_t qfsize = queue_family_props.size(); // Try with avoid preferences first @@ -341,6 +330,13 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector&& buffers, vk_queue& q, vk::AccessFlags src_mask, vk::AccessFlags dst_mask) { + std::vector bmem_barriers; + + uint32_t sfi; + uint32_t dfi; + + for (auto& buf : buffers) { + if (buf.qf_owner != vk::QueueFamilyIgnored && buf.qf_owner != q.queue_family_index) { + sfi = buf.qf_owner; + dfi = q.queue_family_index; + buf.qf_owner = dfi; + } else { + sfi = vk::QueueFamilyIgnored; + dfi = vk::QueueFamilyIgnored; + } + bmem_barriers.push_back({ src_mask, dst_mask, sfi, dfi, buf.buffer, 0, VK_WHOLE_SIZE }); + } + + if (bmem_barriers.empty()) { + return; + } + + cmd_buffer.pipelineBarrier( + vk::PipelineStageFlagBits::eComputeShader, + vk::PipelineStageFlagBits::eComputeShader, + {}, + {}, + bmem_barriers, + {} + ); +} + static void ggml_vk_destroy_buffer(vk_buffer& buf) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_destroy_buffer(" << buf.size << ")" << std::endl; +#endif buf.size = 0; PROFILE("ggml_vk_destroy_buffer", vmaDestroyBuffer(vk_allocator, buf.buffer, buf.allocation); @@ -439,12 +484,17 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int sp void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size); void ggml_vk_init(void) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_init()" << std::endl; +#endif char* GGML_VULKAN_DEVICE = getenv("GGML_VULKAN_DEVICE"); int dev_num = (GGML_VULKAN_DEVICE == NULL ? 0 : atoi(GGML_VULKAN_DEVICE)); vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION }; const std::vector layers = { - // "VK_LAYER_KHRONOS_validation", +#ifdef VK_VALIDATE + "VK_LAYER_KHRONOS_validation", +#endif }; vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers.size(), layers.data()); vk_instance = vk::createInstance(instance_create_info); @@ -560,7 +610,7 @@ void ggml_vk_init(void) { vk_pipeline_matmul_split_k_reduce = ggml_vk_create_pipeline("vk_shaders/matmul_split_k_reduce.spv", "main", 1, 3, {32, 32, 1}, {}); vk_pipeline_f16_to_f32 = ggml_vk_create_pipeline("vk_shaders/f16_to_f32.spv", "main", 2, 1, {64, 1, 1}, {}); - vk_pipeline_dequant_q4_0 = ggml_vk_create_pipeline("vk_shaders/dequant_q4_0.spv", "main", 2, 1, {32, 1, 1}, {}); + vk_pipeline_dequant_q4_0 = ggml_vk_create_pipeline("vk_shaders/dequant_q4_0.spv", "main", 2, 1, {256*32, 1, 1}, {}); // Group size * values per quant group // Queues vk_compute_queue = ggml_vk_create_queue(compute_queue_family_index, 0); @@ -612,6 +662,9 @@ void ggml_vk_init(void) { } static vk_pipeline* ggml_get_to_fp32_vk(ggml_type type) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_get_to_fp32_vk()" << std::endl; +#endif switch (type) { case GGML_TYPE_Q4_0: return &vk_pipeline_dequant_q4_0; @@ -651,6 +704,9 @@ static vk_buffer g_vk_buffer_pool[MAX_VK_BUFFERS]; static std::atomic_flag g_vk_pool_lock = ATOMIC_FLAG_INIT; static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf, VmaAllocationCreateFlags alloc_flags) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_pool_malloc(" << size << ")" << std::endl; +#endif PROFILE("ggml_vk_pool_malloc", scoped_spin_lock lock(g_vk_pool_lock); @@ -687,6 +743,9 @@ static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf, VmaAllocationCreate } static void ggml_vk_pool_free(vk_buffer& buffer) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_pool_free(" << buffer.size << ")" << std::endl; +#endif PROFILE("ggml_vk_pool_free", scoped_spin_lock lock(g_vk_pool_lock); @@ -694,6 +753,8 @@ static void ggml_vk_pool_free(vk_buffer& buffer) { vk_buffer& b = g_vk_buffer_pool[i]; if (b.size == 0) { b = buffer; + // Set owning queue family index to ignored to avoid synchronization on next use + b.qf_owner = VK_QUEUE_FAMILY_IGNORED; return; } } @@ -703,6 +764,9 @@ static void ggml_vk_pool_free(vk_buffer& buffer) { } void* ggml_vk_host_malloc(size_t size) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl; +#endif if (getenv("GGML_VK_NO_PINNED") != nullptr) { return nullptr; } @@ -726,6 +790,9 @@ void* ggml_vk_host_malloc(size_t size) { } void ggml_vk_host_free(void* ptr) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_host_free()" << std::endl; +#endif vk_buffer* buf = nullptr; for (size_t i = 0; i < vk_buf_list.size(); i++) { const uint8_t* addr = (const uint8_t*) std::get<0>(vk_buf_list[i]); @@ -743,7 +810,55 @@ void ggml_vk_host_free(void* ptr) { ggml_vk_destroy_buffer(*buf); } +static vk_submission ggml_vk_begin_submission(vk_queue& q) { + vk_submission s; + s.buffer = ggml_vk_create_cmd_buffer(q); + s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); + + return s; +} + +static void ggml_vk_dispatch_pipeline(vk_submission& s, vk_pipeline& pipeline, std::vector&& buffers, const std::vector&& push_constants, std::array elements, vk_queue& q) { + uint32_t wg0 = CEIL_DIV(elements[0], pipeline.wg_denoms[0]); + uint32_t wg1 = CEIL_DIV(elements[1], pipeline.wg_denoms[1]); + uint32_t wg2 = CEIL_DIV(elements[2], pipeline.wg_denoms[2]); +#ifdef VK_DEBUG + std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline.name << ", (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl; +#endif + std::vector descriptor_buffer_infos; + std::vector write_descriptor_sets; + for (uint32_t i = 0; i < pipeline.parameter_count; i++) { + descriptor_buffer_infos.push_back({buffers[i].buffer, 0, buffers[i].size}); + } + for (uint32_t i = 0; i < pipeline.parameter_count; i++) { + write_descriptor_sets.push_back({pipeline.descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); + } + + vk_device.updateDescriptorSets(write_descriptor_sets, {}); + + ggml_vk_sync_buffers(s.buffer, std::move(buffers), q, vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eMemoryRead); + + s.buffer.pushConstants(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants); + s.buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline.pipeline); + s.buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, + pipeline.layout, + 0, + { pipeline.descriptor_set }, + {}); + s.buffer.dispatch(wg0, wg1, wg2); +} + +static void ggml_vk_end_submission(vk_submission& s, std::vector&& wait_semaphores, std::vector&& signal_semaphores) { + s.buffer.end(); + + s.wait_semaphores = wait_semaphores; + s.signal_semaphores = signal_semaphores; +} + static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q, std::vector&& wait_semaphores, std::vector&& signal_semaphores) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl; +#endif VkMemoryPropertyFlags mem_prop_flags; vmaGetAllocationMemoryProperties(vk_allocator, dst->allocation, &mem_prop_flags); @@ -778,6 +893,7 @@ static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset, } s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); + ggml_vk_sync_buffers(s.buffer, { *dst }, q, vk::AccessFlagBits::eMemoryRead, vk::AccessFlagBits::eMemoryWrite); vkCmdCopyBuffer(s.buffer, buf->buffer, dst->buffer, height, slices.data()); s.buffer.end(); return { s }; @@ -799,6 +915,7 @@ static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset, width * height}; s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); + ggml_vk_sync_buffers(s.buffer, { *dst }, q, vk::AccessFlagBits::eMemoryRead, vk::AccessFlagBits::eMemoryWrite); vkCmdCopyBuffer(s.buffer, dst->sb_write->buffer, dst->buffer, 1, &buf_copy); s.buffer.end(); @@ -810,6 +927,9 @@ static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset, } static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl; +#endif VkMemoryPropertyFlags mem_prop_flags; vmaGetAllocationMemoryProperties(vk_allocator, dst->allocation, &mem_prop_flags); @@ -831,14 +951,23 @@ static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * } static vk_sequence ggml_vk_buffer_write_async(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, std::vector&& wait_semaphores, std::vector&& signal_semaphores) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl; +#endif return ggml_vk_buffer_write_2d_async(dst, offset, src, 0, size, 1, q, std::move(wait_semaphores), std::move(signal_semaphores)); } static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl; +#endif ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1, q); } static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q, std::vector&& wait_semaphores, std::vector&& signal_semaphores) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_buffer_read_async(" << size << ")" << std::endl; +#endif // Check if dst is pinned memory vk_buffer* buf = nullptr; size_t buf_offset = 0; @@ -864,6 +993,7 @@ static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void vk_submission s = ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores)); s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); + ggml_vk_sync_buffers(s.buffer, { *src }, q, vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eMemoryRead); vkCmdCopyBuffer(s.buffer, src->buffer, buf->buffer, 1, &buf_copy); s.buffer.end(); @@ -871,6 +1001,9 @@ static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void } static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_buffer_read(" << size << ")" << std::endl; +#endif VkMemoryPropertyFlags mem_prop_flags; vmaGetAllocationMemoryProperties(vk_allocator, src->allocation, &mem_prop_flags); @@ -902,6 +1035,7 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_ std::vector s = { ggml_vk_create_sequence_1(q, {}, {}) }; s[0][0].buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); + ggml_vk_sync_buffers(s[0][0].buffer, { *src }, q, vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eMemoryRead); vkCmdCopyBuffer(s[0][0].buffer, src->buffer, buf->buffer, 1, &buf_copy); s[0][0].buffer.end(); ggml_vk_submit(q, s, fence); @@ -926,6 +1060,7 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_ vk::CommandBuffer cmd_buffer = ggml_vk_create_cmd_buffer(q); vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); cmd_buffer.begin(cmd_buffer_begin_info); + ggml_vk_sync_buffers(cmd_buffer, { *src }, q, vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eMemoryRead); vkCmdCopyBuffer(cmd_buffer, src->buffer, src->sb_read->buffer, 1, &buf_copy); cmd_buffer.end(); @@ -945,6 +1080,9 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_ } static vk_sequence ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, vk_queue& q, std::vector&& wait_semaphores, std::vector&& signal_semaphores) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_h2d_tensor_2d()" << std::endl; +#endif const uint64_t ne0 = src->ne[0]; const uint64_t ne1 = src->ne[1]; const uint64_t nb0 = src->nb[0]; @@ -975,6 +1113,9 @@ static vk_sequence ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const st } static int ggml_vk_guess_split_k(int m, int n, int k) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_guess_split_k()" << std::endl; +#endif if (k > 128 && (m < 128 || n < 128)) { return 4; } @@ -983,6 +1124,9 @@ static int ggml_vk_guess_split_k(int m, int n, int k) { } static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16, int m, int n) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_guess_matmul_pipeline()" << std::endl; +#endif if (bit16) { if (m <= 32 || n <= 32) { return &vk_pipeline_matmul_f16_s; @@ -1003,20 +1147,37 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16, int m, int n) { } static vk_sequence ggml_vk_matmul(vk_pipeline& pipeline, vk_buffer& a, vk_buffer& b, vk_buffer& d, int m, int n, int k, int split_k, vk_queue& q, std::vector&& wait_semaphores, std::vector&& signal_semaphores) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_matmul(" << m << ", " << n << ", " << k << ")" << std::endl; +#endif + vk_submission s = ggml_vk_begin_submission(q); if (split_k == 1) { - return { ggml_vk_submit_pipeline(pipeline, { &a, &b, &d }, { m, n, k, k, k, m, k }, { (uint32_t)m, (uint32_t)n, 1 }, q, std::move(wait_semaphores), std::move(signal_semaphores)) }; + ggml_vk_dispatch_pipeline(s, pipeline, { a, b, d }, { m, n, k, k, k, m, k }, { (uint32_t)m, (uint32_t)n, 1 }, q); + ggml_vk_end_submission(s, std::move(wait_semaphores), std::move(signal_semaphores)); + return { s }; } // Synchronize the two submissions - vk::Semaphore semaphore = ggml_vk_create_semaphore(q); + ggml_vk_dispatch_pipeline(s, pipeline, { a, b, d }, { m, n, k, k, k, m, CEIL_DIV(k, split_k) }, { (uint32_t)m * split_k, (uint32_t)n, 1 }, q); + s.buffer.pipelineBarrier( + vk::PipelineStageFlagBits::eComputeShader, + vk::PipelineStageFlagBits::eComputeShader, + {}, + {}, + { + { vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite, vk::QueueFamilyIgnored, vk::QueueFamilyIgnored, d.buffer, 0, VK_WHOLE_SIZE } }, + {} + ); + ggml_vk_dispatch_pipeline(s, vk_pipeline_matmul_split_k_reduce, { d }, { m, n, split_k }, { (uint32_t)m, (uint32_t)n, 1 }, q); + ggml_vk_end_submission(s, std::move(wait_semaphores), std::move(signal_semaphores)); - vk_submission s1 = ggml_vk_submit_pipeline(pipeline, { &a, &b, &d }, { m, n, k, k, k, m, CEIL_DIV(k, split_k) }, { (uint32_t)m * split_k, (uint32_t)n, 1 }, q, std::move(wait_semaphores), { semaphore }); - vk_submission s2 = ggml_vk_submit_pipeline(vk_pipeline_matmul_split_k_reduce, { &d }, { m, n, split_k }, { (uint32_t)m, (uint32_t)n, 1 }, q, { semaphore }, std::move(signal_semaphores)); - - return { s1, s2 }; + return { s }; } static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_mul_mat_f32()" << std::endl; +#endif const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; @@ -1050,26 +1211,55 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr std::vector transfer_0_seqs; std::vector transfer_1_seqs; + vk::Semaphore s_it_x; + vk::Semaphore s_it_y; + + const bool load_x = src0->backend != GGML_BACKEND_CPU; + for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { + const bool first = i03 == 0 && i02 == 0; + const bool last = i03 == ne03 - 1 && i02 == ne02 - 1; + vk::Semaphore s_x; vk::Semaphore s_y = ggml_vk_create_semaphore(vk_compute_queue); std::vector semaphores = { s_y }; // copy data to device - if (src0->backend != GGML_BACKEND_GPU) { + if (load_x) { s_x = ggml_vk_create_semaphore(vk_compute_queue); semaphores.push_back(s_x); - transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0], {}, { s_x })); + if (first) { + transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0], {}, { s_x })); + } else { + // Wait for previous matmul to be done before writing to the input buffers again + transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0], { s_it_x }, { s_x })); + } } ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE); - transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1], {}, { s_y })); + if (first) { + transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1], {}, { s_y })); + } else { + // Wait for previous matmul to be done before writing to the input buffers again + transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1], { s_it_y }, { s_y })); + } // compute vk::Semaphore s_mm = ggml_vk_create_semaphore(vk_compute_queue); - compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm })); + if (!last) { + if (load_x) { + s_it_x = ggml_vk_create_semaphore(vk_compute_queue); + s_it_y = ggml_vk_create_semaphore(vk_compute_queue); + compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm, s_it_x, s_it_y })); + } else { + s_it_y = ggml_vk_create_semaphore(vk_compute_queue); + compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm, s_it_y })); + } + } else { + compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm })); + } // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); @@ -1095,6 +1285,9 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_mul_mat_f16()" << std::endl; +#endif GGML_ASSERT(vk_fp16_support); GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -1141,17 +1334,30 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr std::vector transfer_0_seqs; std::vector transfer_1_seqs; + vk::Semaphore s_it_x; + vk::Semaphore s_it_y; + + const bool load_x = src1->backend != GGML_BACKEND_GPU; + for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { + const bool first = i03 == 0 && i02 == 0; + const bool last = i03 == ne03 - 1 && i02 == ne02 - 1; + vk::Semaphore s_x; vk::Semaphore s_y = ggml_vk_create_semaphore(vk_compute_queue); std::vector semaphores = { s_y }; // copy data to device - if (src1->backend != GGML_BACKEND_GPU) { + if (load_x) { s_x = ggml_vk_create_semaphore(vk_compute_queue); semaphores.push_back(s_x); - transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0], {}, { s_x })); + if (first) { + transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0], {}, { s_x })); + } else { + // Wait for previous matmul to be done before writing to the input buffers again + transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0], { s_it_x }, { s_x })); + } } ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE); @@ -1179,11 +1385,27 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } } - transfer_1_seqs.push_back(ggml_vk_buffer_write_async(&d_Y, 0, tmp, sizeof(ggml_fp16_t) * y_ne, vk_transfer_queues[1], {}, { s_y })); + if (first) { + transfer_1_seqs.push_back(ggml_vk_buffer_write_async(&d_Y, 0, tmp, sizeof(ggml_fp16_t) * y_ne, vk_transfer_queues[1], {}, { s_y })); + } else { + // Wait for previous matmul to be done before writing to the input buffers again + transfer_1_seqs.push_back(ggml_vk_buffer_write_async(&d_Y, 0, tmp, sizeof(ggml_fp16_t) * y_ne, vk_transfer_queues[1], { s_it_y }, { s_y })); + } // compute vk::Semaphore s_mm = ggml_vk_create_semaphore(vk_compute_queue); - compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm })); + if (!last) { + if (load_x) { + s_it_x = ggml_vk_create_semaphore(vk_compute_queue); + s_it_y = ggml_vk_create_semaphore(vk_compute_queue); + compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm, s_it_x, s_it_y })); + } else { + s_it_y = ggml_vk_create_semaphore(vk_compute_queue); + compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm, s_it_y })); + } + } else { + compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm })); + } // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); @@ -1195,6 +1417,7 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE); + // vk_transfer_queues[0].queue.waitIdle(); // cleanup waits for the queue to be done ggml_vk_queue_cleanup(vk_transfer_queues[0]); @@ -1209,6 +1432,9 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_mul_mat_q_f32()" << std::endl; +#endif const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; @@ -1251,8 +1477,16 @@ static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * std::vector transfer_0_seqs; std::vector transfer_1_seqs; + vk::Semaphore s_it_x; + vk::Semaphore s_it_y; + + const bool load_x = src0->backend != GGML_BACKEND_GPU; + for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { + const bool first = i03 == 0 && i02 == 0; + const bool last = i03 == ne03 - 1 && i02 == ne02 - 1; + vk::Semaphore s_x; vk::Semaphore s_y = ggml_vk_create_semaphore(vk_compute_queue); vk::Semaphore s_q = ggml_vk_create_semaphore(vk_compute_queue); @@ -1263,10 +1497,15 @@ static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * vk::Semaphore s_mm = ggml_vk_create_semaphore(vk_compute_queue); // copy src0 to device if necessary - if (src0->backend == GGML_BACKEND_CPU) { + if (load_x) { s_x = ggml_vk_create_semaphore(vk_compute_queue); - transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Q, 0, src0, i03, i02, vk_transfer_queues[0], {}, { s_x })); q_semaphores.push_back(s_x); + if (first) { + transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Q, 0, src0, i03, i02, vk_transfer_queues[0], {}, { s_x })); + } else { + // Wait for previous dequant to be done before writing to the input buffers again + transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Q, 0, src0, i03, i02, vk_transfer_queues[0], { s_it_x }, { s_x })); + } } else if (src0->backend == GGML_BACKEND_GPU) { d_Q = *(vk_buffer *) src0->data; } else { @@ -1276,7 +1515,12 @@ static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE); // copy src1 to device - transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1], {}, { s_y })); + if (first) { + transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1], {}, { s_y })); + } else { + // Wait for previous matmul to be done before writing to the input buffers again + transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1], { s_it_y }, { s_y })); + } if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel GGML_ASSERT(false); @@ -1298,10 +1542,23 @@ static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } else { // general dequantization kernel + VK matrix matrix multiplication // convert src0 to fp32 on device - compute_seqs.push_back({ ggml_vk_submit_pipeline(*to_fp32_vk, {&d_Q, &d_X}, { (int)x_ne }, { (uint32_t)x_ne, 1, 1}, vk_compute_queue, std::move(q_semaphores), { s_q }) }); + vk_submission s = ggml_vk_begin_submission(vk_compute_queue); + ggml_vk_dispatch_pipeline(s, *to_fp32_vk, {d_Q, d_X}, { (int)x_ne }, { (uint32_t)x_ne, 1, 1}, vk_compute_queue); + if (load_x && !last) { + s_it_x = ggml_vk_create_semaphore(vk_compute_queue); + ggml_vk_end_submission(s, std::move(q_semaphores), { s_q, s_it_x }); + } else { + ggml_vk_end_submission(s, std::move(q_semaphores), { s_q }); + } + compute_seqs.push_back({ s }); // compute - compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm })); + if (!last) { + s_it_y = ggml_vk_create_semaphore(vk_compute_queue); + compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm, s_it_y })); + } else { + compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm })); + } } // copy dst to host diff --git a/vk_shaders/dequant_q4_0.glsl b/vk_shaders/dequant_q4_0.glsl index 29d24b597..dd9fed031 100644 --- a/vk_shaders/dequant_q4_0.glsl +++ b/vk_shaders/dequant_q4_0.glsl @@ -1,5 +1,6 @@ #version 450 +#extension GL_EXT_control_flow_attributes : require #extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require @@ -7,7 +8,7 @@ #define QUANT_K 32 #define QUANT_R 2 -layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; struct block_q4_0 { @@ -24,34 +25,21 @@ layout (push_constant) uniform parameter } p; void main() { - const int idx = int(gl_GlobalInvocationID.x); + const int i = int(gl_GlobalInvocationID.x); - const int i = int(gl_WorkGroupID.x * gl_WorkGroupSize.x + gl_LocalInvocationID.x*2); - - if (idx >= p.N) { + if (i >= p.N) { return; } - const int qk = QUANT_K; - const int qr = QUANT_R; + const block_q4_0 blk = x[i]; - const int ib = i/qk; // block index - const int iqs = (i%qk)/qr; // quant index - const int iybs = i - i%qk; // y block start index - const int y_offset = qr == 1 ? 1 : qk/2; + const float d = float(blk.d); - // dequantize - float v0, v1; - const float d = float(x[ib].d); + [[unroll]] for (int j = 0; j < QUANT_K/2; ++j) { + const int x0 = (blk.qs[j] & 0x0F) - 8; + const int x1 = (blk.qs[j] >> 4) - 8; - const uint8_t vui = x[ib].qs[iqs]; - - const int8_t vi0 = int8_t(vui & 0xF); - const int8_t vi1 = int8_t(vui >> 4); - - v0 = (vi0 - 8)*d; - v1 = (vi1 - 8)*d; - - y[iybs + iqs + 0] = v0; - y[iybs + iqs + y_offset] = v1; + y[i*QUANT_K + j + 0 ] = x0*d; + y[i*QUANT_K + j + QUANT_K/2] = x1*d; + } }