diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 510d8fef2..f2749087e 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -37,6 +37,7 @@ inline static void* ggml_aligned_malloc(size_t size, size_t alignment) { #include #include #include +#include #include "ggml.h" @@ -44,6 +45,8 @@ inline static void* ggml_aligned_malloc(size_t size, size_t alignment) { #define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) +#define VK_TRANSFER_QUEUE_COUNT 2 + struct vk_buffer { vk::Buffer buffer; VmaAllocation allocation; @@ -56,6 +59,8 @@ struct vk_buffer { struct vk_pipeline { vk::DescriptorSetLayout dsl; + vk::DescriptorPool descriptor_pool; + vk::DescriptorSet descriptor_set; vk::PipelineLayout layout; vk::Pipeline pipeline; uint32_t push_constant_size; @@ -63,12 +68,31 @@ struct vk_pipeline { std::array wg_denoms; }; +struct vk_queue { + vk_queue() {}; + vk_queue(const vk_queue& b) : queue_family_index(b.queue_family_index), queue(b.queue), pool(b.pool) {} + + vk_queue& operator=(const vk_queue& b) { + if (this != &b) { + queue_family_index = b.queue_family_index; + queue = b.queue; + pool = b.pool; + } + return *this; + } + + uint32_t queue_family_index; + vk::Queue queue; + vk::CommandPool pool; + std::vector cmd_buffers; + std::mutex mutex; +}; + vk::Instance vk_instance; -uint32_t vk_compute_queue_family_index; -uint32_t vk_transfer_queue_family_index; vk::PhysicalDevice vk_physical_device; vk::Device vk_device; -vk::CommandPool vk_command_pool_compute, vk_command_pool_transfer; +vk_queue vk_compute_queue; +vk_queue vk_transfer_queues[VK_TRANSFER_QUEUE_COUNT]; VmaAllocator vk_allocator; vk_pipeline vk_pipeline_matmul_f32, vk_pipeline_matmul_f16; vk_pipeline vk_pipeline_f16_to_f32, vk_pipeline_dequant_q4_0; @@ -79,21 +103,12 @@ bool vk_fp16_support = false; static std::vector> vk_buf_list; -static vk::CommandBuffer ggml_vk_cmd_buffer_create(vk::CommandPool& pool) { - vk::CommandBufferAllocateInfo command_buffer_alloc_info( - pool, - vk::CommandBufferLevel::ePrimary, - 1); - const std::vector cmd_buffers = vk_device.allocateCommandBuffers(command_buffer_alloc_info); - return cmd_buffers.front(); -} - static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_count, std::array wg_denoms) { - vk_pipeline pipeline; - GGML_ASSERT(parameter_count > 0); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); + vk_pipeline pipeline; + pipeline.parameter_count = parameter_count; pipeline.push_constant_size = push_constant_count * sizeof(int); pipeline.wg_denoms = wg_denoms; @@ -132,6 +147,14 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::s dsl_binding); pipeline.dsl = vk_device.createDescriptorSetLayout(descriptor_set_layout_create_info); + vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count); + vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT), 1, descriptor_pool_size); + pipeline.descriptor_pool = vk_device.createDescriptorPool(descriptor_pool_create_info); + + vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline.descriptor_pool, 1, &pipeline.dsl); + const std::vector descriptor_sets = vk_device.allocateDescriptorSets(descriptor_set_alloc_info); + pipeline.descriptor_set = descriptor_sets.front(); + vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline.dsl, pcr); pipeline.layout = vk_device.createPipelineLayout(pipeline_layout_create_info); vk::PipelineCache pipeline_cache = vk_device.createPipelineCache(vk::PipelineCacheCreateInfo()); @@ -150,28 +173,18 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::s return pipeline; } -static void ggml_vk_dispatch_pipeline(vk_pipeline& pipeline, std::vector buffers, std::vector push_constants, std::array elements, vk::Fence& fence) { - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count); - vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(), 1, descriptor_pool_size); - vk::DescriptorPool descriptor_pool = vk_device.createDescriptorPool(descriptor_pool_create_info); - - vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(descriptor_pool, 1, &pipeline.dsl); - const std::vector descriptor_sets = vk_device.allocateDescriptorSets(descriptor_set_alloc_info); - vk::DescriptorSet descriptor_set = descriptor_sets.front(); - +static void ggml_vk_dispatch_pipeline(vk_pipeline& pipeline, std::vector buffers, const std::vector&& push_constants, std::array elements, vk::CommandBuffer& cmd_buffer, vk::Fence& fence) { std::vector descriptor_buffer_infos; std::vector write_descriptor_sets; for (uint32_t i = 0; i < pipeline.parameter_count; i++) { descriptor_buffer_infos.push_back({buffers[i]->buffer, 0, buffers[i]->size}); } for (uint32_t i = 0; i < pipeline.parameter_count; i++) { - write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); + write_descriptor_sets.push_back({pipeline.descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); } vk_device.updateDescriptorSets(write_descriptor_sets, {}); - vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_compute); - vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); cmd_buffer.begin(cmd_buffer_begin_info); cmd_buffer.pushConstants(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants); @@ -179,13 +192,13 @@ static void ggml_vk_dispatch_pipeline(vk_pipeline& pipeline, std::vector guard(vk_compute_queue.mutex); vk::SubmitInfo submit_info(0, nullptr, @@ -193,15 +206,139 @@ static void ggml_vk_dispatch_pipeline(vk_pipeline& pipeline, std::vector& indices, uint32_t num_indices, std::vector& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index) { + const uint32_t qfsize = queue_family_props.size(); + + bool done; + + for (uint32_t idx = 0; idx < num_indices; idx++) { + done = false; + // Try with avoid preferences first + for (uint32_t i = 0; i < qfsize; i++) { + if ((compute_index < 0 || i != compute_index) && std::find(indices.begin(), indices.end(), i) == indices.end() && queue_family_props[i].queueFlags & required && !(queue_family_props[i].queueFlags & avoid)) { + indices.push_back(i); + done = true; + break; + } + } + + if (!done) { + // Fall back to only required + for (size_t i = 0; i < qfsize; i++) { + if ((compute_index < 0 || i != compute_index) && std::find(indices.begin(), indices.end(), i) == indices.end() && queue_family_props[i].queueFlags & required) { + indices.push_back(i); + done = true; + break; + } + } + } + + if (!done) { + // Fall back to reusing compute queue + for (size_t i = 0; i < qfsize; i++) { + if (std::find(indices.begin(), indices.end(), i) == indices.end() && queue_family_props[i].queueFlags & required) { + indices.push_back(i); + done = true; + break; + } + } + } + + if (!done) { + std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl; + for (uint32_t i = 0; i < qfsize; i++) { + std::cerr << i << ": " << "compute=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eCompute) << " transfer=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eTransfer) << " graphics=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eGraphics) << " protected=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eProtected) << " optical_flow_nv=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eOpticalFlowNV) << " sparse binding=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eSparseBinding) << " video decode=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eVideoDecodeKHR) << std::endl; + } + abort(); + } + } +} + +static vk_queue ggml_vk_create_queue(uint32_t queue_family_index) { + vk_queue q; + q.queue_family_index = queue_family_index; + + vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT), queue_family_index); + q.pool = vk_device.createCommandPool(command_pool_create_info_compute); + + q.queue = vk_device.getQueue(queue_family_index, 0); + + return q; +} + +static vk::CommandBuffer ggml_vk_cmd_buffer_create(vk_queue& q) { + vk::CommandBufferAllocateInfo command_buffer_alloc_info( + q.pool, + vk::CommandBufferLevel::ePrimary, + 1); + const std::vector cmd_buffers = vk_device.allocateCommandBuffers(command_buffer_alloc_info); + auto buf = cmd_buffers.front(); + + q.cmd_buffers.push_back(buf); + + return buf; +} + +static void ggml_vk_queue_cleanup(vk_queue& q) { + q.queue.waitIdle(); + vk_device.freeCommandBuffers(q.pool, q.cmd_buffers); + q.cmd_buffers.clear(); +} + +static vk_buffer ggml_vk_create_buffer(size_t size, VmaAllocationCreateFlags alloc_flags, VmaMemoryUsage vma_usage, VkMemoryPropertyFlags req_flags = 0) { + vk_buffer buf; + + buf.size = size; + vk::BufferCreateInfo buffer_create_info{ + vk::BufferCreateFlags(), + size, + vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst, + vk::SharingMode::eExclusive, + 1, + nullptr, + }; + + VmaAllocationCreateInfo allocation_info = {}; + allocation_info.requiredFlags = req_flags; + allocation_info.flags = alloc_flags; + allocation_info.usage = vma_usage; + + vmaCreateBuffer(vk_allocator, + (VkBufferCreateInfo*)&buffer_create_info, + &allocation_info, + (VkBuffer*)&buf.buffer, + &buf.allocation, + &buf.info); + + buf.sb_write = nullptr; + buf.sb_read = nullptr; + + return buf; +} + +static void ggml_vk_destroy_buffer(vk_buffer& buf) { + buf.size = 0; + vmaDestroyBuffer(vk_allocator, buf.buffer, buf.allocation); + + // Cleanup staging buffers + if (buf.sb_write != nullptr) { + vmaDestroyBuffer(vk_allocator, buf.sb_write->buffer, buf.sb_write->allocation); + delete buf.sb_write; + buf.sb_write = nullptr; + } + if (buf.sb_read != nullptr) { + vmaDestroyBuffer(vk_allocator, buf.sb_read->buffer, buf.sb_read->allocation); + delete buf.sb_read; + buf.sb_read = nullptr; + } +} + +void ggml_vk_test_transfer(size_t ne); void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k); void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k); -void ggml_vk_test_f16_to_f32(size_t m); void ggml_vk_init(void) { char* GGML_VULKAN_DEVICE = getenv("GGML_VULKAN_DEVICE"); @@ -235,44 +372,20 @@ void ggml_vk_init(void) { std::vector queue_family_props = vk_physical_device.getQueueFamilyProperties(); - const size_t qfsize = queue_family_props.size(); - - // Try to find a non-graphics compute queue and a transfer-focused queue - vk_compute_queue_family_index = qfsize; - vk_transfer_queue_family_index = qfsize; - for (size_t i = 0; i < qfsize; i++) { - // std::cout << i << ": " << "compute=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eCompute) << " transfer=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eTransfer) << " graphics=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eGraphics) << " protected=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eProtected) << " optical_flow_nv=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eOpticalFlowNV) << " sparse binding=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eSparseBinding) << " video decode=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eVideoDecodeKHR) << std::endl; - if (vk_compute_queue_family_index >= qfsize && !(queue_family_props[i].queueFlags & vk::QueueFlagBits::eGraphics) && queue_family_props[i].queueFlags & vk::QueueFlagBits::eCompute) { - vk_compute_queue_family_index = i; - } - if (vk_transfer_queue_family_index >= qfsize && !(queue_family_props[i].queueFlags & (vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics | vk::QueueFlagBits::eVideoDecodeKHR | vk::QueueFlagBits::eProtected | vk::QueueFlagBits::eOpticalFlowNV)) && queue_family_props[i].queueFlags & vk::QueueFlagBits::eTransfer) { - vk_transfer_queue_family_index = i; - } - } - - // Fall back to graphics and compute queue if not yet found - if (vk_compute_queue_family_index == qfsize) { - for (size_t i = 0; i < qfsize; i++) { - if (vk_compute_queue_family_index >= qfsize && queue_family_props[i].queueFlags & vk::QueueFlagBits::eCompute) { - vk_compute_queue_family_index = i; - } - } - } - - if (vk_compute_queue_family_index == qfsize) { - std::cerr << "ggml_vulkan: vk_compute_queue_family_index invalid" << std::endl; - abort(); - } - if (vk_transfer_queue_family_index == qfsize) { - std::cerr << "ggml_vulkan: vk_transfer_queue_family_index invalid" << std::endl; - abort(); - } + // Try to find a non-graphics compute queue and transfer-focused queues + std::vector compute_queue_family_index_vec; + ggml_vk_find_queue_family_index(compute_queue_family_index_vec, 1, queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1); + uint32_t compute_queue_family_index = compute_queue_family_index_vec[0]; + std::vector transfer_queue_family_index; + ggml_vk_find_queue_family_index(transfer_queue_family_index, VK_TRANSFER_QUEUE_COUNT, queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics | vk::QueueFlagBits::eVideoDecodeKHR | vk::QueueFlagBits::eProtected | vk::QueueFlagBits::eOpticalFlowNV, compute_queue_family_index); const float compute_queue_priority = 1.0f; const float transfer_queue_priority = 1.0f; - vk::DeviceQueueCreateInfo device_queue_create_infos[] = { - {vk::DeviceQueueCreateFlags(), vk_compute_queue_family_index, 1, &compute_queue_priority}, - {vk::DeviceQueueCreateFlags(), vk_transfer_queue_family_index, 1, &transfer_queue_priority}, + std::vector device_queue_create_infos; + device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, &compute_queue_priority}); + + for (int i = 0; i < VK_TRANSFER_QUEUE_COUNT; i++) { + device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index[i], 1, &transfer_queue_priority}); }; vk::DeviceCreateInfo device_create_info; std::vector device_extensions; @@ -334,26 +447,29 @@ void ggml_vk_init(void) { vk_pipeline_f16_to_f32 = ggml_vk_create_pipeline("vk_shaders/f16_to_f32.spv", "main", 2, 1, {32, 1, 1}); vk_pipeline_dequant_q4_0 = ggml_vk_create_pipeline("vk_shaders/dequant_q4_0.spv", "main", 2, 1, {32, 1, 1}); - // Command pools - vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(), vk_compute_queue_family_index); - vk_command_pool_compute = vk_device.createCommandPool(command_pool_create_info_compute); - - vk::CommandPoolCreateInfo command_pool_create_info_transfer(vk::CommandPoolCreateFlags(), vk_transfer_queue_family_index); - vk_command_pool_transfer = vk_device.createCommandPool(command_pool_create_info_transfer); - -#if defined(VK_CHK_KERNEL) - const int step = 5; - for (size_t m = 1; m < 12; m += step) { - for (size_t n = 1; n < 12; n += step) { - for (size_t k = 1; k < 12; k += step) { - ggml_vk_test_matmul_f32(m * 128, n * 128, k * 128); - ggml_vk_test_matmul_f16(m * 128, n * 128, k * 128); - } - } + // Queues + vk_compute_queue = ggml_vk_create_queue(compute_queue_family_index); + for (int i = 0; i < VK_TRANSFER_QUEUE_COUNT; i++) { + vk_transfer_queues[i] = ggml_vk_create_queue(transfer_queue_family_index[i]); } - for (size_t m = 1; m < 12; m += step) { - ggml_vk_test_f16_to_f32(m * 128); +#if defined(VK_CHK_KERNEL) + int step = 16; + for (size_t m = step; m < 64; m += step) { + ggml_vk_test_transfer(1024 * 1024 * m); + } + const std::vector vals { + 49, 49, 128, + 128, 49, 49, + 4096, 49, 4096, + 11008, 49, 4096, + 4096, 49, 11008, + 4096, 49, 4096, + 32000, 49, 4096, + }; + for (size_t i = 0; i < vals.size(); i += 3) { + ggml_vk_test_matmul_f32(vals[i], vals[i + 1], vals[i + 2]); + ggml_vk_test_matmul_f16(vals[i], vals[i + 1], vals[i + 2]); } #endif } @@ -397,54 +513,6 @@ struct scoped_spin_lock { static vk_buffer g_vk_buffer_pool[MAX_VK_BUFFERS]; static std::atomic_flag g_vk_pool_lock = ATOMIC_FLAG_INIT; -static vk_buffer ggml_vk_create_buffer(size_t size, VmaAllocationCreateFlags alloc_flags, VmaMemoryUsage vma_usage, VkMemoryPropertyFlags req_flags = 0) { - vk_buffer buf; - - buf.size = size; - vk::BufferCreateInfo buffer_create_info{ - vk::BufferCreateFlags(), - size, - vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst, - vk::SharingMode::eExclusive, - 1, - &vk_compute_queue_family_index - }; - - VmaAllocationCreateInfo allocation_info = {}; - allocation_info.requiredFlags = req_flags; - allocation_info.flags = alloc_flags; - allocation_info.usage = vma_usage; - - vmaCreateBuffer(vk_allocator, - (VkBufferCreateInfo*)&buffer_create_info, - &allocation_info, - (VkBuffer*)&buf.buffer, - &buf.allocation, - &buf.info); - - buf.sb_write = nullptr; - buf.sb_read = nullptr; - - return buf; -} - -static void ggml_vk_destroy_buffer(vk_buffer& buf) { - buf.size = 0; - vmaDestroyBuffer(vk_allocator, buf.buffer, buf.allocation); - - // Cleanup staging buffers - if (buf.sb_write != nullptr) { - vmaDestroyBuffer(vk_allocator, buf.sb_write->buffer, buf.sb_write->allocation); - free(buf.sb_write); - buf.sb_write = nullptr; - } - if (buf.sb_read != nullptr) { - vmaDestroyBuffer(vk_allocator, buf.sb_read->buffer, buf.sb_read->allocation); - free(buf.sb_read); - buf.sb_read = nullptr; - } -} - static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf, VmaAllocationCreateFlags alloc_flags) { scoped_spin_lock lock(g_vk_pool_lock); @@ -534,16 +602,15 @@ void ggml_vk_host_free(void* ptr) { ggml_vk_destroy_buffer(*buf); } -static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src, size_t size) { +static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q) { VkMemoryPropertyFlags mem_prop_flags; vmaGetAllocationMemoryProperties(vk_allocator, dst->allocation, &mem_prop_flags); // Buffer is already mapped if(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { + GGML_ASSERT(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); + memcpy((uint8_t *)dst->info.pMappedData + offset, src, size); - if (!(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { - vmaFlushAllocation(vk_allocator, dst->allocation, offset, size); - } } else { // Check if src is pinned memory vk_buffer* buf = nullptr; @@ -565,61 +632,63 @@ static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src offset, // dstOffset, size}; // size - vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_transfer); + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(q); vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); cmd_buffer.begin(cmd_buffer_begin_info); vkCmdCopyBuffer(cmd_buffer, buf->buffer, dst->buffer, 1, &buf_copy); cmd_buffer.end(); - vk::Queue queue = vk_device.getQueue(vk_transfer_queue_family_index, 0); - vk::SubmitInfo submit_info(0, nullptr, nullptr, 1, &cmd_buffer); - queue.submit({ submit_info }, VK_NULL_HANDLE); + std::lock_guard guard(q.mutex); + q.queue.submit({ submit_info }, VK_NULL_HANDLE); + return; } // Staging buffer required, malloc because of async transfer if (dst->sb_write == nullptr) { - dst->sb_write = (vk_buffer *) malloc(sizeof(vk_buffer)); - *dst->sb_write = ggml_vk_create_buffer(dst->size, VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, VMA_MEMORY_USAGE_AUTO, 0); + dst->sb_write = new vk_buffer; + *dst->sb_write = ggml_vk_create_buffer(dst->size, VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, VMA_MEMORY_USAGE_AUTO_PREFER_HOST, 0); } - memcpy(((uint8_t *)dst->sb_write->info.pMappedData) + offset, src, size); - vmaFlushAllocation(vk_allocator, dst->sb_write->allocation, 0, VK_WHOLE_SIZE); + VkMemoryPropertyFlags mpf_staging; + vmaGetAllocationMemoryProperties(vk_allocator, dst->sb_write->allocation, &mpf_staging); + GGML_ASSERT(mpf_staging & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); + VkBufferCopy buf_copy = { 0, // srcOffset offset, // dstOffset, size}; // size - vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_transfer); + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(q); vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); cmd_buffer.begin(cmd_buffer_begin_info); vkCmdCopyBuffer(cmd_buffer, dst->sb_write->buffer, dst->buffer, 1, &buf_copy); cmd_buffer.end(); - vk::Queue queue = vk_device.getQueue(vk_transfer_queue_family_index, 0); + memcpy(dst->sb_write->info.pMappedData, src, size); vk::SubmitInfo submit_info(0, nullptr, nullptr, 1, &cmd_buffer); - queue.submit({ submit_info }, VK_NULL_HANDLE); + std::lock_guard guard(q.mutex); + q.queue.submit({ submit_info }, VK_NULL_HANDLE); } } -static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size) { +static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q) { VkMemoryPropertyFlags mem_prop_flags; vmaGetAllocationMemoryProperties(vk_allocator, src->allocation, &mem_prop_flags); if(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { - if (!(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) { - vmaInvalidateAllocation(vk_allocator, src->allocation, offset, size); - } + GGML_ASSERT(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); + memcpy(dst, (uint8_t *) src->info.pMappedData + offset, size); } else { // Check if dst is pinned memory @@ -642,45 +711,42 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_ buf_offset, // dstOffset, size}; // size - vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_transfer); + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(q); vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); cmd_buffer.begin(cmd_buffer_begin_info); vkCmdCopyBuffer(cmd_buffer, src->buffer, buf->buffer, 1, &buf_copy); cmd_buffer.end(); - vk::Queue queue = vk_device.getQueue(vk_transfer_queue_family_index, 0); - vk::Fence fence = vk_device.createFence(vk::FenceCreateFlags{}); - vk::SubmitInfo submit_info(0, nullptr, nullptr, 1, &cmd_buffer); - queue.submit({ submit_info }, fence); - vk::resultCheck(vk_device.waitForFences({ fence }, true, uint64_t(-1)), "vk_buffer_read pinned waitForFences"); - - vk_device.destroyFence(fence); + std::lock_guard guard(q.mutex); + q.queue.submit({ submit_info }, VK_NULL_HANDLE); return; } if (src->sb_read == nullptr) { - src->sb_read = (vk_buffer *) malloc(sizeof(vk_buffer)); + src->sb_read = new vk_buffer; *src->sb_read = ggml_vk_create_buffer(src->size, VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, VMA_MEMORY_USAGE_AUTO, 0); } + VkMemoryPropertyFlags mpf_staging; + vmaGetAllocationMemoryProperties(vk_allocator, src->sb_read->allocation, &mpf_staging); + GGML_ASSERT(mpf_staging & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); + VkBufferCopy buf_copy = { offset, // srcOffset 0, // dstOffset, size}; // size - vmaInvalidateAllocation(vk_allocator, src->sb_read->allocation, 0, VK_WHOLE_SIZE); - vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_transfer); + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(q); vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); cmd_buffer.begin(cmd_buffer_begin_info); vkCmdCopyBuffer(cmd_buffer, src->buffer, src->sb_read->buffer, 1, &buf_copy); cmd_buffer.end(); - vk::Queue queue = vk_device.getQueue(vk_transfer_queue_family_index, 0); vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo{}); vk::SubmitInfo submit_info(0, @@ -688,17 +754,15 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_ nullptr, 1, &cmd_buffer); - queue.submit({ submit_info }, fence); + std::lock_guard guard(q.mutex); + q.queue.submit({ submit_info }, fence); vk::resultCheck(vk_device.waitForFences({ fence }, true, uint64_t(-1)), "vk_buffer_read staging waitForFences"); - memcpy(dst, src->sb_read->info.pMappedData, size); - vk_device.destroyFence(fence); - ggml_vk_destroy_buffer(*src->sb_read); - src->sb_read = nullptr; + memcpy(dst, src->sb_read->info.pMappedData, size); } } -static void ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2) { +static void ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, vk_queue& q) { const uint64_t ne0 = src->ne[0]; const uint64_t ne1 = src->ne[1]; const uint64_t nb0 = src->nb[0]; @@ -708,23 +772,26 @@ static void ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct gg const enum ggml_type type = src->type; const size_t ts = ggml_type_size(type); const size_t bs = ggml_blck_size(type); + const size_t row_length = ts*ne0/bs; const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3); - if (nb0 == ts && nb1 == ts*ne0/bs) { - ggml_vk_buffer_write(dst, offset, x, ne1*nb1); + if (nb0 == ts && nb1 == row_length) { + ggml_vk_buffer_write(dst, offset, x, ne1*nb1, q); return; } if (nb0 == ts) { for (uint64_t i1 = 0; i1 < ne1; i1++) { - ggml_vk_buffer_write(dst, offset + ne0 * i1, (uint8_t *)x + ts*ne0/bs, ne0*nb0); + ggml_vk_buffer_write(dst, offset + i1 * row_length, (uint8_t *)x + i1 * nb1, row_length, q); } return; } + GGML_ASSERT(false); + // TODO: also needs handling of staging buffers uint8_t* dst_ptr = (uint8_t*) dst->info.pMappedData; uint8_t* xc = (uint8_t*)x; for (uint64_t i1 = 0; i1 < ne1; i1++) { for (uint64_t i0 = 0; i0 < ne0; i0++) { - dst_ptr[offset + i1 * ts*ne0/bs + i0 * ts] = xc[i1 * nb1 + i0 * nb0]; + dst_ptr[offset + i1 * row_length + i0 * ts] = xc[i1 * nb1 + i0 * nb0]; } } } @@ -756,15 +823,16 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr ggml_vk_pool_malloc(sizeof(float) * y_ne, &d_Y, 0); ggml_vk_pool_malloc(sizeof(float) * d_ne, &d_D, 0); + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_compute_queue); vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo()); for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { // copy data to device if (src0->backend != GGML_BACKEND_GPU) { - ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02); + ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0]); } - ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02); + ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1]); // compute #ifdef VK_CHK_KERNEL @@ -772,9 +840,10 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr #endif // Wait for transfers to finish - vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle(); + vk_transfer_queues[0].queue.waitIdle(); + vk_transfer_queues[1].queue.waitIdle(); - ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f32, {&d_X, &d_Y, &d_D}, { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 }, { (uint32_t)ne01, (uint32_t)ne11, 1}, fence); + ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f32, {&d_X, &d_Y, &d_D}, { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 }, { (uint32_t)ne01, (uint32_t)ne11, 1}, cmd_buffer, fence); vk::resultCheck(vk_device.waitForFences({ fence }, true, uint64_t(-1)), "matmul_f32 waitForFences"); @@ -784,16 +853,20 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr std::cout << "m=" << ne01 << " n=" << ne11 << " k=" << ne10 << " matmul " << std::chrono::duration_cast(end-begin).count() / 1000.0 << "ms" << std::endl; #endif - vk_device.resetFences({fence}); - // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne); + ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_transfer_queues[0]); + + vk_device.resetFences({fence}); } } vk_device.destroyFence(fence); + ggml_vk_queue_cleanup(vk_compute_queue); + ggml_vk_queue_cleanup(vk_transfer_queues[0]); + ggml_vk_queue_cleanup(vk_transfer_queues[1]); + if (src0->backend != GGML_BACKEND_GPU) { ggml_vk_pool_free(d_X); } @@ -841,13 +914,14 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr bool src1_cont_rows = nb10 == sizeof(float); bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_compute_queue); vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo()); for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { // copy data to device if (src1->backend != GGML_BACKEND_GPU) { - ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02); + ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0]); } // convert src1 to fp16 @@ -872,17 +946,18 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } } } - ggml_vk_buffer_write(&d_Y, 0, tmp, sizeof(ggml_fp16_t) * y_ne); + ggml_vk_buffer_write(&d_Y, 0, tmp, sizeof(ggml_fp16_t) * y_ne, vk_transfer_queues[1]); // Wait for transfers to finish - vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle(); + vk_transfer_queues[0].queue.waitIdle(); + vk_transfer_queues[1].queue.waitIdle(); // compute #ifdef VK_CHK_KERNEL auto begin = std::chrono::high_resolution_clock::now(); #endif - ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f16, {&d_X, &d_Y, &d_D}, { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 }, { (uint32_t)ne01, (uint32_t)ne11, 1}, fence); + ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f16, {&d_X, &d_Y, &d_D}, { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 }, { (uint32_t)ne01, (uint32_t)ne11, 1}, cmd_buffer, fence); vk::resultCheck(vk_device.waitForFences({ fence }, true, uint64_t(-1)), "matmul_f16 waitForFences"); #ifdef VK_CHK_KERNEL @@ -891,16 +966,20 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr std::cout << "m=" << ne01 << " n=" << ne11 << " k=" << ne10 << " matmul " << std::chrono::duration_cast(end-begin).count() / 1000.0 << "ms" << std::endl; #endif - vk_device.resetFences({fence}); - // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne); + ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_transfer_queues[0]); + + vk_device.resetFences({fence}); } } vk_device.destroyFence(fence); + ggml_vk_queue_cleanup(vk_compute_queue); + ggml_vk_queue_cleanup(vk_transfer_queues[0]); + ggml_vk_queue_cleanup(vk_transfer_queues[1]); + if (src0->backend != GGML_BACKEND_GPU) { ggml_vk_pool_free(d_X); } @@ -944,11 +1023,14 @@ static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * // vk_pipeline* dmmv = ggml_get_dequantize_mul_mat_vec_vk(type); GGML_ASSERT(to_fp32_vk != nullptr); + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_compute_queue); + vk::Fence fence = vk_device.createFence(vk::FenceCreateFlags{}); + for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { // copy src0 to device if necessary if (src0->backend == GGML_BACKEND_CPU) { - ggml_vk_h2d_tensor_2d(&d_Q, 0, src0, i03, i02); + ggml_vk_h2d_tensor_2d(&d_Q, 0, src0, i03, i02, vk_transfer_queues[0]); } else if (src0->backend == GGML_BACKEND_GPU) { d_Q = *(vk_buffer *) src0->data; } else { @@ -972,39 +1054,53 @@ static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * // VK_CHECK(vkSetKernelArg(*dmmv, 4, sizeof(vk_int), &ncols)); // VK_CHECK(vkEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++)); } else { // general dequantization kernel + VK matrix matrix multiplication + // copy src1 to device + ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1]); + // convert src0 to fp32 on device // Wait for transfers to finish - vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle(); + vk_transfer_queues[0].queue.waitIdle(); - vk::Fence fence = vk_device.createFence(vk::FenceCreateFlags{}); - - ggml_vk_dispatch_pipeline(*to_fp32_vk, {&d_Q, &d_X}, { (int)x_ne }, { (uint32_t)x_ne, 1, 1}, fence); - - // copy src1 to device - ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02); + vk_device.resetFences({ fence }); + ggml_vk_dispatch_pipeline(*to_fp32_vk, {&d_Q, &d_X}, { (int)x_ne }, { (uint32_t)x_ne, 1, 1}, cmd_buffer, fence); // wait for conversion vk::resultCheck(vk_device.waitForFences({ fence }, true, uint64_t(-1)), "matmul_q_f32 src0 convert waitForFences"); - vk_device.resetFences({fence}); + vk_device.resetFences({ fence }); + cmd_buffer.reset(vk::CommandBufferResetFlags()); // Wait for transfers to finish - vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle(); + vk_transfer_queues[1].queue.waitIdle(); // compute - ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f32, {&d_X, &d_Y, &d_D}, { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 }, { (uint32_t)ne01, (uint32_t)ne11, 1}, fence); +#ifdef VK_CHK_KERNEL + auto begin = std::chrono::high_resolution_clock::now(); +#endif + + ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f32, {&d_X, &d_Y, &d_D}, { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 }, { (uint32_t)ne01, (uint32_t)ne11, 1}, cmd_buffer, fence); vk::resultCheck(vk_device.waitForFences({ fence }, true, uint64_t(-1)), "matmul_q_f32 matmul waitForFences"); - vk_device.destroyFence(fence); +#ifdef VK_CHK_KERNEL + auto end = std::chrono::high_resolution_clock::now(); + + std::cout << "m=" << ne01 << " n=" << ne11 << " k=" << ne10 << " matmul " << std::chrono::duration_cast(end-begin).count() / 1000.0 << "ms" << std::endl; +#endif } // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne); + ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_transfer_queues[0]); } } + vk_device.destroyFence(fence); + + ggml_vk_queue_cleanup(vk_compute_queue); + ggml_vk_queue_cleanup(vk_transfer_queues[0]); + ggml_vk_queue_cleanup(vk_transfer_queues[1]); + if (!mul_mat_vec) { ggml_vk_pool_free(d_X); } @@ -1026,7 +1122,7 @@ bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && - ((ne0 >= 128 && ne1 >= 32 && ne10 >= 128) || src0->backend == GGML_BACKEND_GPU)) { + ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) { return true; } @@ -1083,6 +1179,49 @@ size_t ggml_vk_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct g } #ifdef VK_CHK_KERNEL +void ggml_vk_test_transfer(size_t ne) { + // Check transfers are correct + vk_buffer buffer = ggml_vk_create_buffer(sizeof(float) * ne, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT, VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE, 0); + + float* x = (float *) malloc(sizeof(float) * ne); + float* y = (float *) malloc(sizeof(float) * ne); + + for (size_t i = 0; i < ne; i++) { + x[i] = rand() / (float)RAND_MAX; + } + + auto begin = std::chrono::high_resolution_clock::now(); + + ggml_vk_buffer_write(&buffer, 0, x, sizeof(float) * ne, vk_transfer_queues[0]); + + vk_transfer_queues[0].queue.waitIdle(); + + auto end = std::chrono::high_resolution_clock::now(); + + double ms_to_gpu = std::chrono::duration_cast(end-begin).count() / 1000.0; + + begin = std::chrono::high_resolution_clock::now(); + + ggml_vk_buffer_read(&buffer, 0, y, sizeof(float) * ne, vk_transfer_queues[1]); + + end = std::chrono::high_resolution_clock::now(); + + double ms_from_gpu = std::chrono::duration_cast(end-begin).count() / 1000.0; + + double avg_err = 0.0; + for (size_t i = 0; i < ne; i++) { + avg_err += std::fabs(x[i] - y[i]); + } + + double kb = ne * sizeof(float) / 1024.0; + + std::cout << "TEST TRANSFER " << kb << " KB to_gpu " << ms_to_gpu << "ms (" << kb / ms_to_gpu * 1000.0 / 1024.0 << " MB/s) from_gpu " << ms_from_gpu << "ms (" << kb / ms_from_gpu * 1000.0 / 1024.0 << " MB/s) avg_err=" << avg_err / ne << std::endl; + + ggml_vk_destroy_buffer(buffer); + + free(x); + free(y); +} void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k) { const size_t x_ne = m * k; const size_t y_ne = k * n; @@ -1106,24 +1245,27 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k) { y[i] = rand() / (float)RAND_MAX; } - ggml_vk_buffer_write(&d_X, 0, x, sizeof(float) * x_ne); - ggml_vk_buffer_write(&d_Y, 0, y, sizeof(float) * y_ne); + ggml_vk_buffer_write(&d_X, 0, x, sizeof(float) * x_ne, vk_transfer_queues[0]); + ggml_vk_buffer_write(&d_Y, 0, y, sizeof(float) * y_ne, vk_transfer_queues[1]); // Wait for transfers to finish - vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle(); + vk_transfer_queues[0].queue.waitIdle(); + vk_transfer_queues[1].queue.waitIdle(); vk::Fence fence = vk_device.createFence(vk::FenceCreateFlags{}); + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_compute_queue); + auto begin = std::chrono::high_resolution_clock::now(); - ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f32, {&d_X, &d_Y, &d_D}, { (int)m, (int)n, (int)k, (int)k, (int)k, (int)m }, { (uint32_t)m, (uint32_t)n, 1}, fence); + ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f32, {&d_X, &d_Y, &d_D}, { (int)m, (int)n, (int)k, (int)k, (int)k, (int)m }, { (uint32_t)m, (uint32_t)n, 1}, cmd_buffer, fence); vk::resultCheck(vk_device.waitForFences({ fence }, true, uint64_t(-1)), "test_matmul_f32 waitForFences"); auto end = std::chrono::high_resolution_clock::now(); // copy dst to host - ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne); + ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_transfer_queues[0]); float * d_chk = (float *) malloc(sizeof(float) * d_ne); @@ -1147,6 +1289,10 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k) { vk_device.destroyFence(fence); + ggml_vk_queue_cleanup(vk_compute_queue); + ggml_vk_queue_cleanup(vk_transfer_queues[0]); + ggml_vk_queue_cleanup(vk_transfer_queues[1]); + ggml_vk_pool_free(d_X); ggml_vk_pool_free(d_Y); ggml_vk_pool_free(d_D); @@ -1182,21 +1328,25 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k) { y[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX); } - ggml_vk_buffer_write(&d_X, 0, x, sizeof(ggml_fp16_t) * x_ne); - ggml_vk_buffer_write(&d_Y, 0, y, sizeof(ggml_fp16_t) * y_ne); + ggml_vk_buffer_write(&d_X, 0, x, sizeof(ggml_fp16_t) * x_ne, vk_transfer_queues[0]); + ggml_vk_buffer_write(&d_Y, 0, y, sizeof(ggml_fp16_t) * y_ne, vk_transfer_queues[1]); + + vk_transfer_queues[0].queue.waitIdle(); + vk_transfer_queues[1].queue.waitIdle(); vk::Fence fence = vk_device.createFence(vk::FenceCreateFlags{}); + vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_compute_queue); auto begin = std::chrono::high_resolution_clock::now(); - ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f16, {&d_X, &d_Y, &d_D}, { (int)m, (int)n, (int)k, (int)k, (int)k, (int)m }, { (uint32_t)m, (uint32_t)n, 1}, fence); + ggml_vk_dispatch_pipeline(vk_pipeline_matmul_f16, {&d_X, &d_Y, &d_D}, { (int)m, (int)n, (int)k, (int)k, (int)k, (int)m }, { (uint32_t)m, (uint32_t)n, 1}, cmd_buffer, fence); vk::resultCheck(vk_device.waitForFences({ fence }, true, uint64_t(-1)), "test_matmul_f16 waitForFences"); auto end = std::chrono::high_resolution_clock::now(); // copy dst to host - ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne); + ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_transfer_queues[0]); float * fx = (float *) malloc(sizeof(float) * x_ne); float * fy = (float *) malloc(sizeof(float) * y_ne); @@ -1227,6 +1377,10 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k) { vk_device.destroyFence(fence); + ggml_vk_queue_cleanup(vk_compute_queue); + ggml_vk_queue_cleanup(vk_transfer_queues[0]); + ggml_vk_queue_cleanup(vk_transfer_queues[1]); + ggml_vk_pool_free(d_X); ggml_vk_pool_free(d_Y); @@ -1237,55 +1391,4 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k) { free(y); free(d); } - -void ggml_vk_test_f16_to_f32(size_t m) { - vk_buffer d_X; - vk_buffer d_D; - ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * m, &d_X, 0); - ggml_vk_pool_malloc(sizeof(float) * m, &d_D, 0); - - ggml_fp16_t* x = (ggml_fp16_t *) malloc(sizeof(ggml_fp16_t) * m); - float* d = (float *) malloc(sizeof(float) * m); - - for (size_t i = 0; i < m; i++) { - x[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX); - } - - ggml_vk_buffer_write(&d_X, 0, x, sizeof(ggml_fp16_t) * m); - - vk::Fence fence = vk_device.createFence(vk::FenceCreateFlags{}); - - auto begin = std::chrono::high_resolution_clock::now(); - - ggml_vk_dispatch_pipeline(vk_pipeline_f16_to_f32, {&d_X, &d_D}, { (int)m }, { (uint32_t)m, 1, 1}, fence); - - vk::resultCheck(vk_device.waitForFences({ fence }, true, uint64_t(-1)), "test_f16_to_f32 waitForFences"); - - auto end = std::chrono::high_resolution_clock::now(); - - // copy dst to host - ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * m); - - float * d_chk = (float *) malloc(sizeof(float) * m); - - ggml_fp16_to_fp32_row(x, d_chk, m); - - double avg_err = 0.0; - - for (size_t r = 0; r < m; r++) { - avg_err += std::fabs(ggml_fp16_to_fp32(d[r]) - d_chk[r]); - } - - std::cout << "TEST convert m=" << m << " f16_to_f32 " << std::chrono::duration_cast(end-begin).count() / 1000.0 << "ms avg_err=" << avg_err / m << std::endl; - - free(d_chk); - - vk_device.destroyFence(fence); - - ggml_vk_pool_free(d_X); - ggml_vk_pool_free(d_D); - - free(x); - free(d); -} #endif diff --git a/vk_shaders/matmul_f16.glsl b/vk_shaders/matmul_f16.glsl index 8d6bfac09..0abab4827 100644 --- a/vk_shaders/matmul_f16.glsl +++ b/vk_shaders/matmul_f16.glsl @@ -57,12 +57,20 @@ void main() { [[unroll]] for (int l = 0; l < BM * BK; l += loadstride) { const int lr = l % BK; const int lc = l / BK; - buf_a[(loadc + lc) * (BK+1) + loadr + lr] = data_a[pos_a + (loadc + lc) * p.stride_a + loadr + lr]; + if (ir * BM + loadc + lc < p.M && block + loadr + lr < p.K) { + buf_a[(loadc + lc) * (BK+1) + loadr + lr] = data_a[pos_a + (loadc + lc) * p.stride_a + loadr + lr]; + } else { + buf_a[(loadc + lc) * (BK+1) + loadr + lr] = 0.0hf; + } } [[unroll]] for (int l = 0; l < BN * BK; l += loadstride) { const int lr = l % BK; const int lc = l / BK; - buf_b[(loadc + lc) * (BK+1) + loadr + lr] = data_b[pos_b + (loadc + lc) * p.stride_b + loadr + lr]; + if (ic * BN + loadc + lc < p.N && block + loadr + lr < p.K) { + buf_b[(loadc + lc) * (BK+1) + loadr + lr] = data_b[pos_b + (loadc + lc) * p.stride_b + loadr + lr]; + } else { + buf_b[(loadc + lc) * (BK+1) + loadr + lr] = 0.0hf; + } } barrier(); @@ -94,7 +102,9 @@ void main() { [[unroll]] for (int cc = 0; cc < TN; cc++) { [[unroll]] for (int cr = 0; cr < TM; cr++) { - data_d[(dc + cc) * p.stride_d + dr + cr*rstride] = sums[cc * TM + cr]; + if (dr + cr*rstride < p.M && dc + cc < p.N) { + data_d[(dc + cc) * p.stride_d + dr + cr*rstride] = sums[cc * TM + cr]; + } } } } diff --git a/vk_shaders/matmul_f32.glsl b/vk_shaders/matmul_f32.glsl index 8fc894c37..dfc572a6f 100644 --- a/vk_shaders/matmul_f32.glsl +++ b/vk_shaders/matmul_f32.glsl @@ -56,12 +56,20 @@ void main() { [[unroll]] for (int l = 0; l < BM * BK; l += loadstride) { const int lr = l % BK; const int lc = l / BK; - buf_a[(loadc + lc) * (BK+1) + loadr + lr] = data_a[pos_a + (loadc + lc) * p.stride_a + loadr + lr]; + if (ir * BM + loadc + lc < p.M && block + loadr + lr < p.K) { + buf_a[(loadc + lc) * (BK+1) + loadr + lr] = data_a[pos_a + (loadc + lc) * p.stride_a + loadr + lr]; + } else { + buf_a[(loadc + lc) * (BK+1) + loadr + lr] = 0.0f; + } } [[unroll]] for (int l = 0; l < BN * BK; l += loadstride) { const int lr = l % BK; const int lc = l / BK; - buf_b[(loadc + lc) * (BK+1) + loadr + lr] = data_b[pos_b + (loadc + lc) * p.stride_b + loadr + lr]; + if (ic * BN + loadc + lc < p.N && block + loadr + lr < p.K) { + buf_b[(loadc + lc) * (BK+1) + loadr + lr] = data_b[pos_b + (loadc + lc) * p.stride_b + loadr + lr]; + } else { + buf_b[(loadc + lc) * (BK+1) + loadr + lr] = 0.0f; + } } barrier(); @@ -93,7 +101,9 @@ void main() { [[unroll]] for (int cc = 0; cc < TN; cc++) { [[unroll]] for (int cr = 0; cr < TM; cr++) { - data_d[(dc + cc) * p.stride_d + dr + cr*rstride] = sums[cc * TM + cr]; + if (dr + cr*rstride < p.M && dc + cc < p.N) { + data_d[(dc + cc) * p.stride_d + dr + cr*rstride] = sums[cc * TM + cr]; + } } } }