From e9e2be33fdca831a95825ac8dad7180dd5ccdbf0 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 18 Dec 2023 19:25:46 +0100 Subject: [PATCH] Use single queue per device to simplify code --- ggml-vulkan.cpp | 137 +++++++++++++++++++----------------------------- 1 file changed, 55 insertions(+), 82 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index ecc2beb47..c2699e31c 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -37,8 +37,6 @@ #define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) -#define VK_TRANSFER_QUEUE_COUNT 2 - #define VK_VENDOR_ID_AMD 0x1002 #define VK_VENDOR_ID_INTEL 0x8086 #define VK_VENDOR_ID_NVIDIA 0x10de @@ -129,7 +127,7 @@ struct vk_device { vk::Device device; uint32_t vendor_id; vk_queue compute_queue; - vk_queue transfer_queues[VK_TRANSFER_QUEUE_COUNT]; + vk_queue transfer_queue; uint32_t descriptor_set_mode; }; @@ -175,8 +173,7 @@ struct vk_staging_memcpy { struct ggml_vk_tensor_extra_gpu { bool ready; std::vector memcpys; - std::vector in0_seqs; - std::vector in1_seqs; + std::vector in_seqs; std::vector comp_seqs; std::vector out_seqs; @@ -972,29 +969,17 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; // Try to find a non-graphics compute queue and transfer-focused queues const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1); - const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 2); - - uint32_t transfer_queue_count = VK_TRANSFER_QUEUE_COUNT; - - // If not enough transfer queues are available - if (transfer_queue_count > queue_family_props[transfer_queue_family_index].queueCount) { - // If compute and transfer queues are same family - if (compute_queue_family_index == transfer_queue_family_index) { - transfer_queue_count = queue_family_props[transfer_queue_family_index].queueCount - 1; - } else { - transfer_queue_count = queue_family_props[transfer_queue_family_index].queueCount; - } - } + const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1); const float compute_queue_priority = 1.0f; - const float transfer_queue_priority[] = { 1.0f, 1.0f, 1.0f }; + const float transfer_queue_priority = 1.0f; std::vector device_queue_create_infos; if (compute_queue_family_index != transfer_queue_family_index) { device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, &compute_queue_priority}); - GGML_ASSERT(transfer_queue_count > 0); - device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, transfer_queue_count, transfer_queue_priority}); + device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, &transfer_queue_priority}); } else { - device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1 + transfer_queue_count, transfer_queue_priority}); + const float priorities[] = { compute_queue_priority, transfer_queue_priority }; + device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities}); } vk::DeviceCreateInfo device_create_info; std::vector device_extensions; @@ -1051,16 +1036,10 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; ggml_vk_load_shaders(); // Queues - uint32_t queue_index_offset = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; + const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; vk_device.compute_queue = ggml_vk_create_queue(compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }); - for (int i = 0; i < VK_TRANSFER_QUEUE_COUNT; i++) { - if (transfer_queue_count > 0) { - vk_device.transfer_queues[i] = ggml_vk_create_queue(transfer_queue_family_index, (queue_index_offset + i) % transfer_queue_count, { vk::PipelineStageFlagBits::eTransfer }); - } else { - vk_device.transfer_queues[i] = vk_device.compute_queue; - } - } + vk_device.transfer_queue = ggml_vk_create_queue(transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }); vk_fence = vk_device.device.createFence({}); @@ -1988,8 +1967,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * const int64_t r3 = ne13 / ne03; vk_queue& compq = vk_device.compute_queue; - vk_queue& tr0q = vk_device.transfer_queues[0]; - vk_queue& tr1q = vk_device.transfer_queues[1]; + vk_queue& trq = vk_device.transfer_queue; const bool load_x = src0->backend != GGML_BACKEND_GPU; const bool load_y = src1->backend != GGML_BACKEND_GPU; @@ -2128,7 +2106,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * if (load_x) { // copy data to device - extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, qx_offset, src0, i03, i02, ne01, tr0q, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); + extra->in_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, qx_offset, src0, i03, i02, ne01, trq, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); } if (qx_needs_dequant) { @@ -2190,7 +2168,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * if (y_non_contig) { mm_semaphores.push_back(y_semaphore); } else if (load_y) { - extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, ne11, tr1q, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); + extra->in_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, ne11, trq, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); mm_semaphores.push_back({ sem->s, sem->value + 1 }); } @@ -2200,7 +2178,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, tr1q, { { sem->s, sem->value + 2 } }, {})); + extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, trq, { { sem->s, sem->value + 2 } }, {})); } sem->value += 2; @@ -2242,7 +2220,7 @@ static void ggml_vk_mul_mat_vec_q_f16(const ggml_tensor * src0, const ggml_tenso const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); vk_queue& compq = vk_device.compute_queue; - vk_queue& tr0q = vk_device.transfer_queues[0]; + vk_queue& trq = vk_device.transfer_queue; const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; const bool qx_needs_dequant = x_non_contig; @@ -2340,7 +2318,7 @@ static void ggml_vk_mul_mat_vec_q_f16(const ggml_tensor * src0, const ggml_tenso vk_semaphore * sem = ggml_vk_create_timeline_semaphore(); // copy data to device - extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, 0, src0, 0, 0, ggml_nrows(src0), tr0q, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); + extra->in_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, 0, src0, 0, 0, ggml_nrows(src0), trq, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); semaphores.push_back({ sem->s, sem->value + 1 }); sem->value += 1; @@ -2691,7 +2669,7 @@ static void ggml_vk_op_repeat(const ggml_tensor * src0, const ggml_tensor * src1 } } - vk_submission s = ggml_vk_begin_submission(vk_device.transfer_queues[0]); + vk_submission s = ggml_vk_begin_submission(vk_device.transfer_queue); vkCmdCopyBuffer(s.buffer, src_buf->buffer, dst_buf->buffer, copies.size(), copies.data()); ggml_vk_end_submission(s, {}, {}); extra->out_seqs.push_back({ s }); @@ -2910,13 +2888,13 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g // copy src0 to device if (transfer_src0) { vk_semaphore * sem_x = ggml_vk_create_timeline_semaphore(); - extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, 0, src0, 0, 0, ggml_nrows(src0), vk_device.transfer_queues[0], {}, { { sem_x->s, sem_x->value + 1 } }, nullptr, &extra->memcpys)); + extra->in_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, 0, src0, 0, 0, ggml_nrows(src0), vk_device.transfer_queue, {}, { { sem_x->s, sem_x->value + 1 } }, nullptr, &extra->memcpys)); transfer_semaphores.push_back({ sem_x->s, sem_x->value + 1}); sem_x->value += 1; } if (transfer_src1) { vk_semaphore * sem_y = ggml_vk_create_timeline_semaphore(); - extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, 0, src1, 0, 0, ggml_nrows(src1), vk_device.transfer_queues[1], {}, { { sem_y->s, sem_y->value + 1 } }, nullptr, &extra->memcpys)); + extra->in_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, 0, src1, 0, 0, ggml_nrows(src1), vk_device.transfer_queue, {}, { { sem_y->s, sem_y->value + 1 } }, nullptr, &extra->memcpys)); transfer_semaphores.push_back({ sem_y->s, sem_y->value + 1 }); sem_y->value += 1; } @@ -2967,7 +2945,7 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g extra->comp_seqs.push_back({ s }); // copy dst to host - extra->out_seqs.push_back(ggml_vk_d2h_tensor_2d(d_D, 0, dst, vk_device.transfer_queues[1], { *fsem }, {})); + extra->out_seqs.push_back(ggml_vk_d2h_tensor_2d(d_D, 0, dst, vk_device.transfer_queue, { *fsem }, {})); } else if(dst->backend == GGML_BACKEND_CPU) { vk_semaphore * fsem = ggml_vk_create_binary_semaphore(); ggml_vk_end_submission(s, std::move(transfer_semaphores), { *fsem }); @@ -2975,7 +2953,7 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g // copy dst to host float * d = (float *) dst->data; - extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, 0, d, d_sz, vk_device.transfer_queues[1], { *fsem }, {})); + extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, 0, d, d_sz, vk_device.transfer_queue, { *fsem }, {})); } else { ggml_vk_end_submission(s, std::move(transfer_semaphores), {}); extra->comp_seqs.push_back({ s }); @@ -3025,7 +3003,7 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g extra->comp_seqs.push_back({ s }); // copy dst to host - extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz, vk_device.transfer_queues[1], { *fsem }, {})); + extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz, vk_device.transfer_queue, { *fsem }, {})); } else { ggml_vk_end_submission(s, std::move(transfer_semaphores), {}); extra->comp_seqs.push_back({ s }); @@ -3127,7 +3105,7 @@ static void ggml_vk_nop(const ggml_tensor * src0, ggml_tensor * dst) { ggml_vk_tensor_extra_gpu * extra = (ggml_vk_tensor_extra_gpu *) dst->extra; ggml_vk_tensor_extra_gpu * extra_src0 = (ggml_vk_tensor_extra_gpu *) src0->extra; vk_buffer * d_D = extra_src0->buffer_gpu; - extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, 0, dst->data, d_D->size, vk_device.transfer_queues[1], {}, {})); + extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, 0, dst->data, d_D->size, vk_device.transfer_queue, {}, {})); } } @@ -3148,7 +3126,7 @@ static void ggml_vk_transform_tensor(void * data, ggml_tensor * tensor, bool buf extra->buffer_gpu = new vk_buffer; *extra->buffer_gpu = ggml_vk_pool_malloc(size, vk::MemoryPropertyFlagBits::eDeviceLocal); - ggml_vk_buffer_write(extra->buffer_gpu, 0, data, size, vk_device.transfer_queues[0]); + ggml_vk_buffer_write(extra->buffer_gpu, 0, data, size, vk_device.transfer_queue); extra->buffer_static = buffer_static; @@ -3178,7 +3156,7 @@ void ggml_vk_assign_buffer(ggml_tensor * tensor) { extra->buffer_gpu = new vk_buffer; *extra->buffer_gpu = ggml_vk_create_buffer(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal); - ggml_vk_buffer_memset(extra->buffer_gpu, 0, 0, VK_WHOLE_SIZE, vk_device.transfer_queues[0]); + ggml_vk_buffer_memset(extra->buffer_gpu, 0, 0, VK_WHOLE_SIZE, vk_device.transfer_queue); extra->buffer_static = true; } @@ -3763,13 +3741,12 @@ bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor) for (auto& cpy : extra->memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(vk_device.transfer_queues[0], extra->in0_seqs, VK_NULL_HANDLE); - ggml_vk_submit(vk_device.transfer_queues[1], extra->in1_seqs, VK_NULL_HANDLE); + ggml_vk_submit(vk_device.transfer_queue, extra->in_seqs, VK_NULL_HANDLE); if (extra->out_seqs.empty()) { ggml_vk_submit(vk_device.compute_queue, extra->comp_seqs, vk_fence); } else { ggml_vk_submit(vk_device.compute_queue, extra->comp_seqs, VK_NULL_HANDLE); - ggml_vk_submit(vk_device.transfer_queues[1], extra->out_seqs, vk_fence); + ggml_vk_submit(vk_device.transfer_queue, extra->out_seqs, vk_fence); } VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, uint64_t(-1)), "ggml_vk_compute_forward waitForFences"); @@ -3796,8 +3773,7 @@ void ggml_vk_graph_cleanup() { vk_gc.pipelines.clear(); ggml_vk_queue_cleanup(vk_device.compute_queue); - ggml_vk_queue_cleanup(vk_device.transfer_queues[0]); - ggml_vk_queue_cleanup(vk_device.transfer_queues[1]); + ggml_vk_queue_cleanup(vk_device.transfer_queue); for (size_t i = 0; i < vk_gc.semaphores.size(); i++) { vk_device.device.destroySemaphore({ vk_gc.semaphores[i].s }); @@ -3885,7 +3861,7 @@ void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) { const size_t tensor_size = ggml_vk_tensor_size(tensor); tensor_data = malloc(tensor_size); - ggml_vk_buffer_read((vk_buffer *)tensor->data, 0, tensor_data, tensor_size, vk_device.transfer_queues[0]); + ggml_vk_buffer_read((vk_buffer *)tensor->data, 0, tensor_data, tensor_size, vk_device.transfer_queue); } std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; @@ -3997,7 +3973,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) for (int i3 = 0; i3 < src0->ne[3]; i3++) { for (int i2 = 0; i2 < src0->ne[2]; i2++) { const int idx = i3*src0->ne[2] + i2; - ggml_vk_buffer_read(extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1], vk_device.transfer_queues[0]); + ggml_vk_buffer_read(extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1], vk_device.transfer_queue); } } @@ -4010,7 +3986,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) if (offset + src0_size >= extra->buffer_gpu->size) { src0_size = extra->buffer_gpu->size - offset; } - ggml_vk_buffer_read(extra->buffer_gpu, offset, src0_clone->data, src0_size, vk_device.transfer_queues[0]); + ggml_vk_buffer_read(extra->buffer_gpu, offset, src0_clone->data, src0_size, vk_device.transfer_queue); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { @@ -4040,7 +4016,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) for (int i3 = 0; i3 < src1->ne[3]; i3++) { for (int i2 = 0; i2 < src1->ne[2]; i2++) { const int idx = i3*src1->ne[2] + i2; - ggml_vk_buffer_read(extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1], vk_device.transfer_queues[0]); + ggml_vk_buffer_read(extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1], vk_device.transfer_queue); } } @@ -4053,7 +4029,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) if (offset + src1_size >= extra->buffer_gpu->size) { src1_size = extra->buffer_gpu->size - offset; } - ggml_vk_buffer_read(extra->buffer_gpu, offset, src1_clone->data, src1_size, vk_device.transfer_queues[0]); + ggml_vk_buffer_read(extra->buffer_gpu, offset, src1_clone->data, src1_size, vk_device.transfer_queue); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { @@ -4209,7 +4185,7 @@ void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor) tensor_size = extra->buffer_gpu->size - extra->offset; } - ggml_vk_buffer_read(extra->buffer_gpu, extra->offset, tensor_data, tensor_size, vk_device.transfer_queues[0]); + ggml_vk_buffer_read(extra->buffer_gpu, extra->offset, tensor_data, tensor_size, vk_device.transfer_queue); } float first_error_result = -1.0f; @@ -4356,9 +4332,9 @@ void ggml_vk_test_transfer(size_t ne) { auto begin = std::chrono::high_resolution_clock::now(); - ggml_vk_buffer_write(&buffer, 0, x, sizeof(float) * ne, vk_device.transfer_queues[0]); + ggml_vk_buffer_write(&buffer, 0, x, sizeof(float) * ne, vk_device.transfer_queue); - vk_device.transfer_queues[0].queue.waitIdle(); + vk_device.transfer_queue.queue.waitIdle(); auto end = std::chrono::high_resolution_clock::now(); @@ -4366,7 +4342,7 @@ void ggml_vk_test_transfer(size_t ne) { begin = std::chrono::high_resolution_clock::now(); - ggml_vk_buffer_read(&buffer, 0, y, sizeof(float) * ne, vk_device.transfer_queues[1]); + ggml_vk_buffer_read(&buffer, 0, y, sizeof(float) * ne, vk_device.transfer_queue); end = std::chrono::high_resolution_clock::now(); @@ -4483,13 +4459,13 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int sp y[i] = 1.0f; // (rand() / (float)RAND_MAX) * 2.0f - 1.0f; } - seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(float) * k, sizeof(float) * k, m, vk_device.transfer_queues[0], {}, {})); - seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, vk_device.transfer_queues[0], {}, {})); + seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(float) * k, sizeof(float) * k, m, vk_device.transfer_queue, {}, {})); + seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, vk_device.transfer_queue, {}, {})); - ggml_vk_submit(vk_device.transfer_queues[0], seq, VK_NULL_HANDLE); + ggml_vk_submit(vk_device.transfer_queue, seq, VK_NULL_HANDLE); // Wait for transfers to finish - vk_device.transfer_queues[0].queue.waitIdle(); + vk_device.transfer_queue.queue.waitIdle(); auto begin = std::chrono::high_resolution_clock::now(); @@ -4504,7 +4480,7 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int sp auto end = std::chrono::high_resolution_clock::now(); // copy dst to host - ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queues[0]); + ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queue); float * d_chk = (float *) malloc(sizeof(float) * d_ne); @@ -4538,7 +4514,7 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int sp if (split_k > 1) { float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); - ggml_vk_buffer_read(&vk_prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k, vk_device.transfer_queues[0]); + ggml_vk_buffer_read(&vk_prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k, vk_device.transfer_queue); std::cerr << "d_buf0: " << std::endl << std::endl; ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, n, m, 5, 5); @@ -4558,8 +4534,7 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int sp free(d_chk); - ggml_vk_queue_cleanup(vk_device.transfer_queues[0]); - ggml_vk_queue_cleanup(vk_device.transfer_queues[1]); + ggml_vk_queue_cleanup(vk_device.transfer_queue); ggml_vk_queue_cleanup(vk_device.compute_queue); ggml_vk_destroy_buffer(d_X); @@ -4645,13 +4620,13 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int sp y[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX); } - seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, vk_device.transfer_queues[0], {}, {})); - seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, n, vk_device.transfer_queues[0], {}, {})); + seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, vk_device.transfer_queue, {}, {})); + seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, n, vk_device.transfer_queue, {}, {})); - ggml_vk_submit(vk_device.transfer_queues[0], seq, VK_NULL_HANDLE); + ggml_vk_submit(vk_device.transfer_queue, seq, VK_NULL_HANDLE); // Wait for transfers to finish - vk_device.transfer_queues[0].queue.waitIdle(); + vk_device.transfer_queue.queue.waitIdle(); auto begin = std::chrono::high_resolution_clock::now(); @@ -4666,7 +4641,7 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int sp auto end = std::chrono::high_resolution_clock::now(); // copy dst to host - ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queues[0]); + ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queue); float * fx = (float *) malloc(sizeof(float) * x_ne); float * fy = (float *) malloc(sizeof(float) * y_ne); @@ -4695,8 +4670,7 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int sp free(fy); free(d_chk); - ggml_vk_queue_cleanup(vk_device.transfer_queues[0]); - ggml_vk_queue_cleanup(vk_device.transfer_queues[1]); + ggml_vk_queue_cleanup(vk_device.transfer_queue); ggml_vk_queue_cleanup(vk_device.compute_queue); ggml_vk_destroy_buffer(d_X); @@ -4783,13 +4757,13 @@ void ggml_vk_test_matmul_f16_f32(size_t m, size_t n, size_t k, size_t num_it, in y[i] = rand() / (float)RAND_MAX; } - seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, vk_device.transfer_queues[0], {}, {})); - seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, vk_device.transfer_queues[0], {}, {})); + seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, vk_device.transfer_queue, {}, {})); + seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, vk_device.transfer_queue, {}, {})); - ggml_vk_submit(vk_device.transfer_queues[0], seq, VK_NULL_HANDLE); + ggml_vk_submit(vk_device.transfer_queue, seq, VK_NULL_HANDLE); // Wait for transfers to finish - vk_device.transfer_queues[0].queue.waitIdle(); + vk_device.transfer_queue.queue.waitIdle(); auto begin = std::chrono::high_resolution_clock::now(); @@ -4804,7 +4778,7 @@ void ggml_vk_test_matmul_f16_f32(size_t m, size_t n, size_t k, size_t num_it, in auto end = std::chrono::high_resolution_clock::now(); // copy dst to host - ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queues[0]); + ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queue); float * fx = (float *) malloc(sizeof(float) * x_ne); float * d_chk = (float *) malloc(sizeof(float) * d_ne); @@ -4830,8 +4804,7 @@ void ggml_vk_test_matmul_f16_f32(size_t m, size_t n, size_t k, size_t num_it, in free(fx); free(d_chk); - ggml_vk_queue_cleanup(vk_device.transfer_queues[0]); - ggml_vk_queue_cleanup(vk_device.transfer_queues[1]); + ggml_vk_queue_cleanup(vk_device.transfer_queue); ggml_vk_queue_cleanup(vk_device.compute_queue); ggml_vk_destroy_buffer(d_X);