diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index ffc0583c5..7fd149314 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -146,13 +146,6 @@ struct vk_staging_memcpy { }; struct ggml_vk_tensor_extra_gpu { - uint32_t batch_size; - uint32_t d_buf_idx; - uint32_t qx_buf_idx; - uint32_t qy_buf_idx; - uint32_t x_buf_idx; - uint32_t y_buf_idx; - std::vector memcpys; std::vector in0_seqs; std::vector in1_seqs; @@ -165,7 +158,8 @@ struct ggml_vk_tensor_extra_gpu { struct ggml_vk_garbage_collector { std::vector pipelines; std::vector extras; - std::vector semaphores; + std::vector tl_semaphores; + std::vector semaphores; }; vk::Instance vk_instance; @@ -186,6 +180,7 @@ vk_pipeline vk_pipeline_mul_f32; vk_pipeline vk_pipeline_add_f32, vk_pipeline_add_f16_f32_f16; vk_pipeline vk_pipeline_scale_f32; +static size_t vk_semaphore_idx; static ggml_vk_garbage_collector vk_gc; static std::vector> vk_pinned_memory; static size_t vk_prealloc_size_d, vk_prealloc_size_qx, vk_prealloc_size_qy, vk_prealloc_size_x, vk_prealloc_size_y; @@ -430,12 +425,10 @@ static void ggml_vk_submit(vk_queue& q, std::vector& sequences, vk: tl_signal_semaphores.push_back({}); for (size_t i = 0; i < submission.wait_semaphores.size(); i++) { stage_flags[idx].push_back(q.stage_flags); - GGML_ASSERT(submission.wait_semaphores[i].value > 0); tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value); tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s); } for (size_t i = 0; i < submission.signal_semaphores.size(); i++) { - GGML_ASSERT(submission.signal_semaphores[i].value > 0); tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value); tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s); } @@ -526,15 +519,27 @@ static vk_queue ggml_vk_create_queue(uint32_t queue_family_index, uint32_t queue return q; } -static vk::Semaphore ggml_vk_create_semaphore() { +static vk_semaphore * ggml_vk_create_binary_semaphore() { #ifdef VK_DEBUG - std::cerr << "ggml_vk_create_semaphore()" << std::endl; + std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl; #endif - vk::SemaphoreTypeCreateInfo info{ vk::SemaphoreType::eTimeline, 0 }; + vk::SemaphoreTypeCreateInfo info{ vk::SemaphoreType::eBinary, 0 }; vk::Semaphore semaphore = vk_device.device.createSemaphore(vk::SemaphoreCreateInfo{ {}, &info }); - vk_gc.semaphores.push_back(semaphore); + vk_gc.semaphores.push_back({ semaphore, 0 }); + return &vk_gc.semaphores[vk_gc.semaphores.size() - 1]; +} - return semaphore; +static vk_semaphore * ggml_vk_create_timeline_semaphore() { +#ifdef VK_DEBUG + std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl; +#endif + if (vk_semaphore_idx >= vk_gc.tl_semaphores.size()) { + vk::SemaphoreTypeCreateInfo info{ vk::SemaphoreType::eTimeline, 0 }; + vk::Semaphore semaphore = vk_device.device.createSemaphore(vk::SemaphoreCreateInfo{ {}, &info }); + vk_gc.tl_semaphores.push_back({ semaphore, 0 }); + return &vk_gc.tl_semaphores[vk_semaphore_idx++]; + } + return &vk_gc.tl_semaphores[vk_semaphore_idx++]; } static void ggml_vk_queue_cleanup(vk_queue& q) { @@ -1765,10 +1770,11 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr for (int64_t i02 = 0; i02 < ne02; i02++) { const uint32_t x_offset = load_x ? x_sz * (i03 * ne02 + i02) : 0; // copy data to device - vk::Semaphore s = ggml_vk_create_semaphore(); - x_semaphores.push_back({ s, 1 }); + vk_semaphore * sem = ggml_vk_create_timeline_semaphore(); + x_semaphores.push_back({ sem->s, sem->value + 1 }); // Wait for previous matmul to be done before writing to the input buffers again - extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { { s, 1 } }, nullptr, &extra->memcpys)); + extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); + sem->value += 1; } } } @@ -1782,23 +1788,25 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr const uint32_t y_offset = y_sz * (i13 * ne12 + i12); const uint32_t d_offset = d_sz * (i13 * ne12 + i12); - vk::Semaphore s = ggml_vk_create_semaphore(); - std::vector semaphores = { { s, 1 } }; + vk_semaphore * sem = ggml_vk_create_timeline_semaphore(); + std::vector semaphores = { { sem->s, sem->value + 1 } }; if (load_x) { semaphores.push_back(x_semaphores[i03 * ne02 + i02]); } - extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i13, i12, vk_device.transfer_queues[1], {}, { { s, 1 } }, nullptr, &extra->memcpys)); + extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i13, i12, vk_device.transfer_queues[1], {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); // compute - extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, vk_device.compute_queue, std::move(semaphores), { { s, 2 } })); + extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, vk_device.compute_queue, std::move(semaphores), { { sem->s, sem->value + 2 } })); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, vk_device.transfer_queues[1], { { s, 2 } }, {})); + extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, vk_device.transfer_queues[1], { { sem->s, sem->value + 2 } }, {})); } + + sem->value += 2; } } } @@ -1919,37 +1927,35 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * const uint32_t qx_offset = qx_sz * it_idx0; const uint32_t x_offset = x_sz * it_idx0; - // TODO: Find out why one semaphore doesn't work - vk::Semaphore sem = ggml_vk_create_semaphore(); - vk::Semaphore sem2 = ggml_vk_create_semaphore(); + vk_semaphore * sem = ggml_vk_create_timeline_semaphore(); if (load_x) { // copy data to device - extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, qx_offset, src0, i03, i02, tr0q, {}, { { sem2, 1 } }, nullptr, &extra->memcpys)); + extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, qx_offset, src0, i03, i02, tr0q, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); } if (qx_needs_dequant) { - sem = ggml_vk_create_semaphore(); - vk_submission s = ggml_vk_begin_submission(compq); const std::vector pc = { (int)ne01, (int)ne10, (int)ne10, (int)ne10 }; ggml_vk_sync_buffers(s.buffer, { { *d_Qx, qx_offset, qx_sz } }, compq, vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eShaderRead, false); ggml_vk_sync_buffers(s.buffer, { { *d_X, x_offset, x_sz } }, compq, vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite, false); ggml_vk_dispatch_pipeline(s, *to_fp16_vk_0, { { *d_Qx, qx_offset, qx_sz }, { *d_X, x_offset, x_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)x_ne, 1, 1}); if (load_x) { - ggml_vk_end_submission(s, { { sem2, 1 } }, { { sem, 1 } }); + ggml_vk_end_submission(s, { { sem->s, sem->value + 1 } }, { { sem->s, sem->value + 2 } }); } else { - ggml_vk_end_submission(s, {}, { { sem, 1 } }); + ggml_vk_end_submission(s, {}, { { sem->s, sem->value + 2 } }); } extra->comp_seqs.push_back({ s }); - x_semaphores.push_back({ sem, 1 }); + x_semaphores.push_back({ sem->s, sem->value + 2 }); } else if (load_x) { - x_semaphores.push_back({ sem2, 1 }); + x_semaphores.push_back({ sem->s, sem->value + 1 }); } else { - x_semaphores.push_back({ sem, 0 }); + x_semaphores.push_back({ sem->s, sem->value }); } + + sem->value += 2; } } @@ -1965,7 +1971,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * const uint32_t y_offset = y_sz * it_idx1; const uint32_t d_offset = d_sz * it_idx1; - const vk::Semaphore sem = ggml_vk_create_semaphore(); + vk_semaphore * sem = ggml_vk_create_timeline_semaphore(); std::vector mm_semaphores; @@ -1974,19 +1980,21 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * } if (load_y) { // Set semaphore to 1 - extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, tr1q, {}, { { sem, 1 }}, nullptr, &extra->memcpys)); + extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, tr1q, {}, { { sem->s, sem->value + 1 }}, nullptr, &extra->memcpys)); // Wait for semaphore val 1 - mm_semaphores.push_back({ sem, 1 }); + mm_semaphores.push_back({ sem->s, sem->value + 1 }); } // compute - extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, compq, std::move(mm_semaphores), { { sem, 2 } })); + extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, compq, std::move(mm_semaphores), { { sem->s, sem->value + 2 } })); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, tr1q, { { sem, 2 } }, {})); + extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, tr1q, { { sem->s, sem->value + 2 } }, {})); } + + sem->value += 2; } } } @@ -2349,18 +2357,18 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g const uint32_t y_offset = transfer_src1 ? y_sz * it_idx : 0; const uint32_t d_offset = d_sz * it_idx; - const vk::Semaphore sem = ggml_vk_create_semaphore(); - vk::Semaphore sem_x; + vk_semaphore * sem = ggml_vk_create_timeline_semaphore(); + vk_semaphore * sem_x; std::vector transfer_semaphores; // copy src0 to device if (transfer_src0) { - sem_x = ggml_vk_create_semaphore(); - extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { { sem_x, 1} }, nullptr, &extra->memcpys)); - transfer_semaphores.push_back({ sem_x, 1 }); + sem_x = ggml_vk_create_binary_semaphore(); + extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { *sem_x }, nullptr, &extra->memcpys)); + transfer_semaphores.push_back(*sem_x); } if (transfer_src1) { - extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i03, i02, vk_device.transfer_queues[1], {}, { { sem, 1 } }, nullptr, &extra->memcpys)); - transfer_semaphores.push_back({ sem, 1 }); + extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i03, i02, vk_device.transfer_queues[1], {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); + transfer_semaphores.push_back({ sem->s, sem->value + 1 }); } const int64_t i13 = use_src1 ? i03%ne13 : i03; @@ -2376,14 +2384,16 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g ggml_vk_sync_buffers(s.buffer, { ggml_vk_subbuffer(*d_X) }, vk_device.compute_queue, vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eShaderRead, false); ggml_vk_dispatch_pipeline(s, *pipeline, { { *d_X, x_offset, x_sz }, { *d_D, d_offset, d_sz } }, sizeof(vk_op_push_constants), &pc, { (uint32_t)ne00, (uint32_t)ne01, 1}); } - ggml_vk_end_submission(s, std::move(transfer_semaphores), { { sem, 2 } }); + ggml_vk_end_submission(s, std::move(transfer_semaphores), { { sem->s, sem->value + 2 } }); extra->comp_seqs.push_back({ s }); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * ne00 * ne01, vk_device.transfer_queues[1], { { sem, 2 } }, {})); + extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * ne00 * ne01, vk_device.transfer_queues[1], { { sem->s, sem->value + 2 } }, {})); } + + sem->value += 2; } } } @@ -2600,9 +2610,6 @@ void ggml_vk_preallocate_buffers() { } void ggml_vk_build_graph(ggml_tensor * node){ -#ifdef VK_DEBUG - std::cerr << "ggml_vk_build_graph(" << node << ")" << std::endl; -#endif const bool any_on_device = node->backend == GGML_BACKEND_GPU || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU); @@ -2611,6 +2618,12 @@ void ggml_vk_build_graph(ggml_tensor * node){ return; } +#ifdef VK_DEBUG + std::cerr << "ggml_vk_build_graph(" << node << ")" << std::endl; +#endif + + vk_semaphore_idx = 0; + switch (node->op) { case GGML_OP_REPEAT: ggml_vk_repeat(node->src[0], node->src[1], node); @@ -2688,6 +2701,10 @@ bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor) return true; } +#ifdef VK_DEBUG + std::cerr << "ggml_vk_compute_forward(" << params << ", " << tensor << ")" << std::endl; +#endif + #ifdef GGML_VULKAN_CHECK_RESULTS ggml_vk_check_results_0(params, tensor); #endif @@ -2718,16 +2735,22 @@ void ggml_vk_graph_cleanup() { for (auto * pipeline : vk_gc.pipelines) { ggml_vk_pipeline_cleanup(*pipeline); } + vk_gc.pipelines.clear(); ggml_vk_queue_cleanup(vk_device.compute_queue); ggml_vk_queue_cleanup(vk_device.transfer_queues[0]); ggml_vk_queue_cleanup(vk_device.transfer_queues[1]); for (size_t i = 0; i < vk_gc.semaphores.size(); i++) { - vk_device.device.destroySemaphore({ vk_gc.semaphores[i] }); + vk_device.device.destroySemaphore({ vk_gc.semaphores[i].s }); } vk_gc.semaphores.clear(); + for (size_t i = 0; i < vk_gc.tl_semaphores.size(); i++) { + vk_device.device.destroySemaphore({ vk_gc.tl_semaphores[i].s }); + } + vk_gc.tl_semaphores.clear(); + for (auto * extra : vk_gc.extras) { delete extra; }