Reduce number of used semaphores by utilizing timelines more properly

This commit is contained in:
0cc4m 2023-10-31 14:24:37 +01:00
parent 1cb90e57e4
commit 2c7fa8de5a

View file

@ -27,8 +27,8 @@
#include <limits>
#include <tuple>
#include <vector>
#include <mutex>
#include <sstream>
#include <utility>
#include "ggml.h"
@ -91,21 +91,6 @@ struct vk_pipeline {
};
struct vk_queue {
vk_queue() {}
vk_queue(const vk_queue& b) : queue_family_index(b.queue_family_index), queue(b.queue), pool(b.pool), cmd_buffer_idx(b.cmd_buffer_idx), cmd_buffers(b.cmd_buffers), stage_flags(b.stage_flags) {}
vk_queue& operator=(const vk_queue& b) {
if (this != &b) {
queue_family_index = b.queue_family_index;
queue = b.queue;
pool = b.pool;
cmd_buffer_idx = b.cmd_buffer_idx;
cmd_buffers = b.cmd_buffers;
stage_flags = b.stage_flags;
}
return *this;
}
uint32_t queue_family_index;
vk::Queue queue;
vk::CommandPool pool;
@ -113,14 +98,17 @@ struct vk_queue {
std::vector<vk::CommandBuffer> cmd_buffers;
vk::PipelineStageFlags stage_flags;
};
std::mutex mutex;
struct vk_semaphore {
vk::Semaphore s;
uint64_t value;
};
struct vk_submission {
vk::CommandBuffer buffer;
std::vector<vk::Semaphore> wait_semaphores;
std::vector<vk::Semaphore> signal_semaphores;
std::vector<vk_semaphore> wait_semaphores;
std::vector<vk_semaphore> signal_semaphores;
};
typedef std::vector<vk_submission> vk_sequence;
@ -382,7 +370,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) {
return buf;
}
static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores) {
static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_create_submission()" << std::endl;
#endif
@ -393,7 +381,7 @@ static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector<vk::Sema
return s;
}
static vk_sequence ggml_vk_create_sequence_1(vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores) {
static vk_sequence ggml_vk_create_sequence_1(vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_create_sequence_1()" << std::endl;
#endif
@ -410,6 +398,8 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
std::vector<std::vector<uint64_t>> tl_wait_vals;
std::vector<std::vector<uint64_t>> tl_signal_vals;
std::vector<std::vector<vk::Semaphore>> tl_wait_semaphores;
std::vector<std::vector<vk::Semaphore>> tl_signal_semaphores;
std::vector<vk::TimelineSemaphoreSubmitInfo> tl_submit_infos;
std::vector<vk::SubmitInfo> submit_infos;
int idx = -1;
@ -422,7 +412,9 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
}
// Pre-reserve vectors to prevent reallocation, which invalidates pointers
tl_wait_semaphores.reserve(reserve);
tl_wait_vals.reserve(reserve);
tl_signal_semaphores.reserve(reserve);
tl_signal_vals.reserve(reserve);
tl_submit_infos.reserve(reserve);
submit_infos.reserve(reserve);
@ -432,13 +424,21 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
for (const auto& submission : sequence) {
stage_flags.push_back({});
idx++;
tl_wait_vals.push_back({});
tl_wait_semaphores.push_back({});
tl_signal_vals.push_back({});
tl_signal_semaphores.push_back({});
for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
stage_flags[idx].push_back(q.stage_flags);
GGML_ASSERT(submission.wait_semaphores[i].value > 0);
tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
}
for (size_t i = 0; i < submission.signal_semaphores.size(); i++) {
GGML_ASSERT(submission.signal_semaphores[i].value > 0);
tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value);
tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s);
}
tl_wait_vals.push_back({});
tl_wait_vals[idx].assign(submission.wait_semaphores.size(), 1);
tl_signal_vals.push_back({});
tl_signal_vals[idx].assign(submission.signal_semaphores.size(), 1);
tl_submit_infos.push_back({
(uint32_t) submission.wait_semaphores.size(),
tl_wait_vals[idx].data(),
@ -449,19 +449,17 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
tl_submit_infos[idx].pNext = nullptr;
submit_infos.push_back({
(uint32_t) submission.wait_semaphores.size(),
submission.wait_semaphores.data(),
tl_wait_semaphores[idx].data(),
stage_flags[idx].data(),
1,
&submission.buffer,
(uint32_t) submission.signal_semaphores.size(),
submission.signal_semaphores.data(),
+ &tl_submit_infos[idx],
tl_signal_semaphores[idx].data(),
&tl_submit_infos[idx],
});
}
}
std::lock_guard<std::mutex> guard(q.mutex);
q.queue.submit(submit_infos, fence);
sequences.clear();
@ -1295,14 +1293,14 @@ static void ggml_vk_dispatch_pipeline(vk_submission& s, vk_pipeline& pipeline, s
s.buffer.dispatch(wg0, wg1, wg2);
}
static void ggml_vk_end_submission(vk_submission& s, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores) {
static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
s.buffer.end();
s.wait_semaphores = std::move(wait_semaphores);
s.signal_semaphores = std::move(signal_semaphores);
}
static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
#endif
@ -1422,7 +1420,7 @@ static inline size_t ggml_vk_align_size(size_t width, size_t align) {
return CEIL_DIV(width, align) * align;
}
static vk_sequence ggml_vk_buffer_write_async(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
static vk_sequence ggml_vk_buffer_write_async(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
#endif
@ -1436,7 +1434,7 @@ static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src
ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1, q);
}
static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores, vk_submission* s = nullptr) {
static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores, vk_submission* s = nullptr) {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_buffer_read_async(" << size << ")" << std::endl;
#endif
@ -1546,7 +1544,6 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
nullptr,
1,
&cmd_buffer);
std::lock_guard<std::mutex> guard(q.mutex);
q.queue.submit({ submit_info }, fence);
vk::resultCheck(vk_device.device.waitForFences({ fence }, true, uint64_t(-1)), "vk_buffer_read staging waitForFences");
vk_device.device.destroyFence(fence);
@ -1554,7 +1551,7 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
}
}
static vk_sequence ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
static vk_sequence ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_h2d_tensor_2d()" << std::endl;
#endif
@ -1679,7 +1676,7 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, in
return aligned ? &vk_pipeline_matmul_f32_aligned_l : &vk_pipeline_matmul_f32_l;
}
static vk_sequence ggml_vk_matmul(vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, int m, int n, int k, int stride_a, int stride_b, int stride_d, int split_k, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores) {
static vk_sequence ggml_vk_matmul(vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, int m, int n, int k, int stride_a, int stride_b, int stride_d, int split_k, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
#ifdef VK_DEBUG
std::cerr << "ggml_vk_matmul(" << m << ", " << n << ", " << k << ")" << std::endl;
#endif
@ -1767,7 +1764,7 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, ne12 * ne13);
}
std::vector<vk::Semaphore> x_semaphores;
std::vector<vk_semaphore> x_semaphores;
if (load_x) {
for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -1775,9 +1772,9 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
const uint32_t x_offset = load_x ? x_sz * (i03 * ne02 + i02) : 0;
// copy data to device
vk::Semaphore s = ggml_vk_create_semaphore();
x_semaphores.push_back(s);
x_semaphores.push_back({ s, 1 });
// Wait for previous matmul to be done before writing to the input buffers again
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { s }, nullptr, &extra->memcpys));
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { { s, 1 } }, nullptr, &extra->memcpys));
}
}
}
@ -1791,24 +1788,22 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
const uint32_t y_offset = y_sz * (i13 * ne12 + i12);
const uint32_t d_offset = d_sz * (i13 * ne12 + i12);
vk::Semaphore s_x;
if (load_x) {
s_x = x_semaphores[i03 * ne02 + i02];
}
vk::Semaphore s_y = ggml_vk_create_semaphore();
std::vector<vk::Semaphore> semaphores = { s_y };
vk::Semaphore s = ggml_vk_create_semaphore();
std::vector<vk_semaphore> semaphores = { { s, 1 } };
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i13, i12, vk_device.transfer_queues[1], {}, { s_y }, nullptr, &extra->memcpys));
if (load_x) {
semaphores.push_back(x_semaphores[i03 * ne02 + i02]);
}
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i13, i12, vk_device.transfer_queues[1], {}, { { s, 1 } }, nullptr, &extra->memcpys));
// compute
vk::Semaphore s_mm = ggml_vk_create_semaphore();
extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, vk_device.compute_queue, std::move(semaphores), { s_mm }));
extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, vk_device.compute_queue, std::move(semaphores), { { s, 2 } }));
if (dst->backend == GGML_BACKEND_CPU) {
// copy dst to host
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, vk_device.transfer_queues[1], { s_mm }, {}));
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, vk_device.transfer_queues[1], { { s, 2 } }, {}));
}
}
}
@ -1922,7 +1917,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, ne12 * ne13);
}
std::vector<vk::Semaphore> x_semaphores;
std::vector<vk_semaphore> x_semaphores;
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -1930,17 +1925,17 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
const uint32_t qx_offset = qx_sz * it_idx0;
const uint32_t x_offset = x_sz * it_idx0;
vk::Semaphore s_qx;
vk::Semaphore s_x;
// TODO: Find out why one semaphore doesn't work
vk::Semaphore sem = ggml_vk_create_semaphore();
vk::Semaphore sem2 = ggml_vk_create_semaphore();
if (load_x) {
// copy data to device
s_qx = ggml_vk_create_semaphore();
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, qx_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { s_qx }, nullptr, &extra->memcpys));
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, qx_offset, src0, i03, i02, tr0q, {}, { { sem2, 1 } }, nullptr, &extra->memcpys));
}
if (qx_needs_dequant) {
s_x = ggml_vk_create_semaphore();
sem = ggml_vk_create_semaphore();
vk_submission s = ggml_vk_begin_submission(compq);
const std::vector<int> pc = { (int)ne01, (int)ne10, (int)ne10, (int)ne10 };
@ -1948,16 +1943,18 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
ggml_vk_sync_buffers(s.buffer, { { *d_X, x_offset, x_sz } }, compq, vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite, false);
ggml_vk_dispatch_pipeline(s, *to_fp16_vk_0, { { *d_Qx, qx_offset, qx_sz }, { *d_X, x_offset, x_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)x_ne, 1, 1});
if (load_x) {
ggml_vk_end_submission(s, { s_qx }, { s_x });
ggml_vk_end_submission(s, { { sem2, 1 } }, { { sem, 1 } });
} else {
ggml_vk_end_submission(s, {}, { s_x });
ggml_vk_end_submission(s, {}, { { sem, 1 } });
}
extra->comp_seqs.push_back({ s });
x_semaphores.push_back(s_x);
x_semaphores.push_back({ sem, 1 });
} else if (load_x) {
x_semaphores.push_back({ sem2, 1 });
} else {
x_semaphores.push_back(s_qx);
x_semaphores.push_back({ sem, 0 });
}
}
}
@ -1969,35 +1966,32 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
const uint32_t it_idx0 = (i03 * ne02 + i02);
const uint32_t it_idx1 = (i13 * ne12 + i12);
const uint32_t qx_offset = load_x ? qx_sz * it_idx0 : 0;
const uint32_t qy_offset = load_y ? qy_sz * it_idx1 : 0;
const uint32_t x_offset = x_sz * it_idx0;
const uint32_t y_offset = y_sz * it_idx1;
const uint32_t d_offset = d_sz * it_idx1;
vk::Semaphore s_y;
vk::Semaphore s_q;
const vk::Semaphore sem = ggml_vk_create_semaphore();
const vk::Semaphore s_mm = ggml_vk_create_semaphore();
std::vector<vk::Semaphore> mm_semaphores;
std::vector<vk_semaphore> mm_semaphores;
if (load_x) {
mm_semaphores.push_back(x_semaphores[it_idx0]);
}
if (load_y) {
s_y = ggml_vk_create_semaphore();
mm_semaphores.push_back(s_y);
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, tr1q, {}, { s_y }, nullptr, &extra->memcpys));
// Set semaphore to 1
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, tr1q, {}, { { sem, 1 }}, nullptr, &extra->memcpys));
// Wait for semaphore val 1
mm_semaphores.push_back({ sem, 1 });
}
// compute
extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, compq, std::move(mm_semaphores), { s_mm }));
extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, compq, std::move(mm_semaphores), { { sem, 2 } }));
if (dst->backend == GGML_BACKEND_CPU) {
// copy dst to host
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, tr1q, { s_mm }, {}));
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, tr1q, { { sem, 2 } }, {}));
}
}
}
@ -2095,7 +2089,7 @@ static void ggml_vk_mul_mat_vec_q_f16(const ggml_tensor * src0, const ggml_tenso
vk_submission s = ggml_vk_begin_submission(compq);
vk::Semaphore s_x;
vk_semaphore s_x;
if (load_y) {
ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, compq, {}, {}, &s, &extra->memcpys);
}
@ -2335,8 +2329,8 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g
} else {
d_D = &vk_prealloc_d;
}
vk_buffer* d_X;
vk_buffer* d_Y;
vk_buffer* d_X = nullptr;
vk_buffer* d_Y = nullptr;
if (transfer_src0) {
d_X = &vk_prealloc_qx;
} else {
@ -2361,20 +2355,18 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g
const uint32_t y_offset = transfer_src1 ? y_sz * it_idx : 0;
const uint32_t d_offset = d_sz * it_idx;
vk::Semaphore s_x;
vk::Semaphore s_y;
vk::Semaphore s_mm = ggml_vk_create_semaphore();
std::vector<vk::Semaphore> transfer_semaphores;
const vk::Semaphore sem = ggml_vk_create_semaphore();
vk::Semaphore sem_x;
std::vector<vk_semaphore> transfer_semaphores;
// copy src0 to device
if (transfer_src0) {
s_x = ggml_vk_create_semaphore();
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { s_x }, nullptr, &extra->memcpys));
transfer_semaphores.push_back(s_x);
sem_x = ggml_vk_create_semaphore();
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { { sem_x, 1} }, nullptr, &extra->memcpys));
transfer_semaphores.push_back({ sem_x, 1 });
}
if (transfer_src1) {
s_y = ggml_vk_create_semaphore();
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i03, i02, vk_device.transfer_queues[1], {}, { s_y }, nullptr, &extra->memcpys));
transfer_semaphores.push_back(s_y);
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i03, i02, vk_device.transfer_queues[1], {}, { { sem, 1 } }, nullptr, &extra->memcpys));
transfer_semaphores.push_back({ sem, 1 });
}
const int64_t i13 = use_src1 ? i03%ne13 : i03;
@ -2390,13 +2382,13 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g
ggml_vk_sync_buffers(s.buffer, { ggml_vk_subbuffer(*d_X) }, vk_device.compute_queue, vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eShaderRead, false);
ggml_vk_dispatch_pipeline(s, *pipeline, { { *d_X, x_offset, x_sz }, { *d_D, d_offset, d_sz } }, sizeof(vk_op_push_constants), &pc, { (uint32_t)ne00, (uint32_t)ne01, 1});
}
ggml_vk_end_submission(s, std::move(transfer_semaphores), { s_mm });
ggml_vk_end_submission(s, std::move(transfer_semaphores), { { sem, 2 } });
extra->comp_seqs.push_back({ s });
if (dst->backend == GGML_BACKEND_CPU) {
// copy dst to host
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * ne00 * ne01, vk_device.transfer_queues[1], { s_mm }, {}));
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * ne00 * ne01, vk_device.transfer_queues[1], { { sem, 2 } }, {}));
}
}
}