Reduce number of used semaphores by utilizing timelines more properly
This commit is contained in:
parent
1cb90e57e4
commit
2c7fa8de5a
1 changed files with 79 additions and 87 deletions
166
ggml-vulkan.cpp
166
ggml-vulkan.cpp
|
@ -27,8 +27,8 @@
|
|||
#include <limits>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
|
@ -91,21 +91,6 @@ struct vk_pipeline {
|
|||
};
|
||||
|
||||
struct vk_queue {
|
||||
vk_queue() {}
|
||||
vk_queue(const vk_queue& b) : queue_family_index(b.queue_family_index), queue(b.queue), pool(b.pool), cmd_buffer_idx(b.cmd_buffer_idx), cmd_buffers(b.cmd_buffers), stage_flags(b.stage_flags) {}
|
||||
|
||||
vk_queue& operator=(const vk_queue& b) {
|
||||
if (this != &b) {
|
||||
queue_family_index = b.queue_family_index;
|
||||
queue = b.queue;
|
||||
pool = b.pool;
|
||||
cmd_buffer_idx = b.cmd_buffer_idx;
|
||||
cmd_buffers = b.cmd_buffers;
|
||||
stage_flags = b.stage_flags;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
uint32_t queue_family_index;
|
||||
vk::Queue queue;
|
||||
vk::CommandPool pool;
|
||||
|
@ -113,14 +98,17 @@ struct vk_queue {
|
|||
std::vector<vk::CommandBuffer> cmd_buffers;
|
||||
|
||||
vk::PipelineStageFlags stage_flags;
|
||||
};
|
||||
|
||||
std::mutex mutex;
|
||||
struct vk_semaphore {
|
||||
vk::Semaphore s;
|
||||
uint64_t value;
|
||||
};
|
||||
|
||||
struct vk_submission {
|
||||
vk::CommandBuffer buffer;
|
||||
std::vector<vk::Semaphore> wait_semaphores;
|
||||
std::vector<vk::Semaphore> signal_semaphores;
|
||||
std::vector<vk_semaphore> wait_semaphores;
|
||||
std::vector<vk_semaphore> signal_semaphores;
|
||||
};
|
||||
|
||||
typedef std::vector<vk_submission> vk_sequence;
|
||||
|
@ -382,7 +370,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) {
|
|||
return buf;
|
||||
}
|
||||
|
||||
static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores) {
|
||||
static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_create_submission()" << std::endl;
|
||||
#endif
|
||||
|
@ -393,7 +381,7 @@ static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector<vk::Sema
|
|||
return s;
|
||||
}
|
||||
|
||||
static vk_sequence ggml_vk_create_sequence_1(vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores) {
|
||||
static vk_sequence ggml_vk_create_sequence_1(vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_create_sequence_1()" << std::endl;
|
||||
#endif
|
||||
|
@ -410,6 +398,8 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
|
|||
|
||||
std::vector<std::vector<uint64_t>> tl_wait_vals;
|
||||
std::vector<std::vector<uint64_t>> tl_signal_vals;
|
||||
std::vector<std::vector<vk::Semaphore>> tl_wait_semaphores;
|
||||
std::vector<std::vector<vk::Semaphore>> tl_signal_semaphores;
|
||||
std::vector<vk::TimelineSemaphoreSubmitInfo> tl_submit_infos;
|
||||
std::vector<vk::SubmitInfo> submit_infos;
|
||||
int idx = -1;
|
||||
|
@ -422,7 +412,9 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
|
|||
}
|
||||
|
||||
// Pre-reserve vectors to prevent reallocation, which invalidates pointers
|
||||
tl_wait_semaphores.reserve(reserve);
|
||||
tl_wait_vals.reserve(reserve);
|
||||
tl_signal_semaphores.reserve(reserve);
|
||||
tl_signal_vals.reserve(reserve);
|
||||
tl_submit_infos.reserve(reserve);
|
||||
submit_infos.reserve(reserve);
|
||||
|
@ -432,13 +424,21 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
|
|||
for (const auto& submission : sequence) {
|
||||
stage_flags.push_back({});
|
||||
idx++;
|
||||
tl_wait_vals.push_back({});
|
||||
tl_wait_semaphores.push_back({});
|
||||
tl_signal_vals.push_back({});
|
||||
tl_signal_semaphores.push_back({});
|
||||
for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
|
||||
stage_flags[idx].push_back(q.stage_flags);
|
||||
GGML_ASSERT(submission.wait_semaphores[i].value > 0);
|
||||
tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
|
||||
tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
|
||||
}
|
||||
for (size_t i = 0; i < submission.signal_semaphores.size(); i++) {
|
||||
GGML_ASSERT(submission.signal_semaphores[i].value > 0);
|
||||
tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value);
|
||||
tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s);
|
||||
}
|
||||
tl_wait_vals.push_back({});
|
||||
tl_wait_vals[idx].assign(submission.wait_semaphores.size(), 1);
|
||||
tl_signal_vals.push_back({});
|
||||
tl_signal_vals[idx].assign(submission.signal_semaphores.size(), 1);
|
||||
tl_submit_infos.push_back({
|
||||
(uint32_t) submission.wait_semaphores.size(),
|
||||
tl_wait_vals[idx].data(),
|
||||
|
@ -449,19 +449,17 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
|
|||
tl_submit_infos[idx].pNext = nullptr;
|
||||
submit_infos.push_back({
|
||||
(uint32_t) submission.wait_semaphores.size(),
|
||||
submission.wait_semaphores.data(),
|
||||
tl_wait_semaphores[idx].data(),
|
||||
stage_flags[idx].data(),
|
||||
1,
|
||||
&submission.buffer,
|
||||
(uint32_t) submission.signal_semaphores.size(),
|
||||
submission.signal_semaphores.data(),
|
||||
+ &tl_submit_infos[idx],
|
||||
tl_signal_semaphores[idx].data(),
|
||||
&tl_submit_infos[idx],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> guard(q.mutex);
|
||||
|
||||
q.queue.submit(submit_infos, fence);
|
||||
|
||||
sequences.clear();
|
||||
|
@ -1295,14 +1293,14 @@ static void ggml_vk_dispatch_pipeline(vk_submission& s, vk_pipeline& pipeline, s
|
|||
s.buffer.dispatch(wg0, wg1, wg2);
|
||||
}
|
||||
|
||||
static void ggml_vk_end_submission(vk_submission& s, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores) {
|
||||
static void ggml_vk_end_submission(vk_submission& s, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
||||
s.buffer.end();
|
||||
|
||||
s.wait_semaphores = std::move(wait_semaphores);
|
||||
s.signal_semaphores = std::move(signal_semaphores);
|
||||
}
|
||||
|
||||
static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
|
||||
static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1422,7 +1420,7 @@ static inline size_t ggml_vk_align_size(size_t width, size_t align) {
|
|||
return CEIL_DIV(width, align) * align;
|
||||
}
|
||||
|
||||
static vk_sequence ggml_vk_buffer_write_async(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
|
||||
static vk_sequence ggml_vk_buffer_write_async(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1436,7 +1434,7 @@ static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src
|
|||
ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1, q);
|
||||
}
|
||||
|
||||
static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores, vk_submission* s = nullptr) {
|
||||
static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores, vk_submission* s = nullptr) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_read_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1546,7 +1544,6 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
|
|||
nullptr,
|
||||
1,
|
||||
&cmd_buffer);
|
||||
std::lock_guard<std::mutex> guard(q.mutex);
|
||||
q.queue.submit({ submit_info }, fence);
|
||||
vk::resultCheck(vk_device.device.waitForFences({ fence }, true, uint64_t(-1)), "vk_buffer_read staging waitForFences");
|
||||
vk_device.device.destroyFence(fence);
|
||||
|
@ -1554,7 +1551,7 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
|
|||
}
|
||||
}
|
||||
|
||||
static vk_sequence ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
|
||||
static vk_sequence ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_h2d_tensor_2d()" << std::endl;
|
||||
#endif
|
||||
|
@ -1679,7 +1676,7 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, in
|
|||
return aligned ? &vk_pipeline_matmul_f32_aligned_l : &vk_pipeline_matmul_f32_l;
|
||||
}
|
||||
|
||||
static vk_sequence ggml_vk_matmul(vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, int m, int n, int k, int stride_a, int stride_b, int stride_d, int split_k, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores) {
|
||||
static vk_sequence ggml_vk_matmul(vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, int m, int n, int k, int stride_a, int stride_b, int stride_d, int split_k, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_matmul(" << m << ", " << n << ", " << k << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1767,7 +1764,7 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, ne12 * ne13);
|
||||
}
|
||||
|
||||
std::vector<vk::Semaphore> x_semaphores;
|
||||
std::vector<vk_semaphore> x_semaphores;
|
||||
|
||||
if (load_x) {
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
|
@ -1775,9 +1772,9 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
const uint32_t x_offset = load_x ? x_sz * (i03 * ne02 + i02) : 0;
|
||||
// copy data to device
|
||||
vk::Semaphore s = ggml_vk_create_semaphore();
|
||||
x_semaphores.push_back(s);
|
||||
x_semaphores.push_back({ s, 1 });
|
||||
// Wait for previous matmul to be done before writing to the input buffers again
|
||||
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { s }, nullptr, &extra->memcpys));
|
||||
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { { s, 1 } }, nullptr, &extra->memcpys));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1791,24 +1788,22 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
const uint32_t y_offset = y_sz * (i13 * ne12 + i12);
|
||||
const uint32_t d_offset = d_sz * (i13 * ne12 + i12);
|
||||
|
||||
vk::Semaphore s_x;
|
||||
if (load_x) {
|
||||
s_x = x_semaphores[i03 * ne02 + i02];
|
||||
}
|
||||
vk::Semaphore s_y = ggml_vk_create_semaphore();
|
||||
std::vector<vk::Semaphore> semaphores = { s_y };
|
||||
vk::Semaphore s = ggml_vk_create_semaphore();
|
||||
std::vector<vk_semaphore> semaphores = { { s, 1 } };
|
||||
|
||||
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i13, i12, vk_device.transfer_queues[1], {}, { s_y }, nullptr, &extra->memcpys));
|
||||
if (load_x) {
|
||||
semaphores.push_back(x_semaphores[i03 * ne02 + i02]);
|
||||
}
|
||||
|
||||
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i13, i12, vk_device.transfer_queues[1], {}, { { s, 1 } }, nullptr, &extra->memcpys));
|
||||
|
||||
// compute
|
||||
vk::Semaphore s_mm = ggml_vk_create_semaphore();
|
||||
|
||||
extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, vk_device.compute_queue, std::move(semaphores), { s_mm }));
|
||||
extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, vk_device.compute_queue, std::move(semaphores), { { s, 2 } }));
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, vk_device.transfer_queues[1], { s_mm }, {}));
|
||||
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, vk_device.transfer_queues[1], { { s, 2 } }, {}));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1922,7 +1917,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
|||
ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, ne12 * ne13);
|
||||
}
|
||||
|
||||
std::vector<vk::Semaphore> x_semaphores;
|
||||
std::vector<vk_semaphore> x_semaphores;
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
|
@ -1930,17 +1925,17 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
|||
const uint32_t qx_offset = qx_sz * it_idx0;
|
||||
const uint32_t x_offset = x_sz * it_idx0;
|
||||
|
||||
vk::Semaphore s_qx;
|
||||
vk::Semaphore s_x;
|
||||
// TODO: Find out why one semaphore doesn't work
|
||||
vk::Semaphore sem = ggml_vk_create_semaphore();
|
||||
vk::Semaphore sem2 = ggml_vk_create_semaphore();
|
||||
|
||||
if (load_x) {
|
||||
// copy data to device
|
||||
s_qx = ggml_vk_create_semaphore();
|
||||
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, qx_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { s_qx }, nullptr, &extra->memcpys));
|
||||
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, qx_offset, src0, i03, i02, tr0q, {}, { { sem2, 1 } }, nullptr, &extra->memcpys));
|
||||
}
|
||||
|
||||
if (qx_needs_dequant) {
|
||||
s_x = ggml_vk_create_semaphore();
|
||||
sem = ggml_vk_create_semaphore();
|
||||
|
||||
vk_submission s = ggml_vk_begin_submission(compq);
|
||||
const std::vector<int> pc = { (int)ne01, (int)ne10, (int)ne10, (int)ne10 };
|
||||
|
@ -1948,16 +1943,18 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
|||
ggml_vk_sync_buffers(s.buffer, { { *d_X, x_offset, x_sz } }, compq, vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite, false);
|
||||
ggml_vk_dispatch_pipeline(s, *to_fp16_vk_0, { { *d_Qx, qx_offset, qx_sz }, { *d_X, x_offset, x_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)x_ne, 1, 1});
|
||||
if (load_x) {
|
||||
ggml_vk_end_submission(s, { s_qx }, { s_x });
|
||||
ggml_vk_end_submission(s, { { sem2, 1 } }, { { sem, 1 } });
|
||||
} else {
|
||||
ggml_vk_end_submission(s, {}, { s_x });
|
||||
ggml_vk_end_submission(s, {}, { { sem, 1 } });
|
||||
}
|
||||
|
||||
extra->comp_seqs.push_back({ s });
|
||||
|
||||
x_semaphores.push_back(s_x);
|
||||
x_semaphores.push_back({ sem, 1 });
|
||||
} else if (load_x) {
|
||||
x_semaphores.push_back({ sem2, 1 });
|
||||
} else {
|
||||
x_semaphores.push_back(s_qx);
|
||||
x_semaphores.push_back({ sem, 0 });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1969,35 +1966,32 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
|||
|
||||
const uint32_t it_idx0 = (i03 * ne02 + i02);
|
||||
const uint32_t it_idx1 = (i13 * ne12 + i12);
|
||||
const uint32_t qx_offset = load_x ? qx_sz * it_idx0 : 0;
|
||||
const uint32_t qy_offset = load_y ? qy_sz * it_idx1 : 0;
|
||||
const uint32_t x_offset = x_sz * it_idx0;
|
||||
const uint32_t y_offset = y_sz * it_idx1;
|
||||
const uint32_t d_offset = d_sz * it_idx1;
|
||||
|
||||
vk::Semaphore s_y;
|
||||
vk::Semaphore s_q;
|
||||
const vk::Semaphore sem = ggml_vk_create_semaphore();
|
||||
|
||||
const vk::Semaphore s_mm = ggml_vk_create_semaphore();
|
||||
|
||||
std::vector<vk::Semaphore> mm_semaphores;
|
||||
std::vector<vk_semaphore> mm_semaphores;
|
||||
|
||||
if (load_x) {
|
||||
mm_semaphores.push_back(x_semaphores[it_idx0]);
|
||||
}
|
||||
if (load_y) {
|
||||
s_y = ggml_vk_create_semaphore();
|
||||
mm_semaphores.push_back(s_y);
|
||||
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, tr1q, {}, { s_y }, nullptr, &extra->memcpys));
|
||||
// Set semaphore to 1
|
||||
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, tr1q, {}, { { sem, 1 }}, nullptr, &extra->memcpys));
|
||||
// Wait for semaphore val 1
|
||||
mm_semaphores.push_back({ sem, 1 });
|
||||
}
|
||||
|
||||
// compute
|
||||
extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, compq, std::move(mm_semaphores), { s_mm }));
|
||||
extra->comp_seqs.push_back(ggml_vk_matmul(*pipeline, { *d_X, x_offset, x_sz }, { *d_Y, y_offset, y_sz }, { *d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, compq, std::move(mm_semaphores), { { sem, 2 } }));
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, tr1q, { s_mm }, {}));
|
||||
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, tr1q, { { sem, 2 } }, {}));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2095,7 +2089,7 @@ static void ggml_vk_mul_mat_vec_q_f16(const ggml_tensor * src0, const ggml_tenso
|
|||
|
||||
vk_submission s = ggml_vk_begin_submission(compq);
|
||||
|
||||
vk::Semaphore s_x;
|
||||
vk_semaphore s_x;
|
||||
if (load_y) {
|
||||
ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, compq, {}, {}, &s, &extra->memcpys);
|
||||
}
|
||||
|
@ -2335,8 +2329,8 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|||
} else {
|
||||
d_D = &vk_prealloc_d;
|
||||
}
|
||||
vk_buffer* d_X;
|
||||
vk_buffer* d_Y;
|
||||
vk_buffer* d_X = nullptr;
|
||||
vk_buffer* d_Y = nullptr;
|
||||
if (transfer_src0) {
|
||||
d_X = &vk_prealloc_qx;
|
||||
} else {
|
||||
|
@ -2361,20 +2355,18 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|||
const uint32_t y_offset = transfer_src1 ? y_sz * it_idx : 0;
|
||||
const uint32_t d_offset = d_sz * it_idx;
|
||||
|
||||
vk::Semaphore s_x;
|
||||
vk::Semaphore s_y;
|
||||
vk::Semaphore s_mm = ggml_vk_create_semaphore();
|
||||
std::vector<vk::Semaphore> transfer_semaphores;
|
||||
const vk::Semaphore sem = ggml_vk_create_semaphore();
|
||||
vk::Semaphore sem_x;
|
||||
std::vector<vk_semaphore> transfer_semaphores;
|
||||
// copy src0 to device
|
||||
if (transfer_src0) {
|
||||
s_x = ggml_vk_create_semaphore();
|
||||
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { s_x }, nullptr, &extra->memcpys));
|
||||
transfer_semaphores.push_back(s_x);
|
||||
sem_x = ggml_vk_create_semaphore();
|
||||
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, x_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { { sem_x, 1} }, nullptr, &extra->memcpys));
|
||||
transfer_semaphores.push_back({ sem_x, 1 });
|
||||
}
|
||||
if (transfer_src1) {
|
||||
s_y = ggml_vk_create_semaphore();
|
||||
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i03, i02, vk_device.transfer_queues[1], {}, { s_y }, nullptr, &extra->memcpys));
|
||||
transfer_semaphores.push_back(s_y);
|
||||
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, y_offset, src1, i03, i02, vk_device.transfer_queues[1], {}, { { sem, 1 } }, nullptr, &extra->memcpys));
|
||||
transfer_semaphores.push_back({ sem, 1 });
|
||||
}
|
||||
|
||||
const int64_t i13 = use_src1 ? i03%ne13 : i03;
|
||||
|
@ -2390,13 +2382,13 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|||
ggml_vk_sync_buffers(s.buffer, { ggml_vk_subbuffer(*d_X) }, vk_device.compute_queue, vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eShaderRead, false);
|
||||
ggml_vk_dispatch_pipeline(s, *pipeline, { { *d_X, x_offset, x_sz }, { *d_D, d_offset, d_sz } }, sizeof(vk_op_push_constants), &pc, { (uint32_t)ne00, (uint32_t)ne01, 1});
|
||||
}
|
||||
ggml_vk_end_submission(s, std::move(transfer_semaphores), { s_mm });
|
||||
ggml_vk_end_submission(s, std::move(transfer_semaphores), { { sem, 2 } });
|
||||
extra->comp_seqs.push_back({ s });
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * ne00 * ne01, vk_device.transfer_queues[1], { s_mm }, {}));
|
||||
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * ne00 * ne01, vk_device.transfer_queues[1], { { sem, 2 } }, {}));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue