Move queue into context
Add not-yet-enabled async backend ops
This commit is contained in:
parent
5a8a07ec60
commit
a5cca6cd8c
1 changed files with 207 additions and 113 deletions
320
ggml-vulkan.cpp
320
ggml-vulkan.cpp
|
@ -193,6 +193,8 @@ struct vk_context {
|
|||
|
||||
std::vector<vk_staging_memcpy> in_memcpys;
|
||||
std::vector<vk_staging_memcpy> out_memcpys;
|
||||
|
||||
vk_queue * q;
|
||||
};
|
||||
|
||||
struct ggml_tensor_extra_gpu {
|
||||
|
@ -469,11 +471,11 @@ static vk_sequence ggml_vk_create_sequence_1(vk_queue& q, std::vector<vk_semapho
|
|||
return { ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores)) };
|
||||
}
|
||||
|
||||
static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk::Fence fence) {
|
||||
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_submit(" << q.queue_family_index << " (" << q.queue << "), " << sequences.size() << ", " << fence << ")" << std::endl;
|
||||
std::cerr << "ggml_vk_submit(" << ctx.seqs.size() << ", " << fence << ")" << std::endl;
|
||||
#endif
|
||||
if (sequences.empty()) {
|
||||
if (ctx.seqs.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -488,7 +490,7 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
|
|||
|
||||
size_t reserve = 0;
|
||||
|
||||
for (const auto& sequence : sequences) {
|
||||
for (const auto& sequence : ctx.seqs) {
|
||||
reserve += sequence.size();
|
||||
}
|
||||
|
||||
|
@ -501,7 +503,7 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
|
|||
submit_infos.reserve(reserve);
|
||||
stage_flags.reserve(reserve);
|
||||
|
||||
for (const auto& sequence : sequences) {
|
||||
for (const auto& sequence : ctx.seqs) {
|
||||
for (const auto& submission : sequence) {
|
||||
stage_flags.push_back({});
|
||||
idx++;
|
||||
|
@ -510,7 +512,7 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
|
|||
tl_signal_vals.push_back({});
|
||||
tl_signal_semaphores.push_back({});
|
||||
for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
|
||||
stage_flags[idx].push_back(q.stage_flags);
|
||||
stage_flags[idx].push_back(ctx.q->stage_flags);
|
||||
tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
|
||||
tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
|
||||
}
|
||||
|
@ -540,9 +542,9 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
|
|||
}
|
||||
}
|
||||
|
||||
q.queue.submit(submit_infos, fence);
|
||||
ctx.q->queue.submit(submit_infos, fence);
|
||||
|
||||
sequences.clear();
|
||||
ctx.seqs.clear();
|
||||
}
|
||||
|
||||
static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
|
||||
|
@ -606,7 +608,7 @@ static vk_queue ggml_vk_create_queue(uint32_t queue_family_index, uint32_t queue
|
|||
return q;
|
||||
}
|
||||
|
||||
static vk_context * ggml_vk_create_context() {
|
||||
static vk_context * ggml_vk_create_context(vk_queue& q) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_create_context()" << std::endl;
|
||||
#endif
|
||||
|
@ -614,6 +616,7 @@ static vk_context * ggml_vk_create_context() {
|
|||
vk_context * result = &vk_gc.contexts[vk_gc.contexts.size() - 1];
|
||||
memset((void *) result, 0, sizeof(vk_context));
|
||||
result->idx = vk_gc.contexts.size() - 1;
|
||||
result->q = &q;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -713,15 +716,15 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
|||
return { buf, 0, VK_WHOLE_SIZE };
|
||||
}
|
||||
|
||||
static void ggml_vk_sync_buffers(vk_context& ctx, vk_queue& q) {
|
||||
static void ggml_vk_sync_buffers(vk_context& ctx) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_sync_buffers()" << std::endl;
|
||||
#endif
|
||||
const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
|
||||
|
||||
ctx.s->buffer.pipelineBarrier(
|
||||
q.stage_flags,
|
||||
q.stage_flags,
|
||||
ctx.q->stage_flags,
|
||||
ctx.q->stage_flags,
|
||||
{},
|
||||
mem_barriers,
|
||||
{},
|
||||
|
@ -1362,7 +1365,7 @@ static void ggml_vk_ctx_end(vk_context& ctx) {
|
|||
ctx.s = nullptr;
|
||||
}
|
||||
|
||||
static void ggml_vk_ctx_begin(vk_context& ctx, vk_queue& q) {
|
||||
static void ggml_vk_ctx_begin(vk_context& ctx) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_ctx_begin(" << &ctx << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1370,7 +1373,7 @@ static void ggml_vk_ctx_begin(vk_context& ctx, vk_queue& q) {
|
|||
ggml_vk_ctx_end(ctx);
|
||||
}
|
||||
|
||||
ctx.seqs.push_back({ ggml_vk_begin_submission(q) });
|
||||
ctx.seqs.push_back({ ggml_vk_begin_submission(*ctx.q) });
|
||||
ctx.s = ctx.seqs[ctx.seqs.size() - 1].data();
|
||||
}
|
||||
|
||||
|
@ -1386,7 +1389,7 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write_nc_async(vk_context& ctx, vk_buffer* dst, size_t offset, const ggml_tensor * tensor, vk_queue& q, bool sync_staging = false) {
|
||||
static void ggml_vk_buffer_write_nc_async(vk_context& ctx, vk_buffer* dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1453,7 +1456,7 @@ static void ggml_vk_buffer_write_nc_async(vk_context& ctx, vk_buffer* dst, size_
|
|||
}
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(ctx, q);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ctx.s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
||||
return;
|
||||
}
|
||||
|
@ -1479,7 +1482,7 @@ static void ggml_vk_buffer_write_nc_async(vk_context& ctx, vk_buffer* dst, size_
|
|||
|
||||
VkBufferCopy buf_copy{ staging_offset, offset, copy_size };
|
||||
|
||||
ggml_vk_sync_buffers(ctx, q);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
vkCmdCopyBuffer(ctx.s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
|
||||
|
||||
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
||||
|
@ -1504,7 +1507,7 @@ static void ggml_vk_buffer_write_nc_async(vk_context& ctx, vk_buffer* dst, size_
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write_2d_async(vk_context& ctx, vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q, bool sync_staging = false) {
|
||||
static void ggml_vk_buffer_write_2d_async(vk_context& ctx, vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1543,7 +1546,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context& ctx, vk_buffer* dst, size_
|
|||
}
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(ctx, q);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ctx.s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
||||
return;
|
||||
}
|
||||
|
@ -1574,7 +1577,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context& ctx, vk_buffer* dst, size_
|
|||
offset,
|
||||
copy_size};
|
||||
|
||||
ggml_vk_sync_buffers(ctx, q);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
vkCmdCopyBuffer(ctx.s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
|
||||
|
||||
if (width == spitch) {
|
||||
|
@ -1586,14 +1589,14 @@ static void ggml_vk_buffer_write_2d_async(vk_context& ctx, vk_buffer* dst, size_
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write_async(vk_context& ctx, vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, bool sync_staging = false) {
|
||||
static void ggml_vk_buffer_write_async(vk_context& ctx, vk_buffer* dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
return ggml_vk_buffer_write_2d_async(ctx, dst, offset, src, size, size, 1, q, sync_staging);
|
||||
return ggml_vk_buffer_write_2d_async(ctx, dst, offset, src, size, size, 1, sync_staging);
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q) {
|
||||
static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1605,29 +1608,29 @@ static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void *
|
|||
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
|
||||
}
|
||||
} else {
|
||||
vk_context * ctx = ggml_vk_create_context();
|
||||
ggml_vk_ctx_begin(*ctx, q);
|
||||
ggml_vk_buffer_write_2d_async(*ctx, dst, offset, src, spitch, width, height, q, true);
|
||||
vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue);
|
||||
ggml_vk_ctx_begin(*ctx);
|
||||
ggml_vk_buffer_write_2d_async(*ctx, dst, offset, src, spitch, width, height, true);
|
||||
ggml_vk_ctx_end(*ctx);
|
||||
|
||||
for (auto& cpy : ctx->in_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
|
||||
ggml_vk_submit(q, ctx->seqs, vk_fence);
|
||||
ggml_vk_submit(*ctx, vk_fence);
|
||||
VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
||||
vk_device.device.resetFences({ vk_fence });
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q) {
|
||||
static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src, size_t size) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl;
|
||||
#endif
|
||||
ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1, q);
|
||||
ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1);
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_read_2d_async(vk_context& ctx, vk_buffer* src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_queue& q, bool sync_staging = false) {
|
||||
static void ggml_vk_buffer_read_2d_async(vk_context& ctx, vk_buffer* src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1664,7 +1667,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context& ctx, vk_buffer* src, size_t
|
|||
|
||||
if (buf != nullptr) {
|
||||
// Memory is pinned, use as staging buffer
|
||||
ggml_vk_sync_buffers(ctx, q);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ctx.s->buffer.copyBuffer(src->buffer, buf->buffer, slices);
|
||||
|
||||
return;
|
||||
|
@ -1690,17 +1693,17 @@ static void ggml_vk_buffer_read_2d_async(vk_context& ctx, vk_buffer* src, size_t
|
|||
}
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(ctx, q);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ctx.s->buffer.copyBuffer(src->buffer, staging->buffer, slices);
|
||||
|
||||
deferred_memcpy(dst, staging->ptr, copy_size, &ctx.out_memcpys);
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_read_async(vk_context& ctx, vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q, bool sync_staging = false) {
|
||||
return ggml_vk_buffer_read_2d_async(ctx, src, offset, dst, size, size, size, 1, q, sync_staging);
|
||||
static void ggml_vk_buffer_read_async(vk_context& ctx, vk_buffer* src, size_t offset, void * dst, size_t size, bool sync_staging = false) {
|
||||
return ggml_vk_buffer_read_2d_async(ctx, src, offset, dst, size, size, size, 1, sync_staging);
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q) {
|
||||
static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1709,12 +1712,12 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
|
|||
|
||||
memcpy(dst, (uint8_t *) src->ptr + offset, size);
|
||||
} else {
|
||||
vk_context * ctx = ggml_vk_create_context();
|
||||
ggml_vk_ctx_begin(*ctx, q);
|
||||
ggml_vk_buffer_read_async(*ctx, src, offset, dst, size, q, true);
|
||||
vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue);
|
||||
ggml_vk_ctx_begin(*ctx);
|
||||
ggml_vk_buffer_read_async(*ctx, src, offset, dst, size, true);
|
||||
ggml_vk_ctx_end(*ctx);
|
||||
|
||||
ggml_vk_submit(q, ctx->seqs, vk_fence);
|
||||
ggml_vk_submit(*ctx, vk_fence);
|
||||
VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
|
||||
vk_device.device.resetFences({ vk_fence });
|
||||
|
||||
|
@ -1724,36 +1727,46 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_copy(vk_buffer * dst, size_t dst_offset, vk_buffer * src, size_t src_offset, size_t size, vk_queue& q) {
|
||||
static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer * dst, size_t dst_offset, vk_buffer * src, size_t src_offset, size_t size) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
VkBufferCopy bc{ src_offset, dst_offset, size };
|
||||
|
||||
vkCmdCopyBuffer(ctx.s->buffer, src->buffer, dst->buffer, 1, &bc);
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_copy(vk_buffer * dst, size_t dst_offset, vk_buffer * src, size_t src_offset, size_t size) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_copy(" << size << ")" << std::endl;
|
||||
#endif
|
||||
VkBufferCopy bc{ src_offset, dst_offset, size };
|
||||
|
||||
vk_context * ctx = ggml_vk_create_context();
|
||||
ggml_vk_ctx_begin(*ctx, q);
|
||||
vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue);
|
||||
ggml_vk_ctx_begin(*ctx);
|
||||
vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc);
|
||||
ggml_vk_buffer_copy_async(*ctx, dst, dst_offset, src, src_offset, size);
|
||||
ggml_vk_ctx_end(*ctx);
|
||||
ggml_vk_submit(q, ctx->seqs, vk_fence);
|
||||
ggml_vk_submit(*ctx, vk_fence);
|
||||
VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
||||
vk_device.device.resetFences({ vk_fence });
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_memset(vk_buffer* dst, size_t offset, uint32_t c, size_t size, vk_queue& q) {
|
||||
static void ggml_vk_buffer_memset(vk_buffer* dst, size_t offset, uint32_t c, size_t size) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl;
|
||||
#endif
|
||||
vk_context * ctx = ggml_vk_create_context();
|
||||
ggml_vk_ctx_begin(*ctx, q);
|
||||
vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue);
|
||||
ggml_vk_ctx_begin(*ctx);
|
||||
ctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
|
||||
ggml_vk_ctx_end(*ctx);
|
||||
|
||||
ggml_vk_submit(q, ctx->seqs, vk_fence);
|
||||
ggml_vk_submit(*ctx, vk_fence);
|
||||
VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_memset waitForFences");
|
||||
vk_device.device.resetFences({ vk_fence });
|
||||
}
|
||||
|
||||
static void ggml_vk_h2d_tensor_2d(vk_context& ctx, vk_buffer * dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1, vk_queue& q) {
|
||||
static void ggml_vk_h2d_tensor_2d(vk_context& ctx, vk_buffer * dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1770,20 +1783,20 @@ static void ggml_vk_h2d_tensor_2d(vk_context& ctx, vk_buffer * dst, size_t offse
|
|||
|
||||
const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
|
||||
if (nb0 == ts && nb1 == row_length) {
|
||||
return ggml_vk_buffer_write_async(ctx, dst, offset, x, i1*nb1, q);
|
||||
return ggml_vk_buffer_write_async(ctx, dst, offset, x, i1*nb1);
|
||||
}
|
||||
if (nb0 == ts && (i1 == ne1 || !ggml_is_permuted(src))) {
|
||||
return ggml_vk_buffer_write_2d_async(ctx, dst, offset, x, nb1, row_length, i1, q);
|
||||
return ggml_vk_buffer_write_2d_async(ctx, dst, offset, x, nb1, row_length, i1);
|
||||
}
|
||||
|
||||
GGML_ASSERT(i3 == 0);
|
||||
GGML_ASSERT(i2 == 0);
|
||||
GGML_ASSERT(i1 == (uint64_t) ggml_nrows(src));
|
||||
|
||||
return ggml_vk_buffer_write_nc_async(ctx, dst, offset, src, q);
|
||||
return ggml_vk_buffer_write_nc_async(ctx, dst, offset, src);
|
||||
}
|
||||
|
||||
static void ggml_vk_d2h_tensor_2d(vk_context& ctx, vk_buffer * src, size_t offset, const ggml_tensor * dst, vk_queue& q) {
|
||||
static void ggml_vk_d2h_tensor_2d(vk_context& ctx, vk_buffer * src, size_t offset, const ggml_tensor * dst) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl;
|
||||
#endif
|
||||
|
@ -1801,10 +1814,10 @@ static void ggml_vk_d2h_tensor_2d(vk_context& ctx, vk_buffer * src, size_t offse
|
|||
const size_t row_length = ts*ne0/bs;
|
||||
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
return ggml_vk_buffer_read_async(ctx, src, offset, dst->data, ne1*nb1*ne2*ne3, q);
|
||||
return ggml_vk_buffer_read_async(ctx, src, offset, dst->data, ne1*nb1*ne2*ne3);
|
||||
}
|
||||
if (nb0 == ts) {
|
||||
return ggml_vk_buffer_read_2d_async(ctx, src, offset, dst->data, nb1, nb1, row_length, ne1*ne2*ne3, q);
|
||||
return ggml_vk_buffer_read_2d_async(ctx, src, offset, dst->data, nb1, nb1, row_length, ne1*ne2*ne3);
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
@ -1902,12 +1915,12 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, in
|
|||
return aligned ? &vk_pipeline_matmul_f32_aligned_l : &vk_pipeline_matmul_f32_l;
|
||||
}
|
||||
|
||||
static void ggml_vk_matmul(vk_context& ctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t split_k, uint32_t d_offset, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, vk_queue& q) {
|
||||
static void ggml_vk_matmul(vk_context& ctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t split_k, uint32_t d_offset, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_matmul(a: (" << a.buffer.buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer.buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer.buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer.buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", d_offset: " << d_offset << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
|
||||
#endif
|
||||
if (split_k == 1) {
|
||||
ggml_vk_sync_buffers(ctx, q);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
const std::array<uint32_t, 15> pc = { m, n, k, stride_a, stride_b, stride_d, k, d_offset, ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d };
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, pipeline, { a, b, d }, pc.size() * sizeof(uint32_t), pc.data(), { m, n, batch });
|
||||
return;
|
||||
|
@ -1915,11 +1928,11 @@ static void ggml_vk_matmul(vk_context& ctx, vk_pipeline& pipeline, vk_subbuffer&
|
|||
|
||||
GGML_ASSERT(batch_stride_d == m * n);
|
||||
|
||||
ggml_vk_sync_buffers(ctx, q);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
// Synchronize the two submissions
|
||||
const std::array<uint32_t, 15> pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), 0, ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d * split_k };
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { m * split_k, n, batch });
|
||||
ggml_vk_sync_buffers(ctx, q);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
const std::array<uint32_t, 3> pc2 = { (uint32_t)(m * n * batch), split_k, d_offset };
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, vk_pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
|
||||
return;
|
||||
|
@ -1952,8 +1965,6 @@ static void ggml_vk_cpy_to_contiguous(vk_context& ctx, vk_pipeline * pipeline, c
|
|||
std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
||||
std::cerr << "buffer in size=" << in.buffer.size << ", buffer out size=" << out.buffer.size << ")" << std::endl;
|
||||
#endif
|
||||
vk_queue& compq = vk_device.compute_queue;
|
||||
|
||||
const int tensor_type_size = ggml_type_size(tensor->type);
|
||||
const int dst_type_size = ggml_type_size(buffer_type);
|
||||
|
||||
|
@ -1967,7 +1978,7 @@ static void ggml_vk_cpy_to_contiguous(vk_context& ctx, vk_pipeline * pipeline, c
|
|||
(uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], 1 , (uint32_t)tensor->ne[0] , nb2,
|
||||
0,
|
||||
};
|
||||
ggml_vk_sync_buffers(ctx, compq);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, *pipeline, { in, out }, sizeof(vk_op_cpy_push_constants), &pc, { ne, 1, 1 });
|
||||
}
|
||||
|
||||
|
@ -1996,8 +2007,6 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
|
|||
const uint64_t r2 = ne12 / ne02;
|
||||
const uint64_t r3 = ne13 / ne03;
|
||||
|
||||
vk_queue& compq = vk_device.compute_queue;
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
|
||||
|
@ -2110,20 +2119,20 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
|
|||
} else if (load_x || qx_needs_dequant) {
|
||||
if (load_x) {
|
||||
// copy data to device
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0), compq);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
|
||||
vk_staging_offset = qx_sz * ne02 * ne03;
|
||||
}
|
||||
|
||||
if (qx_needs_dequant) {
|
||||
const std::vector<int> pc = { (int)ne01, (int)ne10, (int)ne10, (int)ne10 };
|
||||
ggml_vk_sync_buffers(ctx, compq);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, *to_fp16_vk_0, { { *d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { *d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
||||
}
|
||||
}
|
||||
if (y_non_contig) {
|
||||
ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_1, src1, { *d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { *d_Y, 0, VK_WHOLE_SIZE }, dst->type);
|
||||
} else if (load_y) {
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1), compq);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
||||
}
|
||||
|
||||
uint32_t stride_batch_x = ne00*ne01;
|
||||
|
@ -2138,12 +2147,12 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
|
|||
}
|
||||
|
||||
// compute
|
||||
ggml_vk_matmul(ctx, *pipeline, { *d_X, x_buf_offset, x_sz * ne02 * ne03 }, { *d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { *d_D, d_buf_offset, d_sz * ne12 * ne13 }, { vk_prealloc_split_k, 0, split_k_d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, 0, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21, compq); // NOLINT
|
||||
ggml_vk_matmul(ctx, *pipeline, { *d_X, x_buf_offset, x_sz * ne02 * ne03 }, { *d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { *d_D, d_buf_offset, d_sz * ne12 * ne13 }, { vk_prealloc_split_k, 0, split_k_d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, 0, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13, compq);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2180,7 +2189,6 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context& ctx, const ggml_tensor * src0,
|
|||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||
|
||||
vk_queue& compq = vk_device.compute_queue;
|
||||
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
|
||||
|
||||
const bool qx_needs_dequant = x_non_contig;
|
||||
|
@ -2269,13 +2277,13 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context& ctx, const ggml_tensor * src0,
|
|||
ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_0, src0, { *d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { *d_X, 0, VK_WHOLE_SIZE }, src0->type);
|
||||
} else if (load_x) {
|
||||
// copy data to device
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0), compq);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
|
||||
}
|
||||
if (y_non_contig) {
|
||||
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
||||
ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_1, src1, { *d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { *d_Y, 0, VK_WHOLE_SIZE }, src1->type);
|
||||
} else if (load_y) {
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1), compq);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
||||
}
|
||||
|
||||
for (uint64_t i13 = 0; i13 < ne13; i13++) {
|
||||
|
@ -2298,20 +2306,20 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context& ctx, const ggml_tensor * src0,
|
|||
|
||||
if (!y_non_contig && qy_needs_dequant) {
|
||||
const std::vector<int> pc = { (int)ne11, (int)ne10, (int)ne10, (int)ne10 };
|
||||
ggml_vk_sync_buffers(ctx, compq);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, *to_fp16_vk_1, { { *d_Qy, qy_offset, qy_sz }, { *d_Y, y_offset, y_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)y_ne, 1, 1});
|
||||
}
|
||||
|
||||
// compute
|
||||
const std::array<int, 3> pc = { (int)ne00, (int)(y_shader_offset / ggml_type_size(src1->type)), (int)(d_shader_offset / ggml_type_size(dst->type))};
|
||||
ggml_vk_sync_buffers(ctx, compq);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, *dmmv, { { *d_X, x_offset, x_sz }, { *d_Y, y_buffer_offset, y_sz + y_shader_offset }, { *d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
ggml_vk_sync_buffers(ctx, compq);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_offset, d, sizeof(float) * d_ne, compq);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_offset, d, sizeof(float) * d_ne);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2344,8 +2352,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context& ctx, const ggml_tensor
|
|||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
|
||||
vk_queue& compq = vk_device.compute_queue;
|
||||
|
||||
const uint64_t x_ne = ne00 * ne01 * ne02;
|
||||
const uint64_t y_ne = ne10 * ne11 * ne12;
|
||||
const uint64_t d_ne = ne01 * ne11 * ne12;
|
||||
|
@ -2385,19 +2391,19 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context& ctx, const ggml_tensor
|
|||
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
||||
|
||||
if (load_y) {
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1), compq);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
|
||||
}
|
||||
|
||||
// compute
|
||||
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
||||
ggml_vk_sync_buffers(ctx, compq);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, vk_pipeline_mul_mat_vec_p021_f16_f32, { { *d_Qx, qx_buf_offset, qx_sz }, { *d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { *d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) dst->data;
|
||||
ggml_vk_sync_buffers(ctx, compq);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset, d, sizeof(float) * d_ne, compq);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2431,8 +2437,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context& ctx, const ggml_tensor *
|
|||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
|
||||
vk_queue& compq = vk_device.compute_queue;
|
||||
|
||||
const uint64_t d_ne = ne01 * ne11 * ne12;
|
||||
|
||||
const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
|
||||
|
@ -2473,19 +2477,19 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context& ctx, const ggml_tensor *
|
|||
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
||||
|
||||
if (load_y) {
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1), compq);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
|
||||
}
|
||||
|
||||
// compute
|
||||
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
||||
ggml_vk_sync_buffers(ctx, compq);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, vk_pipeline_mul_mat_vec_nc_f16_f32, { { *d_Qx, qx_buf_offset, qx_sz }, { *d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { *d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
||||
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) dst->data;
|
||||
ggml_vk_sync_buffers(ctx, compq);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset, d, sizeof(float) * d_ne, compq);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2580,7 +2584,7 @@ static void ggml_vk_op_repeat(vk_context& ctx, const ggml_tensor * src0, const g
|
|||
}
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(ctx, vk_device.compute_queue);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ctx.s->buffer.copyBuffer(src_buf->buffer, dst_buf->buffer, copies);
|
||||
|
||||
(void) src1;
|
||||
|
@ -2812,11 +2816,11 @@ static void ggml_vk_op_f32(vk_context& ctx, const ggml_tensor * src0, const ggml
|
|||
|
||||
// copy src0 to device
|
||||
if (transfer_src0) {
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_X, 0, src0, 0, 0, ggml_nrows(src0), vk_device.compute_queue);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_X, 0, src0, 0, 0, ggml_nrows(src0));
|
||||
vk_staging_offset = x_sz * ne02 * ne03;
|
||||
}
|
||||
if (transfer_src1) {
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1), vk_device.compute_queue);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1));
|
||||
}
|
||||
|
||||
// Single call if dimension 2 is contiguous
|
||||
|
@ -2848,21 +2852,21 @@ static void ggml_vk_op_f32(vk_context& ctx, const ggml_tensor * src0, const ggml
|
|||
|
||||
if (!use_src1 && op == GGML_OP_SOFT_MAX) {
|
||||
// Empty src1 is possible on soft_max, but the shader needs a buffer
|
||||
ggml_vk_sync_buffers(ctx, vk_device.compute_queue);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, *pipeline, { { *d_X, x_buf_offset, x_sz }, { vk_prealloc_y, 0, vk_prealloc_y.size }, { *d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
} else if (use_src1) {
|
||||
ggml_vk_sync_buffers(ctx, vk_device.compute_queue);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, *pipeline, { { *d_X, x_buf_offset, x_sz }, { *d_Y, y_buf_offset, y_sz }, { *d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
} else {
|
||||
ggml_vk_sync_buffers(ctx, vk_device.compute_queue);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, *pipeline, { { *d_X, x_buf_offset, x_sz }, { *d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
}
|
||||
if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) {
|
||||
ggml_vk_d2h_tensor_2d(ctx, d_D, 0, dst, vk_device.compute_queue);
|
||||
ggml_vk_d2h_tensor_2d(ctx, d_D, 0, dst);
|
||||
} else if(dst->backend == GGML_BACKEND_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) dst->data;
|
||||
ggml_vk_buffer_read_async(ctx, d_D, 0, d, d_sz, vk_device.compute_queue);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, 0, d, d_sz);
|
||||
}
|
||||
} else {
|
||||
ggml_vk_pipeline_allocate_descriptor_sets(*pipeline, ne02 * ne03);
|
||||
|
@ -2892,18 +2896,18 @@ static void ggml_vk_op_f32(vk_context& ctx, const ggml_tensor * src0, const ggml
|
|||
|
||||
if (!use_src1 && op == GGML_OP_SOFT_MAX) {
|
||||
// Empty src1 is possible on soft_max, but the shader needs a buffer
|
||||
ggml_vk_sync_buffers(ctx, vk_device.compute_queue);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, *pipeline, { { *d_X, x_buf_offset, x_sz }, { vk_prealloc_y, 0, vk_prealloc_y.size }, { *d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
} else if (use_src1) {
|
||||
ggml_vk_sync_buffers(ctx, vk_device.compute_queue);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, *pipeline, { { *d_X, x_buf_offset + x_offset, x_sz }, { *d_Y, y_buf_offset + y_offset, y_sz }, { *d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
} else {
|
||||
ggml_vk_sync_buffers(ctx, vk_device.compute_queue);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_dispatch_pipeline(*ctx.s, *pipeline, { { *d_X, x_buf_offset + x_offset, x_sz }, { *d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
}
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
// copy dst to host
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz, vk_device.compute_queue);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3008,11 +3012,10 @@ static void ggml_vk_rope(vk_context& ctx, const ggml_tensor * src0, const ggml_t
|
|||
static void ggml_vk_nop(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
// If backend is CPU, data from src0 has to be copied off the device
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
vk_buffer * d_D = &extra_src0->buffer_gpu;
|
||||
ggml_vk_sync_buffers(ctx, vk_device.compute_queue);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, 0, dst->data, d_D->size, vk_device.compute_queue);
|
||||
ggml_vk_sync_buffers(ctx);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, 0, dst->data, d_D->size);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3824,8 +3827,8 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){
|
|||
}
|
||||
|
||||
if (vk_ctx == nullptr) {
|
||||
vk_ctx = ggml_vk_create_context();
|
||||
ggml_vk_ctx_begin(*vk_ctx, vk_device.compute_queue);
|
||||
vk_ctx = ggml_vk_create_context(vk_device.compute_queue);
|
||||
ggml_vk_ctx_begin(*vk_ctx);
|
||||
}
|
||||
|
||||
switch (node->op) {
|
||||
|
@ -4013,7 +4016,7 @@ bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
|
||||
ggml_vk_submit(vk_device.compute_queue, ctx.seqs, vk_fence);
|
||||
ggml_vk_submit(ctx, vk_fence);
|
||||
}
|
||||
|
||||
if (tensor == ctx.exit_tensor) {
|
||||
|
@ -4194,7 +4197,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
ggml_vk_buffer_write(&extra->buffer_gpu, extra->offset + offset, data, size, vk_device.transfer_queue);
|
||||
ggml_vk_buffer_write(&extra->buffer_gpu, extra->offset + offset, data, size);
|
||||
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
@ -4207,7 +4210,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset + offset, data, size, vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset + offset, data, size);
|
||||
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
@ -4217,7 +4220,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
|||
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
ggml_vk_buffer_copy(&src_extra->buffer_gpu, src_extra->offset, &dst_extra->buffer_gpu, dst_extra->offset, ggml_nbytes(src), vk_device.transfer_queue);
|
||||
ggml_vk_buffer_copy(&src_extra->buffer_gpu, src_extra->offset, &dst_extra->buffer_gpu, dst_extra->offset, ggml_nbytes(src));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -4229,7 +4232,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
|||
GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
|
||||
ggml_vk_buffer_memset(&ctx->dev_buffer, 0, value, buffer->size, vk_device.transfer_queue);
|
||||
ggml_vk_buffer_memset(&ctx->dev_buffer, 0, value, buffer->size);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
||||
|
@ -4396,6 +4399,96 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_t
|
|||
UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_vk_buffer_type() && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
if (vk_ctx == nullptr) {
|
||||
// Initialize new transfer context
|
||||
vk_ctx = ggml_vk_create_context(vk_device.transfer_queue);
|
||||
ggml_vk_ctx_begin(*vk_ctx);
|
||||
}
|
||||
|
||||
ggml_vk_buffer_write_async(*vk_ctx, &extra->buffer_gpu, extra->offset + offset, data, size);
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_vk_buffer_type() && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
if (vk_ctx == nullptr) {
|
||||
// Initialize new transfer context
|
||||
vk_ctx = ggml_vk_create_context(vk_device.transfer_queue);
|
||||
ggml_vk_ctx_begin(*vk_ctx);
|
||||
}
|
||||
|
||||
ggml_vk_buffer_read_async(*vk_ctx, &extra->buffer_gpu, extra->offset + offset, data, size);
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
|
||||
#endif
|
||||
if (dst->buffer->buft == ggml_backend_vk_buffer_type() && ggml_backend_buffer_is_vk(src->buffer)) {
|
||||
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
if (vk_ctx == nullptr) {
|
||||
// Initialize new transfer context
|
||||
vk_ctx = ggml_vk_create_context(vk_device.transfer_queue);
|
||||
ggml_vk_ctx_begin(*vk_ctx);
|
||||
}
|
||||
|
||||
ggml_vk_buffer_copy_async(*vk_ctx, &src_extra->buffer_gpu, src_extra->offset, &dst_extra->buffer_gpu, dst_extra->offset, ggml_nbytes(src));
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_synchronize()" << std::endl;
|
||||
#endif
|
||||
if(vk_ctx == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_vk_ctx_end(*vk_ctx);
|
||||
|
||||
for (auto& cpy : vk_ctx->in_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
|
||||
ggml_vk_submit(*vk_ctx, vk_fence);
|
||||
VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
|
||||
vk_device.device.resetFences({ vk_fence });
|
||||
|
||||
for (auto& cpy : vk_ctx->out_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
|
||||
vk_ctx = nullptr;
|
||||
|
||||
UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
// ggml_backend_vk_context * vk_ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
|
@ -4532,14 +4625,15 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|||
UNUSED(backend);
|
||||
}
|
||||
|
||||
// TODO: enable async and synchronize
|
||||
static ggml_backend_i ggml_backend_vk_interface = {
|
||||
/* .get_name = */ ggml_backend_vk_name,
|
||||
/* .free = */ ggml_backend_vk_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
||||
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
||||
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue