Simplify gpu_extras by removing events and putting staging memcpys into contexts
This commit is contained in:
parent
82ce1c4da2
commit
5a8a07ec60
1 changed files with 61 additions and 88 deletions
149
ggml-vulkan.cpp
149
ggml-vulkan.cpp
|
@ -190,15 +190,13 @@ struct vk_context {
|
|||
std::vector<vk_sequence> seqs;
|
||||
|
||||
ggml_tensor * exit_tensor;
|
||||
|
||||
std::vector<vk_staging_memcpy> in_memcpys;
|
||||
std::vector<vk_staging_memcpy> out_memcpys;
|
||||
};
|
||||
|
||||
struct ggml_tensor_extra_gpu {
|
||||
bool ready;
|
||||
std::vector<vk_staging_memcpy> in_memcpys;
|
||||
std::vector<vk_staging_memcpy> out_memcpys;
|
||||
|
||||
vk::Event in0_staging_event;
|
||||
vk::Event in1_staging_event;
|
||||
|
||||
size_t ctx_idx;
|
||||
|
||||
|
@ -207,10 +205,6 @@ struct ggml_tensor_extra_gpu {
|
|||
|
||||
void reset() {
|
||||
ready = false;
|
||||
in_memcpys.clear();
|
||||
out_memcpys.clear();
|
||||
in0_staging_event = (vk::Event) VK_NULL_HANDLE;
|
||||
in1_staging_event = (vk::Event) VK_NULL_HANDLE;
|
||||
ctx_idx = 0;
|
||||
buffer_gpu.size = 0;
|
||||
offset = 0;
|
||||
|
@ -1392,7 +1386,7 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write_nc_async(vk_context& ctx, vk_buffer* dst, size_t offset, const ggml_tensor * tensor, vk_queue& q, std::vector<vk_staging_memcpy>* memcpys = nullptr, vk::Event * event = nullptr, bool sync_staging = false) {
|
||||
static void ggml_vk_buffer_write_nc_async(vk_context& ctx, vk_buffer* dst, size_t offset, const ggml_tensor * tensor, vk_queue& q, bool sync_staging = false) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1485,10 +1479,6 @@ static void ggml_vk_buffer_write_nc_async(vk_context& ctx, vk_buffer* dst, size_
|
|||
|
||||
VkBufferCopy buf_copy{ staging_offset, offset, copy_size };
|
||||
|
||||
if (event != nullptr) {
|
||||
*event = ggml_vk_create_event();
|
||||
ggml_vk_wait_events(ctx.s->buffer, { *event }, { vk::PipelineStageFlagBits::eHost }, q.stage_flags);
|
||||
}
|
||||
ggml_vk_sync_buffers(ctx, q);
|
||||
vkCmdCopyBuffer(ctx.s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
|
||||
|
||||
|
@ -1496,16 +1486,16 @@ static void ggml_vk_buffer_write_nc_async(vk_context& ctx, vk_buffer* dst, size_
|
|||
for (uint64_t i2 = 0; i2 < ne2; i2++) {
|
||||
// Find longest contiguous slice
|
||||
if (ne1*nb1 == dstnb2) {
|
||||
deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, memcpys);
|
||||
deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &ctx.in_memcpys);
|
||||
} else {
|
||||
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
||||
if (ne0*nb0/bs == dstnb1) {
|
||||
deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, memcpys);
|
||||
deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &ctx.in_memcpys);
|
||||
} else {
|
||||
const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
|
||||
const uint64_t d_off = staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
|
||||
for (uint64_t i0 = 0; i0 < ne0; i0++) {
|
||||
deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, memcpys);
|
||||
deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &ctx.in_memcpys);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1514,7 +1504,7 @@ static void ggml_vk_buffer_write_nc_async(vk_context& ctx, vk_buffer* dst, size_
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write_2d_async(vk_context& ctx, vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q, std::vector<vk_staging_memcpy>* memcpys = nullptr, vk::Event * event = nullptr, bool sync_staging = false) {
|
||||
static void ggml_vk_buffer_write_2d_async(vk_context& ctx, vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q, bool sync_staging = false) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1584,27 +1574,23 @@ static void ggml_vk_buffer_write_2d_async(vk_context& ctx, vk_buffer* dst, size_
|
|||
offset,
|
||||
copy_size};
|
||||
|
||||
if (event != nullptr) {
|
||||
*event = ggml_vk_create_event();
|
||||
ggml_vk_wait_events(ctx.s->buffer, { *event }, { vk::PipelineStageFlagBits::eHost }, q.stage_flags);
|
||||
}
|
||||
ggml_vk_sync_buffers(ctx, q);
|
||||
vkCmdCopyBuffer(ctx.s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
|
||||
|
||||
if (width == spitch) {
|
||||
deferred_memcpy((uint8_t *)staging->ptr + staging_offset, src, width * height, memcpys);
|
||||
deferred_memcpy((uint8_t *)staging->ptr + staging_offset, src, width * height, &ctx.in_memcpys);
|
||||
} else {
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i * width, (const uint8_t *) src + i * spitch, width, memcpys);
|
||||
deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i * width, (const uint8_t *) src + i * spitch, width, &ctx.in_memcpys);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write_async(vk_context& ctx, vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, std::vector<vk_staging_memcpy>* memcpys = nullptr, vk::Event * event = nullptr, bool sync_staging = false) {
|
||||
static void ggml_vk_buffer_write_async(vk_context& ctx, vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, bool sync_staging = false) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
return ggml_vk_buffer_write_2d_async(ctx, dst, offset, src, size, size, 1, q, memcpys, event, sync_staging);
|
||||
return ggml_vk_buffer_write_2d_async(ctx, dst, offset, src, size, size, 1, q, sync_staging);
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q) {
|
||||
|
@ -1621,8 +1607,13 @@ static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void *
|
|||
} else {
|
||||
vk_context * ctx = ggml_vk_create_context();
|
||||
ggml_vk_ctx_begin(*ctx, q);
|
||||
ggml_vk_buffer_write_2d_async(*ctx, dst, offset, src, spitch, width, height, q, nullptr, nullptr, true);
|
||||
ggml_vk_buffer_write_2d_async(*ctx, dst, offset, src, spitch, width, height, q, true);
|
||||
ggml_vk_ctx_end(*ctx);
|
||||
|
||||
for (auto& cpy : ctx->in_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
|
||||
ggml_vk_submit(q, ctx->seqs, vk_fence);
|
||||
VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
||||
vk_device.device.resetFences({ vk_fence });
|
||||
|
@ -1636,7 +1627,7 @@ static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src
|
|||
ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1, q);
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_read_2d_async(vk_context& ctx, vk_buffer* src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_queue& q, std::vector<vk_staging_memcpy>* memcpys = nullptr, bool sync_staging = false) {
|
||||
static void ggml_vk_buffer_read_2d_async(vk_context& ctx, vk_buffer* src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, vk_queue& q, bool sync_staging = false) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1702,13 +1693,11 @@ static void ggml_vk_buffer_read_2d_async(vk_context& ctx, vk_buffer* src, size_t
|
|||
ggml_vk_sync_buffers(ctx, q);
|
||||
ctx.s->buffer.copyBuffer(src->buffer, staging->buffer, slices);
|
||||
|
||||
GGML_ASSERT(memcpys != nullptr);
|
||||
|
||||
deferred_memcpy(dst, staging->ptr, copy_size, memcpys);
|
||||
deferred_memcpy(dst, staging->ptr, copy_size, &ctx.out_memcpys);
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_read_async(vk_context& ctx, vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q, std::vector<vk_staging_memcpy>* memcpys = nullptr, bool sync_staging = false) {
|
||||
return ggml_vk_buffer_read_2d_async(ctx, src, offset, dst, size, size, size, 1, q, memcpys, sync_staging);
|
||||
static void ggml_vk_buffer_read_async(vk_context& ctx, vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q, bool sync_staging = false) {
|
||||
return ggml_vk_buffer_read_2d_async(ctx, src, offset, dst, size, size, size, 1, q, sync_staging);
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q) {
|
||||
|
@ -1722,14 +1711,14 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
|
|||
} else {
|
||||
vk_context * ctx = ggml_vk_create_context();
|
||||
ggml_vk_ctx_begin(*ctx, q);
|
||||
std::vector<vk_staging_memcpy> staging;
|
||||
ggml_vk_buffer_read_async(*ctx, src, offset, dst, size, q, &staging, true);
|
||||
ggml_vk_buffer_read_async(*ctx, src, offset, dst, size, q, true);
|
||||
ggml_vk_ctx_end(*ctx);
|
||||
|
||||
ggml_vk_submit(q, ctx->seqs, vk_fence);
|
||||
VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
|
||||
vk_device.device.resetFences({ vk_fence });
|
||||
|
||||
for (auto& cpy : staging) {
|
||||
for (auto& cpy : ctx->out_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
}
|
||||
|
@ -1764,7 +1753,7 @@ static void ggml_vk_buffer_memset(vk_buffer* dst, size_t offset, uint32_t c, siz
|
|||
vk_device.device.resetFences({ vk_fence });
|
||||
}
|
||||
|
||||
static void ggml_vk_h2d_tensor_2d(vk_context& ctx, vk_buffer * dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1, vk_queue& q, std::vector<vk_staging_memcpy> * memcpys = nullptr, vk::Event * event = nullptr) {
|
||||
static void ggml_vk_h2d_tensor_2d(vk_context& ctx, vk_buffer * dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1, vk_queue& q) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl;
|
||||
#endif
|
||||
|
@ -1781,17 +1770,17 @@ static void ggml_vk_h2d_tensor_2d(vk_context& ctx, vk_buffer * dst, size_t offse
|
|||
|
||||
const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
|
||||
if (nb0 == ts && nb1 == row_length) {
|
||||
return ggml_vk_buffer_write_async(ctx, dst, offset, x, i1*nb1, q, memcpys, event);
|
||||
return ggml_vk_buffer_write_async(ctx, dst, offset, x, i1*nb1, q);
|
||||
}
|
||||
if (nb0 == ts && (i1 == ne1 || !ggml_is_permuted(src))) {
|
||||
return ggml_vk_buffer_write_2d_async(ctx, dst, offset, x, nb1, row_length, i1, q, memcpys, event);
|
||||
return ggml_vk_buffer_write_2d_async(ctx, dst, offset, x, nb1, row_length, i1, q);
|
||||
}
|
||||
|
||||
GGML_ASSERT(i3 == 0);
|
||||
GGML_ASSERT(i2 == 0);
|
||||
GGML_ASSERT(i1 == (uint64_t) ggml_nrows(src));
|
||||
|
||||
return ggml_vk_buffer_write_nc_async(ctx, dst, offset, src, q, memcpys, event);
|
||||
return ggml_vk_buffer_write_nc_async(ctx, dst, offset, src, q);
|
||||
}
|
||||
|
||||
static void ggml_vk_d2h_tensor_2d(vk_context& ctx, vk_buffer * src, size_t offset, const ggml_tensor * dst, vk_queue& q) {
|
||||
|
@ -2121,7 +2110,7 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
|
|||
} else if (load_x || qx_needs_dequant) {
|
||||
if (load_x) {
|
||||
// copy data to device
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0), compq, &extra->in_memcpys, &extra->in0_staging_event);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0), compq);
|
||||
vk_staging_offset = qx_sz * ne02 * ne03;
|
||||
}
|
||||
|
||||
|
@ -2134,7 +2123,7 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
|
|||
if (y_non_contig) {
|
||||
ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_1, src1, { *d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { *d_Y, 0, VK_WHOLE_SIZE }, dst->type);
|
||||
} else if (load_y) {
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1), compq, &extra->in_memcpys, &extra->in1_staging_event);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1), compq);
|
||||
}
|
||||
|
||||
uint32_t stride_batch_x = ne00*ne01;
|
||||
|
@ -2154,7 +2143,7 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
|
|||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13, compq, &extra->out_memcpys);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13, compq);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2280,13 +2269,13 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context& ctx, const ggml_tensor * src0,
|
|||
ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_0, src0, { *d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { *d_X, 0, VK_WHOLE_SIZE }, src0->type);
|
||||
} else if (load_x) {
|
||||
// copy data to device
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0), compq, &extra->in_memcpys, &extra->in0_staging_event);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0), compq);
|
||||
}
|
||||
if (y_non_contig) {
|
||||
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
||||
ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_1, src1, { *d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { *d_Y, 0, VK_WHOLE_SIZE }, src1->type);
|
||||
} else if (load_y) {
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1), compq, &extra->in_memcpys, &extra->in1_staging_event);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1), compq);
|
||||
}
|
||||
|
||||
for (uint64_t i13 = 0; i13 < ne13; i13++) {
|
||||
|
@ -2322,7 +2311,7 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context& ctx, const ggml_tensor * src0,
|
|||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
ggml_vk_sync_buffers(ctx, compq);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_offset, d, sizeof(float) * d_ne, compq, &extra->out_memcpys);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_offset, d, sizeof(float) * d_ne, compq);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2396,7 +2385,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context& ctx, const ggml_tensor
|
|||
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
||||
|
||||
if (load_y) {
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1), compq, &extra->in_memcpys, &extra->in1_staging_event);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1), compq);
|
||||
}
|
||||
|
||||
// compute
|
||||
|
@ -2408,7 +2397,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context& ctx, const ggml_tensor
|
|||
// copy dst to host
|
||||
float * d = (float *) dst->data;
|
||||
ggml_vk_sync_buffers(ctx, compq);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset, d, sizeof(float) * d_ne, compq, &extra->out_memcpys);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset, d, sizeof(float) * d_ne, compq);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2484,7 +2473,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context& ctx, const ggml_tensor *
|
|||
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
||||
|
||||
if (load_y) {
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1), compq, &extra->in_memcpys, &extra->in1_staging_event);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1), compq);
|
||||
}
|
||||
|
||||
// compute
|
||||
|
@ -2496,7 +2485,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context& ctx, const ggml_tensor *
|
|||
// copy dst to host
|
||||
float * d = (float *) dst->data;
|
||||
ggml_vk_sync_buffers(ctx, compq);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset, d, sizeof(float) * d_ne, compq, &extra->out_memcpys);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset, d, sizeof(float) * d_ne, compq);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2823,11 +2812,11 @@ static void ggml_vk_op_f32(vk_context& ctx, const ggml_tensor * src0, const ggml
|
|||
|
||||
// copy src0 to device
|
||||
if (transfer_src0) {
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_X, 0, src0, 0, 0, ggml_nrows(src0), vk_device.compute_queue, &extra->in_memcpys, &extra->in0_staging_event);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_X, 0, src0, 0, 0, ggml_nrows(src0), vk_device.compute_queue);
|
||||
vk_staging_offset = x_sz * ne02 * ne03;
|
||||
}
|
||||
if (transfer_src1) {
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1), vk_device.compute_queue, &extra->in_memcpys, &extra->in1_staging_event);
|
||||
ggml_vk_h2d_tensor_2d(ctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1), vk_device.compute_queue);
|
||||
}
|
||||
|
||||
// Single call if dimension 2 is contiguous
|
||||
|
@ -2873,7 +2862,7 @@ static void ggml_vk_op_f32(vk_context& ctx, const ggml_tensor * src0, const ggml
|
|||
} else if(dst->backend == GGML_BACKEND_CPU) {
|
||||
// copy dst to host
|
||||
float * d = (float *) dst->data;
|
||||
ggml_vk_buffer_read_async(ctx, d_D, 0, d, d_sz, vk_device.compute_queue, &extra->out_memcpys);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, 0, d, d_sz, vk_device.compute_queue);
|
||||
}
|
||||
} else {
|
||||
ggml_vk_pipeline_allocate_descriptor_sets(*pipeline, ne02 * ne03);
|
||||
|
@ -2914,7 +2903,7 @@ static void ggml_vk_op_f32(vk_context& ctx, const ggml_tensor * src0, const ggml
|
|||
}
|
||||
if (dst->backend == GGML_BACKEND_CPU) {
|
||||
// copy dst to host
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz, vk_device.compute_queue, &extra->out_memcpys);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz, vk_device.compute_queue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3023,7 +3012,7 @@ static void ggml_vk_nop(vk_context& ctx, const ggml_tensor * src0, ggml_tensor *
|
|||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
vk_buffer * d_D = &extra_src0->buffer_gpu;
|
||||
ggml_vk_sync_buffers(ctx, vk_device.compute_queue);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, 0, dst->data, d_D->size, vk_device.compute_queue, &extra->out_memcpys);
|
||||
ggml_vk_buffer_read_async(ctx, d_D, 0, dst->data, d_D->size, vk_device.compute_queue);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3548,17 +3537,6 @@ static void ggml_vk_tensor_stride_order(const ggml_tensor * tensor, std::array<i
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: Remove
|
||||
static size_t ggml_vk_tensor_size(const ggml_tensor * tensor) {
|
||||
std::array<int, 4> order;
|
||||
ggml_vk_tensor_stride_order(tensor, order);
|
||||
// Handle weird stride configurations
|
||||
if (tensor->nb[order[2]] == tensor->nb[order[3]] && tensor->ne[order[2]] > tensor->ne[order[3]]) {
|
||||
return tensor->ne[order[2]]*tensor->nb[order[2]];
|
||||
}
|
||||
return tensor->ne[order[3]]*tensor->nb[order[3]];
|
||||
}
|
||||
|
||||
static ggml_tensor * ggml_vk_find_last_use(const ggml_tensor * node, ggml_cgraph * graph) {
|
||||
GGML_ASSERT(node != nullptr);
|
||||
|
||||
|
@ -4026,35 +4004,30 @@ bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
|
||||
GGML_ASSERT(extra->ready);
|
||||
|
||||
// Do staging buffer copies
|
||||
for (auto& cpy : extra->in_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
// Trigger staging events if any exists
|
||||
if (extra->in0_staging_event != (vk::Event) VK_NULL_HANDLE) {
|
||||
vk_device.device.setEvent(extra->in0_staging_event);
|
||||
}
|
||||
if (extra->in1_staging_event != (vk::Event) VK_NULL_HANDLE) {
|
||||
vk_device.device.setEvent(extra->in1_staging_event);
|
||||
}
|
||||
|
||||
vk_context& ctx = vk_gc.contexts[extra->ctx_idx];
|
||||
|
||||
ggml_vk_submit(vk_device.compute_queue, ctx.seqs, vk_fence);
|
||||
// Only run if ctx hasn't been submitted yet
|
||||
if (!ctx.seqs.empty()) {
|
||||
// Do staging buffer copies
|
||||
for (auto& cpy : ctx.in_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
|
||||
ggml_vk_submit(vk_device.compute_queue, ctx.seqs, vk_fence);
|
||||
}
|
||||
|
||||
if (tensor == ctx.exit_tensor) {
|
||||
VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
||||
vk_device.device.resetFences({ vk_fence });
|
||||
|
||||
// Do staging buffer copies
|
||||
for (auto& cpy : extra->out_memcpys) {
|
||||
for (auto& cpy : ctx.out_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
}
|
||||
extra->out_memcpys.clear();
|
||||
ctx.in_memcpys.clear();
|
||||
ctx.out_memcpys.clear();
|
||||
}
|
||||
|
||||
extra->in_memcpys.clear();
|
||||
|
||||
extra->ready = false;
|
||||
|
||||
return true;
|
||||
|
@ -4662,7 +4635,7 @@ void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
|||
void * tensor_data = tensor->data;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
const size_t tensor_size = ggml_vk_tensor_size(tensor);
|
||||
const size_t tensor_size = ggml_nbytes(tensor);
|
||||
tensor_data = malloc(tensor_size);
|
||||
|
||||
ggml_vk_buffer_read((vk_buffer *)tensor->data, 0, tensor_data, tensor_size, vk_device.transfer_queue);
|
||||
|
@ -4762,7 +4735,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
if (src0 != nullptr) {
|
||||
src0_clone = ggml_dup_tensor(ctx, src0);
|
||||
|
||||
src0_size = ggml_vk_tensor_size(src0);
|
||||
src0_size = ggml_nbytes(src0);
|
||||
|
||||
src0_buffer = malloc(src0_size);
|
||||
src0_clone->data = src0_buffer;
|
||||
|
@ -4805,7 +4778,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
if (src1 != nullptr) {
|
||||
src1_clone = ggml_dup_tensor(ctx, src1);
|
||||
|
||||
src1_size = ggml_vk_tensor_size(src1);
|
||||
src1_size = ggml_nbytes(src1);
|
||||
|
||||
src1_buffer = malloc(src1_size);
|
||||
src1_clone->data = src1_buffer;
|
||||
|
@ -4951,7 +4924,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
ggml_vk_print_tensor(tensor_clone, "tensor_clone");
|
||||
}
|
||||
|
||||
comp_size = ggml_vk_tensor_size(tensor_clone);
|
||||
comp_size = ggml_nbytes(tensor_clone);
|
||||
|
||||
comp_result = malloc(comp_size);
|
||||
memcpy(comp_result, tensor_clone->data, comp_size);
|
||||
|
@ -4984,7 +4957,7 @@ void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
void * tensor_data = tensor->data;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
size_t tensor_size = ggml_vk_tensor_size(tensor);
|
||||
size_t tensor_size = ggml_nbytes(tensor);
|
||||
tensor_data = malloc(tensor_size);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue