Fix compiler warnings
This commit is contained in:
parent
f84c54fe23
commit
c0f3474ed5
1 changed files with 178 additions and 157 deletions
335
ggml-vulkan.cpp
335
ggml-vulkan.cpp
|
@ -197,6 +197,22 @@ struct ggml_tensor_extra_gpu {
|
||||||
uint64_t view_offset;
|
uint64_t view_offset;
|
||||||
|
|
||||||
bool prepared;
|
bool prepared;
|
||||||
|
|
||||||
|
void reset() {
|
||||||
|
ready = false;
|
||||||
|
in_memcpys.clear();
|
||||||
|
out_memcpys.clear();
|
||||||
|
in0_staging_event = (vk::Event) VK_NULL_HANDLE;
|
||||||
|
in1_staging_event = (vk::Event) VK_NULL_HANDLE;
|
||||||
|
ctx_idx = 0;
|
||||||
|
d_idx = 0;
|
||||||
|
tensor_size = 0;
|
||||||
|
buffer_static = 0;
|
||||||
|
buffer_gpu = nullptr;
|
||||||
|
base_buffer_offset = 0;
|
||||||
|
view_offset = 0;
|
||||||
|
prepared = false;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_vk_garbage_collector {
|
struct ggml_vk_garbage_collector {
|
||||||
|
@ -1397,19 +1413,19 @@ static void ggml_vk_buffer_write_nc_async(vk_context& ctx, vk_buffer* dst, size_
|
||||||
// Memory is pinned, use as staging buffer
|
// Memory is pinned, use as staging buffer
|
||||||
std::vector<vk::BufferCopy> slices;
|
std::vector<vk::BufferCopy> slices;
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
for (uint64_t i2 = 0; i2 < ne2; i2++) {
|
||||||
// Find longest contiguous slice
|
// Find longest contiguous slice
|
||||||
if (ne1*nb1 == dstnb2) {
|
if (ne1*nb1 == dstnb2) {
|
||||||
slices.push_back({ buf_offset + i3*nb3 + i2*nb2, offset + i3*dstnb3 + i2*dstnb2, dstnb2 });
|
slices.push_back({ buf_offset + i3*nb3 + i2*nb2, offset + i3*dstnb3 + i2*dstnb2, dstnb2 });
|
||||||
} else {
|
} else {
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
||||||
if (ne0*nb0/bs == dstnb1) {
|
if (ne0*nb0/bs == dstnb1) {
|
||||||
slices.push_back({ buf_offset + i3*nb3 + i2*nb2 + i1*nb1, offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, dstnb1 });
|
slices.push_back({ buf_offset + i3*nb3 + i2*nb2 + i1*nb1, offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, dstnb1 });
|
||||||
} else {
|
} else {
|
||||||
const size_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
|
const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
|
||||||
const size_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
|
const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
|
||||||
for (size_t i0 = 0; i0 < ne0; i0++) {
|
for (uint64_t i0 = 0; i0 < ne0; i0++) {
|
||||||
slices.push_back({ s_off + i1*nb0, d_off + i0*dstnb0, dstnb0 });
|
slices.push_back({ s_off + i1*nb0, d_off + i0*dstnb0, dstnb0 });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1445,19 +1461,19 @@ static void ggml_vk_buffer_write_nc_async(vk_context& ctx, vk_buffer* dst, size_
|
||||||
ggml_vk_sync_buffers(ctx.s->buffer, { ggml_vk_subbuffer(*dst) }, q, vk::AccessFlagBits::eMemoryRead, vk::AccessFlagBits::eMemoryWrite, true);
|
ggml_vk_sync_buffers(ctx.s->buffer, { ggml_vk_subbuffer(*dst) }, q, vk::AccessFlagBits::eMemoryRead, vk::AccessFlagBits::eMemoryWrite, true);
|
||||||
vkCmdCopyBuffer(ctx.s->buffer, vk_staging.buffer, dst->buffer, 1, &buf_copy);
|
vkCmdCopyBuffer(ctx.s->buffer, vk_staging.buffer, dst->buffer, 1, &buf_copy);
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (uint64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
for (uint64_t i2 = 0; i2 < ne2; i2++) {
|
||||||
// Find longest contiguous slice
|
// Find longest contiguous slice
|
||||||
if (ne1*nb1 == dstnb2) {
|
if (ne1*nb1 == dstnb2) {
|
||||||
deferred_memcpy((uint8_t *)vk_staging.ptr + vk_staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, memcpys);
|
deferred_memcpy((uint8_t *)vk_staging.ptr + vk_staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, memcpys);
|
||||||
} else {
|
} else {
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
||||||
if (ne0*nb0/bs == dstnb1) {
|
if (ne0*nb0/bs == dstnb1) {
|
||||||
deferred_memcpy((uint8_t *)vk_staging.ptr + vk_staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, memcpys);
|
deferred_memcpy((uint8_t *)vk_staging.ptr + vk_staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, memcpys);
|
||||||
} else {
|
} else {
|
||||||
const size_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
|
const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
|
||||||
const size_t d_off = vk_staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
|
const uint64_t d_off = vk_staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
|
||||||
for (size_t i0 = 0; i0 < ne0; i0++) {
|
for (uint64_t i0 = 0; i0 < ne0; i0++) {
|
||||||
deferred_memcpy((uint8_t *)vk_staging.ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, memcpys);
|
deferred_memcpy((uint8_t *)vk_staging.ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, memcpys);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1742,7 +1758,7 @@ static void ggml_vk_h2d_tensor_2d(vk_context& ctx, vk_buffer * dst, size_t offse
|
||||||
|
|
||||||
GGML_ASSERT(i3 == 0);
|
GGML_ASSERT(i3 == 0);
|
||||||
GGML_ASSERT(i2 == 0);
|
GGML_ASSERT(i2 == 0);
|
||||||
GGML_ASSERT(i1 == ggml_nrows(src));
|
GGML_ASSERT(i1 == (uint64_t) ggml_nrows(src));
|
||||||
|
|
||||||
return ggml_vk_buffer_write_nc_async(ctx, dst, offset, src, q, memcpys, event);
|
return ggml_vk_buffer_write_nc_async(ctx, dst, offset, src, q, memcpys, event);
|
||||||
}
|
}
|
||||||
|
@ -1757,8 +1773,8 @@ static void ggml_vk_d2h_tensor_2d(vk_context& ctx, vk_buffer * src, size_t offse
|
||||||
const uint64_t ne3 = dst->ne[3];
|
const uint64_t ne3 = dst->ne[3];
|
||||||
const uint64_t nb0 = dst->nb[0];
|
const uint64_t nb0 = dst->nb[0];
|
||||||
const uint64_t nb1 = dst->nb[1];
|
const uint64_t nb1 = dst->nb[1];
|
||||||
const uint64_t nb2 = dst->nb[2];
|
// const uint64_t nb2 = dst->nb[2];
|
||||||
const uint64_t nb3 = dst->nb[3];
|
// const uint64_t nb3 = dst->nb[3];
|
||||||
const enum ggml_type type = dst->type;
|
const enum ggml_type type = dst->type;
|
||||||
const size_t ts = ggml_type_size(type);
|
const size_t ts = ggml_type_size(type);
|
||||||
const size_t bs = ggml_blck_size(type);
|
const size_t bs = ggml_blck_size(type);
|
||||||
|
@ -1773,7 +1789,7 @@ static void ggml_vk_d2h_tensor_2d(vk_context& ctx, vk_buffer * src, size_t offse
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int ggml_vk_guess_split_k(int m, int n, int k, bool aligned) {
|
static uint32_t ggml_vk_guess_split_k(int m, int n, int k, bool aligned) {
|
||||||
#ifdef VK_DEBUG
|
#ifdef VK_DEBUG
|
||||||
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ", " << aligned << ")";
|
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ", " << aligned << ")";
|
||||||
#endif
|
#endif
|
||||||
|
@ -1958,9 +1974,6 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
|
||||||
const uint64_t ne20 = dst->ne[0];
|
const uint64_t ne20 = dst->ne[0];
|
||||||
const uint64_t ne21 = dst->ne[1];
|
const uint64_t ne21 = dst->ne[1];
|
||||||
|
|
||||||
const size_t nb2 = dst->nb[2];
|
|
||||||
const size_t nb3 = dst->nb[3];
|
|
||||||
|
|
||||||
const uint64_t r2 = ne12 / ne02;
|
const uint64_t r2 = ne12 / ne02;
|
||||||
const uint64_t r3 = ne13 / ne03;
|
const uint64_t r3 = ne13 / ne03;
|
||||||
|
|
||||||
|
@ -1984,10 +1997,10 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
|
||||||
const int y_ne = ne11 * ne10;
|
const int y_ne = ne11 * ne10;
|
||||||
const int d_ne = ne11 * ne01;
|
const int d_ne = ne11 * ne01;
|
||||||
|
|
||||||
const int kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ne01, ne11));
|
const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ne01, ne11));
|
||||||
const bool aligned = ne10 == kpad;
|
const bool aligned = ne10 == kpad;
|
||||||
|
|
||||||
const int split_k = ggml_vk_guess_split_k(ne01, ne11, ne10, aligned);
|
const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10, aligned);
|
||||||
|
|
||||||
vk_pipeline * pipeline = ggml_vk_guess_matmul_pipeline(true, !f16_f32_kernel, ne01, ne11, aligned);
|
vk_pipeline * pipeline = ggml_vk_guess_matmul_pipeline(true, !f16_f32_kernel, ne01, ne11, aligned);
|
||||||
|
|
||||||
|
@ -1998,13 +2011,6 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
|
||||||
const uint64_t split_k_d_sz = sizeof(float) * d_ne * split_k;
|
const uint64_t split_k_d_sz = sizeof(float) * d_ne * split_k;
|
||||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_GPU) {
|
|
||||||
if (d_sz != nb2) {
|
|
||||||
std::cerr << "ERROR: incompatible tensor alignment d_sz=" << d_sz << " nb2=" << nb2 << std::endl;
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||||
|
@ -2014,9 +2020,9 @@ static void ggml_vk_mul_mat_q_f16(vk_context& ctx, const ggml_tensor * src0, con
|
||||||
GGML_ASSERT(d_D != nullptr);
|
GGML_ASSERT(d_D != nullptr);
|
||||||
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
|
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
|
||||||
vk_buffer* d_Qx;
|
vk_buffer* d_Qx;
|
||||||
uint32_t qx_buf_offset = 0;
|
uint64_t qx_buf_offset = 0;
|
||||||
vk_buffer* d_Qy;
|
vk_buffer* d_Qy;
|
||||||
uint32_t qy_buf_offset = 0;
|
uint64_t qy_buf_offset = 0;
|
||||||
vk_buffer* d_X;
|
vk_buffer* d_X;
|
||||||
uint64_t x_buf_offset = 0;
|
uint64_t x_buf_offset = 0;
|
||||||
vk_buffer* d_Y;
|
vk_buffer* d_Y;
|
||||||
|
@ -2134,23 +2140,23 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context& ctx, const ggml_tensor * src0,
|
||||||
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
||||||
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
const uint64_t ne00 = src0->ne[0];
|
||||||
const int64_t ne01 = src0->ne[1];
|
const uint64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const uint64_t ne02 = src0->ne[2];
|
||||||
const int64_t ne03 = src0->ne[3];
|
const uint64_t ne03 = src0->ne[3];
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const uint64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const uint64_t ne11 = src1->ne[1];
|
||||||
const int64_t ne12 = src1->ne[2];
|
const uint64_t ne12 = src1->ne[2];
|
||||||
const int64_t ne13 = src1->ne[3];
|
const uint64_t ne13 = src1->ne[3];
|
||||||
|
|
||||||
GGML_ASSERT(ne11 == 1);
|
GGML_ASSERT(ne11 == 1);
|
||||||
|
|
||||||
const int nb2 = dst->nb[2];
|
const uint64_t nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const uint64_t nb3 = dst->nb[3];
|
||||||
|
|
||||||
const int64_t r2 = ne12 / ne02;
|
const uint64_t r2 = ne12 / ne02;
|
||||||
const int64_t r3 = ne13 / ne03;
|
const uint64_t r3 = ne13 / ne03;
|
||||||
|
|
||||||
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
||||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||||
|
@ -2164,9 +2170,9 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context& ctx, const ggml_tensor * src0,
|
||||||
const bool qx_needs_dequant = x_non_contig;
|
const bool qx_needs_dequant = x_non_contig;
|
||||||
const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig;
|
const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig;
|
||||||
|
|
||||||
const int x_ne = ne01 * ne00;
|
const uint64_t x_ne = ne01 * ne00;
|
||||||
const int y_ne = ne11 * ne10;
|
const uint64_t y_ne = ne11 * ne10;
|
||||||
const int d_ne = ne11 * ne01;
|
const uint64_t d_ne = ne11 * ne01;
|
||||||
|
|
||||||
const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), vk_device.properties.limits.minStorageBufferOffsetAlignment);
|
const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), vk_device.properties.limits.minStorageBufferOffsetAlignment);
|
||||||
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
||||||
|
@ -2256,23 +2262,23 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context& ctx, const ggml_tensor * src0,
|
||||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1), compq, &extra->in_memcpys, &extra->in1_staging_event);
|
ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1), compq, &extra->in_memcpys, &extra->in1_staging_event);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
for (uint64_t i13 = 0; i13 < ne13; i13++) {
|
||||||
const int64_t i03 = i13 / r3;
|
const uint64_t i03 = i13 / r3;
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
for (uint64_t i12 = 0; i12 < ne12; i12++) {
|
||||||
const int64_t i02 = i12 / r2;
|
const uint64_t i02 = i12 / r2;
|
||||||
|
|
||||||
const uint32_t it_idx0 = (i03 * ne02 + i02);
|
const uint64_t it_idx0 = (i03 * ne02 + i02);
|
||||||
const uint32_t it_idx1 = (i13 * ne12 + i12);
|
const uint64_t it_idx1 = (i13 * ne12 + i12);
|
||||||
const uint32_t x_offset = x_buf_offset + x_sz * it_idx0;
|
const uint64_t x_offset = x_buf_offset + x_sz * it_idx0;
|
||||||
const uint32_t qy_offset = qy_buf_offset + qy_sz * it_idx1;
|
const uint64_t qy_offset = qy_buf_offset + qy_sz * it_idx1;
|
||||||
const uint32_t y_offset = y_buf_offset + y_sz * it_idx1;
|
const uint64_t y_offset = y_buf_offset + y_sz * it_idx1;
|
||||||
const uint32_t d_offset = d_buf_offset + d_sz * it_idx1;
|
const uint64_t d_offset = d_buf_offset + d_sz * it_idx1;
|
||||||
|
|
||||||
const uint32_t y_buffer_offset = (y_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
const uint64_t y_buffer_offset = (y_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
||||||
const uint32_t y_shader_offset = y_offset - y_buffer_offset;
|
const uint64_t y_shader_offset = y_offset - y_buffer_offset;
|
||||||
|
|
||||||
const uint32_t d_buffer_offset = (d_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
const uint64_t d_buffer_offset = (d_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
||||||
const uint32_t d_shader_offset = d_offset - d_buffer_offset;
|
const uint64_t d_shader_offset = d_offset - d_buffer_offset;
|
||||||
|
|
||||||
if (!y_non_contig && qy_needs_dequant) {
|
if (!y_non_contig && qy_needs_dequant) {
|
||||||
const std::vector<int> pc = { (int)ne11, (int)ne10, (int)ne10, (int)ne10 };
|
const std::vector<int> pc = { (int)ne11, (int)ne10, (int)ne10, (int)ne10 };
|
||||||
|
@ -2310,21 +2316,18 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context& ctx, const ggml_tensor
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
const uint64_t ne00 = src0->ne[0];
|
||||||
const int64_t ne01 = src0->ne[1];
|
const uint64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const uint64_t ne02 = src0->ne[2];
|
||||||
const int64_t ne03 = src0->ne[3];
|
// const uint64_t ne03 = src0->ne[3];
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const uint64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const uint64_t ne11 = src1->ne[1];
|
||||||
const int64_t ne12 = src1->ne[2];
|
const uint64_t ne12 = src1->ne[2];
|
||||||
const int64_t ne13 = src1->ne[3];
|
// const uint64_t ne13 = src1->ne[3];
|
||||||
|
|
||||||
GGML_ASSERT(ne11 == 1);
|
GGML_ASSERT(ne11 == 1);
|
||||||
|
|
||||||
const int nb2 = dst->nb[2];
|
|
||||||
const int nb3 = dst->nb[3];
|
|
||||||
|
|
||||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||||
|
|
||||||
vk_queue& compq = vk_device.compute_queue;
|
vk_queue& compq = vk_device.compute_queue;
|
||||||
|
@ -2345,9 +2348,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context& ctx, const ggml_tensor
|
||||||
const uint64_t d_buf_offset = extra->base_buffer_offset + extra->view_offset;
|
const uint64_t d_buf_offset = extra->base_buffer_offset + extra->view_offset;
|
||||||
GGML_ASSERT(d_D != nullptr);
|
GGML_ASSERT(d_D != nullptr);
|
||||||
vk_buffer* d_Qx;
|
vk_buffer* d_Qx;
|
||||||
const uint32_t qx_buf_offset = extra_src0->base_buffer_offset + extra_src0->view_offset;
|
const uint64_t qx_buf_offset = extra_src0->base_buffer_offset + extra_src0->view_offset;
|
||||||
vk_buffer* d_Qy;
|
vk_buffer* d_Qy;
|
||||||
uint32_t qy_buf_offset = 0;
|
uint64_t qy_buf_offset = 0;
|
||||||
d_Qx = extra_src0->buffer_gpu;
|
d_Qx = extra_src0->buffer_gpu;
|
||||||
GGML_ASSERT(d_Qx != nullptr);
|
GGML_ASSERT(d_Qx != nullptr);
|
||||||
if (load_y) {
|
if (load_y) {
|
||||||
|
@ -2361,11 +2364,11 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context& ctx, const ggml_tensor
|
||||||
// Allocate descriptor sets
|
// Allocate descriptor sets
|
||||||
ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_mul_mat_vec_p021_f16_f32, 1);
|
ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_mul_mat_vec_p021_f16_f32, 1);
|
||||||
|
|
||||||
const uint32_t qy_buffer_offset = (qy_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
const uint64_t qy_buffer_offset = (qy_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
||||||
const uint32_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
|
const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
|
||||||
|
|
||||||
const uint32_t d_buffer_offset = (d_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
const uint64_t d_buffer_offset = (d_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
||||||
const uint32_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
||||||
|
|
||||||
if (load_y) {
|
if (load_y) {
|
||||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1), compq, &extra->in_memcpys, &extra->in1_staging_event);
|
ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1), compq, &extra->in_memcpys, &extra->in1_staging_event);
|
||||||
|
@ -2398,24 +2401,21 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context& ctx, const ggml_tensor *
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
const uint64_t ne00 = src0->ne[0];
|
||||||
const int64_t ne01 = src0->ne[1];
|
const uint64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const uint64_t ne02 = src0->ne[2];
|
||||||
const int64_t ne03 = src0->ne[3];
|
// const uint64_t ne03 = src0->ne[3];
|
||||||
|
|
||||||
const int64_t nb01 = src0->nb[1];
|
const uint64_t nb01 = src0->nb[1];
|
||||||
const int64_t nb02 = src0->nb[2];
|
const uint64_t nb02 = src0->nb[2];
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
// const uint64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const uint64_t ne11 = src1->ne[1];
|
||||||
const int64_t ne12 = src1->ne[2];
|
const uint64_t ne12 = src1->ne[2];
|
||||||
const int64_t ne13 = src1->ne[3];
|
// const uint64_t ne13 = src1->ne[3];
|
||||||
|
|
||||||
GGML_ASSERT(ne11 == 1);
|
GGML_ASSERT(ne11 == 1);
|
||||||
|
|
||||||
const int nb2 = dst->nb[2];
|
|
||||||
const int nb3 = dst->nb[3];
|
|
||||||
|
|
||||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||||
|
|
||||||
vk_queue& compq = vk_device.compute_queue;
|
vk_queue& compq = vk_device.compute_queue;
|
||||||
|
@ -2437,9 +2437,9 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context& ctx, const ggml_tensor *
|
||||||
const uint64_t d_buf_offset = extra->base_buffer_offset + extra->view_offset;
|
const uint64_t d_buf_offset = extra->base_buffer_offset + extra->view_offset;
|
||||||
GGML_ASSERT(d_D != nullptr);
|
GGML_ASSERT(d_D != nullptr);
|
||||||
vk_buffer* d_Qx;
|
vk_buffer* d_Qx;
|
||||||
const uint32_t qx_buf_offset = extra_src0->base_buffer_offset + extra_src0->view_offset;
|
const uint64_t qx_buf_offset = extra_src0->base_buffer_offset + extra_src0->view_offset;
|
||||||
vk_buffer* d_Qy;
|
vk_buffer* d_Qy;
|
||||||
uint32_t qy_buf_offset = 0;
|
uint64_t qy_buf_offset = 0;
|
||||||
d_Qx = extra_src0->buffer_gpu;
|
d_Qx = extra_src0->buffer_gpu;
|
||||||
GGML_ASSERT(d_Qx != nullptr);
|
GGML_ASSERT(d_Qx != nullptr);
|
||||||
if (load_y) {
|
if (load_y) {
|
||||||
|
@ -2453,11 +2453,11 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context& ctx, const ggml_tensor *
|
||||||
// Allocate descriptor sets
|
// Allocate descriptor sets
|
||||||
ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_mul_mat_vec_nc_f16_f32, 1);
|
ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_mul_mat_vec_nc_f16_f32, 1);
|
||||||
|
|
||||||
const uint32_t qy_buffer_offset = (qy_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
const uint64_t qy_buffer_offset = (qy_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
||||||
const uint32_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
|
const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
|
||||||
|
|
||||||
const uint32_t d_buffer_offset = (d_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
const uint64_t d_buffer_offset = (d_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
||||||
const uint32_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
||||||
|
|
||||||
if (load_y) {
|
if (load_y) {
|
||||||
ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1), compq, &extra->in_memcpys, &extra->in1_staging_event);
|
ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1), compq, &extra->in_memcpys, &extra->in1_staging_event);
|
||||||
|
@ -2478,10 +2478,10 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context& ctx, const ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
|
bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
|
||||||
const int64_t ne10 = src1->ne[0];
|
const uint64_t ne10 = src1->ne[0];
|
||||||
|
|
||||||
const int64_t ne0 = dst->ne[0];
|
const uint64_t ne0 = dst->ne[0];
|
||||||
const int64_t ne1 = dst->ne[1];
|
const uint64_t ne1 = dst->ne[1];
|
||||||
|
|
||||||
// TODO: find the optimal values for these
|
// TODO: find the optimal values for these
|
||||||
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||||
|
@ -2507,30 +2507,30 @@ static void ggml_vk_mul_mat(vk_context& ctx, const struct ggml_tensor * src0, co
|
||||||
|
|
||||||
static void ggml_vk_op_repeat(vk_context& ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_vk_op_repeat(vk_context& ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
// guaranteed to be an integer due to the check in ggml_can_repeat
|
// guaranteed to be an integer due to the check in ggml_can_repeat
|
||||||
const int64_t ne0 = dst->ne[0];
|
const uint64_t ne0 = dst->ne[0];
|
||||||
const int64_t ne1 = dst->ne[1];
|
const uint64_t ne1 = dst->ne[1];
|
||||||
const int64_t ne2 = dst->ne[2];
|
const uint64_t ne2 = dst->ne[2];
|
||||||
const int64_t ne3 = dst->ne[3];
|
const uint64_t ne3 = dst->ne[3];
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
const uint64_t ne00 = src0->ne[0];
|
||||||
const int64_t ne01 = src0->ne[1];
|
const uint64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const uint64_t ne02 = src0->ne[2];
|
||||||
const int64_t ne03 = src0->ne[3];
|
const uint64_t ne03 = src0->ne[3];
|
||||||
|
|
||||||
const size_t nb0 = dst->nb[0];
|
const uint64_t nb0 = dst->nb[0];
|
||||||
const size_t nb1 = dst->nb[1];
|
const uint64_t nb1 = dst->nb[1];
|
||||||
const size_t nb2 = dst->nb[2];
|
const uint64_t nb2 = dst->nb[2];
|
||||||
const size_t nb3 = dst->nb[3];
|
const uint64_t nb3 = dst->nb[3];
|
||||||
|
|
||||||
const size_t nb00 = src0->nb[0];
|
const uint64_t nb00 = src0->nb[0];
|
||||||
const size_t nb01 = src0->nb[1];
|
const uint64_t nb01 = src0->nb[1];
|
||||||
const size_t nb02 = src0->nb[2];
|
const uint64_t nb02 = src0->nb[2];
|
||||||
const size_t nb03 = src0->nb[3];
|
const uint64_t nb03 = src0->nb[3];
|
||||||
|
|
||||||
const int nr0 = (int)(ne0/ne00);
|
const uint64_t nr0 = ne0/ne00;
|
||||||
const int nr1 = (int)(ne1/ne01);
|
const uint64_t nr1 = ne1/ne01;
|
||||||
const int nr2 = (int)(ne2/ne02);
|
const uint64_t nr2 = ne2/ne02;
|
||||||
const int nr3 = (int)(ne3/ne03);
|
const uint64_t nr3 = ne3/ne03;
|
||||||
|
|
||||||
// TODO: support for transposed / permuted tensors
|
// TODO: support for transposed / permuted tensors
|
||||||
GGML_ASSERT(nb0 == sizeof(float));
|
GGML_ASSERT(nb0 == sizeof(float));
|
||||||
|
@ -2548,13 +2548,13 @@ static void ggml_vk_op_repeat(vk_context& ctx, const ggml_tensor * src0, const g
|
||||||
|
|
||||||
std::vector<vk::BufferCopy> copies;
|
std::vector<vk::BufferCopy> copies;
|
||||||
|
|
||||||
for (int i3 = 0; i3 < nr3; i3++) {
|
for (uint64_t i3 = 0; i3 < nr3; i3++) {
|
||||||
for (int k3 = 0; k3 < ne03; k3++) {
|
for (uint64_t k3 = 0; k3 < ne03; k3++) {
|
||||||
for (int i2 = 0; i2 < nr2; i2++) {
|
for (uint64_t i2 = 0; i2 < nr2; i2++) {
|
||||||
for (int k2 = 0; k2 < ne02; k2++) {
|
for (uint64_t k2 = 0; k2 < ne02; k2++) {
|
||||||
for (int i1 = 0; i1 < nr1; i1++) {
|
for (uint64_t i1 = 0; i1 < nr1; i1++) {
|
||||||
for (int k1 = 0; k1 < ne01; k1++) {
|
for (uint64_t k1 = 0; k1 < ne01; k1++) {
|
||||||
for (int i0 = 0; i0 < nr0; i0++) {
|
for (uint64_t i0 = 0; i0 < nr0; i0++) {
|
||||||
copies.push_back({
|
copies.push_back({
|
||||||
src_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
src_offset + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
||||||
dst_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
dst_offset + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
||||||
|
@ -2694,20 +2694,20 @@ static void ggml_vk_op_f32(vk_context& ctx, const ggml_tensor * src0, const ggml
|
||||||
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
||||||
GGML_ASSERT(src1 == nullptr || ggml_vk_dim01_contiguous(src1)); // NOLINT
|
GGML_ASSERT(src1 == nullptr || ggml_vk_dim01_contiguous(src1)); // NOLINT
|
||||||
GGML_ASSERT(dst->extra != nullptr);
|
GGML_ASSERT(dst->extra != nullptr);
|
||||||
const int64_t ne00 = src0->ne[0];
|
const uint64_t ne00 = src0->ne[0];
|
||||||
const int64_t ne01 = src0->ne[1];
|
const uint64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const uint64_t ne02 = src0->ne[2];
|
||||||
const int64_t ne03 = src0->ne[3];
|
const uint64_t ne03 = src0->ne[3];
|
||||||
const int64_t ne0 = ne00 * ne01;
|
const uint64_t ne0 = ne00 * ne01;
|
||||||
const bool use_src1 = src1 != nullptr;
|
const bool use_src1 = src1 != nullptr;
|
||||||
const int64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
||||||
const int64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
||||||
const int64_t ne12 = use_src1 ? src1->ne[2] : 0;
|
const uint64_t ne12 = use_src1 ? src1->ne[2] : 0;
|
||||||
const int64_t ne13 = use_src1 ? src1->ne[3] : 0;
|
const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
|
||||||
const int64_t ne1 = ne10 * ne11;
|
const uint64_t ne1 = ne10 * ne11;
|
||||||
const int64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
// const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
||||||
const int nb2 = dst->nb[2];
|
const uint64_t nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const uint64_t nb3 = dst->nb[3];
|
||||||
|
|
||||||
vk_pipeline * pipeline = ggml_vk_op_get_pipeline(src0, src1, dst, op);
|
vk_pipeline * pipeline = ggml_vk_op_get_pipeline(src0, src1, dst, op);
|
||||||
ggml_vk_func_t op_func;
|
ggml_vk_func_t op_func;
|
||||||
|
@ -2788,7 +2788,7 @@ static void ggml_vk_op_f32(vk_context& ctx, const ggml_tensor * src0, const ggml
|
||||||
}
|
}
|
||||||
|
|
||||||
// Single call if dimension 2 is contiguous
|
// Single call if dimension 2 is contiguous
|
||||||
if (op == GGML_OP_CPY || ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1))) {
|
if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
||||||
ggml_vk_pipeline_allocate_descriptor_sets(*pipeline, 1);
|
ggml_vk_pipeline_allocate_descriptor_sets(*pipeline, 1);
|
||||||
|
|
||||||
switch (dst->op) {
|
switch (dst->op) {
|
||||||
|
@ -2851,8 +2851,8 @@ static void ggml_vk_op_f32(vk_context& ctx, const ggml_tensor * src0, const ggml
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (uint64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (uint64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
const uint32_t it_idx0 = (i03 * ne02 + i02);
|
const uint32_t it_idx0 = (i03 * ne02 + i02);
|
||||||
const uint32_t it_idx1 = use_src1 ? ((i03 % ne13) * ne12 + (i02 % ne12)) : 0;
|
const uint32_t it_idx1 = use_src1 ? ((i03 % ne13) * ne12 + (i02 % ne12)) : 0;
|
||||||
const uint32_t x_offset = x_sz * it_idx0;
|
const uint32_t x_offset = x_sz * it_idx0;
|
||||||
|
@ -2897,7 +2897,8 @@ static void ggml_vk_mul(vk_context& ctx, const ggml_tensor * src0, const ggml_te
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_scale(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_scale(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, src0, nullptr, dst, GGML_OP_SCALE, { (uint32_t)ggml_nelements(src0), 0, ((float *)dst->op_params)[0], 0.0f });
|
float * op_params = (float *)dst->op_params;
|
||||||
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, src0, nullptr, dst, GGML_OP_SCALE, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f });
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_sqr(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_sqr(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
|
@ -2905,7 +2906,8 @@ static void ggml_vk_sqr(vk_context& ctx, const ggml_tensor * src0, ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_clamp(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_clamp(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, src0, nullptr, dst, GGML_OP_CLAMP, { (uint32_t)ggml_nelements(src0), 0, ((float *)dst->op_params)[0], ((float *)dst->op_params)[1] });
|
float * op_params = (float *)dst->op_params;
|
||||||
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, src0, nullptr, dst, GGML_OP_CLAMP, { (uint32_t)ggml_nelements(src0), 0, op_params[0], op_params[1] });
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_cpy(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_cpy(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
|
@ -2926,7 +2928,8 @@ static void ggml_vk_norm(vk_context& ctx, const ggml_tensor * src0, ggml_tensor
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_rms_norm(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_rms_norm(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], ((float *)dst->op_params)[0], 0.0f });
|
float * op_params = (float *)dst->op_params;
|
||||||
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_unary(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_unary(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
|
@ -2934,17 +2937,19 @@ static void ggml_vk_unary(vk_context& ctx, const ggml_tensor * src0, ggml_tensor
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_diag_mask_inf(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_diag_mask_inf(vk_context& ctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||||
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], ((int32_t *)dst->op_params)[0] });
|
int32_t * op_params = (int32_t *)dst->op_params;
|
||||||
|
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_soft_max(vk_context& ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_vk_soft_max(vk_context& ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, src0, src1, dst, GGML_OP_SOFT_MAX, { (uint32_t)src0->ne[0], (uint32_t)(src1 != nullptr ? ggml_nrows(src1) : 0), ((float *)dst->op_params)[0], 0.0f });
|
float * op_params = (float *)dst->op_params;
|
||||||
|
ggml_vk_op_f32<vk_op_push_constants>(ctx, src0, src1, dst, GGML_OP_SOFT_MAX, { (uint32_t)src0->ne[0], (uint32_t)(src1 != nullptr ? ggml_nrows(src1) : 0), op_params[0], 0.0f });
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_rope(vk_context& ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_vk_rope(vk_context& ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
const int mode = ((int32_t *) dst->op_params)[2];
|
||||||
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||||
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
||||||
const float freq_base = ((float *) dst->op_params)[5];
|
const float freq_base = ((float *) dst->op_params)[5];
|
||||||
const float freq_scale = ((float *) dst->op_params)[6];
|
const float freq_scale = ((float *) dst->op_params)[6];
|
||||||
|
@ -3481,7 +3486,7 @@ static void ggml_vk_transform_tensor(const void * data, ggml_tensor * tensor, bo
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||||
if (extra == nullptr) {
|
if (extra == nullptr) {
|
||||||
extra = new ggml_tensor_extra_gpu;
|
extra = new ggml_tensor_extra_gpu;
|
||||||
memset((void *) extra, 0, sizeof(ggml_tensor_extra_gpu));
|
extra->reset();
|
||||||
tensor->extra = extra;
|
tensor->extra = extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3514,7 +3519,7 @@ void ggml_vk_assign_buffer(ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(tensor->extra == nullptr);
|
GGML_ASSERT(tensor->extra == nullptr);
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
||||||
memset((void *) extra, 0, sizeof(ggml_tensor_extra_gpu));
|
extra->reset();
|
||||||
tensor->extra = extra;
|
tensor->extra = extra;
|
||||||
|
|
||||||
extra->buffer_gpu = new vk_buffer;
|
extra->buffer_gpu = new vk_buffer;
|
||||||
|
@ -3528,8 +3533,7 @@ static void ggml_vk_tensor_create_extra(ggml_tensor * tensor) {
|
||||||
std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl;
|
std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
||||||
memset((void *) extra, 0, sizeof(ggml_tensor_extra_gpu));
|
extra->reset();
|
||||||
extra->d_idx = -1;
|
|
||||||
tensor->extra = extra;
|
tensor->extra = extra;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3689,7 +3693,7 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node, ggml_cgraph * graph){
|
||||||
const bool qvec_kernel = use_src0 && use_src1 && src1->ne[1] == 1 && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type));
|
const bool qvec_kernel = use_src0 && use_src1 && src1->ne[1] == 1 && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type));
|
||||||
const bool qx_needs_dequant = use_src0 && !qvec_kernel && !x_non_contig && (src0->type != GGML_TYPE_F16 || x_non_contig);
|
const bool qx_needs_dequant = use_src0 && !qvec_kernel && !x_non_contig && (src0->type != GGML_TYPE_F16 || x_non_contig);
|
||||||
const bool f16_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32;
|
const bool f16_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32;
|
||||||
const bool qy_needs_dequant = use_src1 && (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig;
|
const bool qy_needs_dequant = (use_src1 && (src1->type != GGML_TYPE_F16 && !f16_f32_kernel)) || y_non_contig;
|
||||||
|
|
||||||
int split_k;
|
int split_k;
|
||||||
if (node->op == GGML_OP_MUL_MAT) {
|
if (node->op == GGML_OP_MUL_MAT) {
|
||||||
|
@ -3923,8 +3927,6 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){
|
||||||
const ggml_tensor * src1 = node->src[1];
|
const ggml_tensor * src1 = node->src[1];
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
||||||
ggml_tensor_extra_gpu * src0_extra = src0 != nullptr ? (ggml_tensor_extra_gpu *) src0->extra : nullptr;
|
|
||||||
ggml_tensor_extra_gpu * src1_extra = src1 != nullptr ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
|
||||||
|
|
||||||
// Set data to vk_buffer
|
// Set data to vk_buffer
|
||||||
// This can't be done earlier cause the buffer may not exist yet
|
// This can't be done earlier cause the buffer may not exist yet
|
||||||
|
@ -4280,7 +4282,7 @@ struct ggml_backend_vk_buffer_context {
|
||||||
size_t alloc_index = temp_tensor_extra_index;
|
size_t alloc_index = temp_tensor_extra_index;
|
||||||
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_VK_MAX_NODES;
|
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_VK_MAX_NODES;
|
||||||
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
||||||
memset(extra, 0, sizeof(*extra));
|
extra->reset();
|
||||||
|
|
||||||
return extra;
|
return extra;
|
||||||
}
|
}
|
||||||
|
@ -4331,6 +4333,8 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
||||||
} else {
|
} else {
|
||||||
ggml_vk_preallocate_buffers_graph(tensor, nullptr);
|
ggml_vk_preallocate_buffers_graph(tensor, nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
@ -4344,6 +4348,10 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
||||||
// ggml_vk_buffer_write(&ctx->dev_buffer, offset, data, size, vk_device.transfer_queue);
|
// ggml_vk_buffer_write(&ctx->dev_buffer, offset, data, size, vk_device.transfer_queue);
|
||||||
|
|
||||||
ggml_vk_transform_tensor_static(data, tensor);
|
ggml_vk_transform_tensor_static(data, tensor);
|
||||||
|
|
||||||
|
UNUSED(buffer);
|
||||||
|
UNUSED(offset);
|
||||||
|
UNUSED(size);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
|
@ -4355,6 +4363,8 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||||
|
|
||||||
ggml_vk_buffer_read(extra->buffer_gpu, offset, data, size, vk_device.transfer_queue);
|
ggml_vk_buffer_read(extra->buffer_gpu, offset, data, size, vk_device.transfer_queue);
|
||||||
|
|
||||||
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||||
|
@ -4367,12 +4377,19 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
|
||||||
// return true;
|
// return true;
|
||||||
// }
|
// }
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
UNUSED(buffer);
|
||||||
|
UNUSED(src);
|
||||||
|
UNUSED(dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
// ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||||
|
|
||||||
// ggml_vk_buffer_memset(&ctx->dev_buffer, 0, value, buffer->size, vk_device.transfer_queue);
|
// ggml_vk_buffer_memset(&ctx->dev_buffer, 0, value, buffer->size, vk_device.transfer_queue);
|
||||||
|
|
||||||
|
UNUSED(buffer);
|
||||||
|
UNUSED(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
||||||
|
@ -4425,6 +4442,8 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||||
return ggml_backend_is_vk(backend);
|
return ggml_backend_is_vk(backend);
|
||||||
|
|
||||||
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
||||||
|
@ -4530,7 +4549,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_t
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
ggml_backend_vk_context * vk_ctx = (ggml_backend_vk_context *)backend->context;
|
// ggml_backend_vk_context * vk_ctx = (ggml_backend_vk_context *)backend->context;
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||||
ggml_tensor * node = cgraph->leafs[i];
|
ggml_tensor * node = cgraph->leafs[i];
|
||||||
|
@ -4573,6 +4592,8 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
||||||
ggml_vk_graph_cleanup();
|
ggml_vk_graph_cleanup();
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue