Split off matrix vector multiplication for separate optimization
This commit is contained in:
parent
6bd9bd9026
commit
2231618450
1 changed files with 220 additions and 93 deletions
311
ggml-vulkan.cpp
311
ggml-vulkan.cpp
|
@ -144,9 +144,6 @@ vk_pipeline vk_pipeline_dequant_mul_mat_vec_f16_f32, vk_pipeline_dequant_mul_mat
|
||||||
vk_pipeline vk_pipeline_mul_f32;
|
vk_pipeline vk_pipeline_mul_f32;
|
||||||
vk_pipeline vk_pipeline_f32_to_f16, vk_pipeline_dequant_q4_0;
|
vk_pipeline vk_pipeline_f32_to_f16, vk_pipeline_dequant_q4_0;
|
||||||
|
|
||||||
void * vk_pinned_workspace;
|
|
||||||
size_t vk_pinned_workspace_size;
|
|
||||||
|
|
||||||
static std::vector<std::tuple<void*, size_t, vk_buffer>> vk_pinned_memory;
|
static std::vector<std::tuple<void*, size_t, vk_buffer>> vk_pinned_memory;
|
||||||
|
|
||||||
static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<int>&& specialization_constants, uint32_t align) {
|
static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<int>&& specialization_constants, uint32_t align) {
|
||||||
|
@ -724,9 +721,6 @@ void ggml_vk_init(void) {
|
||||||
|
|
||||||
vk_device.descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN;
|
vk_device.descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN;
|
||||||
|
|
||||||
vk_pinned_workspace = nullptr;
|
|
||||||
vk_pinned_workspace_size = 0;
|
|
||||||
|
|
||||||
// Prepare matmul values
|
// Prepare matmul values
|
||||||
auto warptile_l = { 128, 128, 128, 16, 64, 64, 2, 4, 4 };
|
auto warptile_l = { 128, 128, 128, 16, 64, 64, 2, 4, 4 };
|
||||||
auto warptile_m = { 128, 64, 64, 16, 32, 32, 2, 4, 2 };
|
auto warptile_m = { 128, 64, 64, 16, 32, 32, 2, 4, 2 };
|
||||||
|
@ -1490,7 +1484,7 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(int m, int n) {
|
||||||
|
|
||||||
static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
||||||
#ifdef VK_DEBUG
|
#ifdef VK_DEBUG
|
||||||
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16 << ", " << m << ", " << n << ", " << aligned << ")";
|
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
|
||||||
#endif
|
#endif
|
||||||
if (bit16_x && bit16_y) {
|
if (bit16_x && bit16_y) {
|
||||||
if (m <= 32 || n <= 32) {
|
if (m <= 32 || n <= 32) {
|
||||||
|
@ -1744,19 +1738,6 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
|
|
||||||
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
||||||
|
|
||||||
const size_t workspace_size = sizeof(ggml_fp16_t) * y_ne * (ne02 * ne03);
|
|
||||||
|
|
||||||
if (vk_pinned_workspace == nullptr) {
|
|
||||||
vk_pinned_workspace = ggml_vk_host_malloc(workspace_size);
|
|
||||||
vk_pinned_workspace_size = workspace_size;
|
|
||||||
} else if (vk_pinned_workspace_size < workspace_size) {
|
|
||||||
ggml_vk_host_free(vk_pinned_workspace);
|
|
||||||
vk_pinned_workspace = ggml_vk_host_malloc(workspace_size);
|
|
||||||
vk_pinned_workspace_size = workspace_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_fp16_t * const fp16_staging = (ggml_fp16_t *) vk_pinned_workspace;
|
|
||||||
|
|
||||||
// Allocate descriptor sets
|
// Allocate descriptor sets
|
||||||
ggml_vk_pipeline_allocate_descriptor_sets(*pipeline, ne02 * ne03);
|
ggml_vk_pipeline_allocate_descriptor_sets(*pipeline, ne02 * ne03);
|
||||||
if (split_k > 1) {
|
if (split_k > 1) {
|
||||||
|
@ -1854,11 +1835,13 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
||||||
|
|
||||||
const int nb2 = dst->nb[2];
|
const int nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int nb3 = dst->nb[3];
|
||||||
const bool mul_mat_vec = ne11 == 1 && src0->type != GGML_TYPE_F16;
|
|
||||||
|
|
||||||
|
vk_queue& compq = vk_device.compute_queue;
|
||||||
|
vk_queue& tr0q = vk_device.transfer_queues[0];
|
||||||
|
vk_queue& tr1q = vk_device.transfer_queues[1];
|
||||||
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
|
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
|
||||||
|
|
||||||
const bool qx_needs_dequant = src0->type != GGML_TYPE_F16 && !mul_mat_vec;
|
const bool qx_needs_dequant = src0->type != GGML_TYPE_F16;
|
||||||
const bool qy_needs_dequant = src1->type != GGML_TYPE_F16 && !f16_f32_kernel;
|
const bool qy_needs_dequant = src1->type != GGML_TYPE_F16 && !f16_f32_kernel;
|
||||||
const bool dq = qx_needs_dequant || qy_needs_dequant;
|
const bool dq = qx_needs_dequant || qy_needs_dequant;
|
||||||
|
|
||||||
|
@ -1869,7 +1852,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
||||||
const int y_ne = ne11 * ne10;
|
const int y_ne = ne11 * ne10;
|
||||||
const int d_ne = ne11 * ne01;
|
const int d_ne = ne11 * ne01;
|
||||||
|
|
||||||
const int split_k = mul_mat_vec ? 1 : ggml_vk_guess_split_k(ne01, ne11, ne10);
|
const int split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
|
||||||
|
|
||||||
const int kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ne01, ne11));
|
const int kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ne01, ne11));
|
||||||
|
|
||||||
|
@ -1900,7 +1883,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
||||||
ggml_vk_pool_malloc(x_sz * ne02 * ne03, &d_X, {});
|
ggml_vk_pool_malloc(x_sz * ne02 * ne03, &d_X, {});
|
||||||
} else {
|
} else {
|
||||||
d_X = d_Qx;
|
d_X = d_Qx;
|
||||||
GGML_ASSERT(qx_sz == x_sz || mul_mat_vec); // NOLINT
|
GGML_ASSERT(qx_sz == x_sz); // NOLINT
|
||||||
}
|
}
|
||||||
if (qy_needs_dequant) {
|
if (qy_needs_dequant) {
|
||||||
ggml_vk_pool_malloc(y_sz * ne02 * ne03, &d_Y, {});
|
ggml_vk_pool_malloc(y_sz * ne02 * ne03, &d_Y, {});
|
||||||
|
@ -1912,10 +1895,8 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
||||||
|
|
||||||
vk_pipeline* to_fp16_vk_0 = ggml_vk_get_to_fp16(src0->type);
|
vk_pipeline* to_fp16_vk_0 = ggml_vk_get_to_fp16(src0->type);
|
||||||
vk_pipeline* to_fp16_vk_1 = ggml_vk_get_to_fp16(src1->type);
|
vk_pipeline* to_fp16_vk_1 = ggml_vk_get_to_fp16(src1->type);
|
||||||
vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(src0->type, !f16_f32_kernel);
|
|
||||||
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT
|
||||||
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
||||||
GGML_ASSERT(dmmv != nullptr);
|
|
||||||
|
|
||||||
std::vector<vk_sequence> compute_seqs;
|
std::vector<vk_sequence> compute_seqs;
|
||||||
std::vector<vk_sequence> transfer_0_seqs;
|
std::vector<vk_sequence> transfer_0_seqs;
|
||||||
|
@ -1930,9 +1911,6 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
||||||
if (qy_needs_dequant) {
|
if (qy_needs_dequant) {
|
||||||
ggml_vk_pipeline_allocate_descriptor_sets(*to_fp16_vk_1, ne02 * ne03);
|
ggml_vk_pipeline_allocate_descriptor_sets(*to_fp16_vk_1, ne02 * ne03);
|
||||||
}
|
}
|
||||||
if (mul_mat_vec) {
|
|
||||||
ggml_vk_pipeline_allocate_descriptor_sets(*dmmv, ne02 * ne03);
|
|
||||||
}
|
|
||||||
if (split_k > 1) {
|
if (split_k > 1) {
|
||||||
ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, ne02 * ne03);
|
ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, ne02 * ne03);
|
||||||
}
|
}
|
||||||
|
@ -1952,7 +1930,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
||||||
vk::Semaphore s_y;
|
vk::Semaphore s_y;
|
||||||
vk::Semaphore s_q;
|
vk::Semaphore s_q;
|
||||||
|
|
||||||
const vk::Semaphore s_mm = ggml_vk_create_semaphore(vk_device.compute_queue);
|
const vk::Semaphore s_mm = ggml_vk_create_semaphore(compq);
|
||||||
|
|
||||||
std::vector<vk::Semaphore> q_semaphores;
|
std::vector<vk::Semaphore> q_semaphores;
|
||||||
std::vector<vk::Semaphore> mm_semaphores;
|
std::vector<vk::Semaphore> mm_semaphores;
|
||||||
|
@ -1960,44 +1938,44 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
||||||
submit_counter++;
|
submit_counter++;
|
||||||
|
|
||||||
if (load_x) {
|
if (load_x) {
|
||||||
s_x = ggml_vk_create_semaphore(vk_device.transfer_queues[0]);
|
s_x = ggml_vk_create_semaphore(tr0q);
|
||||||
if (qx_needs_dequant) {
|
if (qx_needs_dequant) {
|
||||||
q_semaphores.push_back(s_x);
|
q_semaphores.push_back(s_x);
|
||||||
} else {
|
} else {
|
||||||
mm_semaphores.push_back(s_x);
|
mm_semaphores.push_back(s_x);
|
||||||
}
|
}
|
||||||
transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Qx, qx_offset, src0, i03, i02, vk_device.transfer_queues[0], {}, { s_x }));
|
transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Qx, qx_offset, src0, i03, i02, tr0q, {}, { s_x }));
|
||||||
}
|
}
|
||||||
if (it_idx == 0 || submit_counter >= VK_SUBMIT_BATCH) {
|
if (it_idx == 0 || submit_counter >= VK_SUBMIT_BATCH) {
|
||||||
ggml_vk_submit(vk_device.transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(tr0q, transfer_0_seqs, VK_NULL_HANDLE);
|
||||||
}
|
}
|
||||||
if (load_y) {
|
if (load_y) {
|
||||||
s_y = ggml_vk_create_semaphore(vk_device.transfer_queues[1]);
|
s_y = ggml_vk_create_semaphore(tr1q);
|
||||||
if (qy_needs_dequant) {
|
if (qy_needs_dequant) {
|
||||||
q_semaphores.push_back(s_y);
|
q_semaphores.push_back(s_y);
|
||||||
} else {
|
} else {
|
||||||
mm_semaphores.push_back(s_y);
|
mm_semaphores.push_back(s_y);
|
||||||
}
|
}
|
||||||
transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Qy, qy_offset, src1, i03, i02, vk_device.transfer_queues[1], {}, { s_y }));
|
transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Qy, qy_offset, src1, i03, i02, tr1q, {}, { s_y }));
|
||||||
}
|
}
|
||||||
if (it_idx == 0 || submit_counter >= VK_SUBMIT_BATCH) {
|
if (it_idx == 0 || submit_counter >= VK_SUBMIT_BATCH) {
|
||||||
ggml_vk_submit(vk_device.transfer_queues[1], transfer_1_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(tr1q, transfer_1_seqs, VK_NULL_HANDLE);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dq) {
|
if (dq) {
|
||||||
s_q = ggml_vk_create_semaphore(vk_device.compute_queue);
|
s_q = ggml_vk_create_semaphore(tr0q);
|
||||||
vk_submission s = ggml_vk_begin_submission(vk_device.compute_queue);
|
vk_submission s = ggml_vk_begin_submission(compq);
|
||||||
if (qx_needs_dequant) {
|
if (qx_needs_dequant) {
|
||||||
const std::vector<int> pc = { (int)ne01, (int)ne10, (int)ne10, (int)ne10 };
|
const std::vector<int> pc = { (int)ne01, (int)ne10, (int)ne10, (int)ne10 };
|
||||||
ggml_vk_sync_buffers(s.buffer, { { d_Qx, qx_offset, qx_sz } }, vk_device.compute_queue, vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eShaderRead, false);
|
ggml_vk_sync_buffers(s.buffer, { { d_Qx, qx_offset, qx_sz } }, compq, vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eShaderRead, false);
|
||||||
ggml_vk_sync_buffers(s.buffer, { { d_X, x_offset, x_sz } }, vk_device.compute_queue, vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite, false);
|
ggml_vk_sync_buffers(s.buffer, { { d_X, x_offset, x_sz } }, compq, vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite, false);
|
||||||
ggml_vk_dispatch_pipeline(s, *to_fp16_vk_0, { { d_Qx, qx_offset, qx_sz }, { d_X, x_offset, x_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)x_ne, 1, 1});
|
ggml_vk_dispatch_pipeline(s, *to_fp16_vk_0, { { d_Qx, qx_offset, qx_sz }, { d_X, x_offset, x_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)x_ne, 1, 1});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (qy_needs_dequant) {
|
if (qy_needs_dequant) {
|
||||||
const std::vector<int> pc = { (int)ne11, (int)ne10, (int)ne10, (int)ne10 };
|
const std::vector<int> pc = { (int)ne11, (int)ne10, (int)ne10, (int)ne10 };
|
||||||
ggml_vk_sync_buffers(s.buffer, { { d_Qy, qy_offset, qy_sz } }, vk_device.compute_queue, vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eShaderRead, false);
|
ggml_vk_sync_buffers(s.buffer, { { d_Qy, qy_offset, qy_sz } }, compq, vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eShaderRead, false);
|
||||||
ggml_vk_sync_buffers(s.buffer, { { d_Y, y_offset, y_sz } }, vk_device.compute_queue, vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite, false);
|
ggml_vk_sync_buffers(s.buffer, { { d_Y, y_offset, y_sz } }, compq, vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite, false);
|
||||||
ggml_vk_dispatch_pipeline(s, *to_fp16_vk_1, { { d_Qy, qy_offset, qy_sz }, { d_Y, y_offset, y_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)y_ne, 1, 1});
|
ggml_vk_dispatch_pipeline(s, *to_fp16_vk_1, { { d_Qy, qy_offset, qy_sz }, { d_Y, y_offset, y_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)y_ne, 1, 1});
|
||||||
}
|
}
|
||||||
ggml_vk_end_submission(s, std::move(q_semaphores), { s_q });
|
ggml_vk_end_submission(s, std::move(q_semaphores), { s_q });
|
||||||
|
@ -2006,44 +1984,34 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
||||||
mm_semaphores.push_back(s_q);
|
mm_semaphores.push_back(s_q);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
|
||||||
// compute
|
// compute
|
||||||
const int ncols = ne00;
|
compute_seqs.push_back(ggml_vk_matmul(*pipeline, { d_X, x_offset, x_sz }, { d_Y, y_offset, y_sz }, { d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, compq, std::move(mm_semaphores), { s_mm }));
|
||||||
vk_submission s = ggml_vk_begin_submission(vk_device.compute_queue);
|
|
||||||
ggml_vk_sync_buffers(s.buffer, { ggml_vk_subbuffer(d_Qx), ggml_vk_subbuffer(d_Y) }, vk_device.compute_queue, vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eShaderRead, false);
|
|
||||||
ggml_vk_sync_buffers(s.buffer, { ggml_vk_subbuffer(d_D) }, vk_device.compute_queue, vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite, false);
|
|
||||||
ggml_vk_dispatch_pipeline(s, *dmmv, { { d_Qx, qx_offset, qx_sz }, { d_Y, y_offset, y_sz }, { d_D, d_offset, d_sz } }, sizeof(int), &ncols, { (uint32_t)ne01, 1, 1});
|
|
||||||
ggml_vk_end_submission(s, std::move(mm_semaphores), { s_mm });
|
|
||||||
compute_seqs.push_back({ s });
|
|
||||||
} else {
|
|
||||||
// compute
|
|
||||||
compute_seqs.push_back(ggml_vk_matmul(*pipeline, { d_X, x_offset, x_sz }, { d_Y, y_offset, y_sz }, { d_D, d_offset, d_sz }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, vk_device.compute_queue, std::move(mm_semaphores), { s_mm }));
|
|
||||||
}
|
|
||||||
if (it_idx == 0 || submit_counter >= VK_SUBMIT_BATCH) {
|
if (it_idx == 0 || submit_counter >= VK_SUBMIT_BATCH) {
|
||||||
ggml_vk_submit(vk_device.compute_queue, compute_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(compq, compute_seqs, VK_NULL_HANDLE);
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||||
transfer_2_seqs.push_back(ggml_vk_buffer_read_async(&d_D, d_offset, d, sizeof(float) * d_ne, vk_device.transfer_queues[0], { s_mm }, {}));
|
transfer_2_seqs.push_back(ggml_vk_buffer_read_async(&d_D, d_offset, d, sizeof(float) * d_ne, tr0q, { s_mm }, {}));
|
||||||
|
|
||||||
if (it_idx == 0 || submit_counter >= VK_SUBMIT_BATCH) {
|
if (it_idx == 0 || submit_counter >= VK_SUBMIT_BATCH) {
|
||||||
ggml_vk_submit(vk_device.transfer_queues[0], transfer_2_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(tr0q, transfer_2_seqs, VK_NULL_HANDLE);
|
||||||
submit_counter = 0;
|
submit_counter = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vk_submit(vk_device.transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(tr0q, transfer_0_seqs, VK_NULL_HANDLE);
|
||||||
ggml_vk_submit(vk_device.transfer_queues[1], transfer_1_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(tr1q, transfer_1_seqs, VK_NULL_HANDLE);
|
||||||
ggml_vk_submit(vk_device.compute_queue, compute_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(compq, compute_seqs, VK_NULL_HANDLE);
|
||||||
ggml_vk_submit(vk_device.transfer_queues[0], transfer_2_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(tr0q, transfer_2_seqs, VK_NULL_HANDLE);
|
||||||
|
|
||||||
vk_device.transfer_queues[0].queue.waitIdle();
|
tr0q.queue.waitIdle();
|
||||||
|
|
||||||
ggml_vk_queue_cleanup(vk_device.transfer_queues[0]);
|
ggml_vk_queue_cleanup(tr0q);
|
||||||
ggml_vk_queue_cleanup(vk_device.transfer_queues[1]);
|
ggml_vk_queue_cleanup(tr1q);
|
||||||
ggml_vk_queue_cleanup(vk_device.compute_queue);
|
ggml_vk_queue_cleanup(compq);
|
||||||
|
|
||||||
ggml_vk_pipeline_cleanup(*pipeline);
|
ggml_vk_pipeline_cleanup(*pipeline);
|
||||||
if (qx_needs_dequant) {
|
if (qx_needs_dequant) {
|
||||||
|
@ -2052,7 +2020,6 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
||||||
if (qy_needs_dequant) {
|
if (qy_needs_dequant) {
|
||||||
ggml_vk_pipeline_cleanup(*to_fp16_vk_1);
|
ggml_vk_pipeline_cleanup(*to_fp16_vk_1);
|
||||||
}
|
}
|
||||||
ggml_vk_pipeline_cleanup(*dmmv);
|
|
||||||
ggml_vk_pipeline_cleanup(vk_pipeline_matmul_split_k_reduce);
|
ggml_vk_pipeline_cleanup(vk_pipeline_matmul_split_k_reduce);
|
||||||
|
|
||||||
if (qx_needs_dequant) {
|
if (qx_needs_dequant) {
|
||||||
|
@ -2070,6 +2037,182 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_vk_mul_mat_vec_q_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_mul_mat_vec_q_f16((type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3];
|
||||||
|
std::cerr << "), (type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3];
|
||||||
|
std::cerr << "), (type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << "),)" << std::endl;
|
||||||
|
#endif
|
||||||
|
const int64_t ne00 = src0->ne[0];
|
||||||
|
const int64_t ne01 = src0->ne[1];
|
||||||
|
const int64_t ne02 = src0->ne[2];
|
||||||
|
const int64_t ne03 = src0->ne[3];
|
||||||
|
|
||||||
|
const int64_t ne10 = src1->ne[0];
|
||||||
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
|
||||||
|
GGML_ASSERT(ne11 == 1);
|
||||||
|
|
||||||
|
const int nb2 = dst->nb[2];
|
||||||
|
const int nb3 = dst->nb[3];
|
||||||
|
|
||||||
|
vk_queue& compq = vk_device.compute_queue;
|
||||||
|
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
|
||||||
|
|
||||||
|
const bool qy_needs_dequant = src1->type != GGML_TYPE_F16 && !f16_f32_kernel;
|
||||||
|
|
||||||
|
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
||||||
|
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||||
|
|
||||||
|
const int x_ne = ne01 * ne00;
|
||||||
|
const int y_ne = ne11 * ne10;
|
||||||
|
const int d_ne = ne11 * ne01;
|
||||||
|
|
||||||
|
const uint32_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), vk_device.properties.limits.minStorageBufferOffsetAlignment);
|
||||||
|
const uint32_t qy_sz = ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), vk_device.properties.limits.minStorageBufferOffsetAlignment);
|
||||||
|
const uint32_t y_sz = ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment);
|
||||||
|
const uint32_t d_sz = ggml_vk_align_size(sizeof(float) * d_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment);
|
||||||
|
|
||||||
|
vk_buffer d_Qx;
|
||||||
|
if (load_x) {
|
||||||
|
ggml_vk_pool_malloc(qx_sz * ne02 * ne03, &d_Qx, {});
|
||||||
|
} else {
|
||||||
|
d_Qx = *(vk_buffer *) src0->data;
|
||||||
|
}
|
||||||
|
vk_buffer d_Qy;
|
||||||
|
if (load_y) {
|
||||||
|
ggml_vk_pool_malloc(qy_sz * ne02 * ne03, &d_Qy, {});
|
||||||
|
} else {
|
||||||
|
d_Qy = *(vk_buffer *) src1->data;
|
||||||
|
}
|
||||||
|
vk_buffer d_X;
|
||||||
|
vk_buffer d_Y;
|
||||||
|
vk_buffer d_D;
|
||||||
|
if (qy_needs_dequant) {
|
||||||
|
ggml_vk_pool_malloc(y_sz * ne02 * ne03, &d_Y, {});
|
||||||
|
} else {
|
||||||
|
d_Y = d_Qy;
|
||||||
|
GGML_ASSERT(qy_sz == y_sz);
|
||||||
|
}
|
||||||
|
ggml_vk_pool_malloc(d_sz * ne02 * ne03, &d_D, {});
|
||||||
|
|
||||||
|
vk_pipeline* to_fp16_vk_1 = ggml_vk_get_to_fp16(src1->type);
|
||||||
|
vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(src0->type, !f16_f32_kernel);
|
||||||
|
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
|
||||||
|
GGML_ASSERT(dmmv != nullptr);
|
||||||
|
|
||||||
|
std::vector<vk_sequence> compute_seqs;
|
||||||
|
std::vector<vk_sequence> transfer_0_seqs;
|
||||||
|
std::vector<vk_sequence> transfer_1_seqs;
|
||||||
|
std::vector<vk_sequence> transfer_2_seqs;
|
||||||
|
|
||||||
|
// Allocate descriptor sets
|
||||||
|
if (qy_needs_dequant) {
|
||||||
|
ggml_vk_pipeline_allocate_descriptor_sets(*to_fp16_vk_1, ne02 * ne03);
|
||||||
|
}
|
||||||
|
ggml_vk_pipeline_allocate_descriptor_sets(*dmmv, ne02 * ne03);
|
||||||
|
|
||||||
|
int submit_counter = 0;
|
||||||
|
|
||||||
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
|
const uint32_t it_idx = (i03 * ne02 + i02);
|
||||||
|
const uint32_t qx_offset = load_x ? qx_sz * it_idx : 0;
|
||||||
|
const uint32_t qy_offset = load_y ? qy_sz * it_idx : 0;
|
||||||
|
const uint32_t y_offset = y_sz * it_idx;
|
||||||
|
const uint32_t d_offset = d_sz * it_idx;
|
||||||
|
|
||||||
|
vk::Semaphore s_x;
|
||||||
|
vk::Semaphore s_y;
|
||||||
|
vk::Semaphore s_q;
|
||||||
|
|
||||||
|
const vk::Semaphore s_mm = ggml_vk_create_semaphore(compq);
|
||||||
|
|
||||||
|
std::vector<vk::Semaphore> q_semaphores;
|
||||||
|
std::vector<vk::Semaphore> mm_semaphores;
|
||||||
|
|
||||||
|
submit_counter++;
|
||||||
|
|
||||||
|
if (load_x) {
|
||||||
|
s_x = ggml_vk_create_semaphore(compq);
|
||||||
|
mm_semaphores.push_back(s_x);
|
||||||
|
transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Qx, qx_offset, src0, i03, i02, compq, {}, { s_x }));
|
||||||
|
}
|
||||||
|
if (it_idx == 0 || submit_counter >= VK_SUBMIT_BATCH) {
|
||||||
|
ggml_vk_submit(compq, transfer_0_seqs, VK_NULL_HANDLE);
|
||||||
|
}
|
||||||
|
if (load_y) {
|
||||||
|
s_y = ggml_vk_create_semaphore(compq);
|
||||||
|
if (qy_needs_dequant) {
|
||||||
|
q_semaphores.push_back(s_y);
|
||||||
|
} else {
|
||||||
|
mm_semaphores.push_back(s_y);
|
||||||
|
}
|
||||||
|
transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Qy, qy_offset, src1, i03, i02, compq, {}, { s_y }));
|
||||||
|
}
|
||||||
|
if (it_idx == 0 || submit_counter >= VK_SUBMIT_BATCH) {
|
||||||
|
ggml_vk_submit(compq, transfer_1_seqs, VK_NULL_HANDLE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (qy_needs_dequant) {
|
||||||
|
s_q = ggml_vk_create_semaphore(compq);
|
||||||
|
vk_submission s = ggml_vk_begin_submission(compq);
|
||||||
|
const std::vector<int> pc = { (int)ne11, (int)ne10, (int)ne10, (int)ne10 };
|
||||||
|
ggml_vk_sync_buffers(s.buffer, { { d_Qy, qy_offset, qy_sz } }, compq, vk::AccessFlagBits::eTransferWrite, vk::AccessFlagBits::eShaderRead, false);
|
||||||
|
ggml_vk_sync_buffers(s.buffer, { { d_Y, y_offset, y_sz } }, compq, vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite, false);
|
||||||
|
ggml_vk_dispatch_pipeline(s, *to_fp16_vk_1, { { d_Qy, qy_offset, qy_sz }, { d_Y, y_offset, y_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)y_ne, 1, 1});
|
||||||
|
ggml_vk_end_submission(s, std::move(q_semaphores), { s_q });
|
||||||
|
compute_seqs.push_back({ s });
|
||||||
|
|
||||||
|
mm_semaphores.push_back(s_q);
|
||||||
|
}
|
||||||
|
|
||||||
|
// compute
|
||||||
|
const int ncols = ne00;
|
||||||
|
vk_submission s = ggml_vk_begin_submission(compq);
|
||||||
|
ggml_vk_sync_buffers(s.buffer, { ggml_vk_subbuffer(d_Qx), ggml_vk_subbuffer(d_Y) }, compq, vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eShaderRead, false);
|
||||||
|
ggml_vk_sync_buffers(s.buffer, { ggml_vk_subbuffer(d_D) }, compq, vk::AccessFlagBits::eShaderRead, vk::AccessFlagBits::eShaderWrite, false);
|
||||||
|
ggml_vk_dispatch_pipeline(s, *dmmv, { { d_Qx, qx_offset, qx_sz }, { d_Y, y_offset, y_sz }, { d_D, d_offset, d_sz } }, sizeof(int), &ncols, { (uint32_t)ne01, 1, 1});
|
||||||
|
ggml_vk_end_submission(s, std::move(mm_semaphores), { s_mm });
|
||||||
|
compute_seqs.push_back({ s });
|
||||||
|
|
||||||
|
if (it_idx == 0 || submit_counter >= VK_SUBMIT_BATCH) {
|
||||||
|
ggml_vk_submit(compq, compute_seqs, VK_NULL_HANDLE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy dst to host
|
||||||
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||||
|
transfer_2_seqs.push_back(ggml_vk_buffer_read_async(&d_D, d_offset, d, sizeof(float) * d_ne, compq, { s_mm }, {}));
|
||||||
|
|
||||||
|
if (it_idx == 0 || submit_counter >= VK_SUBMIT_BATCH) {
|
||||||
|
ggml_vk_submit(compq, transfer_2_seqs, VK_NULL_HANDLE);
|
||||||
|
submit_counter = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_vk_submit(compq, compute_seqs, VK_NULL_HANDLE);
|
||||||
|
|
||||||
|
compq.queue.waitIdle();
|
||||||
|
|
||||||
|
ggml_vk_queue_cleanup(compq);
|
||||||
|
|
||||||
|
if (qy_needs_dequant) {
|
||||||
|
ggml_vk_pipeline_cleanup(*to_fp16_vk_1);
|
||||||
|
}
|
||||||
|
ggml_vk_pipeline_cleanup(*dmmv);
|
||||||
|
|
||||||
|
if (qy_needs_dequant) {
|
||||||
|
ggml_vk_pool_free(d_Y);
|
||||||
|
}
|
||||||
|
ggml_vk_pool_free(d_D);
|
||||||
|
if (load_x) {
|
||||||
|
ggml_vk_pool_free(d_Qx);
|
||||||
|
}
|
||||||
|
if (load_y) {
|
||||||
|
ggml_vk_pool_free(d_Qy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
static bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
|
@ -2077,6 +2220,13 @@ static bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct gg
|
||||||
const int64_t ne0 = dst->ne[0];
|
const int64_t ne0 = dst->ne[0];
|
||||||
const int64_t ne1 = dst->ne[1];
|
const int64_t ne1 = dst->ne[1];
|
||||||
|
|
||||||
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||||
|
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
||||||
|
dst->type == GGML_TYPE_F16 &&
|
||||||
|
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
|
||||||
|
std::cerr << "FP16 dst required" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: find the optimal values for these
|
// TODO: find the optimal values for these
|
||||||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||||
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
||||||
|
@ -2088,29 +2238,6 @@ static bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct gg
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_vk_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
|
||||||
#ifdef VK_DEBUG
|
|
||||||
std::cerr << "ggml_vk_mul_mat_use_f16(" << src0 << ", " << src1 << ")" << std::endl;
|
|
||||||
#endif
|
|
||||||
// If device doesn't support FP16
|
|
||||||
if (!vk_device.fp16) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t src0_sz = ggml_nbytes(src0);
|
|
||||||
size_t src1_sz = ggml_nbytes(src1);
|
|
||||||
|
|
||||||
// mul_mat_q: src0 is converted to fp32 on device
|
|
||||||
size_t mul_mat_q_transfer = src0_sz + src1_sz;
|
|
||||||
|
|
||||||
// mul_mat_f16: src1 is converted to fp16 on cpu
|
|
||||||
size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1);
|
|
||||||
|
|
||||||
// choose the smaller one to transfer to the device
|
|
||||||
// TODO: this is not always the best choice due to the overhead of converting to fp16
|
|
||||||
return mul_mat_f16_transfer < mul_mat_q_transfer;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
static void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||||
#ifdef VK_DEBUG
|
#ifdef VK_DEBUG
|
||||||
std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl;
|
std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl;
|
||||||
|
@ -2119,10 +2246,10 @@ static void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_t
|
||||||
|
|
||||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||||
ggml_vk_mul_mat_f32(src0, src1, dst);
|
ggml_vk_mul_mat_f32(src0, src1, dst);
|
||||||
} else if (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) {
|
} else if (src1->ne[1] == 1 && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
||||||
ggml_vk_mul_mat_q_f16(src0, src1, dst);
|
ggml_vk_mul_mat_vec_q_f16(src0, src1, dst);
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
ggml_vk_mul_mat_q_f16(src0, src1, dst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue