Use single queue per device to simplify code

This commit is contained in:
0cc4m 2023-12-18 19:25:46 +01:00
parent 5fef0d6bc9
commit e9e2be33fd

View file

@ -37,8 +37,6 @@
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
#define VK_TRANSFER_QUEUE_COUNT 2
#define VK_VENDOR_ID_AMD 0x1002 #define VK_VENDOR_ID_AMD 0x1002
#define VK_VENDOR_ID_INTEL 0x8086 #define VK_VENDOR_ID_INTEL 0x8086
#define VK_VENDOR_ID_NVIDIA 0x10de #define VK_VENDOR_ID_NVIDIA 0x10de
@ -129,7 +127,7 @@ struct vk_device {
vk::Device device; vk::Device device;
uint32_t vendor_id; uint32_t vendor_id;
vk_queue compute_queue; vk_queue compute_queue;
vk_queue transfer_queues[VK_TRANSFER_QUEUE_COUNT]; vk_queue transfer_queue;
uint32_t descriptor_set_mode; uint32_t descriptor_set_mode;
}; };
@ -175,8 +173,7 @@ struct vk_staging_memcpy {
struct ggml_vk_tensor_extra_gpu { struct ggml_vk_tensor_extra_gpu {
bool ready; bool ready;
std::vector<vk_staging_memcpy> memcpys; std::vector<vk_staging_memcpy> memcpys;
std::vector<vk_sequence> in0_seqs; std::vector<vk_sequence> in_seqs;
std::vector<vk_sequence> in1_seqs;
std::vector<vk_sequence> comp_seqs; std::vector<vk_sequence> comp_seqs;
std::vector<vk_sequence> out_seqs; std::vector<vk_sequence> out_seqs;
@ -972,29 +969,17 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
// Try to find a non-graphics compute queue and transfer-focused queues // Try to find a non-graphics compute queue and transfer-focused queues
const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1); const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1);
const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 2); const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1);
uint32_t transfer_queue_count = VK_TRANSFER_QUEUE_COUNT;
// If not enough transfer queues are available
if (transfer_queue_count > queue_family_props[transfer_queue_family_index].queueCount) {
// If compute and transfer queues are same family
if (compute_queue_family_index == transfer_queue_family_index) {
transfer_queue_count = queue_family_props[transfer_queue_family_index].queueCount - 1;
} else {
transfer_queue_count = queue_family_props[transfer_queue_family_index].queueCount;
}
}
const float compute_queue_priority = 1.0f; const float compute_queue_priority = 1.0f;
const float transfer_queue_priority[] = { 1.0f, 1.0f, 1.0f }; const float transfer_queue_priority = 1.0f;
std::vector<vk::DeviceQueueCreateInfo> device_queue_create_infos; std::vector<vk::DeviceQueueCreateInfo> device_queue_create_infos;
if (compute_queue_family_index != transfer_queue_family_index) { if (compute_queue_family_index != transfer_queue_family_index) {
device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, &compute_queue_priority}); device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, &compute_queue_priority});
GGML_ASSERT(transfer_queue_count > 0); device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, &transfer_queue_priority});
device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, transfer_queue_count, transfer_queue_priority});
} else { } else {
device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1 + transfer_queue_count, transfer_queue_priority}); const float priorities[] = { compute_queue_priority, transfer_queue_priority };
device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities});
} }
vk::DeviceCreateInfo device_create_info; vk::DeviceCreateInfo device_create_info;
std::vector<const char *> device_extensions; std::vector<const char *> device_extensions;
@ -1051,16 +1036,10 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
ggml_vk_load_shaders(); ggml_vk_load_shaders();
// Queues // Queues
uint32_t queue_index_offset = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
vk_device.compute_queue = ggml_vk_create_queue(compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }); vk_device.compute_queue = ggml_vk_create_queue(compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer });
for (int i = 0; i < VK_TRANSFER_QUEUE_COUNT; i++) { vk_device.transfer_queue = ggml_vk_create_queue(transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer });
if (transfer_queue_count > 0) {
vk_device.transfer_queues[i] = ggml_vk_create_queue(transfer_queue_family_index, (queue_index_offset + i) % transfer_queue_count, { vk::PipelineStageFlagBits::eTransfer });
} else {
vk_device.transfer_queues[i] = vk_device.compute_queue;
}
}
vk_fence = vk_device.device.createFence({}); vk_fence = vk_device.device.createFence({});
@ -1988,8 +1967,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
const int64_t r3 = ne13 / ne03; const int64_t r3 = ne13 / ne03;
vk_queue& compq = vk_device.compute_queue; vk_queue& compq = vk_device.compute_queue;
vk_queue& tr0q = vk_device.transfer_queues[0]; vk_queue& trq = vk_device.transfer_queue;
vk_queue& tr1q = vk_device.transfer_queues[1];
const bool load_x = src0->backend != GGML_BACKEND_GPU; const bool load_x = src0->backend != GGML_BACKEND_GPU;
const bool load_y = src1->backend != GGML_BACKEND_GPU; const bool load_y = src1->backend != GGML_BACKEND_GPU;
@ -2128,7 +2106,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
if (load_x) { if (load_x) {
// copy data to device // copy data to device
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, qx_offset, src0, i03, i02, ne01, tr0q, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); extra->in_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, qx_offset, src0, i03, i02, ne01, trq, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys));
} }
if (qx_needs_dequant) { if (qx_needs_dequant) {
@ -2190,7 +2168,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
if (y_non_contig) { if (y_non_contig) {
mm_semaphores.push_back(y_semaphore); mm_semaphores.push_back(y_semaphore);
} else if (load_y) { } else if (load_y) {
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, ne11, tr1q, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); extra->in_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qy, qy_offset, src1, i13, i12, ne11, trq, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys));
mm_semaphores.push_back({ sem->s, sem->value + 1 }); mm_semaphores.push_back({ sem->s, sem->value + 1 });
} }
@ -2200,7 +2178,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
if (dst->backend == GGML_BACKEND_CPU) { if (dst->backend == GGML_BACKEND_CPU) {
// copy dst to host // copy dst to host
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, tr1q, { { sem->s, sem->value + 2 } }, {})); extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_offset, d, sizeof(float) * d_ne, trq, { { sem->s, sem->value + 2 } }, {}));
} }
sem->value += 2; sem->value += 2;
@ -2242,7 +2220,7 @@ static void ggml_vk_mul_mat_vec_q_f16(const ggml_tensor * src0, const ggml_tenso
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
vk_queue& compq = vk_device.compute_queue; vk_queue& compq = vk_device.compute_queue;
vk_queue& tr0q = vk_device.transfer_queues[0]; vk_queue& trq = vk_device.transfer_queue;
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
const bool qx_needs_dequant = x_non_contig; const bool qx_needs_dequant = x_non_contig;
@ -2340,7 +2318,7 @@ static void ggml_vk_mul_mat_vec_q_f16(const ggml_tensor * src0, const ggml_tenso
vk_semaphore * sem = ggml_vk_create_timeline_semaphore(); vk_semaphore * sem = ggml_vk_create_timeline_semaphore();
// copy data to device // copy data to device
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, 0, src0, 0, 0, ggml_nrows(src0), tr0q, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys)); extra->in_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Qx, 0, src0, 0, 0, ggml_nrows(src0), trq, {}, { { sem->s, sem->value + 1 } }, nullptr, &extra->memcpys));
semaphores.push_back({ sem->s, sem->value + 1 }); semaphores.push_back({ sem->s, sem->value + 1 });
sem->value += 1; sem->value += 1;
@ -2691,7 +2669,7 @@ static void ggml_vk_op_repeat(const ggml_tensor * src0, const ggml_tensor * src1
} }
} }
vk_submission s = ggml_vk_begin_submission(vk_device.transfer_queues[0]); vk_submission s = ggml_vk_begin_submission(vk_device.transfer_queue);
vkCmdCopyBuffer(s.buffer, src_buf->buffer, dst_buf->buffer, copies.size(), copies.data()); vkCmdCopyBuffer(s.buffer, src_buf->buffer, dst_buf->buffer, copies.size(), copies.data());
ggml_vk_end_submission(s, {}, {}); ggml_vk_end_submission(s, {}, {});
extra->out_seqs.push_back({ s }); extra->out_seqs.push_back({ s });
@ -2910,13 +2888,13 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g
// copy src0 to device // copy src0 to device
if (transfer_src0) { if (transfer_src0) {
vk_semaphore * sem_x = ggml_vk_create_timeline_semaphore(); vk_semaphore * sem_x = ggml_vk_create_timeline_semaphore();
extra->in0_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, 0, src0, 0, 0, ggml_nrows(src0), vk_device.transfer_queues[0], {}, { { sem_x->s, sem_x->value + 1 } }, nullptr, &extra->memcpys)); extra->in_seqs.push_back(ggml_vk_h2d_tensor_2d(d_X, 0, src0, 0, 0, ggml_nrows(src0), vk_device.transfer_queue, {}, { { sem_x->s, sem_x->value + 1 } }, nullptr, &extra->memcpys));
transfer_semaphores.push_back({ sem_x->s, sem_x->value + 1}); transfer_semaphores.push_back({ sem_x->s, sem_x->value + 1});
sem_x->value += 1; sem_x->value += 1;
} }
if (transfer_src1) { if (transfer_src1) {
vk_semaphore * sem_y = ggml_vk_create_timeline_semaphore(); vk_semaphore * sem_y = ggml_vk_create_timeline_semaphore();
extra->in1_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, 0, src1, 0, 0, ggml_nrows(src1), vk_device.transfer_queues[1], {}, { { sem_y->s, sem_y->value + 1 } }, nullptr, &extra->memcpys)); extra->in_seqs.push_back(ggml_vk_h2d_tensor_2d(d_Y, 0, src1, 0, 0, ggml_nrows(src1), vk_device.transfer_queue, {}, { { sem_y->s, sem_y->value + 1 } }, nullptr, &extra->memcpys));
transfer_semaphores.push_back({ sem_y->s, sem_y->value + 1 }); transfer_semaphores.push_back({ sem_y->s, sem_y->value + 1 });
sem_y->value += 1; sem_y->value += 1;
} }
@ -2967,7 +2945,7 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g
extra->comp_seqs.push_back({ s }); extra->comp_seqs.push_back({ s });
// copy dst to host // copy dst to host
extra->out_seqs.push_back(ggml_vk_d2h_tensor_2d(d_D, 0, dst, vk_device.transfer_queues[1], { *fsem }, {})); extra->out_seqs.push_back(ggml_vk_d2h_tensor_2d(d_D, 0, dst, vk_device.transfer_queue, { *fsem }, {}));
} else if(dst->backend == GGML_BACKEND_CPU) { } else if(dst->backend == GGML_BACKEND_CPU) {
vk_semaphore * fsem = ggml_vk_create_binary_semaphore(); vk_semaphore * fsem = ggml_vk_create_binary_semaphore();
ggml_vk_end_submission(s, std::move(transfer_semaphores), { *fsem }); ggml_vk_end_submission(s, std::move(transfer_semaphores), { *fsem });
@ -2975,7 +2953,7 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g
// copy dst to host // copy dst to host
float * d = (float *) dst->data; float * d = (float *) dst->data;
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, 0, d, d_sz, vk_device.transfer_queues[1], { *fsem }, {})); extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, 0, d, d_sz, vk_device.transfer_queue, { *fsem }, {}));
} else { } else {
ggml_vk_end_submission(s, std::move(transfer_semaphores), {}); ggml_vk_end_submission(s, std::move(transfer_semaphores), {});
extra->comp_seqs.push_back({ s }); extra->comp_seqs.push_back({ s });
@ -3025,7 +3003,7 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g
extra->comp_seqs.push_back({ s }); extra->comp_seqs.push_back({ s });
// copy dst to host // copy dst to host
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz, vk_device.transfer_queues[1], { *fsem }, {})); extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz, vk_device.transfer_queue, { *fsem }, {}));
} else { } else {
ggml_vk_end_submission(s, std::move(transfer_semaphores), {}); ggml_vk_end_submission(s, std::move(transfer_semaphores), {});
extra->comp_seqs.push_back({ s }); extra->comp_seqs.push_back({ s });
@ -3127,7 +3105,7 @@ static void ggml_vk_nop(const ggml_tensor * src0, ggml_tensor * dst) {
ggml_vk_tensor_extra_gpu * extra = (ggml_vk_tensor_extra_gpu *) dst->extra; ggml_vk_tensor_extra_gpu * extra = (ggml_vk_tensor_extra_gpu *) dst->extra;
ggml_vk_tensor_extra_gpu * extra_src0 = (ggml_vk_tensor_extra_gpu *) src0->extra; ggml_vk_tensor_extra_gpu * extra_src0 = (ggml_vk_tensor_extra_gpu *) src0->extra;
vk_buffer * d_D = extra_src0->buffer_gpu; vk_buffer * d_D = extra_src0->buffer_gpu;
extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, 0, dst->data, d_D->size, vk_device.transfer_queues[1], {}, {})); extra->out_seqs.push_back(ggml_vk_buffer_read_async(d_D, 0, dst->data, d_D->size, vk_device.transfer_queue, {}, {}));
} }
} }
@ -3148,7 +3126,7 @@ static void ggml_vk_transform_tensor(void * data, ggml_tensor * tensor, bool buf
extra->buffer_gpu = new vk_buffer; extra->buffer_gpu = new vk_buffer;
*extra->buffer_gpu = ggml_vk_pool_malloc(size, vk::MemoryPropertyFlagBits::eDeviceLocal); *extra->buffer_gpu = ggml_vk_pool_malloc(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
ggml_vk_buffer_write(extra->buffer_gpu, 0, data, size, vk_device.transfer_queues[0]); ggml_vk_buffer_write(extra->buffer_gpu, 0, data, size, vk_device.transfer_queue);
extra->buffer_static = buffer_static; extra->buffer_static = buffer_static;
@ -3178,7 +3156,7 @@ void ggml_vk_assign_buffer(ggml_tensor * tensor) {
extra->buffer_gpu = new vk_buffer; extra->buffer_gpu = new vk_buffer;
*extra->buffer_gpu = ggml_vk_create_buffer(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal); *extra->buffer_gpu = ggml_vk_create_buffer(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal);
ggml_vk_buffer_memset(extra->buffer_gpu, 0, 0, VK_WHOLE_SIZE, vk_device.transfer_queues[0]); ggml_vk_buffer_memset(extra->buffer_gpu, 0, 0, VK_WHOLE_SIZE, vk_device.transfer_queue);
extra->buffer_static = true; extra->buffer_static = true;
} }
@ -3763,13 +3741,12 @@ bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor)
for (auto& cpy : extra->memcpys) { for (auto& cpy : extra->memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n); memcpy(cpy.dst, cpy.src, cpy.n);
} }
ggml_vk_submit(vk_device.transfer_queues[0], extra->in0_seqs, VK_NULL_HANDLE); ggml_vk_submit(vk_device.transfer_queue, extra->in_seqs, VK_NULL_HANDLE);
ggml_vk_submit(vk_device.transfer_queues[1], extra->in1_seqs, VK_NULL_HANDLE);
if (extra->out_seqs.empty()) { if (extra->out_seqs.empty()) {
ggml_vk_submit(vk_device.compute_queue, extra->comp_seqs, vk_fence); ggml_vk_submit(vk_device.compute_queue, extra->comp_seqs, vk_fence);
} else { } else {
ggml_vk_submit(vk_device.compute_queue, extra->comp_seqs, VK_NULL_HANDLE); ggml_vk_submit(vk_device.compute_queue, extra->comp_seqs, VK_NULL_HANDLE);
ggml_vk_submit(vk_device.transfer_queues[1], extra->out_seqs, vk_fence); ggml_vk_submit(vk_device.transfer_queue, extra->out_seqs, vk_fence);
} }
VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, uint64_t(-1)), "ggml_vk_compute_forward waitForFences"); VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, uint64_t(-1)), "ggml_vk_compute_forward waitForFences");
@ -3796,8 +3773,7 @@ void ggml_vk_graph_cleanup() {
vk_gc.pipelines.clear(); vk_gc.pipelines.clear();
ggml_vk_queue_cleanup(vk_device.compute_queue); ggml_vk_queue_cleanup(vk_device.compute_queue);
ggml_vk_queue_cleanup(vk_device.transfer_queues[0]); ggml_vk_queue_cleanup(vk_device.transfer_queue);
ggml_vk_queue_cleanup(vk_device.transfer_queues[1]);
for (size_t i = 0; i < vk_gc.semaphores.size(); i++) { for (size_t i = 0; i < vk_gc.semaphores.size(); i++) {
vk_device.device.destroySemaphore({ vk_gc.semaphores[i].s }); vk_device.device.destroySemaphore({ vk_gc.semaphores[i].s });
@ -3885,7 +3861,7 @@ void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
const size_t tensor_size = ggml_vk_tensor_size(tensor); const size_t tensor_size = ggml_vk_tensor_size(tensor);
tensor_data = malloc(tensor_size); tensor_data = malloc(tensor_size);
ggml_vk_buffer_read((vk_buffer *)tensor->data, 0, tensor_data, tensor_size, vk_device.transfer_queues[0]); ggml_vk_buffer_read((vk_buffer *)tensor->data, 0, tensor_data, tensor_size, vk_device.transfer_queue);
} }
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
@ -3997,7 +3973,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
for (int i3 = 0; i3 < src0->ne[3]; i3++) { for (int i3 = 0; i3 < src0->ne[3]; i3++) {
for (int i2 = 0; i2 < src0->ne[2]; i2++) { for (int i2 = 0; i2 < src0->ne[2]; i2++) {
const int idx = i3*src0->ne[2] + i2; const int idx = i3*src0->ne[2] + i2;
ggml_vk_buffer_read(extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1], vk_device.transfer_queues[0]); ggml_vk_buffer_read(extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1], vk_device.transfer_queue);
} }
} }
@ -4010,7 +3986,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
if (offset + src0_size >= extra->buffer_gpu->size) { if (offset + src0_size >= extra->buffer_gpu->size) {
src0_size = extra->buffer_gpu->size - offset; src0_size = extra->buffer_gpu->size - offset;
} }
ggml_vk_buffer_read(extra->buffer_gpu, offset, src0_clone->data, src0_size, vk_device.transfer_queues[0]); ggml_vk_buffer_read(extra->buffer_gpu, offset, src0_clone->data, src0_size, vk_device.transfer_queue);
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
} }
} else { } else {
@ -4040,7 +4016,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
for (int i3 = 0; i3 < src1->ne[3]; i3++) { for (int i3 = 0; i3 < src1->ne[3]; i3++) {
for (int i2 = 0; i2 < src1->ne[2]; i2++) { for (int i2 = 0; i2 < src1->ne[2]; i2++) {
const int idx = i3*src1->ne[2] + i2; const int idx = i3*src1->ne[2] + i2;
ggml_vk_buffer_read(extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1], vk_device.transfer_queues[0]); ggml_vk_buffer_read(extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1], vk_device.transfer_queue);
} }
} }
@ -4053,7 +4029,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
if (offset + src1_size >= extra->buffer_gpu->size) { if (offset + src1_size >= extra->buffer_gpu->size) {
src1_size = extra->buffer_gpu->size - offset; src1_size = extra->buffer_gpu->size - offset;
} }
ggml_vk_buffer_read(extra->buffer_gpu, offset, src1_clone->data, src1_size, vk_device.transfer_queues[0]); ggml_vk_buffer_read(extra->buffer_gpu, offset, src1_clone->data, src1_size, vk_device.transfer_queue);
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
} }
} else { } else {
@ -4209,7 +4185,7 @@ void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor)
tensor_size = extra->buffer_gpu->size - extra->offset; tensor_size = extra->buffer_gpu->size - extra->offset;
} }
ggml_vk_buffer_read(extra->buffer_gpu, extra->offset, tensor_data, tensor_size, vk_device.transfer_queues[0]); ggml_vk_buffer_read(extra->buffer_gpu, extra->offset, tensor_data, tensor_size, vk_device.transfer_queue);
} }
float first_error_result = -1.0f; float first_error_result = -1.0f;
@ -4356,9 +4332,9 @@ void ggml_vk_test_transfer(size_t ne) {
auto begin = std::chrono::high_resolution_clock::now(); auto begin = std::chrono::high_resolution_clock::now();
ggml_vk_buffer_write(&buffer, 0, x, sizeof(float) * ne, vk_device.transfer_queues[0]); ggml_vk_buffer_write(&buffer, 0, x, sizeof(float) * ne, vk_device.transfer_queue);
vk_device.transfer_queues[0].queue.waitIdle(); vk_device.transfer_queue.queue.waitIdle();
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
@ -4366,7 +4342,7 @@ void ggml_vk_test_transfer(size_t ne) {
begin = std::chrono::high_resolution_clock::now(); begin = std::chrono::high_resolution_clock::now();
ggml_vk_buffer_read(&buffer, 0, y, sizeof(float) * ne, vk_device.transfer_queues[1]); ggml_vk_buffer_read(&buffer, 0, y, sizeof(float) * ne, vk_device.transfer_queue);
end = std::chrono::high_resolution_clock::now(); end = std::chrono::high_resolution_clock::now();
@ -4483,13 +4459,13 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int sp
y[i] = 1.0f; // (rand() / (float)RAND_MAX) * 2.0f - 1.0f; y[i] = 1.0f; // (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
} }
seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(float) * k, sizeof(float) * k, m, vk_device.transfer_queues[0], {}, {})); seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(float) * k, sizeof(float) * k, m, vk_device.transfer_queue, {}, {}));
seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, vk_device.transfer_queues[0], {}, {})); seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, vk_device.transfer_queue, {}, {}));
ggml_vk_submit(vk_device.transfer_queues[0], seq, VK_NULL_HANDLE); ggml_vk_submit(vk_device.transfer_queue, seq, VK_NULL_HANDLE);
// Wait for transfers to finish // Wait for transfers to finish
vk_device.transfer_queues[0].queue.waitIdle(); vk_device.transfer_queue.queue.waitIdle();
auto begin = std::chrono::high_resolution_clock::now(); auto begin = std::chrono::high_resolution_clock::now();
@ -4504,7 +4480,7 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int sp
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
// copy dst to host // copy dst to host
ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queues[0]); ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queue);
float * d_chk = (float *) malloc(sizeof(float) * d_ne); float * d_chk = (float *) malloc(sizeof(float) * d_ne);
@ -4538,7 +4514,7 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int sp
if (split_k > 1) { if (split_k > 1) {
float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
ggml_vk_buffer_read(&vk_prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k, vk_device.transfer_queues[0]); ggml_vk_buffer_read(&vk_prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k, vk_device.transfer_queue);
std::cerr << "d_buf0: " << std::endl << std::endl; std::cerr << "d_buf0: " << std::endl << std::endl;
ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, n, m, 5, 5); ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, n, m, 5, 5);
@ -4558,8 +4534,7 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int sp
free(d_chk); free(d_chk);
ggml_vk_queue_cleanup(vk_device.transfer_queues[0]); ggml_vk_queue_cleanup(vk_device.transfer_queue);
ggml_vk_queue_cleanup(vk_device.transfer_queues[1]);
ggml_vk_queue_cleanup(vk_device.compute_queue); ggml_vk_queue_cleanup(vk_device.compute_queue);
ggml_vk_destroy_buffer(d_X); ggml_vk_destroy_buffer(d_X);
@ -4645,13 +4620,13 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int sp
y[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX); y[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX);
} }
seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, vk_device.transfer_queues[0], {}, {})); seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, vk_device.transfer_queue, {}, {}));
seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, n, vk_device.transfer_queues[0], {}, {})); seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, n, vk_device.transfer_queue, {}, {}));
ggml_vk_submit(vk_device.transfer_queues[0], seq, VK_NULL_HANDLE); ggml_vk_submit(vk_device.transfer_queue, seq, VK_NULL_HANDLE);
// Wait for transfers to finish // Wait for transfers to finish
vk_device.transfer_queues[0].queue.waitIdle(); vk_device.transfer_queue.queue.waitIdle();
auto begin = std::chrono::high_resolution_clock::now(); auto begin = std::chrono::high_resolution_clock::now();
@ -4666,7 +4641,7 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int sp
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
// copy dst to host // copy dst to host
ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queues[0]); ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queue);
float * fx = (float *) malloc(sizeof(float) * x_ne); float * fx = (float *) malloc(sizeof(float) * x_ne);
float * fy = (float *) malloc(sizeof(float) * y_ne); float * fy = (float *) malloc(sizeof(float) * y_ne);
@ -4695,8 +4670,7 @@ void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int sp
free(fy); free(fy);
free(d_chk); free(d_chk);
ggml_vk_queue_cleanup(vk_device.transfer_queues[0]); ggml_vk_queue_cleanup(vk_device.transfer_queue);
ggml_vk_queue_cleanup(vk_device.transfer_queues[1]);
ggml_vk_queue_cleanup(vk_device.compute_queue); ggml_vk_queue_cleanup(vk_device.compute_queue);
ggml_vk_destroy_buffer(d_X); ggml_vk_destroy_buffer(d_X);
@ -4783,13 +4757,13 @@ void ggml_vk_test_matmul_f16_f32(size_t m, size_t n, size_t k, size_t num_it, in
y[i] = rand() / (float)RAND_MAX; y[i] = rand() / (float)RAND_MAX;
} }
seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, vk_device.transfer_queues[0], {}, {})); seq.push_back(ggml_vk_buffer_write_2d_async(&d_X, 0, x, sizeof(ggml_fp16_t) * k, sizeof(ggml_fp16_t) * k, m, vk_device.transfer_queue, {}, {}));
seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, vk_device.transfer_queues[0], {}, {})); seq.push_back(ggml_vk_buffer_write_2d_async(&d_Y, 0, y, sizeof(float) * k, sizeof(float) * k, n, vk_device.transfer_queue, {}, {}));
ggml_vk_submit(vk_device.transfer_queues[0], seq, VK_NULL_HANDLE); ggml_vk_submit(vk_device.transfer_queue, seq, VK_NULL_HANDLE);
// Wait for transfers to finish // Wait for transfers to finish
vk_device.transfer_queues[0].queue.waitIdle(); vk_device.transfer_queue.queue.waitIdle();
auto begin = std::chrono::high_resolution_clock::now(); auto begin = std::chrono::high_resolution_clock::now();
@ -4804,7 +4778,7 @@ void ggml_vk_test_matmul_f16_f32(size_t m, size_t n, size_t k, size_t num_it, in
auto end = std::chrono::high_resolution_clock::now(); auto end = std::chrono::high_resolution_clock::now();
// copy dst to host // copy dst to host
ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queues[0]); ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne, vk_device.transfer_queue);
float * fx = (float *) malloc(sizeof(float) * x_ne); float * fx = (float *) malloc(sizeof(float) * x_ne);
float * d_chk = (float *) malloc(sizeof(float) * d_ne); float * d_chk = (float *) malloc(sizeof(float) * d_ne);
@ -4830,8 +4804,7 @@ void ggml_vk_test_matmul_f16_f32(size_t m, size_t n, size_t k, size_t num_it, in
free(fx); free(fx);
free(d_chk); free(d_chk);
ggml_vk_queue_cleanup(vk_device.transfer_queues[0]); ggml_vk_queue_cleanup(vk_device.transfer_queue);
ggml_vk_queue_cleanup(vk_device.transfer_queues[1]);
ggml_vk_queue_cleanup(vk_device.compute_queue); ggml_vk_queue_cleanup(vk_device.compute_queue);
ggml_vk_destroy_buffer(d_X); ggml_vk_destroy_buffer(d_X);