Fix synchronization on AMD, add barriers for buffer ownership transfer, add debug flag and prints
This commit is contained in:
parent
8dd585e8cb
commit
0c4d841cf1
2 changed files with 324 additions and 79 deletions
367
ggml-vulkan.cpp
367
ggml-vulkan.cpp
|
@ -71,9 +71,11 @@ struct vk_buffer {
|
||||||
// Staging buffers
|
// Staging buffers
|
||||||
vk_buffer * sb_write;
|
vk_buffer * sb_write;
|
||||||
vk_buffer * sb_read;
|
vk_buffer * sb_read;
|
||||||
|
uint32_t qf_owner;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct vk_pipeline {
|
struct vk_pipeline {
|
||||||
|
std::string name;
|
||||||
vk::DescriptorSetLayout dsl;
|
vk::DescriptorSetLayout dsl;
|
||||||
vk::DescriptorPool descriptor_pool;
|
vk::DescriptorPool descriptor_pool;
|
||||||
vk::DescriptorSet descriptor_set;
|
vk::DescriptorSet descriptor_set;
|
||||||
|
@ -121,7 +123,7 @@ uint32_t vk_device_vendor_id;
|
||||||
vk_queue vk_compute_queue;
|
vk_queue vk_compute_queue;
|
||||||
vk_queue vk_transfer_queues[VK_TRANSFER_QUEUE_COUNT];
|
vk_queue vk_transfer_queues[VK_TRANSFER_QUEUE_COUNT];
|
||||||
VmaAllocator vk_allocator;
|
VmaAllocator vk_allocator;
|
||||||
vk::PipelineStageFlags vk_stage_flags[8] = { vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands };
|
vk::PipelineStageFlags vk_stage_flags[8] = { vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader };
|
||||||
vk_pipeline vk_pipeline_matmul_f32_l, vk_pipeline_matmul_f32_m, vk_pipeline_matmul_f32_s, vk_pipeline_matmul_f16_l, vk_pipeline_matmul_f16_m, vk_pipeline_matmul_f16_s;
|
vk_pipeline vk_pipeline_matmul_f32_l, vk_pipeline_matmul_f32_m, vk_pipeline_matmul_f32_s, vk_pipeline_matmul_f16_l, vk_pipeline_matmul_f16_m, vk_pipeline_matmul_f16_s;
|
||||||
vk_pipeline vk_pipeline_matmul_split_k_reduce;
|
vk_pipeline vk_pipeline_matmul_split_k_reduce;
|
||||||
vk_pipeline vk_pipeline_f16_to_f32, vk_pipeline_dequant_q4_0;
|
vk_pipeline vk_pipeline_f16_to_f32, vk_pipeline_dequant_q4_0;
|
||||||
|
@ -133,11 +135,15 @@ bool vk_fp16_support = false;
|
||||||
static std::vector<std::tuple<void*, size_t, vk_buffer>> vk_buf_list;
|
static std::vector<std::tuple<void*, size_t, vk_buffer>> vk_buf_list;
|
||||||
|
|
||||||
static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_count, std::array<uint32_t, 3> wg_denoms, std::vector<int>&& specialization_constants) {
|
static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_count, std::array<uint32_t, 3> wg_denoms, std::vector<int>&& specialization_constants) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_create_pipeline(" << path << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_count << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants)" << std::endl;
|
||||||
|
#endif
|
||||||
GGML_ASSERT(parameter_count > 0);
|
GGML_ASSERT(parameter_count > 0);
|
||||||
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0);
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0);
|
||||||
|
|
||||||
vk_pipeline pipeline;
|
vk_pipeline pipeline;
|
||||||
|
|
||||||
|
pipeline.name = path;
|
||||||
pipeline.parameter_count = parameter_count;
|
pipeline.parameter_count = parameter_count;
|
||||||
pipeline.push_constant_size = push_constant_count * sizeof(int);
|
pipeline.push_constant_size = push_constant_count * sizeof(int);
|
||||||
pipeline.wg_denoms = wg_denoms;
|
pipeline.wg_denoms = wg_denoms;
|
||||||
|
@ -227,6 +233,9 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::s
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) {
|
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl;
|
||||||
|
#endif
|
||||||
if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
|
if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
|
||||||
// Reuse command buffer
|
// Reuse command buffer
|
||||||
return q.cmd_buffers[q.cmd_buffer_idx++];
|
return q.cmd_buffers[q.cmd_buffer_idx++];
|
||||||
|
@ -246,6 +255,9 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_create_submission()" << std::endl;
|
||||||
|
#endif
|
||||||
vk_submission s;
|
vk_submission s;
|
||||||
s.buffer = ggml_vk_create_cmd_buffer(q);
|
s.buffer = ggml_vk_create_cmd_buffer(q);
|
||||||
s.wait_semaphores = wait_semaphores;
|
s.wait_semaphores = wait_semaphores;
|
||||||
|
@ -254,10 +266,16 @@ static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector<vk::Sema
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_sequence ggml_vk_create_sequence_1(vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
static vk_sequence ggml_vk_create_sequence_1(vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_create_sequence_1()" << std::endl;
|
||||||
|
#endif
|
||||||
return { ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores)) };
|
return { ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores)) };
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk::Fence fence) {
|
static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk::Fence fence) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_submit()" << std::endl;
|
||||||
|
#endif
|
||||||
if (sequences.empty()) {
|
if (sequences.empty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -285,39 +303,10 @@ static void ggml_vk_submit(vk_queue& q, std::vector<vk_sequence>& sequences, vk:
|
||||||
sequences.clear();
|
sequences.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_submission ggml_vk_submit_pipeline(vk_pipeline& pipeline, std::vector<vk_buffer *> buffers, const std::vector<int>&& push_constants, std::array<uint32_t, 3> elements, vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
|
||||||
std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
|
|
||||||
std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
|
|
||||||
for (uint32_t i = 0; i < pipeline.parameter_count; i++) {
|
|
||||||
descriptor_buffer_infos.push_back({buffers[i]->buffer, 0, buffers[i]->size});
|
|
||||||
}
|
|
||||||
for (uint32_t i = 0; i < pipeline.parameter_count; i++) {
|
|
||||||
write_descriptor_sets.push_back({pipeline.descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]});
|
|
||||||
}
|
|
||||||
|
|
||||||
vk_device.updateDescriptorSets(write_descriptor_sets, {});
|
|
||||||
|
|
||||||
vk_submission s;
|
|
||||||
s.buffer = ggml_vk_create_cmd_buffer(q);
|
|
||||||
|
|
||||||
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
|
||||||
s.buffer.pushConstants<int>(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants);
|
|
||||||
s.buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline.pipeline);
|
|
||||||
s.buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
|
|
||||||
pipeline.layout,
|
|
||||||
0,
|
|
||||||
{ pipeline.descriptor_set },
|
|
||||||
{});
|
|
||||||
s.buffer.dispatch(CEIL_DIV(elements[0], pipeline.wg_denoms[0]), CEIL_DIV(elements[1], pipeline.wg_denoms[1]), CEIL_DIV(elements[2], pipeline.wg_denoms[2]));
|
|
||||||
s.buffer.end();
|
|
||||||
|
|
||||||
s.wait_semaphores = wait_semaphores;
|
|
||||||
s.signal_semaphores = signal_semaphores;
|
|
||||||
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
|
static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_find_queue_family_index()" << std::endl;
|
||||||
|
#endif
|
||||||
const uint32_t qfsize = queue_family_props.size();
|
const uint32_t qfsize = queue_family_props.size();
|
||||||
|
|
||||||
// Try with avoid preferences first
|
// Try with avoid preferences first
|
||||||
|
@ -341,6 +330,13 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fall back to ignoring min_num_queries
|
||||||
|
for (size_t i = 0; i < qfsize; i++) {
|
||||||
|
if (queue_family_props[i].queueFlags & required) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
|
std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
|
||||||
|
|
||||||
for(auto &q_family : queue_family_props) {
|
for(auto &q_family : queue_family_props) {
|
||||||
|
@ -350,6 +346,9 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_queue ggml_vk_create_queue(uint32_t queue_family_index, uint32_t queue_index) {
|
static vk_queue ggml_vk_create_queue(uint32_t queue_family_index, uint32_t queue_index) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_create_queue()" << std::endl;
|
||||||
|
#endif
|
||||||
vk_queue q;
|
vk_queue q;
|
||||||
q.queue_family_index = queue_family_index;
|
q.queue_family_index = queue_family_index;
|
||||||
|
|
||||||
|
@ -364,6 +363,9 @@ static vk_queue ggml_vk_create_queue(uint32_t queue_family_index, uint32_t queue
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk::Semaphore ggml_vk_create_semaphore(vk_queue& q) {
|
static vk::Semaphore ggml_vk_create_semaphore(vk_queue& q) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_create_semaphore()" << std::endl;
|
||||||
|
#endif
|
||||||
vk::Semaphore semaphore = vk_device.createSemaphore({});
|
vk::Semaphore semaphore = vk_device.createSemaphore({});
|
||||||
q.semaphores.push_back(semaphore);
|
q.semaphores.push_back(semaphore);
|
||||||
|
|
||||||
|
@ -371,6 +373,9 @@ static vk::Semaphore ggml_vk_create_semaphore(vk_queue& q) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_queue_cleanup(vk_queue& q) {
|
static void ggml_vk_queue_cleanup(vk_queue& q) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_queue_cleanup()" << std::endl;
|
||||||
|
#endif
|
||||||
q.queue.waitIdle();
|
q.queue.waitIdle();
|
||||||
|
|
||||||
for (auto semaphore : q.semaphores) {
|
for (auto semaphore : q.semaphores) {
|
||||||
|
@ -383,6 +388,9 @@ static void ggml_vk_queue_cleanup(vk_queue& q) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_buffer ggml_vk_create_buffer(size_t size, VmaAllocationCreateFlags alloc_flags, VmaMemoryUsage vma_usage, VkMemoryPropertyFlags req_flags = 0) {
|
static vk_buffer ggml_vk_create_buffer(size_t size, VmaAllocationCreateFlags alloc_flags, VmaMemoryUsage vma_usage, VkMemoryPropertyFlags req_flags = 0) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_create_buffer(" << size << ")" << std::endl;
|
||||||
|
#endif
|
||||||
vk_buffer buf;
|
vk_buffer buf;
|
||||||
|
|
||||||
buf.size = size;
|
buf.size = size;
|
||||||
|
@ -391,7 +399,7 @@ static vk_buffer ggml_vk_create_buffer(size_t size, VmaAllocationCreateFlags all
|
||||||
size,
|
size,
|
||||||
vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
|
vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
|
||||||
vk::SharingMode::eExclusive,
|
vk::SharingMode::eExclusive,
|
||||||
1,
|
0,
|
||||||
nullptr,
|
nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -412,10 +420,47 @@ static vk_buffer ggml_vk_create_buffer(size_t size, VmaAllocationCreateFlags all
|
||||||
buf.sb_write = nullptr;
|
buf.sb_write = nullptr;
|
||||||
buf.sb_read = nullptr;
|
buf.sb_read = nullptr;
|
||||||
|
|
||||||
|
buf.qf_owner = vk::QueueFamilyIgnored;
|
||||||
|
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_vk_sync_buffers(vk::CommandBuffer& cmd_buffer, std::vector<vk_buffer>&& buffers, vk_queue& q, vk::AccessFlags src_mask, vk::AccessFlags dst_mask) {
|
||||||
|
std::vector<vk::BufferMemoryBarrier> bmem_barriers;
|
||||||
|
|
||||||
|
uint32_t sfi;
|
||||||
|
uint32_t dfi;
|
||||||
|
|
||||||
|
for (auto& buf : buffers) {
|
||||||
|
if (buf.qf_owner != vk::QueueFamilyIgnored && buf.qf_owner != q.queue_family_index) {
|
||||||
|
sfi = buf.qf_owner;
|
||||||
|
dfi = q.queue_family_index;
|
||||||
|
buf.qf_owner = dfi;
|
||||||
|
} else {
|
||||||
|
sfi = vk::QueueFamilyIgnored;
|
||||||
|
dfi = vk::QueueFamilyIgnored;
|
||||||
|
}
|
||||||
|
bmem_barriers.push_back({ src_mask, dst_mask, sfi, dfi, buf.buffer, 0, VK_WHOLE_SIZE });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bmem_barriers.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd_buffer.pipelineBarrier(
|
||||||
|
vk::PipelineStageFlagBits::eComputeShader,
|
||||||
|
vk::PipelineStageFlagBits::eComputeShader,
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
bmem_barriers,
|
||||||
|
{}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_destroy_buffer(" << buf.size << ")" << std::endl;
|
||||||
|
#endif
|
||||||
buf.size = 0;
|
buf.size = 0;
|
||||||
PROFILE("ggml_vk_destroy_buffer",
|
PROFILE("ggml_vk_destroy_buffer",
|
||||||
vmaDestroyBuffer(vk_allocator, buf.buffer, buf.allocation);
|
vmaDestroyBuffer(vk_allocator, buf.buffer, buf.allocation);
|
||||||
|
@ -439,12 +484,17 @@ void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k, size_t num_it, int sp
|
||||||
void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size);
|
void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k, size_t num_it, int split_k, int shader_size);
|
||||||
|
|
||||||
void ggml_vk_init(void) {
|
void ggml_vk_init(void) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_init()" << std::endl;
|
||||||
|
#endif
|
||||||
char* GGML_VULKAN_DEVICE = getenv("GGML_VULKAN_DEVICE");
|
char* GGML_VULKAN_DEVICE = getenv("GGML_VULKAN_DEVICE");
|
||||||
int dev_num = (GGML_VULKAN_DEVICE == NULL ? 0 : atoi(GGML_VULKAN_DEVICE));
|
int dev_num = (GGML_VULKAN_DEVICE == NULL ? 0 : atoi(GGML_VULKAN_DEVICE));
|
||||||
|
|
||||||
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
||||||
const std::vector<const char*> layers = {
|
const std::vector<const char*> layers = {
|
||||||
// "VK_LAYER_KHRONOS_validation",
|
#ifdef VK_VALIDATE
|
||||||
|
"VK_LAYER_KHRONOS_validation",
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers.size(), layers.data());
|
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers.size(), layers.data());
|
||||||
vk_instance = vk::createInstance(instance_create_info);
|
vk_instance = vk::createInstance(instance_create_info);
|
||||||
|
@ -560,7 +610,7 @@ void ggml_vk_init(void) {
|
||||||
vk_pipeline_matmul_split_k_reduce = ggml_vk_create_pipeline("vk_shaders/matmul_split_k_reduce.spv", "main", 1, 3, {32, 32, 1}, {});
|
vk_pipeline_matmul_split_k_reduce = ggml_vk_create_pipeline("vk_shaders/matmul_split_k_reduce.spv", "main", 1, 3, {32, 32, 1}, {});
|
||||||
|
|
||||||
vk_pipeline_f16_to_f32 = ggml_vk_create_pipeline("vk_shaders/f16_to_f32.spv", "main", 2, 1, {64, 1, 1}, {});
|
vk_pipeline_f16_to_f32 = ggml_vk_create_pipeline("vk_shaders/f16_to_f32.spv", "main", 2, 1, {64, 1, 1}, {});
|
||||||
vk_pipeline_dequant_q4_0 = ggml_vk_create_pipeline("vk_shaders/dequant_q4_0.spv", "main", 2, 1, {32, 1, 1}, {});
|
vk_pipeline_dequant_q4_0 = ggml_vk_create_pipeline("vk_shaders/dequant_q4_0.spv", "main", 2, 1, {256*32, 1, 1}, {}); // Group size * values per quant group
|
||||||
|
|
||||||
// Queues
|
// Queues
|
||||||
vk_compute_queue = ggml_vk_create_queue(compute_queue_family_index, 0);
|
vk_compute_queue = ggml_vk_create_queue(compute_queue_family_index, 0);
|
||||||
|
@ -612,6 +662,9 @@ void ggml_vk_init(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_pipeline* ggml_get_to_fp32_vk(ggml_type type) {
|
static vk_pipeline* ggml_get_to_fp32_vk(ggml_type type) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_get_to_fp32_vk()" << std::endl;
|
||||||
|
#endif
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
return &vk_pipeline_dequant_q4_0;
|
return &vk_pipeline_dequant_q4_0;
|
||||||
|
@ -651,6 +704,9 @@ static vk_buffer g_vk_buffer_pool[MAX_VK_BUFFERS];
|
||||||
static std::atomic_flag g_vk_pool_lock = ATOMIC_FLAG_INIT;
|
static std::atomic_flag g_vk_pool_lock = ATOMIC_FLAG_INIT;
|
||||||
|
|
||||||
static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf, VmaAllocationCreateFlags alloc_flags) {
|
static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf, VmaAllocationCreateFlags alloc_flags) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_pool_malloc(" << size << ")" << std::endl;
|
||||||
|
#endif
|
||||||
PROFILE("ggml_vk_pool_malloc",
|
PROFILE("ggml_vk_pool_malloc",
|
||||||
scoped_spin_lock lock(g_vk_pool_lock);
|
scoped_spin_lock lock(g_vk_pool_lock);
|
||||||
|
|
||||||
|
@ -687,6 +743,9 @@ static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf, VmaAllocationCreate
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_pool_free(vk_buffer& buffer) {
|
static void ggml_vk_pool_free(vk_buffer& buffer) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_pool_free(" << buffer.size << ")" << std::endl;
|
||||||
|
#endif
|
||||||
PROFILE("ggml_vk_pool_free",
|
PROFILE("ggml_vk_pool_free",
|
||||||
scoped_spin_lock lock(g_vk_pool_lock);
|
scoped_spin_lock lock(g_vk_pool_lock);
|
||||||
|
|
||||||
|
@ -694,6 +753,8 @@ static void ggml_vk_pool_free(vk_buffer& buffer) {
|
||||||
vk_buffer& b = g_vk_buffer_pool[i];
|
vk_buffer& b = g_vk_buffer_pool[i];
|
||||||
if (b.size == 0) {
|
if (b.size == 0) {
|
||||||
b = buffer;
|
b = buffer;
|
||||||
|
// Set owning queue family index to ignored to avoid synchronization on next use
|
||||||
|
b.qf_owner = VK_QUEUE_FAMILY_IGNORED;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -703,6 +764,9 @@ static void ggml_vk_pool_free(vk_buffer& buffer) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void* ggml_vk_host_malloc(size_t size) {
|
void* ggml_vk_host_malloc(size_t size) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
||||||
|
#endif
|
||||||
if (getenv("GGML_VK_NO_PINNED") != nullptr) {
|
if (getenv("GGML_VK_NO_PINNED") != nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -726,6 +790,9 @@ void* ggml_vk_host_malloc(size_t size) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vk_host_free(void* ptr) {
|
void ggml_vk_host_free(void* ptr) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_host_free()" << std::endl;
|
||||||
|
#endif
|
||||||
vk_buffer* buf = nullptr;
|
vk_buffer* buf = nullptr;
|
||||||
for (size_t i = 0; i < vk_buf_list.size(); i++) {
|
for (size_t i = 0; i < vk_buf_list.size(); i++) {
|
||||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_buf_list[i]);
|
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_buf_list[i]);
|
||||||
|
@ -743,7 +810,55 @@ void ggml_vk_host_free(void* ptr) {
|
||||||
ggml_vk_destroy_buffer(*buf);
|
ggml_vk_destroy_buffer(*buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static vk_submission ggml_vk_begin_submission(vk_queue& q) {
|
||||||
|
vk_submission s;
|
||||||
|
s.buffer = ggml_vk_create_cmd_buffer(q);
|
||||||
|
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_vk_dispatch_pipeline(vk_submission& s, vk_pipeline& pipeline, std::vector<vk_buffer>&& buffers, const std::vector<int>&& push_constants, std::array<uint32_t, 3> elements, vk_queue& q) {
|
||||||
|
uint32_t wg0 = CEIL_DIV(elements[0], pipeline.wg_denoms[0]);
|
||||||
|
uint32_t wg1 = CEIL_DIV(elements[1], pipeline.wg_denoms[1]);
|
||||||
|
uint32_t wg2 = CEIL_DIV(elements[2], pipeline.wg_denoms[2]);
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline.name << ", (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl;
|
||||||
|
#endif
|
||||||
|
std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
|
||||||
|
std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
|
||||||
|
for (uint32_t i = 0; i < pipeline.parameter_count; i++) {
|
||||||
|
descriptor_buffer_infos.push_back({buffers[i].buffer, 0, buffers[i].size});
|
||||||
|
}
|
||||||
|
for (uint32_t i = 0; i < pipeline.parameter_count; i++) {
|
||||||
|
write_descriptor_sets.push_back({pipeline.descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]});
|
||||||
|
}
|
||||||
|
|
||||||
|
vk_device.updateDescriptorSets(write_descriptor_sets, {});
|
||||||
|
|
||||||
|
ggml_vk_sync_buffers(s.buffer, std::move(buffers), q, vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eMemoryRead);
|
||||||
|
|
||||||
|
s.buffer.pushConstants<int>(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants);
|
||||||
|
s.buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline.pipeline);
|
||||||
|
s.buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
|
||||||
|
pipeline.layout,
|
||||||
|
0,
|
||||||
|
{ pipeline.descriptor_set },
|
||||||
|
{});
|
||||||
|
s.buffer.dispatch(wg0, wg1, wg2);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_vk_end_submission(vk_submission& s, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
||||||
|
s.buffer.end();
|
||||||
|
|
||||||
|
s.wait_semaphores = wait_semaphores;
|
||||||
|
s.signal_semaphores = signal_semaphores;
|
||||||
|
}
|
||||||
|
|
||||||
static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl;
|
||||||
|
#endif
|
||||||
VkMemoryPropertyFlags mem_prop_flags;
|
VkMemoryPropertyFlags mem_prop_flags;
|
||||||
vmaGetAllocationMemoryProperties(vk_allocator, dst->allocation, &mem_prop_flags);
|
vmaGetAllocationMemoryProperties(vk_allocator, dst->allocation, &mem_prop_flags);
|
||||||
|
|
||||||
|
@ -778,6 +893,7 @@ static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset,
|
||||||
}
|
}
|
||||||
|
|
||||||
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||||
|
ggml_vk_sync_buffers(s.buffer, { *dst }, q, vk::AccessFlagBits::eMemoryRead, vk::AccessFlagBits::eMemoryWrite);
|
||||||
vkCmdCopyBuffer(s.buffer, buf->buffer, dst->buffer, height, slices.data());
|
vkCmdCopyBuffer(s.buffer, buf->buffer, dst->buffer, height, slices.data());
|
||||||
s.buffer.end();
|
s.buffer.end();
|
||||||
return { s };
|
return { s };
|
||||||
|
@ -799,6 +915,7 @@ static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset,
|
||||||
width * height};
|
width * height};
|
||||||
|
|
||||||
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||||
|
ggml_vk_sync_buffers(s.buffer, { *dst }, q, vk::AccessFlagBits::eMemoryRead, vk::AccessFlagBits::eMemoryWrite);
|
||||||
vkCmdCopyBuffer(s.buffer, dst->sb_write->buffer, dst->buffer, 1, &buf_copy);
|
vkCmdCopyBuffer(s.buffer, dst->sb_write->buffer, dst->buffer, 1, &buf_copy);
|
||||||
s.buffer.end();
|
s.buffer.end();
|
||||||
|
|
||||||
|
@ -810,6 +927,9 @@ static vk_sequence ggml_vk_buffer_write_2d_async(vk_buffer* dst, size_t offset,
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q) {
|
static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, vk_queue& q) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl;
|
||||||
|
#endif
|
||||||
VkMemoryPropertyFlags mem_prop_flags;
|
VkMemoryPropertyFlags mem_prop_flags;
|
||||||
vmaGetAllocationMemoryProperties(vk_allocator, dst->allocation, &mem_prop_flags);
|
vmaGetAllocationMemoryProperties(vk_allocator, dst->allocation, &mem_prop_flags);
|
||||||
|
|
||||||
|
@ -831,14 +951,23 @@ static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void *
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_sequence ggml_vk_buffer_write_async(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
static vk_sequence ggml_vk_buffer_write_async(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
|
||||||
|
#endif
|
||||||
return ggml_vk_buffer_write_2d_async(dst, offset, src, 0, size, 1, q, std::move(wait_semaphores), std::move(signal_semaphores));
|
return ggml_vk_buffer_write_2d_async(dst, offset, src, 0, size, 1, q, std::move(wait_semaphores), std::move(signal_semaphores));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q) {
|
static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl;
|
||||||
|
#endif
|
||||||
ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1, q);
|
ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1, q);
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_buffer_read_async(" << size << ")" << std::endl;
|
||||||
|
#endif
|
||||||
// Check if dst is pinned memory
|
// Check if dst is pinned memory
|
||||||
vk_buffer* buf = nullptr;
|
vk_buffer* buf = nullptr;
|
||||||
size_t buf_offset = 0;
|
size_t buf_offset = 0;
|
||||||
|
@ -864,6 +993,7 @@ static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void
|
||||||
|
|
||||||
vk_submission s = ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores));
|
vk_submission s = ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores));
|
||||||
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||||
|
ggml_vk_sync_buffers(s.buffer, { *src }, q, vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eMemoryRead);
|
||||||
vkCmdCopyBuffer(s.buffer, src->buffer, buf->buffer, 1, &buf_copy);
|
vkCmdCopyBuffer(s.buffer, src->buffer, buf->buffer, 1, &buf_copy);
|
||||||
s.buffer.end();
|
s.buffer.end();
|
||||||
|
|
||||||
|
@ -871,6 +1001,9 @@ static vk_sequence ggml_vk_buffer_read_async(vk_buffer* src, size_t offset, void
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q) {
|
static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size, vk_queue& q) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_buffer_read(" << size << ")" << std::endl;
|
||||||
|
#endif
|
||||||
VkMemoryPropertyFlags mem_prop_flags;
|
VkMemoryPropertyFlags mem_prop_flags;
|
||||||
vmaGetAllocationMemoryProperties(vk_allocator, src->allocation, &mem_prop_flags);
|
vmaGetAllocationMemoryProperties(vk_allocator, src->allocation, &mem_prop_flags);
|
||||||
|
|
||||||
|
@ -902,6 +1035,7 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
|
||||||
|
|
||||||
std::vector<vk_sequence> s = { ggml_vk_create_sequence_1(q, {}, {}) };
|
std::vector<vk_sequence> s = { ggml_vk_create_sequence_1(q, {}, {}) };
|
||||||
s[0][0].buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
s[0][0].buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||||
|
ggml_vk_sync_buffers(s[0][0].buffer, { *src }, q, vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eMemoryRead);
|
||||||
vkCmdCopyBuffer(s[0][0].buffer, src->buffer, buf->buffer, 1, &buf_copy);
|
vkCmdCopyBuffer(s[0][0].buffer, src->buffer, buf->buffer, 1, &buf_copy);
|
||||||
s[0][0].buffer.end();
|
s[0][0].buffer.end();
|
||||||
ggml_vk_submit(q, s, fence);
|
ggml_vk_submit(q, s, fence);
|
||||||
|
@ -926,6 +1060,7 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
|
||||||
vk::CommandBuffer cmd_buffer = ggml_vk_create_cmd_buffer(q);
|
vk::CommandBuffer cmd_buffer = ggml_vk_create_cmd_buffer(q);
|
||||||
vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
||||||
cmd_buffer.begin(cmd_buffer_begin_info);
|
cmd_buffer.begin(cmd_buffer_begin_info);
|
||||||
|
ggml_vk_sync_buffers(cmd_buffer, { *src }, q, vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eMemoryRead);
|
||||||
vkCmdCopyBuffer(cmd_buffer, src->buffer, src->sb_read->buffer, 1, &buf_copy);
|
vkCmdCopyBuffer(cmd_buffer, src->buffer, src->sb_read->buffer, 1, &buf_copy);
|
||||||
cmd_buffer.end();
|
cmd_buffer.end();
|
||||||
|
|
||||||
|
@ -945,6 +1080,9 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_sequence ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
static vk_sequence ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_h2d_tensor_2d()" << std::endl;
|
||||||
|
#endif
|
||||||
const uint64_t ne0 = src->ne[0];
|
const uint64_t ne0 = src->ne[0];
|
||||||
const uint64_t ne1 = src->ne[1];
|
const uint64_t ne1 = src->ne[1];
|
||||||
const uint64_t nb0 = src->nb[0];
|
const uint64_t nb0 = src->nb[0];
|
||||||
|
@ -975,6 +1113,9 @@ static vk_sequence ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const st
|
||||||
}
|
}
|
||||||
|
|
||||||
static int ggml_vk_guess_split_k(int m, int n, int k) {
|
static int ggml_vk_guess_split_k(int m, int n, int k) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_guess_split_k()" << std::endl;
|
||||||
|
#endif
|
||||||
if (k > 128 && (m < 128 || n < 128)) {
|
if (k > 128 && (m < 128 || n < 128)) {
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
|
@ -983,6 +1124,9 @@ static int ggml_vk_guess_split_k(int m, int n, int k) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16, int m, int n) {
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16, int m, int n) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_guess_matmul_pipeline()" << std::endl;
|
||||||
|
#endif
|
||||||
if (bit16) {
|
if (bit16) {
|
||||||
if (m <= 32 || n <= 32) {
|
if (m <= 32 || n <= 32) {
|
||||||
return &vk_pipeline_matmul_f16_s;
|
return &vk_pipeline_matmul_f16_s;
|
||||||
|
@ -1003,20 +1147,37 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16, int m, int n) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk_sequence ggml_vk_matmul(vk_pipeline& pipeline, vk_buffer& a, vk_buffer& b, vk_buffer& d, int m, int n, int k, int split_k, vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
static vk_sequence ggml_vk_matmul(vk_pipeline& pipeline, vk_buffer& a, vk_buffer& b, vk_buffer& d, int m, int n, int k, int split_k, vk_queue& q, std::vector<vk::Semaphore>&& wait_semaphores, std::vector<vk::Semaphore>&& signal_semaphores) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_matmul(" << m << ", " << n << ", " << k << ")" << std::endl;
|
||||||
|
#endif
|
||||||
|
vk_submission s = ggml_vk_begin_submission(q);
|
||||||
if (split_k == 1) {
|
if (split_k == 1) {
|
||||||
return { ggml_vk_submit_pipeline(pipeline, { &a, &b, &d }, { m, n, k, k, k, m, k }, { (uint32_t)m, (uint32_t)n, 1 }, q, std::move(wait_semaphores), std::move(signal_semaphores)) };
|
ggml_vk_dispatch_pipeline(s, pipeline, { a, b, d }, { m, n, k, k, k, m, k }, { (uint32_t)m, (uint32_t)n, 1 }, q);
|
||||||
|
ggml_vk_end_submission(s, std::move(wait_semaphores), std::move(signal_semaphores));
|
||||||
|
return { s };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Synchronize the two submissions
|
// Synchronize the two submissions
|
||||||
vk::Semaphore semaphore = ggml_vk_create_semaphore(q);
|
ggml_vk_dispatch_pipeline(s, pipeline, { a, b, d }, { m, n, k, k, k, m, CEIL_DIV(k, split_k) }, { (uint32_t)m * split_k, (uint32_t)n, 1 }, q);
|
||||||
|
s.buffer.pipelineBarrier(
|
||||||
|
vk::PipelineStageFlagBits::eComputeShader,
|
||||||
|
vk::PipelineStageFlagBits::eComputeShader,
|
||||||
|
{},
|
||||||
|
{},
|
||||||
|
{
|
||||||
|
{ vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite, vk::QueueFamilyIgnored, vk::QueueFamilyIgnored, d.buffer, 0, VK_WHOLE_SIZE } },
|
||||||
|
{}
|
||||||
|
);
|
||||||
|
ggml_vk_dispatch_pipeline(s, vk_pipeline_matmul_split_k_reduce, { d }, { m, n, split_k }, { (uint32_t)m, (uint32_t)n, 1 }, q);
|
||||||
|
ggml_vk_end_submission(s, std::move(wait_semaphores), std::move(signal_semaphores));
|
||||||
|
|
||||||
vk_submission s1 = ggml_vk_submit_pipeline(pipeline, { &a, &b, &d }, { m, n, k, k, k, m, CEIL_DIV(k, split_k) }, { (uint32_t)m * split_k, (uint32_t)n, 1 }, q, std::move(wait_semaphores), { semaphore });
|
return { s };
|
||||||
vk_submission s2 = ggml_vk_submit_pipeline(vk_pipeline_matmul_split_k_reduce, { &d }, { m, n, split_k }, { (uint32_t)m, (uint32_t)n, 1 }, q, { semaphore }, std::move(signal_semaphores));
|
|
||||||
|
|
||||||
return { s1, s2 };
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_mul_mat_f32()" << std::endl;
|
||||||
|
#endif
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t ne01 = src0->ne[1];
|
const int64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const int64_t ne02 = src0->ne[2];
|
||||||
|
@ -1050,26 +1211,55 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
std::vector<vk_sequence> transfer_0_seqs;
|
std::vector<vk_sequence> transfer_0_seqs;
|
||||||
std::vector<vk_sequence> transfer_1_seqs;
|
std::vector<vk_sequence> transfer_1_seqs;
|
||||||
|
|
||||||
|
vk::Semaphore s_it_x;
|
||||||
|
vk::Semaphore s_it_y;
|
||||||
|
|
||||||
|
const bool load_x = src0->backend != GGML_BACKEND_CPU;
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
|
const bool first = i03 == 0 && i02 == 0;
|
||||||
|
const bool last = i03 == ne03 - 1 && i02 == ne02 - 1;
|
||||||
|
|
||||||
vk::Semaphore s_x;
|
vk::Semaphore s_x;
|
||||||
vk::Semaphore s_y = ggml_vk_create_semaphore(vk_compute_queue);
|
vk::Semaphore s_y = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
std::vector<vk::Semaphore> semaphores = { s_y };
|
std::vector<vk::Semaphore> semaphores = { s_y };
|
||||||
// copy data to device
|
// copy data to device
|
||||||
if (src0->backend != GGML_BACKEND_GPU) {
|
if (load_x) {
|
||||||
s_x = ggml_vk_create_semaphore(vk_compute_queue);
|
s_x = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
semaphores.push_back(s_x);
|
semaphores.push_back(s_x);
|
||||||
transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0], {}, { s_x }));
|
if (first) {
|
||||||
|
transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0], {}, { s_x }));
|
||||||
|
} else {
|
||||||
|
// Wait for previous matmul to be done before writing to the input buffers again
|
||||||
|
transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0], { s_it_x }, { s_x }));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE);
|
||||||
|
|
||||||
transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1], {}, { s_y }));
|
if (first) {
|
||||||
|
transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1], {}, { s_y }));
|
||||||
|
} else {
|
||||||
|
// Wait for previous matmul to be done before writing to the input buffers again
|
||||||
|
transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1], { s_it_y }, { s_y }));
|
||||||
|
}
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
vk::Semaphore s_mm = ggml_vk_create_semaphore(vk_compute_queue);
|
vk::Semaphore s_mm = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
|
|
||||||
compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm }));
|
if (!last) {
|
||||||
|
if (load_x) {
|
||||||
|
s_it_x = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
|
s_it_y = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
|
compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm, s_it_x, s_it_y }));
|
||||||
|
} else {
|
||||||
|
s_it_y = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
|
compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm, s_it_y }));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm }));
|
||||||
|
}
|
||||||
|
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||||
|
@ -1095,6 +1285,9 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata) {
|
static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_mul_mat_f16()" << std::endl;
|
||||||
|
#endif
|
||||||
GGML_ASSERT(vk_fp16_support);
|
GGML_ASSERT(vk_fp16_support);
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||||
|
@ -1141,17 +1334,30 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
std::vector<vk_sequence> transfer_0_seqs;
|
std::vector<vk_sequence> transfer_0_seqs;
|
||||||
std::vector<vk_sequence> transfer_1_seqs;
|
std::vector<vk_sequence> transfer_1_seqs;
|
||||||
|
|
||||||
|
vk::Semaphore s_it_x;
|
||||||
|
vk::Semaphore s_it_y;
|
||||||
|
|
||||||
|
const bool load_x = src1->backend != GGML_BACKEND_GPU;
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
|
const bool first = i03 == 0 && i02 == 0;
|
||||||
|
const bool last = i03 == ne03 - 1 && i02 == ne02 - 1;
|
||||||
|
|
||||||
vk::Semaphore s_x;
|
vk::Semaphore s_x;
|
||||||
vk::Semaphore s_y = ggml_vk_create_semaphore(vk_compute_queue);
|
vk::Semaphore s_y = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
std::vector<vk::Semaphore> semaphores = { s_y };
|
std::vector<vk::Semaphore> semaphores = { s_y };
|
||||||
|
|
||||||
// copy data to device
|
// copy data to device
|
||||||
if (src1->backend != GGML_BACKEND_GPU) {
|
if (load_x) {
|
||||||
s_x = ggml_vk_create_semaphore(vk_compute_queue);
|
s_x = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
semaphores.push_back(s_x);
|
semaphores.push_back(s_x);
|
||||||
transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0], {}, { s_x }));
|
if (first) {
|
||||||
|
transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0], {}, { s_x }));
|
||||||
|
} else {
|
||||||
|
// Wait for previous matmul to be done before writing to the input buffers again
|
||||||
|
transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_X, 0, src0, i03, i02, vk_transfer_queues[0], { s_it_x }, { s_x }));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE);
|
||||||
|
@ -1179,11 +1385,27 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
transfer_1_seqs.push_back(ggml_vk_buffer_write_async(&d_Y, 0, tmp, sizeof(ggml_fp16_t) * y_ne, vk_transfer_queues[1], {}, { s_y }));
|
if (first) {
|
||||||
|
transfer_1_seqs.push_back(ggml_vk_buffer_write_async(&d_Y, 0, tmp, sizeof(ggml_fp16_t) * y_ne, vk_transfer_queues[1], {}, { s_y }));
|
||||||
|
} else {
|
||||||
|
// Wait for previous matmul to be done before writing to the input buffers again
|
||||||
|
transfer_1_seqs.push_back(ggml_vk_buffer_write_async(&d_Y, 0, tmp, sizeof(ggml_fp16_t) * y_ne, vk_transfer_queues[1], { s_it_y }, { s_y }));
|
||||||
|
}
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
vk::Semaphore s_mm = ggml_vk_create_semaphore(vk_compute_queue);
|
vk::Semaphore s_mm = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm }));
|
if (!last) {
|
||||||
|
if (load_x) {
|
||||||
|
s_it_x = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
|
s_it_y = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
|
compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm, s_it_x, s_it_y }));
|
||||||
|
} else {
|
||||||
|
s_it_y = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
|
compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm, s_it_y }));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm }));
|
||||||
|
}
|
||||||
|
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||||
|
@ -1195,6 +1417,7 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE);
|
||||||
|
// vk_transfer_queues[0].queue.waitIdle();
|
||||||
|
|
||||||
// cleanup waits for the queue to be done
|
// cleanup waits for the queue to be done
|
||||||
ggml_vk_queue_cleanup(vk_transfer_queues[0]);
|
ggml_vk_queue_cleanup(vk_transfer_queues[0]);
|
||||||
|
@ -1209,6 +1432,9 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
#ifdef VK_DEBUG
|
||||||
|
std::cerr << "ggml_vk_mul_mat_q_f32()" << std::endl;
|
||||||
|
#endif
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t ne01 = src0->ne[1];
|
const int64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const int64_t ne02 = src0->ne[2];
|
||||||
|
@ -1251,8 +1477,16 @@ static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
std::vector<vk_sequence> transfer_0_seqs;
|
std::vector<vk_sequence> transfer_0_seqs;
|
||||||
std::vector<vk_sequence> transfer_1_seqs;
|
std::vector<vk_sequence> transfer_1_seqs;
|
||||||
|
|
||||||
|
vk::Semaphore s_it_x;
|
||||||
|
vk::Semaphore s_it_y;
|
||||||
|
|
||||||
|
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
|
const bool first = i03 == 0 && i02 == 0;
|
||||||
|
const bool last = i03 == ne03 - 1 && i02 == ne02 - 1;
|
||||||
|
|
||||||
vk::Semaphore s_x;
|
vk::Semaphore s_x;
|
||||||
vk::Semaphore s_y = ggml_vk_create_semaphore(vk_compute_queue);
|
vk::Semaphore s_y = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
vk::Semaphore s_q = ggml_vk_create_semaphore(vk_compute_queue);
|
vk::Semaphore s_q = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
|
@ -1263,10 +1497,15 @@ static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
vk::Semaphore s_mm = ggml_vk_create_semaphore(vk_compute_queue);
|
vk::Semaphore s_mm = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
|
|
||||||
// copy src0 to device if necessary
|
// copy src0 to device if necessary
|
||||||
if (src0->backend == GGML_BACKEND_CPU) {
|
if (load_x) {
|
||||||
s_x = ggml_vk_create_semaphore(vk_compute_queue);
|
s_x = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Q, 0, src0, i03, i02, vk_transfer_queues[0], {}, { s_x }));
|
|
||||||
q_semaphores.push_back(s_x);
|
q_semaphores.push_back(s_x);
|
||||||
|
if (first) {
|
||||||
|
transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Q, 0, src0, i03, i02, vk_transfer_queues[0], {}, { s_x }));
|
||||||
|
} else {
|
||||||
|
// Wait for previous dequant to be done before writing to the input buffers again
|
||||||
|
transfer_0_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Q, 0, src0, i03, i02, vk_transfer_queues[0], { s_it_x }, { s_x }));
|
||||||
|
}
|
||||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||||
d_Q = *(vk_buffer *) src0->data;
|
d_Q = *(vk_buffer *) src0->data;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1276,7 +1515,12 @@ static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE);
|
||||||
|
|
||||||
// copy src1 to device
|
// copy src1 to device
|
||||||
transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1], {}, { s_y }));
|
if (first) {
|
||||||
|
transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1], {}, { s_y }));
|
||||||
|
} else {
|
||||||
|
// Wait for previous matmul to be done before writing to the input buffers again
|
||||||
|
transfer_1_seqs.push_back(ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02, vk_transfer_queues[1], { s_it_y }, { s_y }));
|
||||||
|
}
|
||||||
|
|
||||||
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -1298,10 +1542,23 @@ static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
} else { // general dequantization kernel + VK matrix matrix multiplication
|
} else { // general dequantization kernel + VK matrix matrix multiplication
|
||||||
|
|
||||||
// convert src0 to fp32 on device
|
// convert src0 to fp32 on device
|
||||||
compute_seqs.push_back({ ggml_vk_submit_pipeline(*to_fp32_vk, {&d_Q, &d_X}, { (int)x_ne }, { (uint32_t)x_ne, 1, 1}, vk_compute_queue, std::move(q_semaphores), { s_q }) });
|
vk_submission s = ggml_vk_begin_submission(vk_compute_queue);
|
||||||
|
ggml_vk_dispatch_pipeline(s, *to_fp32_vk, {d_Q, d_X}, { (int)x_ne }, { (uint32_t)x_ne, 1, 1}, vk_compute_queue);
|
||||||
|
if (load_x && !last) {
|
||||||
|
s_it_x = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
|
ggml_vk_end_submission(s, std::move(q_semaphores), { s_q, s_it_x });
|
||||||
|
} else {
|
||||||
|
ggml_vk_end_submission(s, std::move(q_semaphores), { s_q });
|
||||||
|
}
|
||||||
|
compute_seqs.push_back({ s });
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm }));
|
if (!last) {
|
||||||
|
s_it_y = ggml_vk_create_semaphore(vk_compute_queue);
|
||||||
|
compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm, s_it_y }));
|
||||||
|
} else {
|
||||||
|
compute_seqs.push_back(ggml_vk_matmul(*pipeline, d_X, d_Y, d_D, ne01, ne11, ne10, split_k, vk_compute_queue, std::move(semaphores), { s_mm }));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#version 450
|
#version 450
|
||||||
|
|
||||||
|
#extension GL_EXT_control_flow_attributes : require
|
||||||
#extension GL_EXT_shader_16bit_storage : require
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
|
||||||
|
@ -7,7 +8,7 @@
|
||||||
#define QUANT_K 32
|
#define QUANT_K 32
|
||||||
#define QUANT_R 2
|
#define QUANT_R 2
|
||||||
|
|
||||||
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
struct block_q4_0
|
struct block_q4_0
|
||||||
{
|
{
|
||||||
|
@ -24,34 +25,21 @@ layout (push_constant) uniform parameter
|
||||||
} p;
|
} p;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const int idx = int(gl_GlobalInvocationID.x);
|
const int i = int(gl_GlobalInvocationID.x);
|
||||||
|
|
||||||
const int i = int(gl_WorkGroupID.x * gl_WorkGroupSize.x + gl_LocalInvocationID.x*2);
|
if (i >= p.N) {
|
||||||
|
|
||||||
if (idx >= p.N) {
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int qk = QUANT_K;
|
const block_q4_0 blk = x[i];
|
||||||
const int qr = QUANT_R;
|
|
||||||
|
|
||||||
const int ib = i/qk; // block index
|
const float d = float(blk.d);
|
||||||
const int iqs = (i%qk)/qr; // quant index
|
|
||||||
const int iybs = i - i%qk; // y block start index
|
|
||||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
|
||||||
|
|
||||||
// dequantize
|
[[unroll]] for (int j = 0; j < QUANT_K/2; ++j) {
|
||||||
float v0, v1;
|
const int x0 = (blk.qs[j] & 0x0F) - 8;
|
||||||
const float d = float(x[ib].d);
|
const int x1 = (blk.qs[j] >> 4) - 8;
|
||||||
|
|
||||||
const uint8_t vui = x[ib].qs[iqs];
|
y[i*QUANT_K + j + 0 ] = x0*d;
|
||||||
|
y[i*QUANT_K + j + QUANT_K/2] = x1*d;
|
||||||
const int8_t vi0 = int8_t(vui & 0xF);
|
}
|
||||||
const int8_t vi1 = int8_t(vui >> 4);
|
|
||||||
|
|
||||||
v0 = (vi0 - 8)*d;
|
|
||||||
v1 = (vi1 - 8)*d;
|
|
||||||
|
|
||||||
y[iybs + iqs + 0] = v0;
|
|
||||||
y[iybs + iqs + y_offset] = v1;
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue