From 3adc7b1d60376295cd591aa16ad4a48c3d021aa6 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 28 Jun 2023 07:36:56 +0200
Subject: [PATCH] First FP16 attempt, disabled for now

---
 Makefile                                      |   3 +-
 ggml-vulkan.cpp                               | 636 +++++++++++++++---
 vk_shaders/matmul_f16.spv                     | Bin 0 -> 2144 bytes
 .../matmul_f32.glsl                           |   0
 4 files changed, 547 insertions(+), 92 deletions(-)
 create mode 100644 vk_shaders/matmul_f16.spv
 rename ggml-vulkan-matmul.comp => vk_shaders/matmul_f32.glsl (100%)
diff --git a/Makefile b/Makefile
index e6d8f9e00..d176453a2 100644
--- a/Makefile
+++ b/Makefile
@@ -220,7 +220,8 @@ ifdef LLAMA_VULKAN
 	OBJS    += ggml-vulkan.o
 ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-	glslc -fshader-stage=compute --target-env=vulkan1.2 -O ggml-vulkan-matmul.comp -o ggml-vulkan-matmul.spv
+	glslc -fshader-stage=compute --target-env=vulkan1.2 -O vk_shaders/matmul_f32.glsl -o vk_shaders/matmul_f32.spv
+	glslc -fshader-stage=compute --target-env=vulkan1.2 -O vk_shaders/matmul_f16.glsl -o vk_shaders/matmul_f16.spv
 endif
 
 ifneq ($(filter aarch64%,$(UNAME_M)),)
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 6832653c4..9a6cf68dc 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -3,6 +3,7 @@
 #ifdef VK_CHK_KERNEL
 #include <cblas.h>
 #include <cmath>
+#include <chrono>
 #endif
 
 #include <vulkan/vulkan.hpp>
@@ -34,7 +35,6 @@ inline static void* ggml_aligned_malloc(size_t size, size_t alignment) {
 #include <fstream>
 #include <iostream>
 #include <limits>
-#include <chrono>
 #include <tuple>
 #include <vector>
 
@@ -44,6 +44,22 @@ inline static void* ggml_aligned_malloc(size_t size, size_t alignment) {
 
 #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
 
+struct vk_buffer {
+    vk::Buffer buffer;
+    VmaAllocation allocation;
+    VmaAllocationInfo info;
+    size_t size = 0;
+    // Staging buffers
+    vk_buffer * sb_write;
+    vk_buffer * sb_read;
+};
+
+struct vk_pipeline {
+    vk::DescriptorSetLayout dsl;
+    vk::PipelineLayout layout;
+    vk::Pipeline pipeline;
+};
+
 vk::Instance vk_instance;
 uint32_t vk_compute_queue_family_index;
 uint32_t vk_transfer_queue_family_index;
@@ -51,29 +67,71 @@ vk::PhysicalDevice vk_physical_device;
 vk::Device vk_device;
 vk::CommandPool vk_command_pool_compute, vk_command_pool_transfer;
 VmaAllocator vk_allocator;
-vk::DescriptorSetLayout vk_pipeline_matmul_dsl;
-vk::PipelineLayout vk_pipeline_matmul_layout;
-vk::Pipeline vk_pipeline_matmul;
+vk_pipeline vk_pipeline_matmul_f32, vk_pipeline_matmul_f16;
 VmaAllocation vk_buffer_qa_alloc, vk_buffer_a_alloc, vk_buffer_b_alloc, vk_buffer_c_alloc;
 vk::Buffer vk_buffer_qa, vk_buffer_a, vk_buffer_b, vk_buffer_c;
 
 bool vk_fp16_support = false;
 
-struct vk_buffer {
-    vk::Buffer buffer;
-    VmaAllocation allocation;
-    VmaAllocationInfo info;
-    size_t size = 0;
-};
-
 static std::vector<std::tuple<void*, size_t, vk_buffer>> vk_buf_list;
 
+static vk_pipeline ggml_vk_create_pipeline(const std::string& path, const std::string& entrypoint, const std::vector<vk::DescriptorSetLayoutBinding>& dsl_binding, const vk::PushConstantRange& pcr) {
+    vk_pipeline pipeline;
+
+    std::vector<char> matmul_shader_contents;
+    if (std::ifstream shader_file{ path, std::ios::binary | std::ios::ate }) {
+        const size_t file_size = shader_file.tellg();
+        shader_file.seekg(0);
+        matmul_shader_contents.resize(file_size, '\0');
+        shader_file.read(matmul_shader_contents.data(), file_size);
+    } else {
+        std::cerr << "ggml_vulkan: Invalid shader path " << path << std::endl;
+        abort();
+    }
+
+    vk::ShaderModuleCreateInfo shader_module_create_info(
+        vk::ShaderModuleCreateFlags(),
+        matmul_shader_contents.size(),
+        reinterpret_cast<const uint32_t*>(matmul_shader_contents.data())
+    );
+    vk::ShaderModule shader_module = vk_device.createShaderModule(shader_module_create_info);
+
+    vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
+        vk::DescriptorSetLayoutCreateFlags(),
+        dsl_binding);
+    pipeline.dsl = vk_device.createDescriptorSetLayout(descriptor_set_layout_create_info);
+
+    vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline.dsl, pcr);
+    pipeline.layout = vk_device.createPipelineLayout(pipeline_layout_create_info);
+    vk::PipelineCache pipeline_cache = vk_device.createPipelineCache(vk::PipelineCacheCreateInfo());
+
+    vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
+            vk::PipelineShaderStageCreateFlags(),
+            vk::ShaderStageFlagBits::eCompute,
+            shader_module,
+            entrypoint.c_str());
+    vk::ComputePipelineCreateInfo compute_pipeline_create_info(
+        vk::PipelineCreateFlags(),
+        pipeline_shader_create_info,
+        pipeline.layout);
+    pipeline.pipeline = vk_device.createComputePipeline(pipeline_cache, compute_pipeline_create_info).value;
+
+    return pipeline;
+}
+
+void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k);
+void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k);
+
 void ggml_vk_init(void) {
     char* GGML_VULKAN_DEVICE = getenv("GGML_VULKAN_DEVICE");
     int dev_num = (GGML_VULKAN_DEVICE == NULL ? 0 : atoi(GGML_VULKAN_DEVICE));
 
     vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
-    const std::vector<const char*> layers = { "VK_LAYER_KHRONOS_validation" };
+    const std::vector<const char*> layers = {
+#ifdef GGML_DEBUG
+        "VK_LAYER_KHRONOS_validation",
+#endif
+    };
     vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers.size(), layers.data());
     vk_instance = vk::createInstance(instance_create_info);
 
@@ -135,57 +193,35 @@ void ggml_vk_init(void) {
     vmaCreateAllocator(&allocator_info, &vk_allocator);
 
     // Shaders
-    std::vector<char> matmul_shader_contents;
-    if (std::ifstream shader_file{ "ggml-vulkan-matmul.spv", std::ios::binary | std::ios::ate }) {
-        const size_t file_size = shader_file.tellg();
-        shader_file.seekg(0);
-        matmul_shader_contents.resize(file_size, '\0');
-        shader_file.read(matmul_shader_contents.data(), file_size);
-    }
-
-    vk::ShaderModuleCreateInfo shader_module_create_info(
-        vk::ShaderModuleCreateFlags(),
-        matmul_shader_contents.size(),
-        reinterpret_cast<const uint32_t*>(matmul_shader_contents.data())
-    );
-    vk::ShaderModule shader_module = vk_device.createShaderModule(shader_module_create_info);
-
-    const std::vector<vk::DescriptorSetLayoutBinding> descriptor_set_layout_binding = {
+    std::vector<vk::DescriptorSetLayoutBinding> dsl_binding = {
         {0, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute},
         {1, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute},
         {2, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute}
     };
-    vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
-        vk::DescriptorSetLayoutCreateFlags(),
-        descriptor_set_layout_binding);
-    vk_pipeline_matmul_dsl = vk_device.createDescriptorSetLayout(descriptor_set_layout_create_info);
 
-    vk::PushConstantRange push_constant_range(
+    vk::PushConstantRange pcr(
         vk::ShaderStageFlagBits::eCompute,
         0,
         6 * sizeof(int)
     );
+    vk_pipeline_matmul_f32 = ggml_vk_create_pipeline("vk_shaders/matmul_f32.spv", "main", dsl_binding, pcr);
+    vk_pipeline_matmul_f16 = ggml_vk_create_pipeline("vk_shaders/matmul_f16.spv", "main", dsl_binding, pcr);
 
-    vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), vk_pipeline_matmul_dsl, push_constant_range);
-    vk_pipeline_matmul_layout = vk_device.createPipelineLayout(pipeline_layout_create_info);
-    vk::PipelineCache pipeline_cache = vk_device.createPipelineCache(vk::PipelineCacheCreateInfo());
-
-    vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
-            vk::PipelineShaderStageCreateFlags(),
-            vk::ShaderStageFlagBits::eCompute,
-            shader_module,
-            "main");
-    vk::ComputePipelineCreateInfo compute_pipeline_create_info(
-        vk::PipelineCreateFlags(),
-        pipeline_shader_create_info,
-        vk_pipeline_matmul_layout);
-    vk_pipeline_matmul = vk_device.createComputePipeline(pipeline_cache, compute_pipeline_create_info).value;
-
+    // Command pools
     vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(), vk_compute_queue_family_index);
     vk_command_pool_compute = vk_device.createCommandPool(command_pool_create_info_compute);
 
     vk::CommandPoolCreateInfo command_pool_create_info_transfer(vk::CommandPoolCreateFlags(), vk_transfer_queue_family_index);
     vk_command_pool_transfer = vk_device.createCommandPool(command_pool_create_info_transfer);
+
+    // for (size_t m = 1; m < 10; m++) {
+    //     for (size_t n = 1; n < 10; n++) {
+    //         for (size_t k = 1; k < 10; k++) {
+    //             ggml_vk_test_matmul_f32(m * 128, n * 128, k * 128);
+    //             ggml_vk_test_matmul_f16(m * 128, n * 128, k * 128);
+    //         }
+    //     }
+    // }
 }
 
 // buffer pool for vulkan
@@ -233,12 +269,27 @@ static vk_buffer ggml_vk_create_buffer(size_t size, VmaAllocationCreateFlags all
                     &buf.allocation,
                     &buf.info);
 
+    buf.sb_write = nullptr;
+    buf.sb_read = nullptr;
+
     return buf;
 }
 
 static void ggml_vk_destroy_buffer(vk_buffer& buf) {
     buf.size = 0;
     vmaDestroyBuffer(vk_allocator, buf.buffer, buf.allocation);
+
+    // Cleanup staging buffers
+    if (buf.sb_write != nullptr) {
+        vmaDestroyBuffer(vk_allocator, buf.sb_write->buffer, buf.sb_write->allocation);
+        free(buf.sb_write);
+        buf.sb_write = nullptr;
+    }
+    if (buf.sb_read != nullptr) {
+        vmaDestroyBuffer(vk_allocator, buf.sb_read->buffer, buf.sb_read->allocation);
+        free(buf.sb_read);
+        buf.sb_read = nullptr;
+    }
 }
 
 static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf, VmaAllocationCreateFlags alloc_flags) {
@@ -294,7 +345,7 @@ void* ggml_vk_host_malloc(size_t size) {
         return nullptr;
     }
 
-    vk_buffer buf = ggml_vk_create_buffer(size, VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, VMA_MEMORY_USAGE_AUTO_PREFER_HOST, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
+    vk_buffer buf = ggml_vk_create_buffer(size, VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, VMA_MEMORY_USAGE_AUTO_PREFER_HOST, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
 
     VkMemoryPropertyFlags mem_prop_flags;
     vmaGetAllocationMemoryProperties(vk_allocator, buf.allocation, &mem_prop_flags);
@@ -307,8 +358,6 @@ void* ggml_vk_host_malloc(size_t size) {
         return nullptr;
     }
 
-    printf("allocate %.2f MB of pinned memory\n", size/1024.0/1024.0);
-
     vk_buf_list.push_back(std::make_tuple(buf.info.pMappedData, size, buf));
 
     return buf.info.pMappedData;
@@ -347,9 +396,9 @@ static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src
 
     // Buffer is already mapped
     if(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
-        memcpy(dst->info.pMappedData, src, size);
+        memcpy((uint8_t *)dst->info.pMappedData + offset, src, size);
         if (!(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
-            vmaFlushAllocation(vk_allocator, dst->allocation, 0, VK_WHOLE_SIZE);
+            vmaFlushAllocation(vk_allocator, dst->allocation, offset, size);
         }
     } else {
         // Check if src is pinned memory
@@ -379,26 +428,24 @@ static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src
             cmd_buffer.end();
 
             vk::Queue queue = vk_device.getQueue(vk_transfer_queue_family_index, 0);
-            vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
 
             vk::SubmitInfo submit_info(0,
                                        nullptr,
                                        nullptr,
                                        1,
                                        &cmd_buffer);
-            queue.submit({ submit_info }, fence);
-            vk_device.waitForFences({ fence },
-                                    true,
-                                    uint64_t(-1));
-            vk_device.destroyFence(fence);
+            queue.submit({ submit_info }, VK_NULL_HANDLE);
             return;
         }
 
-        // Staging buffer required
-        vk_buffer staging_buf = ggml_vk_create_buffer(size, VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, VMA_MEMORY_USAGE_AUTO, 0);
+        // Staging buffer required, malloc because of async transfer
+        if (dst->sb_write == nullptr) {
+            dst->sb_write = (vk_buffer *) malloc(sizeof(vk_buffer));
+            *dst->sb_write = ggml_vk_create_buffer(size, VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, VMA_MEMORY_USAGE_AUTO, 0);
+        }
 
-        memcpy(((uint8_t *)staging_buf.info.pMappedData) + offset, src, size);
-        vmaFlushAllocation(vk_allocator, staging_buf.allocation, 0, VK_WHOLE_SIZE);
+        memcpy(((uint8_t *)dst->sb_write->info.pMappedData) + offset, src, size);
+        vmaFlushAllocation(vk_allocator, dst->sb_write->allocation, 0, VK_WHOLE_SIZE);
         VkBufferCopy buf_copy = {
             0, // srcOffset
             offset, // dstOffset,
@@ -407,24 +454,17 @@ static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src
         vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_transfer);
         vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
         cmd_buffer.begin(cmd_buffer_begin_info);
-        vkCmdCopyBuffer(cmd_buffer, staging_buf.buffer, dst->buffer, 1, &buf_copy);
+        vkCmdCopyBuffer(cmd_buffer, dst->sb_write->buffer, dst->buffer, 1, &buf_copy);
         cmd_buffer.end();
 
         vk::Queue queue = vk_device.getQueue(vk_transfer_queue_family_index, 0);
-        vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
 
         vk::SubmitInfo submit_info(0,
                                    nullptr,
                                    nullptr,
                                    1,
                                    &cmd_buffer);
-        queue.submit({ submit_info }, fence);
-        vk_device.waitForFences({ fence },
-                                true,
-                                uint64_t(-1));
-
-        vk_device.destroyFence(fence);
-        ggml_vk_destroy_buffer(staging_buf);
+        queue.submit({ submit_info }, VK_NULL_HANDLE);
     }
 }
 
@@ -434,9 +474,9 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
 
     if(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
         if (!(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
-            vmaInvalidateAllocation(vk_allocator, src->allocation, 0, VK_WHOLE_SIZE);
+            vmaInvalidateAllocation(vk_allocator, src->allocation, offset, size);
         }
-        memcpy(dst, src->info.pMappedData, size);
+        memcpy(dst, (uint8_t *) src->info.pMappedData + offset, size);
     } else {
         // Check if dst is pinned memory
         vk_buffer* buf = nullptr;
@@ -465,7 +505,7 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
             cmd_buffer.end();
 
             vk::Queue queue = vk_device.getQueue(vk_transfer_queue_family_index, 0);
-            vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
+            vk::Fence fence = vk_device.createFence(vk::FenceCreateFlags{});
 
             vk::SubmitInfo submit_info(0,
                                        nullptr,
@@ -476,25 +516,30 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
             vk_device.waitForFences({ fence },
                                     true,
                                     uint64_t(-1));
+
             vk_device.destroyFence(fence);
             return;
         }
-        vk_buffer staging_buf = ggml_vk_create_buffer(size, VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, VMA_MEMORY_USAGE_AUTO, 0);
+
+        if (src->sb_read == nullptr) {
+            src->sb_read = (vk_buffer *) malloc(sizeof(vk_buffer));
+            *src->sb_read = ggml_vk_create_buffer(size, VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, VMA_MEMORY_USAGE_AUTO, 0);
+        }
 
         VkBufferCopy buf_copy = {
             offset, // srcOffset
             0, // dstOffset,
             size}; // size
-        vmaInvalidateAllocation(vk_allocator, staging_buf.allocation, 0, VK_WHOLE_SIZE);
+        vmaInvalidateAllocation(vk_allocator, src->sb_read->allocation, 0, VK_WHOLE_SIZE);
 
         vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_transfer);
         vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
         cmd_buffer.begin(cmd_buffer_begin_info);
-        vkCmdCopyBuffer(cmd_buffer, src->buffer, staging_buf.buffer, 1, &buf_copy);
+        vkCmdCopyBuffer(cmd_buffer, src->buffer, src->sb_read->buffer, 1, &buf_copy);
         cmd_buffer.end();
 
         vk::Queue queue = vk_device.getQueue(vk_transfer_queue_family_index, 0);
-        vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
+        vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo{});
 
         vk::SubmitInfo submit_info(0,
                                    nullptr,
@@ -505,10 +550,11 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
         vk_device.waitForFences({ fence },
                                 true,
                                 uint64_t(-1));
-        memcpy(dst, staging_buf.info.pMappedData, size);
+        memcpy(dst, src->sb_read->info.pMappedData, size);
 
         vk_device.destroyFence(fence);
-        ggml_vk_destroy_buffer(staging_buf);
+        ggml_vk_destroy_buffer(*src->sb_read);
+        src->sb_read = nullptr;
     }
 }
 
@@ -555,8 +601,6 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
     const int nb2  = dst->nb[2];
     const int nb3  = dst->nb[3];
 
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
     const int x_ne = ne01 * ne00;
     const int y_ne = ne11 * ne10;
     const int d_ne = ne11 * ne01;
@@ -567,16 +611,16 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
     if (src0->backend == GGML_BACKEND_GPU) {
         d_X = *(vk_buffer*) src0->data;
     } else {
-        ggml_vk_pool_malloc(ggml_type_size(src0->type) * x_ne, &d_X, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT);
+        ggml_vk_pool_malloc(ggml_type_size(src0->type) * x_ne, &d_X, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT);
     }
-    ggml_vk_pool_malloc(sizeof(float) * y_ne, &d_Y, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT);
-    ggml_vk_pool_malloc(sizeof(float) * d_ne, &d_D, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT);
+    ggml_vk_pool_malloc(sizeof(float) * y_ne, &d_Y, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT);
+    ggml_vk_pool_malloc(sizeof(float) * d_ne, &d_D, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT);
 
     vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, 3);
     vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(), 1, descriptor_pool_size);
     vk::DescriptorPool descriptor_pool = vk_device.createDescriptorPool(descriptor_pool_create_info);
 
-    vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(descriptor_pool, 1, &vk_pipeline_matmul_dsl);
+    vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(descriptor_pool, 1, &vk_pipeline_matmul_f32.dsl);
     const std::vector<vk::DescriptorSet> descriptor_sets = vk_device.allocateDescriptorSets(descriptor_set_alloc_info);
     vk::DescriptorSet descriptor_set = descriptor_sets.front();
     vk::DescriptorBufferInfo d_X_buffer_info(d_X.buffer, 0, sizeof(float) * x_ne);
@@ -605,14 +649,16 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
             ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02);
 
             // compute
+#ifdef VK_CHK_KERNEL
             auto begin = std::chrono::high_resolution_clock::now();
+#endif
 
             vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
             cmd_buffer.begin(cmd_buffer_begin_info);
-            cmd_buffer.pushConstants<int>(vk_pipeline_matmul_layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants);
-            cmd_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, vk_pipeline_matmul);
+            cmd_buffer.pushConstants<int>(vk_pipeline_matmul_f32.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants);
+            cmd_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, vk_pipeline_matmul_f32.pipeline);
             cmd_buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
-                                         vk_pipeline_matmul_layout,
+                                         vk_pipeline_matmul_f32.layout,
                                          0,
                                          { descriptor_set },
                                          {});
@@ -626,14 +672,20 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
                                        nullptr,
                                        1,
                                        &cmd_buffer);
+
+            // Wait for transfers to finish
+            vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle();
+
             queue.submit({ submit_info }, fence);
             vk_device.waitForFences({ fence },
                                     true,
                                     uint64_t(-1));
 
+#ifdef VK_CHK_KERNEL
             auto end = std::chrono::high_resolution_clock::now();
 
             std::cout << "m=" << ne01 << " n=" << ne11 << " k=" << ne10 << " matmul " << std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0 << "ms" << std::endl;
+#endif
 
             // copy dst to host
             float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
@@ -671,6 +723,165 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
     ggml_vk_pool_free(d_D);
 }
 
+static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata) {
+    GGML_ASSERT(vk_fp16_support);
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb00 = src0->nb[0];
+    const int nb01 = src0->nb[1];
+    const int nb02 = src0->nb[2];
+    const int nb03 = src0->nb[3];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+
+    vk_buffer d_X;
+    vk_buffer d_Y;
+    vk_buffer d_D;
+    if (src0->backend == GGML_BACKEND_GPU) {
+        d_X = *(vk_buffer*) src0->data;
+    } else {
+        ggml_vk_pool_malloc(sizeof(float) * x_ne, &d_X, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT);
+    }
+    ggml_vk_pool_malloc(sizeof(float) * y_ne, &d_Y, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT);
+    ggml_vk_pool_malloc(sizeof(float) * d_ne, &d_D, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT);
+
+    bool src0_cont_rows = nb00 == sizeof(float);
+    bool src0_cont_cols = (size_t)nb01 == ne01*sizeof(float);
+
+    vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, 3);
+    vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(), 1, descriptor_pool_size);
+    vk::DescriptorPool descriptor_pool = vk_device.createDescriptorPool(descriptor_pool_create_info);
+
+    vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(descriptor_pool, 1, &vk_pipeline_matmul_f16.dsl);
+    const std::vector<vk::DescriptorSet> descriptor_sets = vk_device.allocateDescriptorSets(descriptor_set_alloc_info);
+    vk::DescriptorSet descriptor_set = descriptor_sets.front();
+    vk::DescriptorBufferInfo d_X_buffer_info(d_X.buffer, 0, sizeof(float) * x_ne);
+    vk::DescriptorBufferInfo d_Y_buffer_info(d_Y.buffer, 0, sizeof(float) * y_ne);
+    vk::DescriptorBufferInfo d_D_buffer_info(d_D.buffer, 0, sizeof(float) * d_ne);
+
+    const std::vector<vk::WriteDescriptorSet> write_descriptor_sets = {
+        {descriptor_set, 0, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_X_buffer_info},
+        {descriptor_set, 1, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_Y_buffer_info},
+        {descriptor_set, 2, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_D_buffer_info},
+    };
+    vk_device.updateDescriptorSets(write_descriptor_sets, {});
+
+    std::array<int, 6> push_constants = { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 };
+    assert( ( sizeof( push_constants ) <= vk_physical_device.getProperties().limits.maxPushConstantsSize ) && "Too many push constants" );
+
+    vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_compute);
+    vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            // copy data to device
+            if (src1->backend != GGML_BACKEND_GPU) {
+                ggml_vk_h2d_tensor_2d(&d_Y, 0, src1, i03, i02);
+            }
+            // convert src1 to fp16
+            // TODO: use multiple threads
+            float * const tmp = (float *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
+            char * src0i = (char *) src0->data + i03*nb03 + i02*nb02;
+            if (src0_cont_rows) {
+                if (src0_cont_cols) {
+                    ggml_fp16_to_fp32_row((ggml_fp16_t *) src0i, tmp, ne00*ne01);
+                }
+                else {
+                    for (int64_t i01 = 0; i01 < ne01; i01++) {
+                        ggml_fp16_to_fp32_row((ggml_fp16_t *) (src0i + i01*nb01), tmp + i01*ne00, ne00);
+                    }
+                }
+            }
+            else {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        // very slow due to no inlining
+                        tmp[i01*ne10 + i00] = ggml_fp16_to_fp32(*(ggml_fp16_t *) (src0i + i01*nb01 + i00*nb00));
+                    }
+                }
+            }
+            ggml_vk_buffer_write(&d_X, 0, tmp, sizeof(float) * x_ne);
+
+            // compute
+#ifdef VK_CHK_KERNEL
+            auto begin = std::chrono::high_resolution_clock::now();
+#endif
+
+            vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
+            cmd_buffer.begin(cmd_buffer_begin_info);
+            cmd_buffer.pushConstants<int>(vk_pipeline_matmul_f32.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants);
+            cmd_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, vk_pipeline_matmul_f32.pipeline);
+            cmd_buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
+                                         vk_pipeline_matmul_f32.layout,
+                                         0,
+                                         { descriptor_set },
+                                         {});
+            cmd_buffer.dispatch(CEIL_DIV(ne01, 128), CEIL_DIV(ne11, 128), 1);
+            cmd_buffer.end();
+
+            vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0);
+
+            vk::SubmitInfo submit_info(0,
+                                       nullptr,
+                                       nullptr,
+                                       1,
+                                       &cmd_buffer);
+
+            // Wait for transfers to finish
+            vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle();
+
+            queue.submit({ submit_info }, fence);
+            vk_device.waitForFences({ fence },
+                                    true,
+                                    uint64_t(-1));
+
+#ifdef VK_CHK_KERNEL
+            auto end = std::chrono::high_resolution_clock::now();
+
+            std::cout << "m=" << ne01 << " n=" << ne11 << " k=" << ne10 << " matmul " << std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0 << "ms" << std::endl;
+#endif
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            ggml_vk_buffer_read(&d_D, 0, tmp, sizeof(float) * d_ne);
+
+#ifdef VK_CHK_KERNEL
+            for (size_t i = 0; i < d_ne; i++) {
+                if (std::fabs(tmp[i] - d[i]) > 0.01f) {
+                    printf("d[%ld] = %f d_chk[%ld] = %f\n", i, tmp[i], i, d[i]);
+                    abort();
+                }
+            }
+#else
+            // ggml_fp16_to_fp32_row(tmp, d, d_ne);
+#endif
+        }
+    }
+
+    vk_device.destroyFence(fence);
+
+    if (src0->backend != GGML_BACKEND_GPU) {
+        ggml_vk_pool_free(d_X);
+    }
+    ggml_vk_pool_free(d_Y);
+    ggml_vk_pool_free(d_D);
+}
+
 static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     assert(false);
 //     const int64_t ne00 = src0->ne[0];
@@ -841,7 +1052,7 @@ void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
     }
     else if (src0->type == GGML_TYPE_F16) {
         if (ggml_vk_mul_mat_use_f16(src0, src1, dst)) {
-            // ggml_vk_mul_mat_f16(src0, src1, dst, wdata, wsize);
+            ggml_vk_mul_mat_f16(src0, src1, dst, wdata);
         }
         else {
             ggml_vk_mul_mat_q_f32(src0, src1, dst);
@@ -861,3 +1072,246 @@ size_t ggml_vk_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct g
     }
     return 0;
 }
+
+#ifdef VK_CHK_KERNEL
+void ggml_vk_test_matmul_f32(size_t m, size_t n, size_t k) {
+    const size_t x_ne = m * k;
+    const size_t y_ne = k * n;
+    const size_t d_ne = m * n;
+
+    vk_buffer d_X;
+    vk_buffer d_Y;
+    vk_buffer d_D;
+    ggml_vk_pool_malloc(sizeof(float) * x_ne, &d_X, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT);
+    ggml_vk_pool_malloc(sizeof(float) * y_ne, &d_Y, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT);
+    ggml_vk_pool_malloc(sizeof(float) * d_ne, &d_D, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT);
+
+    vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, 3);
+    vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(), 1, descriptor_pool_size);
+    vk::DescriptorPool descriptor_pool = vk_device.createDescriptorPool(descriptor_pool_create_info);
+
+    vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(descriptor_pool, 1, &vk_pipeline_matmul_f32.dsl);
+    const std::vector<vk::DescriptorSet> descriptor_sets = vk_device.allocateDescriptorSets(descriptor_set_alloc_info);
+    vk::DescriptorSet descriptor_set = descriptor_sets.front();
+    vk::DescriptorBufferInfo d_X_buffer_info(d_X.buffer, 0, sizeof(float) * x_ne);
+    vk::DescriptorBufferInfo d_Y_buffer_info(d_Y.buffer, 0, sizeof(float) * y_ne);
+    vk::DescriptorBufferInfo d_D_buffer_info(d_D.buffer, 0, sizeof(float) * d_ne);
+
+    const std::vector<vk::WriteDescriptorSet> write_descriptor_sets = {
+        {descriptor_set, 0, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_X_buffer_info},
+        {descriptor_set, 1, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_Y_buffer_info},
+        {descriptor_set, 2, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_D_buffer_info},
+    };
+    vk_device.updateDescriptorSets(write_descriptor_sets, {});
+
+    std::array<int, 6> push_constants = { (int)m, (int)n, (int)k, (int)k, (int)k, (int)m };
+    assert( ( sizeof( push_constants ) <= vk_physical_device.getProperties().limits.maxPushConstantsSize ) && "Too many push constants" );
+
+    float* x = (float *) malloc(sizeof(float) * x_ne);
+    float* y = (float *) malloc(sizeof(float) * y_ne);
+    float* d = (float *) malloc(sizeof(float) * d_ne);
+
+    for (size_t i = 0; i < x_ne; i++) {
+        x[i] = rand() / (float)RAND_MAX;
+    }
+    for (size_t i = 0; i < y_ne; i++) {
+        y[i] = rand() / (float)RAND_MAX;
+    }
+
+    ggml_vk_buffer_write(&d_X, 0, x, sizeof(float) * x_ne);
+    ggml_vk_buffer_write(&d_Y, 0, y, sizeof(float) * y_ne);
+
+    vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_compute);
+    vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
+
+    // compute
+    auto begin = std::chrono::high_resolution_clock::now();
+
+    vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
+    cmd_buffer.begin(cmd_buffer_begin_info);
+    cmd_buffer.pushConstants<int>(vk_pipeline_matmul_f32.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants);
+    cmd_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, vk_pipeline_matmul_f32.pipeline);
+    cmd_buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
+                                 vk_pipeline_matmul_f32.layout,
+                                 0,
+                                 { descriptor_set },
+                                 {});
+    cmd_buffer.dispatch(CEIL_DIV(m, 128), CEIL_DIV(n, 128), 1);
+    cmd_buffer.end();
+
+    vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0);
+
+    vk::SubmitInfo submit_info(0,
+                               nullptr,
+                               nullptr,
+                               1,
+                               &cmd_buffer);
+
+    // Wait for transfers to finish
+    vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle();
+
+    queue.submit({ submit_info }, fence);
+    vk_device.waitForFences({ fence },
+                            true,
+                            uint64_t(-1));
+
+    auto end = std::chrono::high_resolution_clock::now();
+
+    // copy dst to host
+    ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne);
+
+    float * d_chk = (float *) malloc(sizeof(float) * d_ne);
+
+    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+            m, n, k,
+            1.0f,    x, k,
+                     y, k,
+            0.0f,    d_chk, m);
+
+    double avg_err = 0.0;
+
+    for (size_t r = 0; r < m; r++) {
+        for (size_t c = 0; c < n; c++) {
+            avg_err += std::fabs(d[c * m + r] - d_chk[c * m + r]);
+        }
+    }
+
+    std::cout << "TEST FP32 m=" << m << " n=" << n << " k=" << k << " matmul " << std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0 << "ms avg_err=" << avg_err / (m * n) << std::endl;
+
+    free(d_chk);
+
+    vk_device.destroyFence(fence);
+
+    ggml_vk_pool_free(d_X);
+    ggml_vk_pool_free(d_Y);
+    ggml_vk_pool_free(d_D);
+
+    free(x);
+    free(y);
+    free(d);
+}
+
+void ggml_vk_test_matmul_f16(size_t m, size_t n, size_t k) {
+    const size_t x_ne = m * k;
+    const size_t y_ne = k * n;
+    const size_t d_ne = m * n;
+
+    vk_buffer d_X;
+    vk_buffer d_Y;
+    vk_buffer d_D;
+    ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &d_X, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT);
+    ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &d_Y, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT);
+    ggml_vk_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_D, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT);
+
+    vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, 3);
+    vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(), 1, descriptor_pool_size);
+    vk::DescriptorPool descriptor_pool = vk_device.createDescriptorPool(descriptor_pool_create_info);
+
+    vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(descriptor_pool, 1, &vk_pipeline_matmul_f32.dsl);
+    const std::vector<vk::DescriptorSet> descriptor_sets = vk_device.allocateDescriptorSets(descriptor_set_alloc_info);
+    vk::DescriptorSet descriptor_set = descriptor_sets.front();
+    vk::DescriptorBufferInfo d_X_buffer_info(d_X.buffer, 0, sizeof(ggml_fp16_t) * x_ne);
+    vk::DescriptorBufferInfo d_Y_buffer_info(d_Y.buffer, 0, sizeof(ggml_fp16_t) * y_ne);
+    vk::DescriptorBufferInfo d_D_buffer_info(d_D.buffer, 0, sizeof(ggml_fp16_t) * d_ne);
+
+    const std::vector<vk::WriteDescriptorSet> write_descriptor_sets = {
+        {descriptor_set, 0, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_X_buffer_info},
+        {descriptor_set, 1, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_Y_buffer_info},
+        {descriptor_set, 2, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &d_D_buffer_info},
+    };
+    vk_device.updateDescriptorSets(write_descriptor_sets, {});
+
+    std::array<int, 6> push_constants = { (int)m, (int)n, (int)k, (int)k, (int)k, (int)m };
+    assert( ( sizeof( push_constants ) <= vk_physical_device.getProperties().limits.maxPushConstantsSize ) && "Too many push constants" );
+
+    ggml_fp16_t* x = (ggml_fp16_t *) malloc(sizeof(ggml_fp16_t) * x_ne);
+    ggml_fp16_t* y = (ggml_fp16_t *) malloc(sizeof(ggml_fp16_t) * y_ne);
+    ggml_fp16_t* d = (ggml_fp16_t *) malloc(sizeof(ggml_fp16_t) * d_ne);
+
+    for (size_t i = 0; i < x_ne; i++) {
+        x[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX);
+    }
+    for (size_t i = 0; i < y_ne; i++) {
+        y[i] = ggml_fp32_to_fp16(rand() / (float)RAND_MAX);
+    }
+
+    ggml_vk_buffer_write(&d_X, 0, x, sizeof(ggml_fp16_t) * x_ne);
+    ggml_vk_buffer_write(&d_Y, 0, y, sizeof(ggml_fp16_t) * y_ne);
+
+    vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_compute);
+    vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
+
+    // compute
+    auto begin = std::chrono::high_resolution_clock::now();
+
+    vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
+    cmd_buffer.begin(cmd_buffer_begin_info);
+    cmd_buffer.pushConstants<int>(vk_pipeline_matmul_f32.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constants);
+    cmd_buffer.bindPipeline(vk::PipelineBindPoint::eCompute, vk_pipeline_matmul_f32.pipeline);
+    cmd_buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
+                                 vk_pipeline_matmul_f32.layout,
+                                 0,
+                                 { descriptor_set },
+                                 {});
+    cmd_buffer.dispatch(CEIL_DIV(m, 32), CEIL_DIV(n, 32), 1);
+    cmd_buffer.end();
+
+    vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0);
+
+    vk::SubmitInfo submit_info(0,
+                               nullptr,
+                               nullptr,
+                               1,
+                               &cmd_buffer);
+
+    // Wait for transfers to finish
+    vk_device.getQueue(vk_transfer_queue_family_index, 0).waitIdle();
+
+    queue.submit({ submit_info }, fence);
+    vk_device.waitForFences({ fence },
+                            true,
+                            uint64_t(-1));
+
+    auto end = std::chrono::high_resolution_clock::now();
+
+    // copy dst to host
+    ggml_vk_buffer_read(&d_D, 0, d, sizeof(ggml_fp16_t) * d_ne);
+
+    float * fx = (float *) malloc(sizeof(float) * x_ne);
+    float * fy = (float *) malloc(sizeof(float) * y_ne);
+    float * d_chk = (float *) malloc(sizeof(float) * d_ne);
+
+    ggml_fp16_to_fp32_row(x, fx, x_ne);
+    ggml_fp16_to_fp32_row(y, fy, y_ne);
+
+    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+            m, n, k,
+            1.0f,    fx, k,
+                     fy, k,
+            0.0f,    d_chk, m);
+
+    double avg_err = 0.0;
+
+    for (size_t r = 0; r < m; r++) {
+        for (size_t c = 0; c < n; c++) {
+            avg_err += std::fabs(ggml_fp16_to_fp32(d[c * m + r]) - d_chk[c * m + r]);
+        }
+    }
+
+    std::cout << "TEST FP16 m=" << m << " n=" << n << " k=" << k << " matmul " << std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0 << "ms avg_err=" << avg_err / (m * n) << std::endl;
+
+    free(fx);
+    free(fy);
+    free(d_chk);
+
+    vk_device.destroyFence(fence);
+
+    ggml_vk_pool_free(d_X);
+    ggml_vk_pool_free(d_Y);
+    ggml_vk_pool_free(d_D);
+
+    free(x);
+    free(y);
+    free(d);
+}
+#endif
diff --git a/vk_shaders/matmul_f16.spv b/vk_shaders/matmul_f16.spv
new file mode 100644
index 0000000000000000000000000000000000000000..a52c8f676765148d8f96ce795b26684721d894ac
GIT binary patch
literal 2144
zcmZvcSx;0!5QQ6%MRvlbvUmq@#Sk=#D57A%0l^g$BWeUgR5bdck3RU|AM(e!CMJI0
zOc%Kic5<ewPF0_-UhXtCPPV4zhSZU|(z{e2eW@`u;A#(N`=BpT(^BKBn_HVRA3yEQ
zT)Z^vjPBH=spWc7duql%1m5rNe~|A4BVZA1fajnWEo+cXMTTouVXf}~qhJlSqgcme
z-5hY%%c&D}jhSmJadu~&x>4sGx!w|I-*N+>%AQ<TJ$t%}ZOQQ(@dCc%2v9R(=R2G)
z7xUfwgb9Xv#+6!BkJs?a`s+Ct`Vp7)<wD<cE$bVrA7{FaU)Fz0>~N3vA8}b<F7&;#
zvc9qUk$;2lovGIkD_`LZny~tV?Ai4TU{A)rEn~+paXVJdIR&wL*2*vB*gKS)&hxwQ
z9oBba&8ZRj{>%-~#W4POYOD2VFaLu&$H=MSUR>9BBy;LI&$sEYehh29eB`I_?PZAF
zOk;g#=JaQPBWmaHJ)<#deP`B;<1b*1UE><oz028?v2RE&bZ+3s*{vSr>bkzK&|Swj
zR`+(UlT+7tBXgm9huH7M{<g89<NGjH=NZ;s<kT_tjg<R)LF_lCuIKd{6yO{u=N<co
z<b?M&N44kT+79>j3r3A)e!cG=;2OSz#jNXD%YP>JPW+FNi#dMZ&T);HGl(xAbB6Hc
zoa6nDgAp(aoTKh>ApeaT-&O%!Oa3R_PGB8>s4uV;Am<+RL;D22+P<04K8Y_M+NbdC
zN&YwULr42Pwbs@@jb8xI$9mVWkF!{ZeOPk_$k~U!db2s!s^=ZQ1-?~l9)t71oO|B~
zo`tz5z+CL-GQRuqUB`ar@#SMbSMb%4|3P0DuxdE$)4Z{~eU|l?;MDiMhyF7D68OsQ
z?e8Y&1M*)ow~~3+U8ZUkdkgp-xfk>5x%VnC7kj#k@0xxKvF1H|`B?KlzIP-aYd*x6
zFR!^p>^J6`=3~uA_~ttKkFp2Pya9Ly_G8{V32o0^-6>GML+hjO9en%t+l#(;@pnM<
o{Sx24<wJW9U%q~Z^|N|K>^E)S=CAX%<m}1y<4tM*()m322fz<+H~;_u

literal 0
HcmV?d00001

diff --git a/ggml-vulkan-matmul.comp b/vk_shaders/matmul_f32.glsl
similarity index 100%
rename from ggml-vulkan-matmul.comp
rename to vk_shaders/matmul_f32.glsl