Add F32 dmmv shaders

2023-07-28 06:38:23 +02:00 · 2023-07-28 06:38:23 +02:00 · 44065df367
commit 44065df367
parent d0bd120814
4 changed files with 145 additions and 7 deletions
--- a/2
+++ b/2
@ -244,6 +244,8 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
 	glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_q4_0.glsl -o vk_shaders/dequant_q4_0.spv & \
 	glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_mul_mat_vec_f16.glsl -o vk_shaders/dequant_mul_mat_vec_f16.spv & \
 	glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_mul_mat_vec_q4_0.glsl -o vk_shaders/dequant_mul_mat_vec_q4_0.spv & \
+	glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_mul_mat_vec_f16_f32.glsl -o vk_shaders/dequant_mul_mat_vec_f16_f32.spv & \
+	glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_mul_mat_vec_q4_0_f32.glsl -o vk_shaders/dequant_mul_mat_vec_q4_0_f32.spv & \
 	glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/mul_f32.glsl -o vk_shaders/mul_f32.spv & \
 	wait
 endif
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -138,6 +138,7 @@ vk_pipeline vk_pipeline_matmul_f16_f32_l, vk_pipeline_matmul_f16_f32_m, vk_pipel
 vk_pipeline vk_pipeline_matmul_f16_f32_aligned_l, vk_pipeline_matmul_f16_f32_aligned_m, vk_pipeline_matmul_f16_f32_aligned_s;
 vk_pipeline vk_pipeline_matmul_split_k_reduce;
 vk_pipeline vk_pipeline_dequant_mul_mat_vec_f16, vk_pipeline_dequant_mul_mat_vec_q4_0;
+vk_pipeline vk_pipeline_dequant_mul_mat_vec_f16_f32, vk_pipeline_dequant_mul_mat_vec_q4_0_f32;
 vk_pipeline vk_pipeline_mul_f32;
 vk_pipeline vk_pipeline_f32_to_f16, vk_pipeline_dequant_q4_0;

@ -750,14 +751,17 @@ void ggml_vk_init(void) {
        vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline("vk_shaders/matmul_f16_f32_aligned.spv", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128);
        vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline("vk_shaders/matmul_f16_f32_aligned.spv", "main", 3, 7 * sizeof(int), { 64,  64, 1}, warptile_m, 64);
        vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline("vk_shaders/matmul_f16_f32_aligned.spv", "main", 3, 7 * sizeof(int), { 32,  32, 1}, warptile_s, 32);
+
+        vk_pipeline_dequant_mul_mat_vec_f16 = ggml_vk_create_pipeline("vk_shaders/dequant_mul_mat_vec_f16.spv", "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1);
+        vk_pipeline_dequant_mul_mat_vec_q4_0 = ggml_vk_create_pipeline("vk_shaders/dequant_mul_mat_vec_q4_0.spv", "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1);
    }
    vk_pipeline_matmul_split_k_reduce = ggml_vk_create_pipeline("vk_shaders/matmul_split_k_reduce.spv", "main", 1, 3 * sizeof(int), {32, 32, 1}, {}, 1);

    vk_pipeline_f32_to_f16 = ggml_vk_create_pipeline("vk_shaders/f32_to_f16.spv", "main", 2, 4 * sizeof(int), {64, 1, 1}, {}, 1);
    vk_pipeline_dequant_q4_0 = ggml_vk_create_pipeline("vk_shaders/dequant_q4_0.spv", "main", 2, 4 * sizeof(int), {256*32, 1, 1}, {}, 1);

-    vk_pipeline_dequant_mul_mat_vec_f16 = ggml_vk_create_pipeline("vk_shaders/dequant_mul_mat_vec_f16.spv", "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1);
-    vk_pipeline_dequant_mul_mat_vec_q4_0 = ggml_vk_create_pipeline("vk_shaders/dequant_mul_mat_vec_q4_0.spv", "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1);
+    vk_pipeline_dequant_mul_mat_vec_f16_f32 = ggml_vk_create_pipeline("vk_shaders/dequant_mul_mat_vec_f16_f32.spv", "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1);
+    vk_pipeline_dequant_mul_mat_vec_q4_0_f32 = ggml_vk_create_pipeline("vk_shaders/dequant_mul_mat_vec_q4_0_f32.spv", "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1);

    vk_pipeline_mul_f32 = ggml_vk_create_pipeline("vk_shaders/mul_f32.spv", "main", 3, 8 * sizeof(int), {32, 32, 1}, {}, 1);

@ -840,15 +844,15 @@ static vk_pipeline* ggml_vk_get_to_fp16(ggml_type type) {
    }
 }

-static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_type type) {
+static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_type type, bool f16_y) {
 #ifdef VK_DEBUG
    std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
 #endif
    switch (type) {
        case GGML_TYPE_Q4_0:
-            return &vk_pipeline_dequant_mul_mat_vec_q4_0;
+            return f16_y ? &vk_pipeline_dequant_mul_mat_vec_q4_0 : &vk_pipeline_dequant_mul_mat_vec_q4_0_f32;
        case GGML_TYPE_F16:
-            return &vk_pipeline_dequant_mul_mat_vec_f16;
+            return f16_y ? &vk_pipeline_dequant_mul_mat_vec_f16 : &vk_pipeline_dequant_mul_mat_vec_f16_f32;
        default:
            return nullptr;
    }
@ -1850,7 +1854,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *
    const int nb3  = dst->nb[3];
    const bool mul_mat_vec = ne11 == 1 && src0->type != GGML_TYPE_F16;

-    const bool f16_f32_kernel = src1->type == GGML_TYPE_F32 && !mul_mat_vec;
+    const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;

    const bool qx_needs_dequant = src0->type != GGML_TYPE_F16 && !mul_mat_vec;
    const bool qy_needs_dequant = src1->type != GGML_TYPE_F16 && !f16_f32_kernel;
@ -1906,7 +1910,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor *

    vk_pipeline* to_fp16_vk_0 = ggml_vk_get_to_fp16(src0->type);
    vk_pipeline* to_fp16_vk_1 = ggml_vk_get_to_fp16(src1->type);
-    vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(src0->type);
+    vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(src0->type, !f16_f32_kernel);
    GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
    GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
    GGML_ASSERT(dmmv != nullptr);
--- a/vk_shaders/dequant_mul_mat_vec_f16_f32.glsl
+++ b/vk_shaders/dequant_mul_mat_vec_f16_f32.glsl
@ -0,0 +1,59 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+
+#define QUANT_K 32
+#define QUANT_R 2
+#define BLOCK_SIZE 32
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A { float16_t x[]; };
+layout (binding = 1) readonly buffer B { float y[]; };
+layout (binding = 2) writeonly buffer D { float dst[]; };
+
+layout (push_constant) uniform parameter
+{
+    int ncols;
+} p;
+
+shared float tmp[BLOCK_SIZE];
+
+void main() {
+    const int block_size = int(gl_WorkGroupSize.x);
+    const int row = int(gl_WorkGroupID.x);
+    const int tid = int(gl_LocalInvocationID.x);
+
+    const int y_offset = QUANT_K/2;
+
+    tmp[tid] = 0.0hf;
+
+    [[unroll]] for (int i = 0; i < p.ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*p.ncols + col)/QUANT_K; // block index
+        const int iqs = (col%QUANT_K)/QUANT_R; // quant index
+        const int iybs = col - col%QUANT_K; // y block start index
+
+        // dequantize
+        float16_t v0 = x[ib + 0];
+        float16_t v1 = x[ib + 1];
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    barrier();
+    [[unroll]] for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier();
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
--- a/vk_shaders/dequant_mul_mat_vec_q4_0_f32.glsl
+++ b/vk_shaders/dequant_mul_mat_vec_q4_0_f32.glsl
@ -0,0 +1,73 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
+
+#define QUANT_K 32
+#define QUANT_R 2
+#define BLOCK_SIZE 32
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+struct block_q4_0
+{
+    float16_t d;
+    uint8_t qs[16];
+};
+
+layout (binding = 0) readonly buffer A { block_q4_0 x[]; };
+layout (binding = 1) readonly buffer B { float y[]; };
+layout (binding = 2) writeonly buffer D { float dst[]; };
+
+layout (push_constant) uniform parameter
+{
+    int ncols;
+} p;
+
+shared float tmp[BLOCK_SIZE];
+
+void main() {
+    const int block_size = int(gl_WorkGroupSize.x);
+    const int row = int(gl_WorkGroupID.x);
+    const int tid = int(gl_LocalInvocationID.x);
+
+    const int y_offset = QUANT_K/2;
+
+    tmp[tid] = 0.0hf;
+
+    [[unroll]] for (int i = 0; i < p.ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*p.ncols + col)/QUANT_K; // block index
+        const int iqs = (col%QUANT_K)/QUANT_R; // quant index
+        const int iybs = col - col%QUANT_K; // y block start index
+
+        // dequantize
+        const float16_t d = x[ib].d;
+
+        const uint8_t vui = x[ib].qs[iqs];
+
+        const int8_t vi0 = int8_t(vui & 0xF);
+        const int8_t vi1 = int8_t(vui >> 4);
+
+        float16_t v0 = float16_t(vi0 - 8)*d;
+        float16_t v1 = float16_t(vi1 - 8)*d;
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    barrier();
+    [[unroll]] for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier();
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}