From 44065df367ff5e2fa2f87782438caa930dca6c02 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 28 Jul 2023 06:38:23 +0200 Subject: [PATCH] Add F32 dmmv shaders --- Makefile | 2 + ggml-vulkan.cpp | 18 +++-- vk_shaders/dequant_mul_mat_vec_f16_f32.glsl | 59 ++++++++++++++++ vk_shaders/dequant_mul_mat_vec_q4_0_f32.glsl | 73 ++++++++++++++++++++ 4 files changed, 145 insertions(+), 7 deletions(-) create mode 100644 vk_shaders/dequant_mul_mat_vec_f16_f32.glsl create mode 100644 vk_shaders/dequant_mul_mat_vec_q4_0_f32.glsl diff --git a/Makefile b/Makefile index 38379bd1f..3bda7257f 100644 --- a/Makefile +++ b/Makefile @@ -244,6 +244,8 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_q4_0.glsl -o vk_shaders/dequant_q4_0.spv & \ glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_mul_mat_vec_f16.glsl -o vk_shaders/dequant_mul_mat_vec_f16.spv & \ glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_mul_mat_vec_q4_0.glsl -o vk_shaders/dequant_mul_mat_vec_q4_0.spv & \ + glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_mul_mat_vec_f16_f32.glsl -o vk_shaders/dequant_mul_mat_vec_f16_f32.spv & \ + glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/dequant_mul_mat_vec_q4_0_f32.glsl -o vk_shaders/dequant_mul_mat_vec_q4_0_f32.spv & \ glslc -fshader-stage=compute --target-env=vulkan1.2 vk_shaders/mul_f32.glsl -o vk_shaders/mul_f32.spv & \ wait endif diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index f5f2d1a3b..43587b9e4 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -138,6 +138,7 @@ vk_pipeline vk_pipeline_matmul_f16_f32_l, vk_pipeline_matmul_f16_f32_m, vk_pipel vk_pipeline vk_pipeline_matmul_f16_f32_aligned_l, vk_pipeline_matmul_f16_f32_aligned_m, vk_pipeline_matmul_f16_f32_aligned_s; vk_pipeline vk_pipeline_matmul_split_k_reduce; vk_pipeline vk_pipeline_dequant_mul_mat_vec_f16, vk_pipeline_dequant_mul_mat_vec_q4_0; +vk_pipeline vk_pipeline_dequant_mul_mat_vec_f16_f32, vk_pipeline_dequant_mul_mat_vec_q4_0_f32; vk_pipeline vk_pipeline_mul_f32; vk_pipeline vk_pipeline_f32_to_f16, vk_pipeline_dequant_q4_0; @@ -750,14 +751,17 @@ void ggml_vk_init(void) { vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline("vk_shaders/matmul_f16_f32_aligned.spv", "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline("vk_shaders/matmul_f16_f32_aligned.spv", "main", 3, 7 * sizeof(int), { 64, 64, 1}, warptile_m, 64); vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline("vk_shaders/matmul_f16_f32_aligned.spv", "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); + + vk_pipeline_dequant_mul_mat_vec_f16 = ggml_vk_create_pipeline("vk_shaders/dequant_mul_mat_vec_f16.spv", "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); + vk_pipeline_dequant_mul_mat_vec_q4_0 = ggml_vk_create_pipeline("vk_shaders/dequant_mul_mat_vec_q4_0.spv", "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); } vk_pipeline_matmul_split_k_reduce = ggml_vk_create_pipeline("vk_shaders/matmul_split_k_reduce.spv", "main", 1, 3 * sizeof(int), {32, 32, 1}, {}, 1); vk_pipeline_f32_to_f16 = ggml_vk_create_pipeline("vk_shaders/f32_to_f16.spv", "main", 2, 4 * sizeof(int), {64, 1, 1}, {}, 1); vk_pipeline_dequant_q4_0 = ggml_vk_create_pipeline("vk_shaders/dequant_q4_0.spv", "main", 2, 4 * sizeof(int), {256*32, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f16 = ggml_vk_create_pipeline("vk_shaders/dequant_mul_mat_vec_f16.spv", "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_q4_0 = ggml_vk_create_pipeline("vk_shaders/dequant_mul_mat_vec_q4_0.spv", "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); + vk_pipeline_dequant_mul_mat_vec_f16_f32 = ggml_vk_create_pipeline("vk_shaders/dequant_mul_mat_vec_f16_f32.spv", "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); + vk_pipeline_dequant_mul_mat_vec_q4_0_f32 = ggml_vk_create_pipeline("vk_shaders/dequant_mul_mat_vec_q4_0_f32.spv", "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); vk_pipeline_mul_f32 = ggml_vk_create_pipeline("vk_shaders/mul_f32.spv", "main", 3, 8 * sizeof(int), {32, 32, 1}, {}, 1); @@ -840,15 +844,15 @@ static vk_pipeline* ggml_vk_get_to_fp16(ggml_type type) { } } -static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_type type) { +static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_type type, bool f16_y) { #ifdef VK_DEBUG std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl; #endif switch (type) { case GGML_TYPE_Q4_0: - return &vk_pipeline_dequant_mul_mat_vec_q4_0; + return f16_y ? &vk_pipeline_dequant_mul_mat_vec_q4_0 : &vk_pipeline_dequant_mul_mat_vec_q4_0_f32; case GGML_TYPE_F16: - return &vk_pipeline_dequant_mul_mat_vec_f16; + return f16_y ? &vk_pipeline_dequant_mul_mat_vec_f16 : &vk_pipeline_dequant_mul_mat_vec_f16_f32; default: return nullptr; } @@ -1850,7 +1854,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * const int nb3 = dst->nb[3]; const bool mul_mat_vec = ne11 == 1 && src0->type != GGML_TYPE_F16; - const bool f16_f32_kernel = src1->type == GGML_TYPE_F32 && !mul_mat_vec; + const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; const bool qx_needs_dequant = src0->type != GGML_TYPE_F16 && !mul_mat_vec; const bool qy_needs_dequant = src1->type != GGML_TYPE_F16 && !f16_f32_kernel; @@ -1906,7 +1910,7 @@ static void ggml_vk_mul_mat_q_f16(const ggml_tensor * src0, const ggml_tensor * vk_pipeline* to_fp16_vk_0 = ggml_vk_get_to_fp16(src0->type); vk_pipeline* to_fp16_vk_1 = ggml_vk_get_to_fp16(src1->type); - vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(src0->type); + vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(src0->type, !f16_f32_kernel); GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT GGML_ASSERT(dmmv != nullptr); diff --git a/vk_shaders/dequant_mul_mat_vec_f16_f32.glsl b/vk_shaders/dequant_mul_mat_vec_f16_f32.glsl new file mode 100644 index 000000000..f365d8023 --- /dev/null +++ b/vk_shaders/dequant_mul_mat_vec_f16_f32.glsl @@ -0,0 +1,59 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : enable +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require + +#define QUANT_K 32 +#define QUANT_R 2 +#define BLOCK_SIZE 32 + +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A { float16_t x[]; }; +layout (binding = 1) readonly buffer B { float y[]; }; +layout (binding = 2) writeonly buffer D { float dst[]; }; + +layout (push_constant) uniform parameter +{ + int ncols; +} p; + +shared float tmp[BLOCK_SIZE]; + +void main() { + const int block_size = int(gl_WorkGroupSize.x); + const int row = int(gl_WorkGroupID.x); + const int tid = int(gl_LocalInvocationID.x); + + const int y_offset = QUANT_K/2; + + tmp[tid] = 0.0hf; + + [[unroll]] for (int i = 0; i < p.ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*p.ncols + col)/QUANT_K; // block index + const int iqs = (col%QUANT_K)/QUANT_R; // quant index + const int iybs = col - col%QUANT_K; // y block start index + + // dequantize + float16_t v0 = x[ib + 0]; + float16_t v1 = x[ib + 1]; + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(); + [[unroll]] for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} diff --git a/vk_shaders/dequant_mul_mat_vec_q4_0_f32.glsl b/vk_shaders/dequant_mul_mat_vec_q4_0_f32.glsl new file mode 100644 index 000000000..8aa6ac57a --- /dev/null +++ b/vk_shaders/dequant_mul_mat_vec_q4_0_f32.glsl @@ -0,0 +1,73 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : enable +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require + +#define QUANT_K 32 +#define QUANT_R 2 +#define BLOCK_SIZE 32 + +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +struct block_q4_0 +{ + float16_t d; + uint8_t qs[16]; +}; + +layout (binding = 0) readonly buffer A { block_q4_0 x[]; }; +layout (binding = 1) readonly buffer B { float y[]; }; +layout (binding = 2) writeonly buffer D { float dst[]; }; + +layout (push_constant) uniform parameter +{ + int ncols; +} p; + +shared float tmp[BLOCK_SIZE]; + +void main() { + const int block_size = int(gl_WorkGroupSize.x); + const int row = int(gl_WorkGroupID.x); + const int tid = int(gl_LocalInvocationID.x); + + const int y_offset = QUANT_K/2; + + tmp[tid] = 0.0hf; + + [[unroll]] for (int i = 0; i < p.ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*p.ncols + col)/QUANT_K; // block index + const int iqs = (col%QUANT_K)/QUANT_R; // quant index + const int iybs = col - col%QUANT_K; // y block start index + + // dequantize + const float16_t d = x[ib].d; + + const uint8_t vui = x[ib].qs[iqs]; + + const int8_t vi0 = int8_t(vui & 0xF); + const int8_t vi1 = int8_t(vui >> 4); + + float16_t v0 = float16_t(vi0 - 8)*d; + float16_t v1 = float16_t(vi1 - 8)*d; + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(); + [[unroll]] for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +}