From e9be24f9addf430295d4e74e2c6f4cac057c885f Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 14 Aug 2023 11:07:55 +0200 Subject: [PATCH] Fix fp32 fallback if device doesn't support fp16, add force disable env var GGML_VULKAN_DISABLE_F16 --- ggml-vulkan-shaders.hpp | 129 ++++++++++++++++++++++------------------ ggml-vulkan.cpp | 101 +++++++++++++++++++++---------- 2 files changed, 141 insertions(+), 89 deletions(-) diff --git a/ggml-vulkan-shaders.hpp b/ggml-vulkan-shaders.hpp index 2599bdf2d..a8d35ac38 100644 --- a/ggml-vulkan-shaders.hpp +++ b/ggml-vulkan-shaders.hpp @@ -11,38 +11,27 @@ const std::string shader_f16 = R"( const std::string shader_int8_ext = R"( #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require )"; -const std::string shader_output_f16 = R"( -#define OUT_TYPE float16_t -)"; -const std::string shader_output_f32 = R"( -#define OUT_TYPE float -)"; // MULMAT const std::string mulmat_head = R"( #version 450 +#extension GL_EXT_control_flow_attributes : enable +#extension GL_EXT_shader_16bit_storage : require + #define WARP 32 -#extension GL_EXT_control_flow_attributes : enable +#ifndef LOAD_VEC +#define LOAD_VEC 1 +#endif )"; const std::string mulmat_body = R"( layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -#ifdef ALIGNED_INPUT -#define LOAD_VEC 8 layout (binding = 0) readonly buffer A { A_TYPE data_a[]; }; layout (binding = 1) readonly buffer B { B_TYPE data_b[]; }; - -#else - -#define LOAD_VEC 1 -layout (binding = 0) readonly buffer A { A_TYPE data_a[]; }; -layout (binding = 1) readonly buffer B { B_TYPE data_b[]; }; -#endif - layout (binding = 2) writeonly buffer D { D_TYPE data_d[]; }; layout (push_constant) uniform parameter @@ -107,16 +96,22 @@ void main() { [[unroll]] for (int block = start_k; block < end_k; block += BK) { [[unroll]] for (int l = 0; l < BM; l += loadstride) { -#ifdef ALIGNED_INPUT - A_TYPE tmp = data_a[pos_a + (loadc + l) * p.stride_a / LOAD_VEC + loadr]; - buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 0] = FLOAT_TYPE(tmp[0].x); - buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 1] = FLOAT_TYPE(tmp[0].y); - buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 2] = FLOAT_TYPE(tmp[0].z); - buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 3] = FLOAT_TYPE(tmp[0].w); - buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 4] = FLOAT_TYPE(tmp[1].x); - buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 5] = FLOAT_TYPE(tmp[1].y); - buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 6] = FLOAT_TYPE(tmp[1].z); - buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 7] = FLOAT_TYPE(tmp[1].w); +#if LOAD_VEC == 8 + const int idx = pos_a + (loadc + l) * p.stride_a / LOAD_VEC + loadr; + buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 0] = FLOAT_TYPE(data_a[idx][0].x); + buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 1] = FLOAT_TYPE(data_a[idx][0].y); + buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 2] = FLOAT_TYPE(data_a[idx][0].z); + buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 3] = FLOAT_TYPE(data_a[idx][0].w); + buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 4] = FLOAT_TYPE(data_a[idx][1].x); + buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 5] = FLOAT_TYPE(data_a[idx][1].y); + buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 6] = FLOAT_TYPE(data_a[idx][1].z); + buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 7] = FLOAT_TYPE(data_a[idx][1].w); +#elif LOAD_VEC == 4 + const int idx = pos_a + (loadc + l) * p.stride_a / LOAD_VEC + loadr; + buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 0] = FLOAT_TYPE(data_a[idx].x); + buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 1] = FLOAT_TYPE(data_a[idx].y); + buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 2] = FLOAT_TYPE(data_a[idx].z); + buf_a[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 3] = FLOAT_TYPE(data_a[idx].w); #else if (ir * BM + loadc + l < p.M && block + loadr < p.K) { buf_a[(loadc + l) * (BK+1) + loadr] = FLOAT_TYPE(data_a[pos_a + (loadc + l) * p.stride_a + loadr]); @@ -126,16 +121,22 @@ void main() { #endif } [[unroll]] for (int l = 0; l < BN; l += loadstride) { -#ifdef ALIGNED_INPUT - B_TYPE tmp = data_b[pos_b + (loadc + l) * p.stride_b / LOAD_VEC + loadr]; - buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 0] = FLOAT_TYPE(tmp[0].x); - buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 1] = FLOAT_TYPE(tmp[0].y); - buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 2] = FLOAT_TYPE(tmp[0].z); - buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 3] = FLOAT_TYPE(tmp[0].w); - buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 4] = FLOAT_TYPE(tmp[1].x); - buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 5] = FLOAT_TYPE(tmp[1].y); - buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 6] = FLOAT_TYPE(tmp[1].z); - buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 7] = FLOAT_TYPE(tmp[1].w); +#if LOAD_VEC == 8 + const int idx = pos_b + (loadc + l) * p.stride_b / LOAD_VEC + loadr; + buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 0] = FLOAT_TYPE(data_b[idx][0].x); + buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 1] = FLOAT_TYPE(data_b[idx][0].y); + buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 2] = FLOAT_TYPE(data_b[idx][0].z); + buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 3] = FLOAT_TYPE(data_b[idx][0].w); + buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 4] = FLOAT_TYPE(data_b[idx][1].x); + buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 5] = FLOAT_TYPE(data_b[idx][1].y); + buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 6] = FLOAT_TYPE(data_b[idx][1].z); + buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 7] = FLOAT_TYPE(data_b[idx][1].w); +#elif LOAD_VEC == 4 + const int idx = pos_b + (loadc + l) * p.stride_b / LOAD_VEC + loadr; + buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 0] = FLOAT_TYPE(data_b[idx].x); + buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 1] = FLOAT_TYPE(data_b[idx].y); + buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 2] = FLOAT_TYPE(data_b[idx].z); + buf_b[(loadc + l) * (BK+1) + loadr * LOAD_VEC + 3] = FLOAT_TYPE(data_b[idx].w); #else if (ic * BN + loadc + l < p.N && block + loadr < p.K) { buf_b[(loadc + l) * (BK+1) + loadr] = FLOAT_TYPE(data_b[pos_b + (loadc + l) * p.stride_b + loadr]); @@ -259,7 +260,7 @@ const std::string dequant_body = R"( layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A { A_TYPE x[]; }; -layout (binding = 1) writeonly buffer D { OUT_TYPE y[]; }; +layout (binding = 1) writeonly buffer D { D_TYPE y[]; }; layout (push_constant) uniform parameter { @@ -282,15 +283,15 @@ void main() { const int stride_a = p.stride_a / QUANT_K; - const A_TYPE blk = x[col * stride_a + row]; - const OUT_TYPE d = blk.d; + const int idx = col * stride_a + row; + const FLOAT_TYPE d = FLOAT_TYPE(x[idx].d); [[unroll]] for (int j = 0; j < QUANT_K/2; ++j) { - const OUT_TYPE x0 = OUT_TYPE((blk.qs[j] & 0x0F) - 8); - const OUT_TYPE x1 = OUT_TYPE((blk.qs[j] >> 4) - 8); + const FLOAT_TYPE x0 = FLOAT_TYPE((x[idx].qs[j] & 0x0F) - 8); + const FLOAT_TYPE x1 = FLOAT_TYPE((x[idx].qs[j] >> 4) - 8); - y[col * p.stride_b + row*QUANT_K + j + 0 ] = x0*d; - y[col * p.stride_b + row*QUANT_K + j + QUANT_K/2] = x1*d; + y[col * p.stride_b + row*QUANT_K + j + 0 ] = D_TYPE(x0*d); + y[col * p.stride_b + row*QUANT_K + j + QUANT_K/2] = D_TYPE(x1*d); } } )"; @@ -304,25 +305,24 @@ const std::string mul_mat_vec_head = R"( #extension GL_EXT_shader_8bit_storage : require )"; -const std::string mul_mat_vec_b_type_f32 = R"( -#define B_TYPE float -)"; - -const std::string mul_mat_vec_b_type_f16 = R"( -#define B_TYPE float16_t -)"; - const std::string mul_mat_vec_f16_defines = R"( #define QUANT_K 32 #define QUANT_R 2 #define BLOCK_SIZE 32 #define A_TYPE float16_t +)"; +const std::string mul_mat_vec_f16_dequant_func = R"( #define DEQUANT_FUNC float16_t v0 = x[ib + 0]; \ float16_t v1 = x[ib + 1]; )"; +const std::string mul_mat_vec_f16_dequant_func_compat = R"( +#define DEQUANT_FUNC float v0 = float(x[ib + 0]); \ +float v1 = float(x[ib + 1]); +)"; + const std::string mul_mat_vec_q4_0_defines = R"( #define QUANT_K 32 #define QUANT_R 2 @@ -334,7 +334,9 @@ struct block_q4_0 uint8_t qs[16]; }; #define A_TYPE block_q4_0 +)"; +const std::string mul_mat_vec_q4_0_dequant_func = R"( #define DEQUANT_FUNC const float16_t d = x[ib].d; \ const uint8_t vui = x[ib].qs[iqs]; \ const int8_t vi0 = int8_t(vui & 0xF); \ @@ -343,12 +345,21 @@ float16_t v0 = float16_t(vi0 - 8)*d; \ float16_t v1 = float16_t(vi1 - 8)*d; )"; +const std::string mul_mat_vec_q4_0_dequant_func_compat = R"( +#define DEQUANT_FUNC const float d = float(x[ib].d); \ +const uint vui = uint(x[ib].qs[iqs]); \ +const int vi0 = int(vui) & 0xF; \ +const int vi1 = int(vui) >> 4; \ +float v0 = float(vi0 - 8)*d; \ +float v1 = float(vi1 - 8)*d; +)"; + const std::string mul_mat_vec_body = R"( layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A { A_TYPE x[]; }; layout (binding = 1) readonly buffer B { B_TYPE y[]; }; -layout (binding = 2) writeonly buffer D { OUT_TYPE dst[]; }; +layout (binding = 2) writeonly buffer D { D_TYPE dst[]; }; layout (push_constant) uniform parameter { @@ -364,7 +375,7 @@ void main() { const int y_offset = QUANT_K/2; - tmp[tid] = 0.0hf; + tmp[tid] = FLOAT_TYPE(0.0f); [[unroll]] for (int i = 0; i < p.ncols/block_size; i += 2) { const int col = i*block_size + 2*tid; @@ -375,8 +386,8 @@ void main() { DEQUANT_FUNC // matrix multiplication - tmp[tid] += FLOAT_TYPE(v0 * y[iybs + iqs + 0]); - tmp[tid] += FLOAT_TYPE(v1 * y[iybs + iqs + y_offset]); + tmp[tid] += FLOAT_TYPE(v0) * FLOAT_TYPE(y[iybs + iqs + 0]); + tmp[tid] += FLOAT_TYPE(v1) * FLOAT_TYPE(y[iybs + iqs + y_offset]); } // sum up partial sums and write back result @@ -388,7 +399,7 @@ void main() { barrier(); } if (tid == 0) { - dst[row] = OUT_TYPE(tmp[0]); + dst[row] = D_TYPE(tmp[0]); } } )"; @@ -460,6 +471,8 @@ void main() { // ADD const std::string add_head = R"( #version 450 + +#extension GL_EXT_shader_16bit_storage : require )"; const std::string add_body = R"( @@ -489,7 +502,7 @@ void main() { return; } - data_d[p.d_offset + y * p.stride_d + x] = D_TYPE(data_x[p.x_offset + y * p.stride_x + x]) + D_TYPE(data_y[p.y_offset + x]); + data_d[p.d_offset + y * p.stride_d + x] = D_TYPE(FLOAT_TYPE(data_x[p.x_offset + y * p.stride_x + x]) + FLOAT_TYPE(data_y[p.y_offset + x])); } )"; diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 96f39ce2b..ebe355ce8 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -662,85 +662,119 @@ static void ggml_vk_generate_shaders() { auto warptile_m = { 128, 64, 64, 16, 32, 32, 2, 4, 2 }; auto warptile_s = { 32, 32, 32, 8, 32, 32, 2, 2, 2 }; + std::string shader_float_type; + std::string load_vec; + std::string vec_type_f16; + std::string vec_type; + if (vk_device.fp16) { + shader_float_type = shader_f16; + load_vec = "8"; + vec_type_f16 = "f16mat2x4"; + vec_type = "mat2x4"; + } else { + shader_float_type = shader_f32; + load_vec = "4"; + vec_type_f16 = "f16vec4"; + vec_type = "vec4"; + } + std::stringstream stream; - stream << mulmat_head << shader_f32 << mulmat_body; + stream << mulmat_head << shader_float_type << mulmat_body; vk_pipeline_matmul_f32_l = ggml_vk_create_pipeline_from_string("matmul_f32_l", stream.str(), { "A_TYPE", "float", "B_TYPE", "float", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); vk_pipeline_matmul_f32_m = ggml_vk_create_pipeline_from_string("matmul_f32_m", stream.str(), { "A_TYPE", "float", "B_TYPE", "float", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 64, 64, 1}, warptile_m, 64); vk_pipeline_matmul_f32_s = ggml_vk_create_pipeline_from_string("matmul_f32_s", stream.str(), { "A_TYPE", "float", "B_TYPE", "float", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); - vk_pipeline_matmul_f32_aligned_l = ggml_vk_create_pipeline_from_string("matmul_f32_aligned_l", stream.str(), { "ALIGNED_INPUT", "", "A_TYPE", "mat2x4", "B_TYPE", "mat2x4", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); - vk_pipeline_matmul_f32_aligned_m = ggml_vk_create_pipeline_from_string("matmul_f32_aligned_m", stream.str(), { "ALIGNED_INPUT", "", "A_TYPE", "mat2x4", "B_TYPE", "mat2x4", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 64, 64, 1}, warptile_m, 64); - vk_pipeline_matmul_f32_aligned_s = ggml_vk_create_pipeline_from_string("matmul_f32_aligned_s", stream.str(), { "ALIGNED_INPUT", "", "A_TYPE", "mat2x4", "B_TYPE", "mat2x4", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); + vk_pipeline_matmul_f32_aligned_l = ggml_vk_create_pipeline_from_string("matmul_f32_aligned_l", stream.str(), { "LOAD_VEC", load_vec, "A_TYPE", vec_type, "B_TYPE", vec_type, "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); + vk_pipeline_matmul_f32_aligned_m = ggml_vk_create_pipeline_from_string("matmul_f32_aligned_m", stream.str(), { "LOAD_VEC", load_vec, "A_TYPE", vec_type, "B_TYPE", vec_type, "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 64, 64, 1}, warptile_m, 64); + vk_pipeline_matmul_f32_aligned_s = ggml_vk_create_pipeline_from_string("matmul_f32_aligned_s", stream.str(), { "LOAD_VEC", load_vec, "A_TYPE", vec_type, "B_TYPE", vec_type, "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); stream.str(""); stream.clear(); - stream << mulmat_head << shader_f16 << mulmat_body; + stream << mulmat_head << shader_float_type << mulmat_body; vk_pipeline_matmul_f16_l = ggml_vk_create_pipeline_from_string("matmul_f16_l", stream.str(), { "A_TYPE", "float16_t", "B_TYPE", "float16_t", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); vk_pipeline_matmul_f16_m = ggml_vk_create_pipeline_from_string("matmul_f16_m", stream.str(), { "A_TYPE", "float16_t", "B_TYPE", "float16_t", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 64, 64, 1}, warptile_m, 64); vk_pipeline_matmul_f16_s = ggml_vk_create_pipeline_from_string("matmul_f16_s", stream.str(), { "A_TYPE", "float16_t", "B_TYPE", "float16_t", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); - vk_pipeline_matmul_f16_aligned_l = ggml_vk_create_pipeline_from_string("matmul_f16_aligned_l", stream.str(), { "ALIGNED_INPUT", "", "A_TYPE", "f16mat2x4", "B_TYPE", "f16mat2x4", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); - vk_pipeline_matmul_f16_aligned_m = ggml_vk_create_pipeline_from_string("matmul_f16_aligned_m", stream.str(), { "ALIGNED_INPUT", "", "A_TYPE", "f16mat2x4", "B_TYPE", "f16mat2x4", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 64, 64, 1}, warptile_m, 64); - vk_pipeline_matmul_f16_aligned_s = ggml_vk_create_pipeline_from_string("matmul_f16_aligned_s", stream.str(), { "ALIGNED_INPUT", "", "A_TYPE", "f16mat2x4", "B_TYPE", "f16mat2x4", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); + vk_pipeline_matmul_f16_aligned_l = ggml_vk_create_pipeline_from_string("matmul_f16_aligned_l", stream.str(), { "LOAD_VEC", load_vec, "A_TYPE", vec_type_f16, "B_TYPE", vec_type_f16, "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); + vk_pipeline_matmul_f16_aligned_m = ggml_vk_create_pipeline_from_string("matmul_f16_aligned_m", stream.str(), { "LOAD_VEC", load_vec, "A_TYPE", vec_type_f16, "B_TYPE", vec_type_f16, "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 64, 64, 1}, warptile_m, 64); + vk_pipeline_matmul_f16_aligned_s = ggml_vk_create_pipeline_from_string("matmul_f16_aligned_s", stream.str(), { "LOAD_VEC", load_vec, "A_TYPE", vec_type_f16, "B_TYPE", vec_type_f16, "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); vk_pipeline_matmul_f16_f32_l = ggml_vk_create_pipeline_from_string("matmul_f16_f32_l", stream.str(), { "A_TYPE", "float16_t", "B_TYPE", "float", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); vk_pipeline_matmul_f16_f32_m = ggml_vk_create_pipeline_from_string("matmul_f16_f32_m", stream.str(), { "A_TYPE", "float16_t", "B_TYPE", "float", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 64, 64, 1}, warptile_m, 64); vk_pipeline_matmul_f16_f32_s = ggml_vk_create_pipeline_from_string("matmul_f16_f32_s", stream.str(), { "A_TYPE", "float16_t", "B_TYPE", "float", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); - vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline_from_string("matmul_f16_f32_aligned_l", stream.str(), { "ALIGNED_INPUT", "", "A_TYPE", "f16mat2x4", "B_TYPE", "mat2x4", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); - vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline_from_string("matmul_f16_f32_aligned_m", stream.str(), { "ALIGNED_INPUT", "", "A_TYPE", "f16mat2x4", "B_TYPE", "mat2x4", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 64, 64, 1}, warptile_m, 64); - vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline_from_string("matmul_f16_f32_aligned_s", stream.str(), { "ALIGNED_INPUT", "", "A_TYPE", "f16mat2x4", "B_TYPE", "mat2x4", "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); + vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline_from_string("matmul_f16_f32_aligned_l", stream.str(), { "LOAD_VEC", load_vec, "A_TYPE", vec_type_f16, "B_TYPE", vec_type, "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), {128, 128, 1}, warptile_l, 128); + vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline_from_string("matmul_f16_f32_aligned_m", stream.str(), { "LOAD_VEC", load_vec, "A_TYPE", vec_type_f16, "B_TYPE", vec_type, "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 64, 64, 1}, warptile_m, 64); + vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline_from_string("matmul_f16_f32_aligned_s", stream.str(), { "LOAD_VEC", load_vec, "A_TYPE", vec_type_f16, "B_TYPE", vec_type, "D_TYPE", "float" }, "main", 3, 7 * sizeof(int), { 32, 32, 1}, warptile_s, 32); // Build dequant q4_0 stream.str(""); stream.clear(); - stream << dequant_head; - if (vk_device.fp16) { - stream << shader_f16 << shader_output_f16; - } else { - stream << shader_output_f32; - } - stream << dequant_q4_0_defines << dequant_body; + stream << dequant_head << shader_float_type << dequant_q4_0_defines << dequant_body; - vk_pipeline_dequant_q4_0 = ggml_vk_create_pipeline_from_string("dequant_q4_0", stream.str(), {}, "main", 2, 4 * sizeof(int), {256*32, 1, 1}, {}, 1); + vk_pipeline_dequant_q4_0 = ggml_vk_create_pipeline_from_string("dequant_q4_0", stream.str(), { "D_TYPE", "float16_t" }, "main", 2, 4 * sizeof(int), {256*32, 1, 1}, {}, 1); // mul mat vec stream.str(""); stream.clear(); - stream << mul_mat_vec_head << shader_f16 << shader_int8_ext << shader_output_f32 << mul_mat_vec_b_type_f16 << mul_mat_vec_q4_0_defines << mul_mat_vec_body; + stream << mul_mat_vec_head << shader_float_type; + if (vk_device.fp16) { + stream << shader_int8_ext << mul_mat_vec_q4_0_dequant_func; + } else { + stream << mul_mat_vec_q4_0_dequant_func_compat; + } + stream << mul_mat_vec_q4_0_defines << mul_mat_vec_body; - vk_pipeline_dequant_mul_mat_vec_q4_0 = ggml_vk_create_pipeline_from_string("mul_mat_vec_q4_0", stream.str(), {}, "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); + vk_pipeline_dequant_mul_mat_vec_q4_0 = ggml_vk_create_pipeline_from_string("mul_mat_vec_q4_0", stream.str(), { "D_TYPE", "float", "B_TYPE", "float16_t" }, "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); stream.str(""); stream.clear(); - stream << mul_mat_vec_head << shader_f16 << shader_int8_ext << shader_output_f32 << mul_mat_vec_b_type_f32 << mul_mat_vec_q4_0_defines << mul_mat_vec_body; + stream << mul_mat_vec_head << shader_float_type; + if (vk_device.fp16) { + stream << shader_int8_ext << mul_mat_vec_q4_0_dequant_func; + } else { + stream << mul_mat_vec_q4_0_dequant_func_compat; + } + stream << mul_mat_vec_q4_0_defines << mul_mat_vec_body; - vk_pipeline_dequant_mul_mat_vec_q4_0_f32 = ggml_vk_create_pipeline_from_string("mul_mat_vec_q4_0_f32", stream.str(), {}, "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); + vk_pipeline_dequant_mul_mat_vec_q4_0_f32 = ggml_vk_create_pipeline_from_string("mul_mat_vec_q4_0_f32", stream.str(), { "D_TYPE", "float", "B_TYPE", "float" }, "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); stream.str(""); stream.clear(); - stream << mul_mat_vec_head << shader_f16 << shader_output_f32 << mul_mat_vec_b_type_f16 << mul_mat_vec_f16_defines << mul_mat_vec_body; + stream << mul_mat_vec_head << shader_float_type; + if (vk_device.fp16) { + stream << shader_int8_ext << mul_mat_vec_f16_dequant_func; + } else { + stream << mul_mat_vec_f16_dequant_func_compat; + } + stream << mul_mat_vec_f16_defines << mul_mat_vec_body; - vk_pipeline_dequant_mul_mat_vec_f16 = ggml_vk_create_pipeline_from_string("mul_mat_vec_f16", stream.str(), {}, "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); + vk_pipeline_dequant_mul_mat_vec_f16 = ggml_vk_create_pipeline_from_string("mul_mat_vec_f16", stream.str(), { "D_TYPE", "float", "B_TYPE", "float16_t" }, "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); stream.str(""); stream.clear(); - stream << mul_mat_vec_head << shader_f16 << shader_output_f32 << mul_mat_vec_b_type_f32 << mul_mat_vec_f16_defines << mul_mat_vec_body; - vk_pipeline_dequant_mul_mat_vec_f16_f32 = ggml_vk_create_pipeline_from_string("mul_mat_vec_f16_f32", stream.str(), {}, "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); + stream << mul_mat_vec_head << shader_float_type; + if (vk_device.fp16) { + stream << shader_int8_ext << mul_mat_vec_f16_dequant_func; + } else { + stream << mul_mat_vec_f16_dequant_func_compat; + } + stream << mul_mat_vec_f16_defines << mul_mat_vec_body; + vk_pipeline_dequant_mul_mat_vec_f16_f32 = ggml_vk_create_pipeline_from_string("mul_mat_vec_f16_f32", stream.str(), { "D_TYPE", "float", "B_TYPE", "float" }, "main", 3, 1 * sizeof(int), {1, 1, 1}, {}, 1); // add stream.str(""); stream.clear(); - stream << add_head << add_body; + stream << add_head << shader_float_type << add_body; vk_pipeline_add_f32 = ggml_vk_create_pipeline_from_string("add_f32", stream.str(), { "X_TYPE", "float", "Y_TYPE", "float", "D_TYPE", "float" }, "main", 3, sizeof(vk_op_push_constants), {32, 32, 1}, {}, 1); stream.str(""); stream.clear(); - stream << add_head << shader_f16 << add_body; + stream << add_head << shader_float_type << add_body; vk_pipeline_add_f16_f32_f16 = ggml_vk_create_pipeline_from_string("add_f16_f32_f16", stream.str(), { "X_TYPE", "float16_t", "Y_TYPE", "float", "D_TYPE", "float16_t" }, "main", 3, sizeof(vk_op_push_constants), {32, 32, 1}, {}, 1); // Static shaders @@ -761,7 +795,7 @@ void ggml_vk_init(void) { #ifdef VK_DEBUG std::cerr << "ggml_vk_init()" << std::endl; #endif - char* GGML_VULKAN_DEVICE = getenv("GGML_VULKAN_DEVICE"); + const char* GGML_VULKAN_DEVICE = getenv("GGML_VULKAN_DEVICE"); int dev_num = (GGML_VULKAN_DEVICE == NULL ? 0 : atoi(GGML_VULKAN_DEVICE)); vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION }; @@ -806,7 +840,10 @@ void ggml_vk_init(void) { } } - vk_device.fp16 = fp16_storage && fp16_compute; + const char* GGML_VULKAN_DISABLE_F16 = getenv("GGML_VULKAN_DISABLE_F16"); + bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != NULL; + + vk_device.fp16 = !force_disable_f16 && fp16_storage && fp16_compute; std::vector queue_family_props = vk_device.physical_device.getQueueFamilyProperties(); @@ -875,6 +912,8 @@ void ggml_vk_init(void) { if (vk_device.fp16) { std::cerr << "ggml_vulkan: 16-bit enabled" << std::endl; device_extensions.push_back("VK_KHR_shader_float16_int8"); + } else if (force_disable_f16) { + std::cerr << "ggml_vulkan: 16-bit force-disabled" << std::endl; } device_create_info = { vk::DeviceCreateFlags(),