support different subgroup sizes (tested)

2024-12-29 22:28:27 -05:00 · 2024-12-29 22:28:27 -05:00 · bdd1e4ddc7
commit bdd1e4ddc7
parent 5641108a33
2 changed files with 38 additions and 28 deletions
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -1877,7 +1877,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {64, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@ -1891,7 +1891,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {64, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@ -1905,7 +1905,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {64, rm_kq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
    // dequant shaders
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
@ -13,6 +13,29 @@ shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
 shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16];
 shared block_q6_K_packed16 blkcache[BLOCK_SIZE/16];
 uint fill_blkcache_its(uint wg_size) {
    // subgroup sizes are always a power of 2
    if (wg_size > 64)
        return 1;
    else if (wg_size == 64)
        return 2;
    else if (wg_size == 32)
        return 4;
    else
        return 8;
 }
 void fill_blkcache(const int num_blocks, const uint ib0, const uint i0, const uint tid, const uint fbi) {
    uint bc_t = 104 / fbi;
    if (tid < bc_t) {
        [[unroll]] for (int l = 0; l < num_blocks; ++l) {
            [[unroll]] for (int m = 0; m < fbi; ++m)
                // cache full superblock into shared memory with coalesced reads
                blkcache[l].blk[tid + m*bc_t] = data_a_packed16[ib0 + i0 + l].blk[tid + m*bc_t];
        }
    }
 }
 void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
    uint a_offset, b_offset, d_offset;
    get_offsets(a_offset, b_offset, d_offset);
@ -24,6 +47,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
    const uint tid = gl_LocalInvocationID.x;
    const uint itid = tid%16;  // 0...15
    const uint ix = tid/16;
    const uint fbi = fill_blkcache_its(gl_WorkGroupSize.x);
    const uint v_im = itid/8;                               // 0 or 1. 0 computes 0..., 1 computes 128...
    const uint v_in = itid - 8*v_im;                        // 0...15 or 0...7
@ -38,10 +62,8 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
    const uint bcs_offset = (itid%2 == 1) ? 8 : 0;
    FLOAT_TYPE temp[NUM_ROWS];
-
+    [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i)
    [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
        temp[i] = FLOAT_TYPE(0);
    }
    [[unroll]] for (uint i0 = 0; i0 < num_blocks_per_row; i0 += it_size) {
        uint i = i0 + ix; // 16 thread group specific counter
@ -55,33 +77,23 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
        uint ibi = first_row*num_blocks_per_row;
        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
            const uint ib0 = a_offset / QUANT_K + ibi;
-            ibi += num_blocks_per_row;
+            const int blim = min(int(num_blocks_per_row) - int(i0), int(it_size));
-            // cache full superblock into shared memory with coalesced reads
+            // fill_blkcache is sensitive to unrolling with hardcoded it_size
-            // we assume 64 threads here!
+            if (blim == it_size) {
-            const int blim = min(int(num_blocks_per_row) - int(i0), 4);
+                fill_blkcache(int(it_size), ib0, i0, tid, fbi);
            // this is required as this loop is super sensitive to unrolling with hardcoded 4
            if (blim == 4) {
                if (tid < 52) {
                    [[unroll]] for (int l = 0; l < 4; ++l) {
                        blkcache[l].blk[tid] = data_a_packed16[ib0 + i0 + l].blk[tid];
                        blkcache[l].blk[tid + 52] = data_a_packed16[ib0 + i0 + l].blk[tid + 52];
                    }
                }
            } else {
-                if (tid < 52) {
+                fill_blkcache(blim, ib0, i0, tid, fbi);
                    [[unroll]] for (int l = 0; l < blim; ++l) {
                        blkcache[l].blk[tid] = data_a_packed16[ib0 + i0 + l].blk[tid];
                        blkcache[l].blk[tid + 52] = data_a_packed16[ib0 + i0 + l].blk[tid + 52];
                    }
                }
            }
            sccache[ix][itid] = FLOAT_TYPE(int8_t(bitfieldExtract(blkcache[ix].blk[96 + itid/2], int(bcs_offset), 8)));
            barrier();
            ibi += num_blocks_per_row;
            if (i >= num_blocks_per_row)
                continue;
-            const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
+            const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib0 + i].d);
            uint32_t ql0_u32 =  uint32_t(blkcache[ix].blk[ql_offset / 2]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 1]) << 16);
            uint32_t ql32_u32 = uint32_t(blkcache[ix].blk[ql_offset / 2 + 16]) | (uint32_t(blkcache[ix].blk[ql_offset / 2 + 17]) << 16);
@ -115,9 +127,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
                sum[3] = fma(FLOAT_TYPE(by96[l]), FLOAT_TYPE(int8_t(q3[l]) - 32), sum[3]);
            }
-            [[unroll]] for (uint l = 0; l < 4; ++l)
+            temp[n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[n]);
                sum[l] *= sccache[ix][s_offset + l*2];
            temp[n] += (sum[0] + sum[1] + sum[2] + sum[3]) * d;
        }
    }