more barriers

2025-01-12 17:21:57 -05:00 · 2025-01-12 17:21:57 -05:00 · 4ae3fc0155
commit 4ae3fc0155
parent ed1ad94c84
3 changed files with 5 additions and 0 deletions
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@ -16,6 +16,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
        barrier();
        if (!all_threads) { // when we don't have enough blocks to use all threads
            if (i < num_blocks_per_row) {
                const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
@ -16,6 +16,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
        if (!all_threads) { // when we don't have enough blocks to use all threads
            barrier();
            if (i < num_blocks_per_row)
                sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
            barrier();
@ -39,6 +40,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
        const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
        if (all_threads) {
            barrier();
            sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
            barrier();
        }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
@ -17,6 +17,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
        if (!all_threads) { // when we don't have enough blocks to use all threads
            barrier();
            if (i < num_blocks_per_row)
                sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
            barrier();
@ -50,6 +51,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
        const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
        if (all_threads) {
            barrier();
            sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
            barrier();
        }