more barriers
This commit is contained in:
parent
ed1ad94c84
commit
4ae3fc0155
3 changed files with 5 additions and 0 deletions
|
@ -16,6 +16,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||||
|
|
||||||
|
barrier();
|
||||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||||
if (i < num_blocks_per_row) {
|
if (i < num_blocks_per_row) {
|
||||||
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
|
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
|
||||||
|
|
|
@ -16,6 +16,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
|
||||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||||
|
|
||||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||||
|
barrier();
|
||||||
if (i < num_blocks_per_row)
|
if (i < num_blocks_per_row)
|
||||||
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
||||||
barrier();
|
barrier();
|
||||||
|
@ -39,6 +40,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, co
|
||||||
const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
|
const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
|
||||||
|
|
||||||
if (all_threads) {
|
if (all_threads) {
|
||||||
|
barrier();
|
||||||
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||||
|
|
||||||
if (!all_threads) { // when we don't have enough blocks to use all threads
|
if (!all_threads) { // when we don't have enough blocks to use all threads
|
||||||
|
barrier();
|
||||||
if (i < num_blocks_per_row)
|
if (i < num_blocks_per_row)
|
||||||
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
||||||
barrier();
|
barrier();
|
||||||
|
@ -50,6 +51,7 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
|
||||||
const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
|
const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
|
||||||
|
|
||||||
if (all_threads) {
|
if (all_threads) {
|
||||||
|
barrier();
|
||||||
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue