force 16 sequential threads per block
This commit is contained in:
parent
97e0c686a3
commit
2bca812230
1 changed files with 4 additions and 4 deletions
|
@ -23,11 +23,11 @@ void main() {
|
||||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||||
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
|
||||||
|
|
||||||
uint it_size = gl_WorkGroupSize.x/16;
|
// 16 threads are used to process each block
|
||||||
|
const uint it_size = gl_WorkGroupSize.x/16;
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
const uint itid = tid/it_size; // 0...16
|
const uint itid = tid%16; // 0...16
|
||||||
const uint ix = tid%it_size;
|
const uint ix = tid/16;
|
||||||
|
|
||||||
const uint step = 8;
|
const uint step = 8;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue