use calc_superblock everywhere
This commit is contained in:
parent
d63497b3a3
commit
30eacad290
2 changed files with 196 additions and 190 deletions
|
@ -6,38 +6,9 @@
|
|||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid%16; // 0...15
|
||||
const uint ix = tid/16;
|
||||
|
||||
const uint il = itid/4; // 0...3
|
||||
const uint ir = itid - 4*il; // 0...3
|
||||
const uint n = 4;
|
||||
|
||||
const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||
const uint v_in = il % 2;
|
||||
|
||||
const uint l0 = n * (2 * ir + v_in); // 0...15
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint y_offset = 64*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||
const uint y1_idx = i * QUANT_K + y_offset;
|
||||
const uint y2_idx = y1_idx + 128;
|
||||
|
||||
|
@ -113,7 +84,39 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
|||
temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid%16; // 0...15
|
||||
const uint ix = tid/16;
|
||||
|
||||
const uint il = itid/4; // 0...3
|
||||
const uint ir = itid - 4*il; // 0...3
|
||||
const uint n = 4;
|
||||
|
||||
const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||
const uint v_in = il % 2;
|
||||
|
||||
const uint l0 = n * (2 * ir + v_in); // 0...15
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint y_offset = 64*v_im + l0;
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size)
|
||||
calc_superblock(a_offset, b_offset, v_im, q_offset, y_offset, i, num_blocks_per_row, first_row, num_rows);
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
|
|
@ -6,37 +6,9 @@
|
|||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid%16; // 0...15
|
||||
const uint ix = tid/16;
|
||||
|
||||
const uint il = itid/4; // 0...3
|
||||
const uint ir = itid - 4*il; // 0...3
|
||||
|
||||
const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||
const uint v_in = il % 2;
|
||||
|
||||
const uint l0 = 4*ir + 2*v_in; // 0...15
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint y_offset = 64*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im, const uint l0, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||
const uint y1_idx = i * QUANT_K + y_offset;
|
||||
const uint y2_idx = y1_idx + 128;
|
||||
|
||||
|
@ -144,7 +116,38 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
|||
temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
uint a_offset, b_offset, d_offset;
|
||||
get_offsets(a_offset, b_offset, d_offset);
|
||||
|
||||
const uint num_blocks_per_row = p.ncols / QUANT_K;
|
||||
|
||||
// 16 threads are used to process each block
|
||||
const uint it_size = gl_WorkGroupSize.x/16;
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint itid = tid%16; // 0...15
|
||||
const uint ix = tid/16;
|
||||
|
||||
const uint il = itid/4; // 0...3
|
||||
const uint ir = itid - 4*il; // 0...3
|
||||
|
||||
const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||
const uint v_in = il % 2;
|
||||
|
||||
const uint l0 = 4*ir + 2*v_in; // 0...15
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint y_offset = 64*v_im + l0;
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
|
||||
temp[j][i] = FLOAT_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
||||
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size)
|
||||
calc_superblock(a_offset, b_offset, v_im, l0, q_offset, y_offset, i, num_blocks_per_row, first_row, num_rows);
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue