better q4_k scales
This commit is contained in:
parent
b4ae7005e6
commit
cdf70cf27f
2 changed files with 43 additions and 42 deletions
|
@ -25,7 +25,6 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
|||
|
||||
const uint l0 = 2*v_in; // 0...15
|
||||
const uint q_offset = 32*v_im + l0;
|
||||
const uint s_offset = 8*v_im;
|
||||
const uint y_offset = 128*v_im + l0;
|
||||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
|
|
@ -43,55 +43,57 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
|||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
|
||||
f16vec2 d = data_a[ib0 + i].d;
|
||||
const f16vec2 d = data_a[ib0 + i].d;
|
||||
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
|
||||
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
|
||||
|
||||
uint16_t scale0_u16 = data_a_packed16[ib0 + i].scales[v_im ];
|
||||
uint16_t scale4_u16 = data_a_packed16[ib0 + i].scales[v_im + 2];
|
||||
uint16_t scale8_u16 = data_a_packed16[ib0 + i].scales[v_im + 4];
|
||||
uvec2 scale0 = uvec2(unpack8(scale0_u16));
|
||||
uvec2 scale4 = uvec2(unpack8(scale4_u16));
|
||||
uvec2 scale8 = uvec2(unpack8(scale8_u16));
|
||||
const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
|
||||
const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
|
||||
const uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
|
||||
|
||||
const uint32_t sc0 = ( scale0.x & 0x3f);
|
||||
const uint32_t sc1 = ( scale0.y & 0x3f);
|
||||
const uint32_t sc2 = ( scale4.x & 0x3f);
|
||||
const uint32_t sc3 = ( scale4.y & 0x3f);
|
||||
const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2));
|
||||
const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2));
|
||||
const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
|
||||
const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
|
||||
const uint32_t scale_0_4_l = (scale4_u32 << 16) | scale0_u32;
|
||||
const uint32_t scale_0_4_h = (scale_0_4_l & 0xc0c0c0c0) >> 2;
|
||||
const vec4 scale_0_4_l_f = vec4(unpack8(scale_0_4_l & 0x3f3f3f3f));
|
||||
const vec4 scale8_f = vec4(unpack8(((((scale8_u32 >> 4) << 16) | scale8_u32) & 0x0f0f0f0f) | scale_0_4_h));
|
||||
|
||||
uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
|
||||
uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
|
||||
const FLOAT_TYPE sc0 = scale_0_4_l_f.x;
|
||||
const FLOAT_TYPE sc1 = scale_0_4_l_f.y;
|
||||
const FLOAT_TYPE sc2 = scale_0_4_l_f.z;
|
||||
const FLOAT_TYPE sc3 = scale_0_4_l_f.w;
|
||||
const FLOAT_TYPE sc4 = scale8_f.x;
|
||||
const FLOAT_TYPE sc5 = scale8_f.y;
|
||||
const FLOAT_TYPE sc6 = scale8_f.z;
|
||||
const FLOAT_TYPE sc7 = scale8_f.w;
|
||||
|
||||
uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
|
||||
uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
|
||||
uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
|
||||
const uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
|
||||
const uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
|
||||
|
||||
uvec4 qs0_lo4 = uvec4(unpack8(qs0_u32_lo4));
|
||||
uvec4 qs64_lo4 = uvec4(unpack8(qs64_u32_lo4));
|
||||
uvec4 qs0_hi4 = uvec4(unpack8(qs0_u32_hi4));
|
||||
uvec4 qs64_hi4 = uvec4(unpack8(qs64_u32_hi4));
|
||||
const uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
|
||||
const uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
|
||||
const uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
|
||||
const uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
|
||||
|
||||
const uint32_t q4_0 = qs0_lo4.x;
|
||||
const uint32_t q4_1 = qs0_lo4.y;
|
||||
const uint32_t q4_2 = qs0_lo4.z;
|
||||
const uint32_t q4_3 = qs0_lo4.w;
|
||||
const uint32_t q4_4 = qs0_hi4.x;
|
||||
const uint32_t q4_5 = qs0_hi4.y;
|
||||
const uint32_t q4_6 = qs0_hi4.z;
|
||||
const uint32_t q4_7 = qs0_hi4.w;
|
||||
const uint32_t q4_8 = qs64_lo4.x;
|
||||
const uint32_t q4_9 = qs64_lo4.y;
|
||||
const uint32_t q4_10 = qs64_lo4.z;
|
||||
const uint32_t q4_11 = qs64_lo4.w;
|
||||
const uint32_t q4_12 = qs64_hi4.x;
|
||||
const uint32_t q4_13 = qs64_hi4.y;
|
||||
const uint32_t q4_14 = qs64_hi4.z;
|
||||
const uint32_t q4_15 = qs64_hi4.w;
|
||||
const vec4 qs0_lo4 = vec4(unpack8(qs0_u32_lo4));
|
||||
const vec4 qs64_lo4 = vec4(unpack8(qs64_u32_lo4));
|
||||
const vec4 qs0_hi4 = vec4(unpack8(qs0_u32_hi4));
|
||||
const vec4 qs64_hi4 = vec4(unpack8(qs64_u32_hi4));
|
||||
|
||||
const FLOAT_TYPE q4_0 = qs0_lo4.x;
|
||||
const FLOAT_TYPE q4_1 = qs0_lo4.y;
|
||||
const FLOAT_TYPE q4_2 = qs0_lo4.z;
|
||||
const FLOAT_TYPE q4_3 = qs0_lo4.w;
|
||||
const FLOAT_TYPE q4_4 = qs0_hi4.x;
|
||||
const FLOAT_TYPE q4_5 = qs0_hi4.y;
|
||||
const FLOAT_TYPE q4_6 = qs0_hi4.z;
|
||||
const FLOAT_TYPE q4_7 = qs0_hi4.w;
|
||||
const FLOAT_TYPE q4_8 = qs64_lo4.x;
|
||||
const FLOAT_TYPE q4_9 = qs64_lo4.y;
|
||||
const FLOAT_TYPE q4_10 = qs64_lo4.z;
|
||||
const FLOAT_TYPE q4_11 = qs64_lo4.w;
|
||||
const FLOAT_TYPE q4_12 = qs64_hi4.x;
|
||||
const FLOAT_TYPE q4_13 = qs64_hi4.y;
|
||||
const FLOAT_TYPE q4_14 = qs64_hi4.z;
|
||||
const FLOAT_TYPE q4_15 = qs64_hi4.w;
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
B_TYPE_VEC4 by10 = data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4];
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue