diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp index 109a6b864..974efd3f9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp @@ -368,23 +368,25 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ2 float16_t dequantFuncIQ2_S(const in decodeBufIQ2_S bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { - const float16_t d = bl.block.d; - const uint idx = coordInBlock[1]; + uint idx = coordInBlock[1]; + uint lsb = idx & 1; + idx /= 2; - const uint ib32 = (idx & 0xE0) >> 5; // 0..7 - const uint ib8 = (idx & 0xF8) >> 3; // 0..31 - const uint qhshift = 2 * (ib8 % 4); + const uint ib8 = (idx % 128) / 4; // 0..31 + const uint ib32 = ib8 / 4; // 0..7 - const uint scale = (bl.block.scales[ib32] >> ((idx & 0x10) >> 4)) & 0xf; + const uint scale = (bl.block.scales[ib32] >> (2 * (ib8 & 2))) & 0xf; const uint qs = bl.block.qs[ib8]; const uint qh = bl.block.qh[ib32]; - const uint sign = bl.block.qs[QUANT_K / 8 + ib8]; + const uint qhshift = 2 * (ib8 % 4); + const uint sign = bl.block.qs[QUANT_K / 8 + ib8] >> (2 * (idx % 4)); - const float16_t dscale = bl.block.d * 0.25hf * (0.5hf + float16_t(scale)); - const uint8_t g = unpack8(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 4) >> 2])[idx & 3]; - - float16_t ret = dscale * float16_t(g) * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf); - return ret; + const float d = float(bl.block.d); + const float db = d * 0.25 * (0.5 + scale); + const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); + const uint16_t grid = unpack16(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 2) >> 1])[idx & 1]; + const vec2 v = db * vec2(sign01) * vec2(unpack8(grid)); + return float16_t(v[lsb]); } #endif @@ -399,25 +401,28 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ3 float16_t dequantFuncIQ3_XXS(const in decodeBufIQ3_XXS bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { - decodeBufIQ3_XXS_packed16 bl16 = decodeBufIQ3_XXS_packed16(bl); - const float16_t d = bl.block.d; - const uint idx = coordInBlock[1]; + uint idx = coordInBlock[1]; + uint lsb = idx & 1; + idx /= 2; - const uint ib32 = (idx & 0xE0) >> 5; // 0..7 - const uint ib4 = (idx & 0xFC) >> 4; // 0..63 - const uint is16 = QUANT_K / 8 + 2 * ib32; // index in packed16 + const uint iqs = (idx % 128) / 2; // 0..63 + const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values - const uint8_t qs = bl.block.qs[ib4]; - const uint signscale = pack32(u16vec2(bl16.block.qs[is16], bl16.block.qs[is16+1])); - - const float16_t dscale = bl.block.d * 0.5hf * (0.5hf + float16_t(signscale >> 28)); - uint sign = bitfieldExtract(signscale, 7 * int(ib4 & 3), 7); - sign |= bitCount(sign) << 7; - - const uint8_t g = unpack8(iq3xxs_grid[qs])[idx & 3]; - - float16_t ret = dscale * float16_t(g) * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf); - return ret; + const float d = float(bl.block.d); + const uint qs = bl.block.qs[iqs]; + const uint signs = pack32(u8vec4( + bl.block.qs[is+0], + bl.block.qs[is+1], + bl.block.qs[is+2], + bl.block.qs[is+3] + )); + const float db = d * 0.5 * (0.5 + (signs >> 28)); + const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7); + const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); + const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); + const uint grid = iq3xxs_grid[qs] >> (16 * (idx & 1)); + const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); + return float16_t(v[lsb]); } #endif @@ -428,26 +433,24 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ3 float16_t dequantFuncIQ3_S(const in decodeBufIQ3_S bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { - const float16_t d = bl.block.d; - const uint idx = coordInBlock[1]; + uint idx = coordInBlock[1]; + uint lsb = idx & 1; + idx /= 2; - const uint iqs = (idx & 0xFC) >> 2; // 0..63 - const uint iqh = (idx & 0xE0) >> 5; // 0..7 - const uint qhbit = iqs & 7; - const uint isgn = (idx & 0xF8) >> 3; // 0..31 - const uint is = (idx & 0xC0) >> 6; // 0..3 - - const uint8_t scale = (bl.block.scales[is] >> ((idx & 0x20) >> 3)) & uint8_t(0xF); - const float16_t dscale = d * (1.0hf + float16_t(2 * scale)); + const uint iqs = (idx % 128) / 2; // 0..63 + const uint iqh = iqs / 8; + const float d = float(bl.block.d); const uint qs = bl.block.qs[iqs]; - const uint qh = (bl.block.qh[iqh] << (8 - qhbit)) & 0x100; - const uint8_t sign = bl.block.signs[isgn]; + const uint qh = bl.block.qh[iqh]; + const int8_t sign = int8_t(bl.block.signs[iqs / 2] >> (2 * (idx % 4))); + const uint scale = bl.block.scales[iqs / 16]; + const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign))); + const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf)); + const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2)); + const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); - const uint g = unpack8(iq3s_grid[qs | qh])[idx & 3]; - float16_t ret = dscale * float16_t(g) * ((sign & (1 << (idx & 7))) != 0 ? -1.0hf : 1.0hf); - - return ret; + return float16_t(v[lsb]); } #endif