diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 85ec3620c..a8aef09b9 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -296,6 +296,8 @@ class Model: )) if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: + # TODO: cleaner model-specific per-tensor types + # NOTE: Q1_3 is only relevant for BitNet 1.58b if self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 and not any( self.match_model_tensor_name(new_name, key, None) for key in [ diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index fd5d8a90a..9c680e3b1 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -1037,73 +1037,6 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512) 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, GGML_TABLE_END() -GGML_TABLE_BEGIN(uint32_t, q22_grid, 256) - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00010000, 0x01010000, 0x00010000, 0xff010000, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000, - 0x00000100, 0x01000100, 0x00000100, 0xff000100, - 0x00010100, 0x01010100, 0x00010100, 0xff010100, - 0x00000100, 0x01000100, 0x00000100, 0xff000100, - 0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00010000, 0x01010000, 0x00010000, 0xff010000, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000, - 0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00, - 0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00, - 0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00, - 0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00, - 0x00000001, 0x01000001, 0x00000001, 0xff000001, - 0x00010001, 0x01010001, 0x00010001, 0xff010001, - 0x00000001, 0x01000001, 0x00000001, 0xff000001, - 0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001, - 0x00000101, 0x01000101, 0x00000101, 0xff000101, - 0x00010101, 0x01010101, 0x00010101, 0xff010101, - 0x00000101, 0x01000101, 0x00000101, 0xff000101, - 0x00ff0101, 0x01ff0101, 0x00ff0101, 0xffff0101, - 0x00000001, 0x01000001, 0x00000001, 0xff000001, - 0x00010001, 0x01010001, 0x00010001, 0xff010001, - 0x00000001, 0x01000001, 0x00000001, 0xff000001, - 0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001, - 0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01, - 0x0001ff01, 0x0101ff01, 0x0001ff01, 0xff01ff01, - 0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01, - 0x00ffff01, 0x01ffff01, 0x00ffff01, 0xffffff01, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00010000, 0x01010000, 0x00010000, 0xff010000, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000, - 0x00000100, 0x01000100, 0x00000100, 0xff000100, - 0x00010100, 0x01010100, 0x00010100, 0xff010100, - 0x00000100, 0x01000100, 0x00000100, 0xff000100, - 0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00010000, 0x01010000, 0x00010000, 0xff010000, - 0x00000000, 0x01000000, 0x00000000, 0xff000000, - 0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000, - 0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00, - 0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00, - 0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00, - 0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00, - 0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff, - 0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff, - 0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff, - 0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff, - 0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff, - 0x000101ff, 0x010101ff, 0x000101ff, 0xff0101ff, - 0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff, - 0x00ff01ff, 0x01ff01ff, 0x00ff01ff, 0xffff01ff, - 0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff, - 0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff, - 0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff, - 0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff, - 0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff, - 0x0001ffff, 0x0101ffff, 0x0001ffff, 0xff01ffff, - 0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff, - 0x00ffffff, 0x01ffffff, 0x00ffffff, 0xffffffff, -GGML_TABLE_END() - GGML_TABLE_BEGIN(uint32_t, q1_3_grid, 256) 0xffffffff, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff, 0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff000000, 0xff000001, diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 050197545..4d7c6ff61 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -657,6 +657,35 @@ static inline __m128i packNibbles( __m256i bytes ) { } #endif //__loongarch_asx +void quantize_row_q2_2_reference(const float * restrict x, block_q2_2 * restrict y, int64_t k) { + static const int qk = QK2_2; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + + for (int j = 0; j < qk/4; ++j) { + int8_t x0 = (int8_t)x[i*qk + 0 + j]; + int8_t x1 = (int8_t)x[i*qk + 1*qk/4 + j]; + int8_t x2 = (int8_t)x[i*qk + 2*qk/4 + j]; + int8_t x3 = (int8_t)x[i*qk + 3*qk/4 + j]; + + const uint8_t xi0 = x0 < 0 ? 1 : x0 == 0 ? 2 : 3; + const uint8_t xi1 = x1 < 0 ? 1 : x1 == 0 ? 2 : 3; + const uint8_t xi2 = x2 < 0 ? 1 : x2 == 0 ? 2 : 3; + const uint8_t xi3 = x3 < 0 ? 1 : x3 == 0 ? 2 : 3; + + y[i].qs[j] = 0; + y[i].qs[j] |= (xi0 << 0); + y[i].qs[j] |= (xi1 << 2); + y[i].qs[j] |= (xi2 << 4); + y[i].qs[j] |= (xi3 << 6); + } + } +} + // reference implementation for deterministic creation of model files void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) { static const int qk = QK4_0; @@ -1512,6 +1541,26 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) #endif } +void dequantize_row_q2_2(const block_q2_2 * restrict x, float * restrict y, int64_t k) { + static const int qk = QK2_2; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + + for (int j = 0; j < qk/4; ++j) { + const int8_t q = x[i].qs[j]; + + y[i*qk + j + 0 ] = (float) (((q >> 0) & 3) - 2); + y[i*qk + j + 1*qk/4] = (float) (((q >> 2) & 3) - 2); + y[i*qk + j + 2*qk/4] = (float) (((q >> 4) & 3) - 2); + y[i*qk + j + 3*qk/4] = (float) (((q >> 6) & 3) - 2); + } + } +} + void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) { static const int qk = QK4_0; @@ -3876,82 +3925,18 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r #if defined(__AVX2__) __m256 acc = _mm256_setzero_ps(); - int leftovers = nb % 2; - - for (int i = 0; i < nb - leftovers; i += 2) { - - const __m256 d0 = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i + 0].d) ); - const __m256 d1 = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i + 1].d) ); - - // assuming two consecutive blocks are contiguous AND aligned - __m128i xq16b = _mm_load_si128((const __m128i *) (x[i].qs)); - __m256i xq16 = MM256_SET_M128I(xq16b, xq16b); - __m256i xq8l0 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1, - 4, -1, 4, -1, 4, -1, 4, -1, - 1, -1, 1, -1, 1, -1, 1, -1, - 0, -1, 0, -1, 0, -1, 0, -1)); - __m256i xq8h0 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1, - 6, -1, 6, -1, 6, -1, 6, -1, - 3, -1, 3, -1, 3, -1, 3, -1, - 2, -1, 2, -1, 2, -1, 2, -1)); - __m256i xq8l1 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(13, -1, 13, -1, 13, -1, 13, -1, - 12, -1, 12, -1, 12, -1, 12, -1, - 9, -1, 9, -1, 9, -1, 9, -1, - 8, -1, 8, -1, 8, -1, 8, -1)); - __m256i xq8h1 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(15, -1, 15, -1, 15, -1, 15, -1, - 14, -1, 14, -1, 14, -1, 14, -1, - 11, -1, 11, -1, 11, -1, 11, -1, - 10, -1, 10, -1, 10, -1, 10, -1)); - __m256i shift = _mm256_set_epi16(64, 16, 4, 1, - 64, 16, 4, 1, - 64, 16, 4, 1, - 64, 16, 4, 1); - xq8l0 = _mm256_mullo_epi16(xq8l0, shift); - xq8h0 = _mm256_mullo_epi16(xq8h0, shift); - xq8l1 = _mm256_mullo_epi16(xq8l1, shift); - xq8h1 = _mm256_mullo_epi16(xq8h1, shift); - xq8l0 = _mm256_srai_epi16(xq8l0, 14); - xq8h0 = _mm256_srai_epi16(xq8h0, 14); - xq8l1 = _mm256_srai_epi16(xq8l1, 14); - xq8h1 = _mm256_srai_epi16(xq8h1, 14); - __m256i xq8_0 = _mm256_packs_epi16(xq8l0, xq8h0); - __m256i xq8_1 = _mm256_packs_epi16(xq8l1, xq8h1); - - __m256i yq8_0 = _mm256_loadu_si256((const __m256i *) (y[i + 0].qs)); - __m256i yq8_1 = _mm256_loadu_si256((const __m256i *) (y[i + 1].qs)); - - const __m256 q0 = mul_sum_i8_pairs_float(xq8_0, yq8_0); - const __m256 q1 = mul_sum_i8_pairs_float(xq8_1, yq8_1); - - acc = _mm256_fmadd_ps( d0, q0, acc ); - acc = _mm256_fmadd_ps( d1, q1, acc ); - } - - for (int i = nb - leftovers; i < nb; ++i) { + for (int i = 0; i < nb; ++i) { const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i].d) ); - __m128i xq8b = _mm_loadu_si64(x[i].qs); - __m256i xq8 = MM256_SET_M128I(xq8b, xq8b); - __m256i xq8l = _mm256_shuffle_epi8(xq8, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1, - 4, -1, 4, -1, 4, -1, 4, -1, - 1, -1, 1, -1, 1, -1, 1, -1, - 0, -1, 0, -1, 0, -1, 0, -1)); - __m256i xq8h = _mm256_shuffle_epi8(xq8, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1, - 6, -1, 6, -1, 6, -1, 6, -1, - 3, -1, 3, -1, 3, -1, 3, -1, - 2, -1, 2, -1, 2, -1, 2, -1)); - __m256i shift = _mm256_set_epi16(64, 16, 4, 1, - 64, 16, 4, 1, - 64, 16, 4, 1, - 64, 16, 4, 1); - xq8l = _mm256_mullo_epi16(xq8l, shift); - xq8h = _mm256_mullo_epi16(xq8h, shift); - xq8l = _mm256_srai_epi16(xq8l, 14); - xq8h = _mm256_srai_epi16(xq8h, 14); - xq8 = _mm256_packs_epi16(xq8l, xq8h); + // assuming this is always aligned + __m256i xq8 = _mm256_set1_epi64x(*(const int64_t *) x[i].qs); + xq8 = _mm256_srlv_epi64(xq8, _mm256_set_epi64x(6, 4, 2, 0)); + xq8 = _mm256_and_si256(xq8, _mm256_set1_epi8(0x03)); + // stangely enough, this is much slower with 1 instead of 2 + xq8 = _mm256_sub_epi8(xq8, _mm256_set1_epi8(2)); - __m256i yq8 = _mm256_loadu_si256((const __m256i *) (y[i].qs)); + const __m256i yq8 = _mm256_loadu_si256((const __m256i *) (y[i].qs)); const __m256 q = mul_sum_i8_pairs_float(xq8, yq8); acc = _mm256_fmadd_ps( d, q, acc ); @@ -3964,11 +3949,11 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; i++) { int sumi = 0; for (int j = 0; j < qk / 4; j++) { - const int8_t* weight = (const int8_t *)(q22_grid + x[i].qs[j]); - sumi += (int)y[i].qs[4*j+0] * weight[0]; - sumi += (int)y[i].qs[4*j+1] * weight[1]; - sumi += (int)y[i].qs[4*j+2] * weight[2]; - sumi += (int)y[i].qs[4*j+3] * weight[3]; + const uint8_t weight = x[i].qs[j]; + sumi += (int)y[i].qs[j + 0*qk/4] * ((weight >> 0) & 3) - 2; + sumi += (int)y[i].qs[j + 1*qk/4] * ((weight >> 2) & 3) - 2; + sumi += (int)y[i].qs[j + 2*qk/4] * ((weight >> 4) & 3) - 2; + sumi += (int)y[i].qs[j + 3*qk/4] * ((weight >> 6) & 3) - 2; } sumf += (float)(sumi)*(GGML_FP16_TO_FP32(y[i].d)); }