From db6657eeaf21db05f25cd03f3556174dec59f423 Mon Sep 17 00:00:00 2001 From: Srihari-mcw Date: Tue, 30 Jul 2024 09:46:22 -0700 Subject: [PATCH] Fic more conflicts in quantize.cpp --- ggml/include/ggml.h | 9 ++- ggml/src/ggml-quants.c | 136 +++++++++++++++++++++++++++++++++++++++-- ggml/src/ggml-quants.h | 2 + ggml/src/ggml.c | 26 +++++++- include/llama.h | 2 + src/llama.cpp | 2 + 6 files changed, 166 insertions(+), 11 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index fc25aa710..b55fd1b8b 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -392,6 +392,8 @@ extern "C" { GGML_TYPE_Q4_0_4_4 = 31, GGML_TYPE_Q4_0_4_8 = 32, GGML_TYPE_Q4_0_8_8 = 33, + GGML_TYPE_Q4_0_B16 = 34, + GGML_TYPE_Q8_0_B16 = 35, GGML_TYPE_COUNT, }; @@ -433,14 +435,11 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors -<<<<<<< HEAD GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors -======= - GGML_FTYPE_MOSTLY_Q4_0_B16 = 25, // except 1d tensors - GGML_FTYPE_MOSTLY_Q8_0_B16 = 26, // except 1d tensors ->>>>>>> ed837022 (Introduce Q4_0 and Q8_0 quantizations with BF16 delta values) + GGML_FTYPE_MOSTLY_Q4_0_B16 = 28, // except 1d tensors + GGML_FTYPE_MOSTLY_Q8_0_B16 = 29, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 674551da1..66db1e46d 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -700,7 +700,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) { } // reference implementation for deterministic creation of model files -void quantize_row_q4_0_b16_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) { +void quantize_row_q4_0_b16_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -738,7 +738,7 @@ void quantize_row_q4_0_b16_reference(const float * restrict x, block_q4_0 * rest } void quantize_row_q4_0_b16(const float * restrict x, void * restrict y, int64_t k) { - quantize_row_q4_0_b16_reference(x, y, k); + quantize_row_q4_0_b16_ref(x, y, k); } @@ -1190,6 +1190,132 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) #endif } +void quantize_row_q8_0_b16_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) { + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = x[i*QK8_0 + j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = (GGML_FP32_TO_BF16(d)).bits; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = x[i*QK8_0 + j]*id; + + y[i].qs[j] = roundf(x0); + } + } +} + +void quantize_row_q8_0_b16(const float * restrict x, void * restrict vy, int64_t k) { + assert(QK8_0 == 32); + assert(k % QK8_0 == 0); + const int nb = k / QK8_0; + + block_q8_0 * restrict y = vy; + +#if defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + + y[i].d = (GGML_FP32_TO_BF16(d)).bits; + + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#else + GGML_UNUSED(nb); + // scalar + quantize_row_q8_0_b16_ref(x, y, k); +#endif +} + + // reference implementation for deterministic creation of model files void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) { assert(QK8_1 == 32); @@ -3217,7 +3343,7 @@ static void quantize_row_q4_0_b16_impl(const float * restrict x, block_q4_0 * re static_assert(QK4_0 == 32, "QK4_0 must be 32"); if (!quant_weights) { - quantize_row_q4_0_b16_reference(x, y, n_per_row); + quantize_row_q4_0_b16_ref(x, y, n_per_row); return; } @@ -3258,7 +3384,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nr size_t quantize_q4_0_b16(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { if (!quant_weights) { - quantize_row_q4_0_b16_reference(src, dst, (int64_t)nrow*n_per_row); + quantize_row_q4_0_b16_ref(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q4_0_B16, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q4_0_B16, n_per_row); @@ -3433,7 +3559,7 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr size_t quantize_q8_0_b16(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { (void)quant_weights; // not used const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0_B16, n_per_row); - quantize_row_q8_0_b16_reference(src, dst, (int64_t)nrow*n_per_row); + quantize_row_q8_0_b16_ref(src, dst, (int64_t)nrow*n_per_row); return nrow * row_size; } diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 23d21c42d..69e11fa22 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -13,10 +13,12 @@ extern "C" { // Quantization void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k); +void quantize_row_q4_0_b16_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k); void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k); void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k); void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_0_b16_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k); void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 60c96c258..b8f5075bb 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1033,7 +1033,31 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .ncols = 8, .gemv = ggml_gemv_q4_0_8x8_q8_0, .gemm = ggml_gemm_q4_0_8x8_q8_0, - } + }, + [GGML_TYPE_Q4_0_B16] = { + .type_name = "q4_0_b16", + .blck_size = QK4_0, + .type_size = sizeof(block_q4_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q4_0_b16, + .from_float = quantize_row_q4_0_b16, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_b16_ref, + .vec_dot = ggml_vec_dot_q4_0_b16_q8_0_b16, + .vec_dot_type = GGML_TYPE_Q8_0_B16, + .nrows = 1, + }, + [GGML_TYPE_Q8_0_B16] = { + .type_name = "q8_0_b16", + .blck_size = QK8_0, + .type_size = sizeof(block_q8_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q8_0_b16, + .from_float = quantize_row_q8_0_b16, + .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_b16_ref, + .vec_dot = ggml_vec_dot_q8_0_b16_q8_0_b16, + .vec_dot_type = GGML_TYPE_Q8_0_B16, + .nrows = 1, + }, }; // For internal test use diff --git a/include/llama.h b/include/llama.h index ce07f4fac..19d014c99 100644 --- a/include/llama.h +++ b/include/llama.h @@ -166,6 +166,8 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0_B16 = 36, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q8_0_B16 = 37, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index 10c96924d..5d1b87700 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3791,6 +3791,8 @@ struct llama_model_loader { case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break; case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break; case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break; + case GGML_TYPE_Q4_0_B16: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_B16; break; + case GGML_TYPE_Q8_0_B16: ftype = LLAMA_FTYPE_MOSTLY_Q8_0_B16; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));