From db6657eeaf21db05f25cd03f3556174dec59f423 Mon Sep 17 00:00:00 2001
From: Srihari-mcw <srihari@multicorewareinc.com>
Date: Tue, 30 Jul 2024 09:46:22 -0700
Subject: [PATCH] Fic more conflicts in quantize.cpp

---
 ggml/include/ggml.h    |   9 ++-
 ggml/src/ggml-quants.c | 136 +++++++++++++++++++++++++++++++++++++++--
 ggml/src/ggml-quants.h |   2 +
 ggml/src/ggml.c        |  26 +++++++-
 include/llama.h        |   2 +
 src/llama.cpp          |   2 +
 6 files changed, 166 insertions(+), 11 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index fc25aa710..b55fd1b8b 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -392,6 +392,8 @@ extern "C" {
         GGML_TYPE_Q4_0_4_4 = 31,
         GGML_TYPE_Q4_0_4_8 = 32,
         GGML_TYPE_Q4_0_8_8 = 33,
+        GGML_TYPE_Q4_0_B16 = 34,
+        GGML_TYPE_Q8_0_B16 = 35,
         GGML_TYPE_COUNT,
     };
 
@@ -433,14 +435,11 @@ extern "C" {
         GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
         GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
-<<<<<<< HEAD
         GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
-=======
-        GGML_FTYPE_MOSTLY_Q4_0_B16 = 25, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q8_0_B16 = 26, // except 1d tensors
->>>>>>> ed837022 (Introduce Q4_0 and Q8_0 quantizations with BF16 delta values)
+        GGML_FTYPE_MOSTLY_Q4_0_B16 = 28, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q8_0_B16 = 29, // except 1d tensors
     };
 
     // available tensor operations:
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 674551da1..66db1e46d 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -700,7 +700,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
 }
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q4_0_b16_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
+void quantize_row_q4_0_b16_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
     static const int qk = QK4_0;
 
     assert(k % qk == 0);
@@ -738,7 +738,7 @@ void quantize_row_q4_0_b16_reference(const float * restrict x, block_q4_0 * rest
 }
 
 void quantize_row_q4_0_b16(const float * restrict x, void * restrict y, int64_t k) {
-    quantize_row_q4_0_b16_reference(x, y, k);
+    quantize_row_q4_0_b16_ref(x, y, k);
 }
 
 
@@ -1190,6 +1190,132 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
 #endif
 }
 
+void quantize_row_q8_0_b16_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < QK8_0; j++) {
+            const float v = x[i*QK8_0 + j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = (GGML_FP32_TO_BF16(d)).bits;
+
+        for (int j = 0; j < QK8_0; ++j) {
+            const float x0 = x[i*QK8_0 + j]*id;
+
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+}
+
+void quantize_row_q8_0_b16(const float * restrict x, void * restrict vy, int64_t k) {
+    assert(QK8_0 == 32);
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    block_q8_0 * restrict y = vy;
+
+#if defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+
+        y[i].d = (GGML_FP32_TO_BF16(d)).bits;
+
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#else
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_b16_ref(x, y, k);
+#endif
+}
+
+
 // reference implementation for deterministic creation of model files
 void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
     assert(QK8_1 == 32);
@@ -3217,7 +3343,7 @@ static void quantize_row_q4_0_b16_impl(const float * restrict x, block_q4_0 * re
     static_assert(QK4_0 == 32, "QK4_0 must be 32");
 
     if (!quant_weights) {
-        quantize_row_q4_0_b16_reference(x, y, n_per_row);
+        quantize_row_q4_0_b16_ref(x, y, n_per_row);
         return;
     }
 
@@ -3258,7 +3384,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nr
 
 size_t quantize_q4_0_b16(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     if (!quant_weights) {
-        quantize_row_q4_0_b16_reference(src, dst, (int64_t)nrow*n_per_row);
+        quantize_row_q4_0_b16_ref(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q4_0_B16, n_per_row);
     }
     size_t row_size = ggml_row_size(GGML_TYPE_Q4_0_B16, n_per_row);
@@ -3433,7 +3559,7 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
 size_t quantize_q8_0_b16(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     (void)quant_weights; // not used
     const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0_B16, n_per_row);
-    quantize_row_q8_0_b16_reference(src, dst, (int64_t)nrow*n_per_row);
+    quantize_row_q8_0_b16_ref(src, dst, (int64_t)nrow*n_per_row);
     return nrow * row_size;
 }
 
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index 23d21c42d..69e11fa22 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -13,10 +13,12 @@ extern "C" {
 
 // Quantization
 void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_0_b16_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_0_b16_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
 
 void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 60c96c258..b8f5075bb 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1033,7 +1033,31 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .ncols                    = 8,
         .gemv                     = ggml_gemv_q4_0_8x8_q8_0,
         .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
-    }
+    },
+    [GGML_TYPE_Q4_0_B16] = {
+        .type_name                = "q4_0_b16",
+        .blck_size                = QK4_0,
+        .type_size                = sizeof(block_q4_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0_b16,
+        .from_float               = quantize_row_q4_0_b16,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_b16_ref,
+        .vec_dot                  = ggml_vec_dot_q4_0_b16_q8_0_b16,
+        .vec_dot_type             = GGML_TYPE_Q8_0_B16,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_Q8_0_B16] = {
+        .type_name                = "q8_0_b16",
+        .blck_size                = QK8_0,
+        .type_size                = sizeof(block_q8_0),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q8_0_b16,
+        .from_float               = quantize_row_q8_0_b16,
+        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_b16_ref,
+        .vec_dot                  = ggml_vec_dot_q8_0_b16_q8_0_b16,
+        .vec_dot_type             = GGML_TYPE_Q8_0_B16,
+        .nrows                    = 1,
+    },
 };
 
 // For internal test use
diff --git a/include/llama.h b/include/llama.h
index ce07f4fac..19d014c99 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -166,6 +166,8 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0_B16      = 36, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q8_0_B16      = 37, // except 1d tensors
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
diff --git a/src/llama.cpp b/src/llama.cpp
index 10c96924d..5d1b87700 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3791,6 +3791,8 @@ struct llama_model_loader {
                 case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
                 case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
                 case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
+                case GGML_TYPE_Q4_0_B16: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_B16; break;
+                case GGML_TYPE_Q8_0_B16: ftype = LLAMA_FTYPE_MOSTLY_Q8_0_B16; break;
                 default:
                     {
                         LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));