From 8150be09d9d9bcd558dd2c75f9ff03f0017f1230 Mon Sep 17 00:00:00 2001
From: Ariadne <wyn2000330@126.com>
Date: Fri, 1 Mar 2024 03:12:56 +0800
Subject: [PATCH] first commit

---
 bigdl.h       | 10 ++++++++++
 ggml-quants.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
 ggml-quants.h |  6 +++++-
 ggml-sycl.cpp | 32 +++++++++++++++++---------------
 4 files changed, 75 insertions(+), 17 deletions(-)
 create mode 100644 bigdl.h

diff --git a/bigdl.h b/bigdl.h
new file mode 100644
index 000000000..83e778b4f
--- /dev/null
+++ b/bigdl.h
@@ -0,0 +1,10 @@
+#pragma once
+#include <sycl/sycl.hpp>
+
+extern void mul_mat_q4_0_sycl(
+    const uint8_t* weight,
+    const float* input, // TODO: consider fp16 later
+    float* output,
+    const int state_size,
+    const int output_size,
+    sycl::queue & queue);
diff --git a/ggml-quants.c b/ggml-quants.c
index 101d3e783..d65917d22 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -511,6 +511,48 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
     }
 }
 
+// below code moves scales to be compatiable with bigdl.llm
+// void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
+//     static const int qk = QK4_0;
+
+//     assert(k % qk == 0);
+
+//     const int nb = k / qk;
+//     const int scales_offset = nb * sizeof(block_q4_0_qs);
+//     block_q4_0_qs * restrict y_qs = (block_q4_0_qs *) y;
+//     ggml_fp16_t * restrict scales = (ggml_fp16_t *) ((char*)y + scales_offset);
+
+//     for (int i = 0; i < nb; i++) {
+//         float amax = 0.0f; // absolute max
+//         float max  = 0.0f;
+
+//         for (int j = 0; j < qk; j++) {
+//             const float v = x[i*qk + j];
+//             if (amax < fabsf(v)) {
+//                 amax = fabsf(v);
+//                 max  = v;
+//             }
+//         }
+
+//         const float d  = max / -8;
+//         const float id = d ? 1.0f/d : 0.0f;
+
+//         // y_qs[i].d = GGML_FP32_TO_FP16(d);
+//         scales[i] = GGML_FP32_TO_FP16(d);
+
+//         for (int j = 0; j < qk/2; ++j) {
+//             const float x0 = x[i*qk + 0    + j]*id;
+//             const float x1 = x[i*qk + qk/2 + j]*id;
+
+//             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+//             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+//             y_qs[i].qs[j]  = xi0;
+//             y_qs[i].qs[j] |= xi1 << 4;
+//         }
+//     }
+// }
+
 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
     quantize_row_q4_0_reference(x, y, k);
 }
@@ -3042,7 +3084,7 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int
 }
 
 static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
-    static_assert(QK4_0 == 32, "QK4_0 must be 32");
+    // static_assert(QK4_0 == 32, "QK4_0 must be 32");
 
     if (!quant_weights) {
         quantize_row_q4_0_reference(x, y, n_per_row);
diff --git a/ggml-quants.h b/ggml-quants.h
index bfdf3c997..5928b87b6 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -7,13 +7,17 @@
 #include <stdint.h>
 #include <stddef.h>
 
-#define QK4_0 32
+#define QK4_0 64
 typedef struct {
     ggml_fp16_t d;          // delta
     uint8_t qs[QK4_0 / 2];  // nibbles / quants
 } block_q4_0;
 static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
 
+typedef struct {
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0_qs;
+
 #define QK4_1 32
 typedef struct {
     ggml_fp16_t d;          // delta
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index a03df4c65..4eeecf02f 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -35,6 +35,7 @@
 #include "ggml-sycl.h"
 #include "ggml.h"
 #include "ggml-backend-impl.h"
+#include "bigdl.h"
 
 /*
 Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
@@ -3020,7 +3021,7 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
 // QR = QK / number of values before dequantization
 // QI = number of 32 bit integers before dequantization
 
-#define QK4_0 32
+#define QK4_0 64
 #define QR4_0 2
 #define QI4_0 (QK4_0 / (4 * QR4_0))
 typedef struct dpct_type_471834 {
@@ -8989,21 +8990,22 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
                                              const int nrows,
                                              dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
+    // const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    // // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+    // const sycl::range<3> block_nums(1, 1, block_num_y);
+    // const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    // {
+    //     dpct::has_capability_or_fail(stream->get_device(),
+    //                                  {sycl::aspect::fp16});
 
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
+    //     stream->parallel_for(
+    //         sycl::nd_range<3>(block_nums * block_dims, block_dims),
+    //         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+    //             dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
+    //                 vx, y, dst, ncols, nrows, item_ct1);
+    //         });
+    // }
+    mul_mat_q4_0_sycl((const uint8_t*)vx, y, dst, ncols, nrows, *stream);
 }
 
 static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,