first commit

2024-03-01 03:12:56 +08:00 · 2024-03-01 03:12:56 +08:00 · 8150be09d9
commit 8150be09d9
parent ff4ff05c5f
4 changed files with 75 additions and 17 deletions
--- a/bigdl.h
+++ b/bigdl.h
@ -0,0 +1,10 @@
+#pragma once
+#include <sycl/sycl.hpp>
+
+extern void mul_mat_q4_0_sycl(
+    const uint8_t* weight,
+    const float* input, // TODO: consider fp16 later
+    float* output,
+    const int state_size,
+    const int output_size,
+    sycl::queue & queue);
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -511,6 +511,48 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
    }
 }

+// below code moves scales to be compatiable with bigdl.llm
+// void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
+//     static const int qk = QK4_0;
+
+//     assert(k % qk == 0);
+
+//     const int nb = k / qk;
+//     const int scales_offset = nb * sizeof(block_q4_0_qs);
+//     block_q4_0_qs * restrict y_qs = (block_q4_0_qs *) y;
+//     ggml_fp16_t * restrict scales = (ggml_fp16_t *) ((char*)y + scales_offset);
+
+//     for (int i = 0; i < nb; i++) {
+//         float amax = 0.0f; // absolute max
+//         float max  = 0.0f;
+
+//         for (int j = 0; j < qk; j++) {
+//             const float v = x[i*qk + j];
+//             if (amax < fabsf(v)) {
+//                 amax = fabsf(v);
+//                 max  = v;
+//             }
+//         }
+
+//         const float d  = max / -8;
+//         const float id = d ? 1.0f/d : 0.0f;
+
+//         // y_qs[i].d = GGML_FP32_TO_FP16(d);
+//         scales[i] = GGML_FP32_TO_FP16(d);
+
+//         for (int j = 0; j < qk/2; ++j) {
+//             const float x0 = x[i*qk + 0    + j]*id;
+//             const float x1 = x[i*qk + qk/2 + j]*id;
+
+//             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+//             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+//             y_qs[i].qs[j]  = xi0;
+//             y_qs[i].qs[j] |= xi1 << 4;
+//         }
+//     }
+// }
+
 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
    quantize_row_q4_0_reference(x, y, k);
 }
@ -3042,7 +3084,7 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int
 }

 static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
-    static_assert(QK4_0 == 32, "QK4_0 must be 32");
+    // static_assert(QK4_0 == 32, "QK4_0 must be 32");

    if (!quant_weights) {
        quantize_row_q4_0_reference(x, y, n_per_row);
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -7,13 +7,17 @@
 #include <stdint.h>
 #include <stddef.h>

-#define QK4_0 32
+#define QK4_0 64
 typedef struct {
    ggml_fp16_t d;          // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
 } block_q4_0;
 static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");

+typedef struct {
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0_qs;
+
 #define QK4_1 32
 typedef struct {
    ggml_fp16_t d;          // delta
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -35,6 +35,7 @@
 #include "ggml-sycl.h"
 #include "ggml.h"
 #include "ggml-backend-impl.h"
+#include "bigdl.h"

 /*
 Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
@ -3020,7 +3021,7 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
 // QR = QK / number of values before dequantization
 // QI = number of 32 bit integers before dequantization

-#define QK4_0 32
+#define QK4_0 64
 #define QR4_0 2
 #define QI4_0 (QK4_0 / (4 * QR4_0))
 typedef struct dpct_type_471834 {
@ -8989,21 +8990,22 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
                                             const int nrows,
                                             dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
+    // const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    // // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+    // const sycl::range<3> block_nums(1, 1, block_num_y);
+    // const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    // {
+    //     dpct::has_capability_or_fail(stream->get_device(),
+    //                                  {sycl::aspect::fp16});

-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
+    //     stream->parallel_for(
+    //         sycl::nd_range<3>(block_nums * block_dims, block_dims),
+    //         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+    //             dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
+    //                 vx, y, dst, ncols, nrows, item_ct1);
+    //         });
+    // }
+    mul_mat_q4_0_sycl((const uint8_t*)vx, y, dst, ncols, nrows, *stream);
 }

 static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,