From 8150be09d9d9bcd558dd2c75f9ff03f0017f1230 Mon Sep 17 00:00:00 2001 From: Ariadne Date: Fri, 1 Mar 2024 03:12:56 +0800 Subject: [PATCH] first commit --- bigdl.h | 10 ++++++++++ ggml-quants.c | 44 +++++++++++++++++++++++++++++++++++++++++++- ggml-quants.h | 6 +++++- ggml-sycl.cpp | 32 +++++++++++++++++--------------- 4 files changed, 75 insertions(+), 17 deletions(-) create mode 100644 bigdl.h diff --git a/bigdl.h b/bigdl.h new file mode 100644 index 000000000..83e778b4f --- /dev/null +++ b/bigdl.h @@ -0,0 +1,10 @@ +#pragma once +#include + +extern void mul_mat_q4_0_sycl( + const uint8_t* weight, + const float* input, // TODO: consider fp16 later + float* output, + const int state_size, + const int output_size, + sycl::queue & queue); diff --git a/ggml-quants.c b/ggml-quants.c index 101d3e783..d65917d22 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -511,6 +511,48 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict } } +// below code moves scales to be compatiable with bigdl.llm +// void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) { +// static const int qk = QK4_0; + +// assert(k % qk == 0); + +// const int nb = k / qk; +// const int scales_offset = nb * sizeof(block_q4_0_qs); +// block_q4_0_qs * restrict y_qs = (block_q4_0_qs *) y; +// ggml_fp16_t * restrict scales = (ggml_fp16_t *) ((char*)y + scales_offset); + +// for (int i = 0; i < nb; i++) { +// float amax = 0.0f; // absolute max +// float max = 0.0f; + +// for (int j = 0; j < qk; j++) { +// const float v = x[i*qk + j]; +// if (amax < fabsf(v)) { +// amax = fabsf(v); +// max = v; +// } +// } + +// const float d = max / -8; +// const float id = d ? 1.0f/d : 0.0f; + +// // y_qs[i].d = GGML_FP32_TO_FP16(d); +// scales[i] = GGML_FP32_TO_FP16(d); + +// for (int j = 0; j < qk/2; ++j) { +// const float x0 = x[i*qk + 0 + j]*id; +// const float x1 = x[i*qk + qk/2 + j]*id; + +// const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); +// const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + +// y_qs[i].qs[j] = xi0; +// y_qs[i].qs[j] |= xi1 << 4; +// } +// } +// } + void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { quantize_row_q4_0_reference(x, y, k); } @@ -3042,7 +3084,7 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int } static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) { - static_assert(QK4_0 == 32, "QK4_0 must be 32"); + // static_assert(QK4_0 == 32, "QK4_0 must be 32"); if (!quant_weights) { quantize_row_q4_0_reference(x, y, n_per_row); diff --git a/ggml-quants.h b/ggml-quants.h index bfdf3c997..5928b87b6 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -7,13 +7,17 @@ #include #include -#define QK4_0 32 +#define QK4_0 64 typedef struct { ggml_fp16_t d; // delta uint8_t qs[QK4_0 / 2]; // nibbles / quants } block_q4_0; static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); +typedef struct { + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0_qs; + #define QK4_1 32 typedef struct { ggml_fp16_t d; // delta diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index a03df4c65..4eeecf02f 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -35,6 +35,7 @@ #include "ggml-sycl.h" #include "ggml.h" #include "ggml-backend-impl.h" +#include "bigdl.h" /* Following definition copied from DPCT head files, which are used by ggml-sycl.cpp @@ -3020,7 +3021,7 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0, // QR = QK / number of values before dequantization // QI = number of 32 bit integers before dequantization -#define QK4_0 32 +#define QK4_0 64 #define QR4_0 2 #define QI4_0 (QK4_0 / (4 * QR4_0)) typedef struct dpct_type_471834 { @@ -8989,21 +8990,22 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); + // const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + // // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + // const sycl::range<3> block_nums(1, 1, block_num_y); + // const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + // { + // dpct::has_capability_or_fail(stream->get_device(), + // {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); - } + // stream->parallel_for( + // sycl::nd_range<3>(block_nums * block_dims, block_dims), + // [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + // dequantize_mul_mat_vec( + // vx, y, dst, ncols, nrows, item_ct1); + // }); + // } + mul_mat_q4_0_sycl((const uint8_t*)vx, y, dst, ncols, nrows, *stream); } static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,