first commit

This commit is contained in:
Ariadne 2024-03-01 03:12:56 +08:00
parent ff4ff05c5f
commit 8150be09d9
4 changed files with 75 additions and 17 deletions

10
bigdl.h Normal file
View file

@ -0,0 +1,10 @@
#pragma once
#include <sycl/sycl.hpp>
extern void mul_mat_q4_0_sycl(
const uint8_t* weight,
const float* input, // TODO: consider fp16 later
float* output,
const int state_size,
const int output_size,
sycl::queue & queue);

View file

@ -511,6 +511,48 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
}
}
// below code moves scales to be compatiable with bigdl.llm
// void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
// static const int qk = QK4_0;
// assert(k % qk == 0);
// const int nb = k / qk;
// const int scales_offset = nb * sizeof(block_q4_0_qs);
// block_q4_0_qs * restrict y_qs = (block_q4_0_qs *) y;
// ggml_fp16_t * restrict scales = (ggml_fp16_t *) ((char*)y + scales_offset);
// for (int i = 0; i < nb; i++) {
// float amax = 0.0f; // absolute max
// float max = 0.0f;
// for (int j = 0; j < qk; j++) {
// const float v = x[i*qk + j];
// if (amax < fabsf(v)) {
// amax = fabsf(v);
// max = v;
// }
// }
// const float d = max / -8;
// const float id = d ? 1.0f/d : 0.0f;
// // y_qs[i].d = GGML_FP32_TO_FP16(d);
// scales[i] = GGML_FP32_TO_FP16(d);
// for (int j = 0; j < qk/2; ++j) {
// const float x0 = x[i*qk + 0 + j]*id;
// const float x1 = x[i*qk + qk/2 + j]*id;
// const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
// const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
// y_qs[i].qs[j] = xi0;
// y_qs[i].qs[j] |= xi1 << 4;
// }
// }
// }
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
quantize_row_q4_0_reference(x, y, k);
}
@ -3042,7 +3084,7 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int
}
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
static_assert(QK4_0 == 32, "QK4_0 must be 32");
// static_assert(QK4_0 == 32, "QK4_0 must be 32");
if (!quant_weights) {
quantize_row_q4_0_reference(x, y, n_per_row);

View file

@ -7,13 +7,17 @@
#include <stdint.h>
#include <stddef.h>
#define QK4_0 32
#define QK4_0 64
typedef struct {
ggml_fp16_t d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0;
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
typedef struct {
uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0_qs;
#define QK4_1 32
typedef struct {
ggml_fp16_t d; // delta

View file

@ -35,6 +35,7 @@
#include "ggml-sycl.h"
#include "ggml.h"
#include "ggml-backend-impl.h"
#include "bigdl.h"
/*
Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
@ -3020,7 +3021,7 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
// QR = QK / number of values before dequantization
// QI = number of 32 bit integers before dequantization
#define QK4_0 32
#define QK4_0 64
#define QR4_0 2
#define QI4_0 (QK4_0 / (4 * QR4_0))
typedef struct dpct_type_471834 {
@ -8989,21 +8990,22 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
// const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
// // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
// const sycl::range<3> block_nums(1, 1, block_num_y);
// const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
// {
// dpct::has_capability_or_fail(stream->get_device(),
// {sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
vx, y, dst, ncols, nrows, item_ct1);
});
}
// stream->parallel_for(
// sycl::nd_range<3>(block_nums * block_dims, block_dims),
// [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
// dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
// vx, y, dst, ncols, nrows, item_ct1);
// });
// }
mul_mat_q4_0_sycl((const uint8_t*)vx, y, dst, ncols, nrows, *stream);
}
static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,