first commit
This commit is contained in:
parent
ff4ff05c5f
commit
8150be09d9
4 changed files with 75 additions and 17 deletions
10
bigdl.h
Normal file
10
bigdl.h
Normal file
|
@ -0,0 +1,10 @@
|
|||
#pragma once
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
extern void mul_mat_q4_0_sycl(
|
||||
const uint8_t* weight,
|
||||
const float* input, // TODO: consider fp16 later
|
||||
float* output,
|
||||
const int state_size,
|
||||
const int output_size,
|
||||
sycl::queue & queue);
|
|
@ -511,6 +511,48 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
|
|||
}
|
||||
}
|
||||
|
||||
// below code moves scales to be compatiable with bigdl.llm
|
||||
// void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
|
||||
// static const int qk = QK4_0;
|
||||
|
||||
// assert(k % qk == 0);
|
||||
|
||||
// const int nb = k / qk;
|
||||
// const int scales_offset = nb * sizeof(block_q4_0_qs);
|
||||
// block_q4_0_qs * restrict y_qs = (block_q4_0_qs *) y;
|
||||
// ggml_fp16_t * restrict scales = (ggml_fp16_t *) ((char*)y + scales_offset);
|
||||
|
||||
// for (int i = 0; i < nb; i++) {
|
||||
// float amax = 0.0f; // absolute max
|
||||
// float max = 0.0f;
|
||||
|
||||
// for (int j = 0; j < qk; j++) {
|
||||
// const float v = x[i*qk + j];
|
||||
// if (amax < fabsf(v)) {
|
||||
// amax = fabsf(v);
|
||||
// max = v;
|
||||
// }
|
||||
// }
|
||||
|
||||
// const float d = max / -8;
|
||||
// const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
// // y_qs[i].d = GGML_FP32_TO_FP16(d);
|
||||
// scales[i] = GGML_FP32_TO_FP16(d);
|
||||
|
||||
// for (int j = 0; j < qk/2; ++j) {
|
||||
// const float x0 = x[i*qk + 0 + j]*id;
|
||||
// const float x1 = x[i*qk + qk/2 + j]*id;
|
||||
|
||||
// const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
|
||||
// const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
|
||||
|
||||
// y_qs[i].qs[j] = xi0;
|
||||
// y_qs[i].qs[j] |= xi1 << 4;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
||||
quantize_row_q4_0_reference(x, y, k);
|
||||
}
|
||||
|
@ -3042,7 +3084,7 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int
|
|||
}
|
||||
|
||||
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
|
||||
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
||||
// static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
||||
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_0_reference(x, y, n_per_row);
|
||||
|
|
|
@ -7,13 +7,17 @@
|
|||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#define QK4_0 32
|
||||
#define QK4_0 64
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||
} block_q4_0;
|
||||
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||
} block_q4_0_qs;
|
||||
|
||||
#define QK4_1 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
#include "ggml-sycl.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "bigdl.h"
|
||||
|
||||
/*
|
||||
Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
|
||||
|
@ -3020,7 +3021,7 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
|
|||
// QR = QK / number of values before dequantization
|
||||
// QI = number of 32 bit integers before dequantization
|
||||
|
||||
#define QK4_0 32
|
||||
#define QK4_0 64
|
||||
#define QR4_0 2
|
||||
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
||||
typedef struct dpct_type_471834 {
|
||||
|
@ -8989,21 +8990,22 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
|
|||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
// const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
// // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
||||
// const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
// const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
// {
|
||||
// dpct::has_capability_or_fail(stream->get_device(),
|
||||
// {sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
// stream->parallel_for(
|
||||
// sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
// [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
// dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
|
||||
// vx, y, dst, ncols, nrows, item_ct1);
|
||||
// });
|
||||
// }
|
||||
mul_mat_q4_0_sycl((const uint8_t*)vx, y, dst, ncols, nrows, *stream);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue