implemented compilation time q4_0 group size variants - for cpu
This commit is contained in:
parent
9201de2b49
commit
a4ee5ca8f5
5 changed files with 68 additions and 9 deletions
6
.gitignore
vendored
6
.gitignore
vendored
|
@ -1,2 +1,8 @@
|
||||||
build/**
|
build/**
|
||||||
.build/**
|
.build/**
|
||||||
|
|
||||||
|
models/**
|
||||||
|
|
||||||
|
.vscode/**
|
||||||
|
|
||||||
|
**/__pycache__/**
|
||||||
|
|
|
@ -1,5 +1,13 @@
|
||||||
include(CheckCXXCompilerFlag)
|
include(CheckCXXCompilerFlag)
|
||||||
|
|
||||||
|
option(CUSTOM_QK4_0 "Quantization block size for Q4_0 (32, 64, 128, 256)" 32)
|
||||||
|
|
||||||
|
if (NOT CUSTOM_QK4_0 MATCHES "^(32|64|128|256)$")
|
||||||
|
message(FATAL_ERROR "Invalid CUSTOM_QK4_0 value: Must be one of {32, 64, 128, 256}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
add_compile_definitions(CUSTOM_QK4_0=${CUSTOM_QK4_0})
|
||||||
|
|
||||||
add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
|
add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
|
||||||
|
|
||||||
# enable libstdc++ assertions for debug builds
|
# enable libstdc++ assertions for debug builds
|
||||||
|
|
|
@ -141,7 +141,16 @@ typedef sycl::half2 ggml_half2;
|
||||||
|
|
||||||
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
||||||
|
|
||||||
#define QK4_0 32
|
#ifdef CUSTOM_QK4_0
|
||||||
|
#define QK4_0 CUSTOM_QK4_0
|
||||||
|
#else
|
||||||
|
#define QK4_0 32 // Default value for QK4_0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (QK4_0 != 32 && QK4_0 != 64 && QK4_0 != 128 && QK4_0 != 256)
|
||||||
|
#error "Invalid QK4_0 value: QK4_0 must be one of {32, 64, 128, 256}"
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_half d; // delta
|
ggml_half d; // delta
|
||||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||||
|
@ -183,7 +192,16 @@ typedef struct {
|
||||||
} block_q5_1;
|
} block_q5_1;
|
||||||
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
||||||
|
|
||||||
#define QK8_0 32
|
#ifdef CUSTOM_QK8_0
|
||||||
|
#define QK8_0 CUSTOM_QK4_0
|
||||||
|
#else
|
||||||
|
#define QK8_0 32
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (QK8_0 != 32 && QK8_0 != 64 && QK8_0 != 128 && QK8_0 != 256)
|
||||||
|
#error "Invalid QK8_0 value: QK8_0 must be one of {32, 64, 128, 256}"
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_half d; // delta
|
ggml_half d; // delta
|
||||||
int8_t qs[QK8_0]; // quants
|
int8_t qs[QK8_0]; // quants
|
||||||
|
@ -403,7 +421,7 @@ typedef union {
|
||||||
} iq1m_scale_t;
|
} iq1m_scale_t;
|
||||||
|
|
||||||
// Non-linear quants
|
// Non-linear quants
|
||||||
#define QK4_NL 32
|
#define QK4_NL 128
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_half d;
|
ggml_half d;
|
||||||
uint8_t qs[QK4_NL/2];
|
uint8_t qs[QK4_NL/2];
|
||||||
|
|
|
@ -707,12 +707,17 @@ void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
|
void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
|
||||||
assert(QK8_0 == 32);
|
assert(QK8_0 == 32 || QK8_0 == 64 || QK8_0 == 128 || QK8_0 == 256);
|
||||||
assert(k % QK8_0 == 0);
|
assert(k % QK8_0 == 0);
|
||||||
const int nb = k / QK8_0;
|
const int nb = k / QK8_0;
|
||||||
|
|
||||||
block_q8_0 * restrict y = vy;
|
block_q8_0 * restrict y = vy;
|
||||||
|
|
||||||
|
#if defined(CUSTOM_QK4_0) && (CUSTOM_QK4_0 != 32)
|
||||||
|
GGML_UNUSED(nb);
|
||||||
|
// scalar
|
||||||
|
quantize_row_q8_0_ref(x, y, k);
|
||||||
|
#else
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
float32x4_t srcv [8];
|
float32x4_t srcv [8];
|
||||||
|
@ -989,6 +994,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
||||||
// scalar
|
// scalar
|
||||||
quantize_row_q8_0_ref(x, y, k);
|
quantize_row_q8_0_ref(x, y, k);
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
|
void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
|
||||||
|
@ -1735,7 +1741,7 @@ static inline __m128i get_scale_shuffle(int i) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||||
const int qk = QK8_0;
|
const int qk = 128;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
|
@ -1825,6 +1831,26 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
int ib = 0;
|
int ib = 0;
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(CUSTOM_QK4_0) && (CUSTOM_QK4_0 != 32)
|
||||||
|
// Use only the basic implementation when CUSTOM_QK4_0 is defined and not 32
|
||||||
|
for (; ib < nb; ++ib) {
|
||||||
|
int sumi0 = 0;
|
||||||
|
int sumi1 = 0;
|
||||||
|
|
||||||
|
for (int j = 0; j < qk/2; ++j) {
|
||||||
|
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
||||||
|
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
||||||
|
|
||||||
|
sumi0 += (v0 * y[ib].qs[j]);
|
||||||
|
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
int sumi = sumi0 + sumi1;
|
||||||
|
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// All the SIMD implementations
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||||
|
@ -2291,7 +2317,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
}
|
}
|
||||||
|
|
||||||
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
||||||
#endif
|
#else
|
||||||
for (; ib < nb; ++ib) {
|
for (; ib < nb; ++ib) {
|
||||||
int sumi0 = 0;
|
int sumi0 = 0;
|
||||||
int sumi1 = 0;
|
int sumi1 = 0;
|
||||||
|
@ -2307,7 +2333,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
int sumi = sumi0 + sumi1;
|
int sumi = sumi0 + sumi1;
|
||||||
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1819,7 +1819,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nr
|
||||||
}
|
}
|
||||||
|
|
||||||
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
|
||||||
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
static_assert(QK4_0 == 32 || QK4_0 == 64 || QK4_0 == 128 || QK4_0 == 256, "QK4_0 must be one of {32, 64, 128, 256}");
|
||||||
|
|
||||||
if (!quant_weights) {
|
if (!quant_weights) {
|
||||||
quantize_row_q4_0_ref(x, y, n_per_row);
|
quantize_row_q4_0_ref(x, y, n_per_row);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue