diff --git a/.gitignore b/.gitignore index da01a6b42..9bfe49fc2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,8 @@ build/** -.build/** \ No newline at end of file +.build/** + +models/** + +.vscode/** + +**/__pycache__/** diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 71934c679..03ee6904b 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1,5 +1,13 @@ include(CheckCXXCompilerFlag) +option(CUSTOM_QK4_0 "Quantization block size for Q4_0 (32, 64, 128, 256)" 32) + +if (NOT CUSTOM_QK4_0 MATCHES "^(32|64|128|256)$") + message(FATAL_ERROR "Invalid CUSTOM_QK4_0 value: Must be one of {32, 64, 128, 256}") +endif() + +add_compile_definitions(CUSTOM_QK4_0=${CUSTOM_QK4_0}) + add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES}) # enable libstdc++ assertions for debug builds diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 050161393..4b2bc6e1a 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -141,7 +141,16 @@ typedef sycl::half2 ggml_half2; #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP -#define QK4_0 32 +#ifdef CUSTOM_QK4_0 + #define QK4_0 CUSTOM_QK4_0 +#else + #define QK4_0 32 // Default value for QK4_0 +#endif + +#if (QK4_0 != 32 && QK4_0 != 64 && QK4_0 != 128 && QK4_0 != 256) + #error "Invalid QK4_0 value: QK4_0 must be one of {32, 64, 128, 256}" +#endif + typedef struct { ggml_half d; // delta uint8_t qs[QK4_0 / 2]; // nibbles / quants @@ -183,7 +192,16 @@ typedef struct { } block_q5_1; static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); -#define QK8_0 32 +#ifdef CUSTOM_QK8_0 + #define QK8_0 CUSTOM_QK4_0 +#else + #define QK8_0 32 +#endif + +#if (QK8_0 != 32 && QK8_0 != 64 && QK8_0 != 128 && QK8_0 != 256) + #error "Invalid QK8_0 value: QK8_0 must be one of {32, 64, 128, 256}" +#endif + typedef struct { ggml_half d; // delta int8_t qs[QK8_0]; // quants @@ -403,7 +421,7 @@ typedef union { } iq1m_scale_t; // Non-linear quants -#define QK4_NL 32 +#define QK4_NL 128 typedef struct { ggml_half d; uint8_t qs[QK4_NL/2]; diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c index f0e276b69..fbd9513a8 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.c +++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c @@ -707,12 +707,17 @@ void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) { } void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) { - assert(QK8_0 == 32); + assert(QK8_0 == 32 || QK8_0 == 64 || QK8_0 == 128 || QK8_0 == 256); assert(k % QK8_0 == 0); const int nb = k / QK8_0; block_q8_0 * restrict y = vy; +#if defined(CUSTOM_QK4_0) && (CUSTOM_QK4_0 != 32) + GGML_UNUSED(nb); + // scalar + quantize_row_q8_0_ref(x, y, k); +#else #if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { float32x4_t srcv [8]; @@ -989,6 +994,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) // scalar quantize_row_q8_0_ref(x, y, k); #endif +#endif } void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) { @@ -1735,7 +1741,7 @@ static inline __m128i get_scale_shuffle(int i) { #endif void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { - const int qk = QK8_0; + const int qk = 128; const int nb = n / qk; assert(n % qk == 0); @@ -1825,6 +1831,26 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r int ib = 0; float sumf = 0; + +#if defined(CUSTOM_QK4_0) && (CUSTOM_QK4_0 != 32) + // Use only the basic implementation when CUSTOM_QK4_0 is defined and not 32 + for (; ib < nb; ++ib) { + int sumi0 = 0; + int sumi1 = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[ib].qs[j] & 0x0F) - 8; + const int v1 = (x[ib].qs[j] >> 4) - 8; + + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); + } + + int sumi = sumi0 + sumi1; + sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + } +#else + // All the SIMD implementations #if defined(__ARM_FEATURE_SVE) svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f); @@ -2291,7 +2317,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r } sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); -#endif +#else for (; ib < nb; ++ib) { int sumi0 = 0; int sumi1 = 0; @@ -2307,7 +2333,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r int sumi = sumi0 + sumi1; sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); } - +#endif +#endif *s = sumf; } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7301a9c6c..838374028 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -1819,7 +1819,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nr } static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) { - static_assert(QK4_0 == 32, "QK4_0 must be 32"); + static_assert(QK4_0 == 32 || QK4_0 == 64 || QK4_0 == 128 || QK4_0 == 256, "QK4_0 must be one of {32, 64, 128, 256}"); if (!quant_weights) { quantize_row_q4_0_ref(x, y, n_per_row);