diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 03ee6904b..24a8f334e 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1,12 +1,17 @@ include(CheckCXXCompilerFlag) -option(CUSTOM_QK4_0 "Quantization block size for Q4_0 (32, 64, 128, 256)" 32) +# Set default value for QK4_0 +set(QK4_0 "32" CACHE STRING "Quantization block size for Q4_0 (32, 64, 128, 256)") -if (NOT CUSTOM_QK4_0 MATCHES "^(32|64|128|256)$") - message(FATAL_ERROR "Invalid CUSTOM_QK4_0 value: Must be one of {32, 64, 128, 256}") +# Only check the value if it was explicitly set by the user +if (DEFINED QK4_0 AND NOT QK4_0 STREQUAL "32") + if (NOT QK4_0 MATCHES "^(32|64|128|256)$") + message(FATAL_ERROR "Invalid QK4_0 value: Must be one of {32, 64, 128, 256}") + endif() + add_compile_definitions(QK4_0=${QK4_0}) endif() -add_compile_definitions(CUSTOM_QK4_0=${CUSTOM_QK4_0}) +message(STATUS "QK4_0 is set to: ${QK4_0}") add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES}) diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 4b2bc6e1a..77624743e 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -141,10 +141,8 @@ typedef sycl::half2 ggml_half2; #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP -#ifdef CUSTOM_QK4_0 - #define QK4_0 CUSTOM_QK4_0 -#else - #define QK4_0 32 // Default value for QK4_0 +#ifndef QK4_0 + #define QK4_0 32 #endif #if (QK4_0 != 32 && QK4_0 != 64 && QK4_0 != 128 && QK4_0 != 256) @@ -192,16 +190,12 @@ typedef struct { } block_q5_1; static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); -#ifdef CUSTOM_QK8_0 - #define QK8_0 CUSTOM_QK4_0 +#if QK4_0 != 32 + #define QK8_0 QK4_0 #else #define QK8_0 32 #endif -#if (QK8_0 != 32 && QK8_0 != 64 && QK8_0 != 128 && QK8_0 != 256) - #error "Invalid QK8_0 value: QK8_0 must be one of {32, 64, 128, 256}" -#endif - typedef struct { ggml_half d; // delta int8_t qs[QK8_0]; // quants @@ -421,7 +415,8 @@ typedef union { } iq1m_scale_t; // Non-linear quants -#define QK4_NL 128 +#define QK4_NL QK8_0 + typedef struct { ggml_half d; uint8_t qs[QK4_NL/2]; diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c index fbd9513a8..568c22ed6 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.c +++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c @@ -713,7 +713,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) block_q8_0 * restrict y = vy; -#if defined(CUSTOM_QK4_0) && (CUSTOM_QK4_0 != 32) +#if (QK8_0 != 32) GGML_UNUSED(nb); // scalar quantize_row_q8_0_ref(x, y, k); @@ -1832,8 +1832,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; -#if defined(CUSTOM_QK4_0) && (CUSTOM_QK4_0 != 32) - // Use only the basic implementation when CUSTOM_QK4_0 is defined and not 32 +#if (QK4_0 != 32) + // Use only the basic implementation when QK4_0 is defined and not 32 for (; ib < nb; ++ib) { int sumi0 = 0; int sumi1 = 0; @@ -2317,7 +2317,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r } sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); -#else +#endif for (; ib < nb; ++ib) { int sumi0 = 0; int sumi1 = 0; @@ -2333,7 +2333,6 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r int sumi = sumi0 + sumi1; sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); } -#endif #endif *s = sumf; }