diff --git a/docs/build.md b/docs/build.md index a4964cbd1..26e673788 100644 --- a/docs/build.md +++ b/docs/build.md @@ -55,7 +55,7 @@ cmake --build build --config Release cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF cmake --build build-arm64-windows-llvm-release ``` - Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels. + Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels. ## BLAS Build diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 704f0d56b..5d1e11c67 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis Several quantization methods are supported. They differ in the resulting model disk size and inference speed. -The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format. - *(outdated)* | Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index b98993210..8d47b17b6 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -48,9 +48,6 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, - { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 74af03639..386d5a15d 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -392,7 +392,7 @@ extern "C" { // GGML_TYPE_IQ4_NL_4_4 = 36, // GGML_TYPE_IQ4_NL_4_8 = 37, // GGML_TYPE_IQ4_NL_8_8 = 38, - GGML_TYPE_COUNT, + GGML_TYPE_COUNT = 39, }; // precision @@ -2202,11 +2202,19 @@ extern "C" { GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); -#ifdef __cplusplus -// restrict not standard in C++ -#define GGML_RESTRICT +#ifdef __cplusplus + // restrict not standard in C++ +# if defined(__GNUC__) +# define GGML_RESTRICT __restrict__ +# elif defined(__clang__) +# define GGML_RESTRICT __restrict +# elif defined(_MSC_VER) +# define GGML_RESTRICT __restrict +# else +# define GGML_RESTRICT +# endif #else -#define GGML_RESTRICT restrict +# define GGML_RESTRICT restrict #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); diff --git a/ggml/src/ggml-cpu/amx/amx.h b/ggml/src/ggml-cpu/amx/amx.h index e550cb0eb..5b65d76bd 100644 --- a/ggml/src/ggml-cpu/amx/amx.h +++ b/ggml/src/ggml-cpu/amx/amx.h @@ -3,7 +3,6 @@ // GGML internal header -// if defined(GGML_USE_CPU_AMX) ? #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index d34a90d27..0c47b3bb8 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -222,12 +222,12 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y) static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; -static void quantize_q8_0_4x4(const float * x, void * vy, int64_t k) { +static void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; - block_q8_0x4 * y = (block_q8_0x4 *) vy; + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; #if defined(__ARM_NEON) float32x4_t srcv[4][8]; @@ -316,12 +316,12 @@ static void quantize_q8_0_4x4(const float * x, void * vy, int64_t k) { #endif } -static void quantize_q8_0_4x8(const float * x, void * vy, int64_t k) { +static void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; - block_q8_0x4 * y = (block_q8_0x4 *) vy; + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; #if defined(__ARM_NEON) float32x4_t srcv[4][8]; @@ -531,7 +531,7 @@ static void quantize_q8_0_4x8(const float * x, void * vy, int64_t k) { #endif } -static void quantize_mat_q8_0(const float * x, void * vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) { +static void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) { assert(nrow == 4); UNUSED(nrow); if (blck_size_interleave == 4) { @@ -543,7 +543,7 @@ static void quantize_mat_q8_0(const float * x, void * vy, int64_t nrow, int64_t } } -static void ggml_gemv_q4_0_4x4_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -628,7 +628,7 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * s, size_t bs, const void * vx } } -static void ggml_gemv_q4_0_4x8_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -738,7 +738,7 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * s, size_t bs, const void * vx } } -static void ggml_gemv_q4_0_8x8_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 8; @@ -1011,7 +1011,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * s, size_t bs, const void * vx } } -static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -1107,7 +1107,7 @@ static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * s, size_t bs, const void * } } -static void ggml_gemm_q4_0_4x4_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +static void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -1623,7 +1623,7 @@ static void ggml_gemm_q4_0_4x4_q8_0(int n, float * s, size_t bs, const void * vx } } -static void ggml_gemm_q4_0_4x8_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +static void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -2077,7 +2077,7 @@ static void ggml_gemm_q4_0_4x8_q8_0(int n, float * s, size_t bs, const void * vx } } -static void ggml_gemm_q4_0_8x8_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 8; @@ -3497,7 +3497,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * s, size_t bs, const void * vx } } -static void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { +static void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -3677,7 +3677,7 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_in return out; } -static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) { +static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_Q4_0); GGML_ASSERT(interleave_block == 4 || interleave_block == 8); @@ -3708,7 +3708,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block GGML_UNUSED(data_size); } -static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * data, size_t data_size) { +static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_Q4_0); GGML_ASSERT(interleave_block == 8); @@ -3772,7 +3772,7 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s return out; } -static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) { +static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); GGML_ASSERT(interleave_block == 4 || interleave_block == 8); @@ -3971,8 +3971,7 @@ static const tensor_traits iq4_nl_4x4_q8_0; static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) { if (cur->type == GGML_TYPE_Q4_0) { - // TODO: enable for AVX2 - currently disabled due to bad gemv performance - if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { + if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { return &ggml::cpu::aarch64::q4_0_8x8_q8_0; } if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.h b/ggml/src/ggml-cpu/ggml-cpu-traits.h index 3b8cfac43..99a6186b1 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-traits.h +++ b/ggml/src/ggml-cpu/ggml-cpu-traits.h @@ -19,14 +19,14 @@ namespace ggml::cpu { // register in tensor->extra class tensor_traits { public: - ~tensor_traits(); + virtual ~tensor_traits(); virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0; virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0; }; class extra_buffer_type { public: - ~extra_buffer_type(); + virtual ~extra_buffer_type(); virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0; virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op) = 0; }; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 4c9f47954..058941c7a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -791,6 +791,24 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, }, + [31] = { // GGML_TYPE_Q4_0_4_4 + .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + }, + [32] = { // GGML_TYPE_Q4_0_4_8 + .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + }, + [33] = { // GGML_TYPE_Q4_0_8_8 + .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + }, [GGML_TYPE_TQ1_0] = { .type_name = "tq1_0", .blck_size = QK_K, @@ -807,6 +825,24 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_tq2_0, .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref, }, + [36] = { // GGML_TYPE_IQ4_NL_4_4 + .type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + }, + [37] = { // GGML_TYPE_IQ4_NL_4_8 + .type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + }, + [38] = { // GGML_TYPE_IQ4_NL_8_8 + .type_name = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { @@ -6799,7 +6835,16 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p (int64_t) info->ne[2] * (int64_t) info->ne[3]; - if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) { + if (ggml_blck_size(info->type) == 0 ) { + // this tensor type support have been removed: + fprintf(stderr, "%s: tensor '%s' of type %d: %s\n", + __func__, info->name.data, (int) info->type, ggml_type_name(info->type)); + fclose(file); + gguf_free(ctx); + return NULL; + } + + if (ne % ggml_blck_size(info->type) != 0) { fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n", __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type)); fclose(file); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 703199fcb..66247b803 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1432,9 +1432,6 @@ class GGMLQuantizationType(IntEnum): F64 = 28 IQ1_M = 29 BF16 = 30 - Q4_0_4_4 = 31 - Q4_0_4_8 = 32 - Q4_0_8_8 = 33 TQ1_0 = 34 TQ2_0 = 35 @@ -1478,9 +1475,9 @@ class LlamaFileType(IntEnum): MOSTLY_IQ4_XS = 30 # except 1d tensors MOSTLY_IQ1_M = 31 # except 1d tensors MOSTLY_BF16 = 32 # except 1d tensors - MOSTLY_Q4_0_4_4 = 33 # except 1d tensors - MOSTLY_Q4_0_4_8 = 34 # except 1d tensors - MOSTLY_Q4_0_8_8 = 35 # except 1d tensors + # MOSTLY_Q4_0_4_4 = 33 # removed from gguf files, use Q4_0 and runtime repack + # MOSTLY_Q4_0_4_8 = 34 # removed from gguf files, use Q4_0 and runtime repack + # MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack MOSTLY_TQ1_0 = 36 # except 1d tensors MOSTLY_TQ2_0 = 37 # except 1d tensors @@ -1556,9 +1553,6 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { GGMLQuantizationType.F64: (1, 8), GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), GGMLQuantizationType.BF16: (1, 2), - GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16), - GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16), - GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16), GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), GGMLQuantizationType.TQ2_0: (256, 2 + 64), } diff --git a/include/llama.h b/include/llama.h index d121354c1..36945cde3 100644 --- a/include/llama.h +++ b/include/llama.h @@ -172,9 +172,9 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors + //LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack + //LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack + //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index edcbd4034..cae3f76ad 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5341,9 +5341,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; - case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; default: return "unknown, may not work"; }