ggml : quantization refactoring (#3833)
* ggml : factor all quantization code in ggml-quants ggml-ci * ggml-quants : fix Zig and Swift builds + quantize tool ggml-ci * quantize : --pure option for disabling k-quant mixtures --------- Co-authored-by: cebtenzzre <cebtenzzre@gmail.com>
This commit is contained in:
		
							parent
							
								
									ff3bad83e2
								
							
						
					
					
						commit
						d69d777c02
					
				
					 11 changed files with 2372 additions and 2385 deletions
				
			
		|  | @ -94,7 +94,6 @@ option(LLAMA_CLBLAST                         "llama: use CLBlast" | ||||||
| option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT}) | option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT}) | ||||||
| option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF) | option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF) | ||||||
| option(LLAMA_MPI                             "llama: use MPI"                                   OFF) | option(LLAMA_MPI                             "llama: use MPI"                                   OFF) | ||||||
| option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON) |  | ||||||
| option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF) | option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF) | ||||||
| 
 | 
 | ||||||
| option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE}) | option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE}) | ||||||
|  | @ -278,13 +277,8 @@ if (LLAMA_BLAS) | ||||||
|     endif() |     endif() | ||||||
| endif() | endif() | ||||||
| 
 | 
 | ||||||
| if (LLAMA_K_QUANTS) | if (LLAMA_QKK_64) | ||||||
|     set(GGML_HEADERS_EXTRA k_quants.h) |     add_compile_definitions(GGML_QKK_64) | ||||||
|     set(GGML_SOURCES_EXTRA k_quants.c) |  | ||||||
|     add_compile_definitions(GGML_USE_K_QUANTS) |  | ||||||
|     if (LLAMA_QKK_64) |  | ||||||
|         add_compile_definitions(GGML_QKK_64) |  | ||||||
|     endif() |  | ||||||
| endif() | endif() | ||||||
| 
 | 
 | ||||||
| if (LLAMA_CUBLAS) | if (LLAMA_CUBLAS) | ||||||
|  | @ -673,6 +667,8 @@ add_library(ggml OBJECT | ||||||
|             ggml-alloc.h |             ggml-alloc.h | ||||||
|             ggml-backend.c |             ggml-backend.c | ||||||
|             ggml-backend.h |             ggml-backend.h | ||||||
|  |             ggml-quants.c | ||||||
|  |             ggml-quants.h | ||||||
|             ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} |             ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} | ||||||
|             ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} |             ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} | ||||||
|             ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} |             ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} | ||||||
|  |  | ||||||
							
								
								
									
										18
									
								
								Makefile
									
										
									
									
									
								
							
							
						
						
									
										18
									
								
								Makefile
									
										
									
									
									
								
							|  | @ -342,13 +342,9 @@ else | ||||||
| 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d | 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d | ||||||
| endif | endif | ||||||
| 
 | 
 | ||||||
| ifndef LLAMA_NO_K_QUANTS |  | ||||||
| 	MK_CPPFLAGS += -DGGML_USE_K_QUANTS |  | ||||||
| 	OBJS     += k_quants.o |  | ||||||
| ifdef LLAMA_QKK_64 | ifdef LLAMA_QKK_64 | ||||||
| 	MK_CPPFLAGS += -DGGML_QKK_64 | 	MK_CPPFLAGS += -DGGML_QKK_64 | ||||||
| endif | endif | ||||||
| endif |  | ||||||
| 
 | 
 | ||||||
| ifndef LLAMA_NO_ACCELERATE | ifndef LLAMA_NO_ACCELERATE | ||||||
| 	# Mac OS - include Accelerate framework. | 	# Mac OS - include Accelerate framework. | ||||||
|  | @ -365,7 +361,7 @@ ifdef LLAMA_MPI | ||||||
| 	MK_CPPFLAGS += -DGGML_USE_MPI | 	MK_CPPFLAGS += -DGGML_USE_MPI | ||||||
| 	MK_CFLAGS   += -Wno-cast-qual | 	MK_CFLAGS   += -Wno-cast-qual | ||||||
| 	MK_CXXFLAGS += -Wno-cast-qual | 	MK_CXXFLAGS += -Wno-cast-qual | ||||||
| 	OBJS     += ggml-mpi.o | 	OBJS        += ggml-mpi.o | ||||||
| endif # LLAMA_MPI
 | endif # LLAMA_MPI
 | ||||||
| 
 | 
 | ||||||
| ifdef LLAMA_OPENBLAS | ifdef LLAMA_OPENBLAS | ||||||
|  | @ -382,7 +378,7 @@ endif # LLAMA_BLIS | ||||||
| ifdef LLAMA_CUBLAS | ifdef LLAMA_CUBLAS | ||||||
| 	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include | 	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include | ||||||
| 	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib | 	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib | ||||||
| 	OBJS      += ggml-cuda.o | 	OBJS         += ggml-cuda.o | ||||||
| 	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math | 	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math | ||||||
| ifdef LLAMA_CUDA_NVCC | ifdef LLAMA_CUDA_NVCC | ||||||
| 	NVCC = $(LLAMA_CUDA_NVCC) | 	NVCC = $(LLAMA_CUDA_NVCC) | ||||||
|  | @ -497,11 +493,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h | ||||||
| 	$(CC) $(CFLAGS) -c $< -o $@ | 	$(CC) $(CFLAGS) -c $< -o $@ | ||||||
| endif # LLAMA_MPI
 | endif # LLAMA_MPI
 | ||||||
| 
 | 
 | ||||||
| ifndef LLAMA_NO_K_QUANTS |  | ||||||
| k_quants.o: k_quants.c k_quants.h |  | ||||||
| 	$(CC) $(CFLAGS) -c $< -o $@ |  | ||||||
| endif # LLAMA_NO_K_QUANTS
 |  | ||||||
| 
 |  | ||||||
| # combine build flags with cmdline overrides
 | # combine build flags with cmdline overrides
 | ||||||
| override CFLAGS        := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) | override CFLAGS        := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS) | ||||||
| override CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) | override CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) | ||||||
|  | @ -542,7 +533,10 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h | ||||||
| ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h | ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h | ||||||
| 	$(CC)  $(CFLAGS)   -c $< -o $@ | 	$(CC)  $(CFLAGS)   -c $< -o $@ | ||||||
| 
 | 
 | ||||||
| OBJS += ggml-alloc.o ggml-backend.o | ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h | ||||||
|  | 	$(CC) $(CFLAGS)    -c $< -o $@ | ||||||
|  | 
 | ||||||
|  | OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o | ||||||
| 
 | 
 | ||||||
| llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h | llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h | ||||||
| 	$(CXX) $(CXXFLAGS) -c $< -o $@ | 	$(CXX) $(CXXFLAGS) -c $< -o $@ | ||||||
|  |  | ||||||
|  | @ -42,13 +42,12 @@ let package = Package( | ||||||
|                 "llama.cpp", |                 "llama.cpp", | ||||||
|                 "ggml-alloc.c", |                 "ggml-alloc.c", | ||||||
|                 "ggml-backend.c", |                 "ggml-backend.c", | ||||||
|                 "k_quants.c", |                 "ggml-quants.c", | ||||||
|             ] + additionalSources, |             ] + additionalSources, | ||||||
|             resources: resources, |             resources: resources, | ||||||
|             publicHeadersPath: "spm-headers", |             publicHeadersPath: "spm-headers", | ||||||
|             cSettings: [ |             cSettings: [ | ||||||
|                 .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), |                 .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), | ||||||
|                 .define("GGML_USE_K_QUANTS"), |  | ||||||
|                 .define("GGML_USE_ACCELERATE") |                 .define("GGML_USE_ACCELERATE") | ||||||
|                 // NOTE: NEW_LAPACK will required iOS version 16.4+ |                 // NOTE: NEW_LAPACK will required iOS version 16.4+ | ||||||
|                 // We should consider add this in the future when we drop support for iOS 14 |                 // We should consider add this in the future when we drop support for iOS 14 | ||||||
|  |  | ||||||
							
								
								
									
										21
									
								
								build.zig
									
										
									
									
									
								
							
							
						
						
									
										21
									
								
								build.zig
									
										
									
									
									
								
							|  | @ -116,15 +116,10 @@ pub fn build(b: *std.build.Builder) !void { | ||||||
|     var make = try Maker.init(b); |     var make = try Maker.init(b); | ||||||
|     make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false; |     make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false; | ||||||
| 
 | 
 | ||||||
|     if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) { |  | ||||||
|         try make.addFlag("-DGGML_USE_K_QUANTS"); |  | ||||||
|         const k_quants = make.obj("k_quants", "k_quants.c"); |  | ||||||
|         try make.objs.append(k_quants); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     const ggml = make.obj("ggml", "ggml.c"); |     const ggml = make.obj("ggml", "ggml.c"); | ||||||
|     const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); |     const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); | ||||||
|     const ggml_backend = make.obj("ggml-backend", "ggml-backend.c"); |     const ggml_backend = make.obj("ggml-backend", "ggml-backend.c"); | ||||||
|  |     const ggml_quants = make.obj("ggml-quants", "ggml-quants.c"); | ||||||
|     const llama = make.obj("llama", "llama.cpp"); |     const llama = make.obj("llama", "llama.cpp"); | ||||||
|     const common = make.obj("common", "common/common.cpp"); |     const common = make.obj("common", "common/common.cpp"); | ||||||
|     const console = make.obj("console", "common/console.cpp"); |     const console = make.obj("console", "common/console.cpp"); | ||||||
|  | @ -133,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void { | ||||||
|     const train = make.obj("train", "common/train.cpp"); |     const train = make.obj("train", "common/train.cpp"); | ||||||
|     const clip = make.obj("clip", "examples/llava/clip.cpp"); |     const clip = make.obj("clip", "examples/llava/clip.cpp"); | ||||||
| 
 | 
 | ||||||
|     _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser }); |     _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser }); | ||||||
|     _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); |     _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common }); | ||||||
|     _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); |     _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common }); | ||||||
|     _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); |     _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common }); | ||||||
|     _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); |     _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train }); | ||||||
|     _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); |     _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train }); | ||||||
| 
 | 
 | ||||||
|     const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip }); |     const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip }); | ||||||
|     if (server.target.isWindows()) { |     if (server.target.isWindows()) { | ||||||
|         server.linkSystemLibrary("ws2_32"); |         server.linkSystemLibrary("ws2_32"); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  | @ -18,7 +18,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = { | ||||||
|     { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", }, |     { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", }, | ||||||
|     { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, |     { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, | ||||||
|     { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, |     { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, | ||||||
| #ifdef GGML_USE_K_QUANTS |  | ||||||
|     { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, |     { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, | ||||||
|     { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, |     { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, | ||||||
|     { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, |     { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, | ||||||
|  | @ -31,7 +30,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = { | ||||||
|     { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", }, |     { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", }, | ||||||
|     { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, |     { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, | ||||||
|     { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", }, |     { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", }, | ||||||
| #endif |  | ||||||
|     { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, |     { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, | ||||||
|     { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", }, |     { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", }, | ||||||
|     { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", }, |     { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", }, | ||||||
|  | @ -70,13 +68,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // usage:
 | // usage:
 | ||||||
| //  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 | //  ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 | ||||||
| //
 | //
 | ||||||
| [[noreturn]] | [[noreturn]] | ||||||
| static void usage(const char * executable) { | static void usage(const char * executable) { | ||||||
|     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); |     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); | ||||||
|     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); |     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); | ||||||
|     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); |     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); | ||||||
|  |     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); | ||||||
|     printf("\nAllowed quantization types:\n"); |     printf("\nAllowed quantization types:\n"); | ||||||
|     for (auto & it : QUANT_OPTIONS) { |     for (auto & it : QUANT_OPTIONS) { | ||||||
|         if (it.name != "COPY") { |         if (it.name != "COPY") { | ||||||
|  | @ -103,6 +102,8 @@ int main(int argc, char ** argv) { | ||||||
|             params.quantize_output_tensor = false; |             params.quantize_output_tensor = false; | ||||||
|         } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { |         } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { | ||||||
|             params.allow_requantize = true; |             params.allow_requantize = true; | ||||||
|  |         } else if (strcmp(argv[arg_idx], "--pure") == 0) { | ||||||
|  |             params.pure = true; | ||||||
|         } else { |         } else { | ||||||
|             usage(argv[0]); |             usage(argv[0]); | ||||||
|         } |         } | ||||||
|  |  | ||||||
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -1,20 +1,14 @@ | ||||||
| #pragma once | #pragma once | ||||||
| 
 | 
 | ||||||
|  | // This is a private API for quantization and dequantization
 | ||||||
|  | // Should not be used directly, use ggml.h instead
 | ||||||
|  | 
 | ||||||
| #include "ggml.h" | #include "ggml.h" | ||||||
| 
 | 
 | ||||||
| #include <stdint.h> | #include <stdint.h> | ||||||
| #include <assert.h> | #include <assert.h> | ||||||
| #include <stddef.h> | #include <stddef.h> | ||||||
| 
 | 
 | ||||||
| // Super-block size
 |  | ||||||
| #ifdef GGML_QKK_64 |  | ||||||
| #define QK_K 64 |  | ||||||
| #define K_SCALE_SIZE 4 |  | ||||||
| #else |  | ||||||
| #define QK_K 256 |  | ||||||
| #define K_SCALE_SIZE 12 |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
| #ifndef static_assert | #ifndef static_assert | ||||||
| #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) | ||||||
| #define static_assert(cond, msg) _Static_assert(cond, msg) | #define static_assert(cond, msg) _Static_assert(cond, msg) | ||||||
|  | @ -23,10 +17,66 @@ | ||||||
| #endif | #endif | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | #define QK4_0 32 | ||||||
|  | typedef struct { | ||||||
|  |     ggml_fp16_t d;          // delta
 | ||||||
|  |     uint8_t qs[QK4_0 / 2];  // nibbles / quants
 | ||||||
|  | } block_q4_0; | ||||||
|  | static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); | ||||||
|  | 
 | ||||||
|  | #define QK4_1 32 | ||||||
|  | typedef struct { | ||||||
|  |     ggml_fp16_t d;          // delta
 | ||||||
|  |     ggml_fp16_t m;          // min
 | ||||||
|  |     uint8_t qs[QK4_1 / 2];  // nibbles / quants
 | ||||||
|  | } block_q4_1; | ||||||
|  | static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding"); | ||||||
|  | 
 | ||||||
|  | #define QK5_0 32 | ||||||
|  | typedef struct { | ||||||
|  |     ggml_fp16_t d;         // delta
 | ||||||
|  |     uint8_t qh[4];         // 5-th bit of quants
 | ||||||
|  |     uint8_t qs[QK5_0 / 2]; // nibbles / quants
 | ||||||
|  | } block_q5_0; | ||||||
|  | static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); | ||||||
|  | 
 | ||||||
|  | #define QK5_1 32 | ||||||
|  | typedef struct { | ||||||
|  |     ggml_fp16_t d;         // delta
 | ||||||
|  |     ggml_fp16_t m;         // min
 | ||||||
|  |     uint8_t qh[4];         // 5-th bit of quants
 | ||||||
|  |     uint8_t qs[QK5_1 / 2]; // nibbles / quants
 | ||||||
|  | } block_q5_1; | ||||||
|  | static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); | ||||||
|  | 
 | ||||||
|  | #define QK8_0 32 | ||||||
|  | typedef struct { | ||||||
|  |     ggml_fp16_t d;         // delta
 | ||||||
|  |     int8_t  qs[QK8_0];     // quants
 | ||||||
|  | } block_q8_0; | ||||||
|  | static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); | ||||||
|  | 
 | ||||||
|  | #define QK8_1 32 | ||||||
|  | typedef struct { | ||||||
|  |     float d;               // delta
 | ||||||
|  |     float s;               // d * sum(qs[i])
 | ||||||
|  |     int8_t  qs[QK8_1];     // quants
 | ||||||
|  | } block_q8_1; | ||||||
|  | static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); | ||||||
|  | 
 | ||||||
| //
 | //
 | ||||||
| // Super-block quantization structures
 | // Super-block quantization structures
 | ||||||
| //
 | //
 | ||||||
| 
 | 
 | ||||||
|  | // Super-block size
 | ||||||
|  | #ifdef GGML_QKK_64 | ||||||
|  | #define QK_K 64 | ||||||
|  | #define K_SCALE_SIZE 4 | ||||||
|  | #else | ||||||
|  | #define QK_K 256 | ||||||
|  | #define K_SCALE_SIZE 12 | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| // 2-bit quantization
 | // 2-bit quantization
 | ||||||
| // weight is represented as x = a * q + b
 | // weight is represented as x = a * q + b
 | ||||||
| // 16 blocks of 16 elements each
 | // 16 blocks of 16 elements each
 | ||||||
|  | @ -127,6 +177,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_ | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| // Quantization
 | // Quantization
 | ||||||
|  | void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); | ||||||
|  | void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); | ||||||
|  | void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k); | ||||||
|  | void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k); | ||||||
|  | void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k); | ||||||
|  | void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k); | ||||||
|  | 
 | ||||||
| void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); | void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); | ||||||
| void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); | void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); | ||||||
| void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); | void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); | ||||||
|  | @ -134,6 +191,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict | ||||||
| void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); | void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); | ||||||
| void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); | void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); | ||||||
| 
 | 
 | ||||||
|  | void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); | ||||||
|  | void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); | ||||||
|  | void quantize_row_q5_0(const float * restrict x, void * restrict y, int k); | ||||||
|  | void quantize_row_q5_1(const float * restrict x, void * restrict y, int k); | ||||||
|  | void quantize_row_q8_0(const float * restrict x, void * restrict y, int k); | ||||||
|  | void quantize_row_q8_1(const float * restrict x, void * restrict y, int k); | ||||||
|  | 
 | ||||||
| void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); | void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); | ||||||
| void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); | void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); | ||||||
| void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); | void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); | ||||||
|  | @ -142,6 +206,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); | ||||||
| void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); | void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); | ||||||
| 
 | 
 | ||||||
| // Dequantization
 | // Dequantization
 | ||||||
|  | void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); | ||||||
|  | void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k); | ||||||
|  | void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k); | ||||||
|  | void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k); | ||||||
|  | void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k); | ||||||
|  | //void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
 | ||||||
|  | 
 | ||||||
| void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); | void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); | ||||||
| void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); | void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); | ||||||
| void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); | void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); | ||||||
|  | @ -150,16 +221,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int | ||||||
| void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); | void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); | ||||||
| 
 | 
 | ||||||
| // Dot product
 | // Dot product
 | ||||||
|  | void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
|  | void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
|  | void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
|  | void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
|  | void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
|  | 
 | ||||||
| void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
| void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
| void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
| void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
| void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); | ||||||
| 
 |  | ||||||
| // Quantization with histogram collection
 |  | ||||||
| size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); |  | ||||||
| size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist); |  | ||||||
| size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); |  | ||||||
| size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); |  | ||||||
| size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); |  | ||||||
| 
 |  | ||||||
							
								
								
									
										7
									
								
								ggml.h
									
										
									
									
									
								
							
							
						
						
									
										7
									
								
								ggml.h
									
										
									
									
									
								
							|  | @ -1930,12 +1930,19 @@ extern "C" { | ||||||
|     // quantization
 |     // quantization
 | ||||||
|     //
 |     //
 | ||||||
| 
 | 
 | ||||||
|  |     // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
 | ||||||
|     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); |     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
|     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); |     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
|     GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); |     GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
|     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); |     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
|     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); |     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
| 
 | 
 | ||||||
|  |     GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
|  |     GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
|  |     GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
|  |     GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
|  |     GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); | ||||||
|  | 
 | ||||||
|     GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); |     GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); | ||||||
| 
 | 
 | ||||||
|     //
 |     //
 | ||||||
|  |  | ||||||
							
								
								
									
										34
									
								
								llama.cpp
									
										
									
									
									
								
							
							
						
						
									
										34
									
								
								llama.cpp
									
										
									
									
									
								
							|  | @ -19,13 +19,11 @@ | ||||||
| #ifdef GGML_USE_MPI | #ifdef GGML_USE_MPI | ||||||
| #  include "ggml-mpi.h" | #  include "ggml-mpi.h" | ||||||
| #endif | #endif | ||||||
| #ifdef GGML_USE_K_QUANTS | #ifndef QK_K | ||||||
| #  ifndef QK_K | #  ifdef GGML_QKK_64 | ||||||
| #    ifdef GGML_QKK_64 | #    define QK_K 64 | ||||||
| #      define QK_K 64 | #  else | ||||||
| #    else | #    define QK_K 256 | ||||||
| #      define QK_K 256 |  | ||||||
| #    endif |  | ||||||
| #  endif | #  endif | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | @ -8052,7 +8050,7 @@ struct no_init { | ||||||
| struct quantize_state_internal { | struct quantize_state_internal { | ||||||
|     const llama_model                 & model; |     const llama_model                 & model; | ||||||
|     const llama_model_quantize_params * params; |     const llama_model_quantize_params * params; | ||||||
| #ifdef GGML_USE_K_QUANTS | 
 | ||||||
|     int n_attention_wv    = 0; |     int n_attention_wv    = 0; | ||||||
|     int n_feed_forward_w2 = 0; |     int n_feed_forward_w2 = 0; | ||||||
|     int i_attention_wv    = 0; |     int i_attention_wv    = 0; | ||||||
|  | @ -8060,7 +8058,7 @@ struct quantize_state_internal { | ||||||
| 
 | 
 | ||||||
|     int n_k_quantized     = 0; |     int n_k_quantized     = 0; | ||||||
|     int n_fallback        = 0; |     int n_fallback        = 0; | ||||||
| #endif | 
 | ||||||
|     quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) |     quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) | ||||||
|         : model(model) |         : model(model) | ||||||
|         , params(params) |         , params(params) | ||||||
|  | @ -8125,7 +8123,6 @@ static void llama_convert_tensor_internal( | ||||||
|     workers.clear(); |     workers.clear(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| #ifdef GGML_USE_K_QUANTS |  | ||||||
| static ggml_type get_k_quant_type( | static ggml_type get_k_quant_type( | ||||||
|     quantize_state_internal & qs, |     quantize_state_internal & qs, | ||||||
|     ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype |     ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype | ||||||
|  | @ -8237,7 +8234,6 @@ static ggml_type get_k_quant_type( | ||||||
| 
 | 
 | ||||||
|     return new_type; |     return new_type; | ||||||
| } | } | ||||||
| #endif |  | ||||||
| 
 | 
 | ||||||
| static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { | static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { | ||||||
|     ggml_type quantized_type; |     ggml_type quantized_type; | ||||||
|  | @ -8252,7 +8248,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||||
|         case LLAMA_FTYPE_MOSTLY_F16:  quantized_type = GGML_TYPE_F16;  break; |         case LLAMA_FTYPE_MOSTLY_F16:  quantized_type = GGML_TYPE_F16;  break; | ||||||
|         case LLAMA_FTYPE_ALL_F32:     quantized_type = GGML_TYPE_F32;  break; |         case LLAMA_FTYPE_ALL_F32:     quantized_type = GGML_TYPE_F32;  break; | ||||||
| 
 | 
 | ||||||
| #ifdef GGML_USE_K_QUANTS |  | ||||||
|         // K-quants
 |         // K-quants
 | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_TYPE_Q2_K; break; |         case LLAMA_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_TYPE_Q2_K; break; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q3_K_S: |         case LLAMA_FTYPE_MOSTLY_Q3_K_S: | ||||||
|  | @ -8263,7 +8258,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q5_K_S: |         case LLAMA_FTYPE_MOSTLY_Q5_K_S: | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; |         case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; | ||||||
|         case LLAMA_FTYPE_MOSTLY_Q6_K:   quantized_type = GGML_TYPE_Q6_K; break; |         case LLAMA_FTYPE_MOSTLY_Q6_K:   quantized_type = GGML_TYPE_Q6_K; break; | ||||||
| #endif | 
 | ||||||
|         default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); |         default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  | @ -8304,7 +8299,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||||
|     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); |     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); | ||||||
|     gguf_set_val_u32(ctx_out, "general.file_type", ftype); |     gguf_set_val_u32(ctx_out, "general.file_type", ftype); | ||||||
| 
 | 
 | ||||||
| #ifdef GGML_USE_K_QUANTS |  | ||||||
|     for (int i = 0; i < ml.n_tensors; ++i) { |     for (int i = 0; i < ml.n_tensors; ++i) { | ||||||
|         struct ggml_tensor * meta = ml.get_tensor_meta(i); |         struct ggml_tensor * meta = ml.get_tensor_meta(i); | ||||||
| 
 | 
 | ||||||
|  | @ -8322,7 +8316,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||||
|         LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", |         LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", | ||||||
|                 __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer); |                 __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer); | ||||||
|     } |     } | ||||||
| #endif |  | ||||||
| 
 | 
 | ||||||
|     size_t total_size_org = 0; |     size_t total_size_org = 0; | ||||||
|     size_t total_size_new = 0; |     size_t total_size_new = 0; | ||||||
|  | @ -8387,9 +8380,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||||
| 
 | 
 | ||||||
|         if (quantize) { |         if (quantize) { | ||||||
|             new_type = quantized_type; |             new_type = quantized_type; | ||||||
| #ifdef GGML_USE_K_QUANTS |             if (!params->pure) { | ||||||
|             new_type = get_k_quant_type(qs, new_type, tensor, ftype); |                 new_type = get_k_quant_type(qs, new_type, tensor, ftype); | ||||||
| #endif |             } | ||||||
|  | 
 | ||||||
|             // If we've decided to quantize to the same type the tensor is already
 |             // If we've decided to quantize to the same type the tensor is already
 | ||||||
|             // in then there's nothing to do.
 |             // in then there's nothing to do.
 | ||||||
|             quantize = tensor->type != new_type; |             quantize = tensor->type != new_type; | ||||||
|  | @ -8514,12 +8508,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||||
|             LLAMA_LOG_INFO("\n"); |             LLAMA_LOG_INFO("\n"); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| #ifdef GGML_USE_K_QUANTS | 
 | ||||||
|     if (qs.n_fallback > 0) { |     if (qs.n_fallback > 0) { | ||||||
|         LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n", |         LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n", | ||||||
|                 __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); |                 __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); | ||||||
|     } |     } | ||||||
| #endif |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int llama_apply_lora_from_file_internal( | static int llama_apply_lora_from_file_internal( | ||||||
|  | @ -8844,6 +8837,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { | ||||||
|         /*.allow_requantize            =*/ false, |         /*.allow_requantize            =*/ false, | ||||||
|         /*.quantize_output_tensor      =*/ true, |         /*.quantize_output_tensor      =*/ true, | ||||||
|         /*.only_copy                   =*/ false, |         /*.only_copy                   =*/ false, | ||||||
|  |         /*.pure                        =*/ false, | ||||||
|     }; |     }; | ||||||
| 
 | 
 | ||||||
|     return result; |     return result; | ||||||
|  |  | ||||||
							
								
								
									
										1
									
								
								llama.h
									
										
									
									
									
								
							
							
						
						
									
										1
									
								
								llama.h
									
										
									
									
									
								
							|  | @ -191,6 +191,7 @@ extern "C" { | ||||||
|         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
 |         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
 | ||||||
|         bool quantize_output_tensor; // quantize output.weight
 |         bool quantize_output_tensor; // quantize output.weight
 | ||||||
|         bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
 |         bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
 | ||||||
|  |         bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
 | ||||||
|     } llama_model_quantize_params; |     } llama_model_quantize_params; | ||||||
| 
 | 
 | ||||||
|     // grammar types
 |     // grammar types
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue