From 2a4e41a086ce80da68c402457c75c77e52dcc698 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 6 Jun 2023 22:41:53 +0300 Subject: [PATCH 01/10] llama : fix compile warnings --- ggml.c | 22 +++++++++++----------- llama.cpp | 15 ++++++++------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/ggml.c b/ggml.c index 05889d154..045768faf 100644 --- a/ggml.c +++ b/ggml.c @@ -14720,12 +14720,12 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-12s %8d %8jd %jd %jd %jd %16zu %16zu %16zu %16zu %16p %32s\n", + fprintf(fout, "%-6s %-12s %8d %8d %d %d %d %16zu %16zu %16zu %16zu %16p %32s\n", ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, - ne[0], ne[1], ne[2], ne[3], - nb[0], nb[1], nb[2], nb[3], + (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3], + nb[0], nb[1], nb[2], nb[3], tensor->data, tensor->name); } @@ -14734,13 +14734,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-6s %-12s %8d %jd %jd %jd %jd %16zu %16zu %16zu %16zu %8d %16p %32s\n", + fprintf(fout, "%-6s %-6s %-12s %8d %d %d %d %d %16zu %16zu %16zu %16zu %8d %16p %32s\n", arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, - ne[0], ne[1], ne[2], ne[3], - nb[0], nb[1], nb[2], nb[3], + (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3], + nb[0], nb[1], nb[2], nb[3], tensor->n_tasks, tensor->data, tensor->name); @@ -14763,11 +14763,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { FILE * fout = stdout; fprintf(fout, "\n"); - fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); - fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); - fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); - fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); - fprintf(fout, "%-16s %8ju\n", "eval", size_eval); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); + fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); + fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + fprintf(fout, "%-16s %8d\n", "eval", (int) size_eval); // header fprintf(fout, "\n"); diff --git a/llama.cpp b/llama.cpp index b992321e4..cf512ccdd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1059,23 +1059,23 @@ static void llama_model_load_internal( } } + (void) main_gpu; #if defined(GGML_USE_CUBLAS) fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); -#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT #elif defined(GGML_USE_CLBLAST) fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__); -#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU #else -#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU #endif // prepare memory for the weights size_t vram_weights = 0; - size_t vram_scratch = 0; { const uint32_t n_embd = hparams.n_embd; const uint32_t n_layer = hparams.n_layer; @@ -1152,10 +1152,8 @@ static void llama_model_load_internal( fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); - const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); - #ifdef GGML_USE_CUBLAS - vram_scratch = n_batch * MB; + const size_t vram_scratch = n_batch * MB; ggml_cuda_set_scratch_size(vram_scratch); if (n_gpu_layers > 0) { fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n", @@ -1163,6 +1161,8 @@ static void llama_model_load_internal( } #endif // GGML_USE_CUBLAS #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu); if (n_gpu_layers > (int) hparams.n_layer) { fprintf(stderr, "%s: offloading output layer to GPU\n", __func__); @@ -1331,6 +1331,7 @@ static bool llama_eval_internal( struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); const int i_gpu_start = n_layer - n_gpu_layers; + (void) i_gpu_start; for (int il = 0; il < n_layer; ++il) { offload_func_t offload_func = llama_nop; From 2d7bf110edd8c49209401a16132052cba706ffd0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 6 Jun 2023 22:54:39 +0300 Subject: [PATCH 02/10] llama : fix vram_scratch var --- llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index cf512ccdd..16d6f6ef1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1076,6 +1076,7 @@ static void llama_model_load_internal( // prepare memory for the weights size_t vram_weights = 0; + size_t vram_scratch = 0; { const uint32_t n_embd = hparams.n_embd; const uint32_t n_layer = hparams.n_layer; @@ -1152,8 +1153,9 @@ static void llama_model_load_internal( fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); + (void) vram_scratch; #ifdef GGML_USE_CUBLAS - const size_t vram_scratch = n_batch * MB; + vram_scratch = n_batch * MB; ggml_cuda_set_scratch_size(vram_scratch); if (n_gpu_layers > 0) { fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n", From 35a84916fb029905c44746127026079268216e7a Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 7 Jun 2023 04:10:17 +0200 Subject: [PATCH 03/10] main: add the possibility to open the prompt cache read-only (#1640) The prompt cache constitutes a nice speed up when using the same prompt prefix across multiple evaluations, but when using it, it will also be updated, which is not always desirable. One use case is to have a large prompt containing some context and usage rules, and a second part containing variable data of the problem being studied. In this case it's desirable to be able to save the first part once, and to always reuse it as-is without updating it with the second part. The new argument --prompt-cache-ro enables this read-only mode on the prompt cache. The prompt's contents that match the cache are loaded from the cache but the rest is not modified. This allowed to reduce a total analysis time from 112s to 49.7s here, without having to backup and restore a copy of the prompt, which takes significant time at 500 MB. Signed-off-by: Willy Tarreau --- examples/common.cpp | 3 +++ examples/common.h | 1 + examples/main/main.cpp | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index c37346214..f5d886acf 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -132,6 +132,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.path_prompt_cache = argv[i]; } else if (arg == "--prompt-cache-all") { params.prompt_cache_all = true; + } else if (arg == "--prompt-cache-ro") { + params.prompt_cache_ro = true; } else if (arg == "-f" || arg == "--file") { if (++i >= argc) { invalid_param = true; @@ -432,6 +434,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); fprintf(stderr, " not supported with --interactive or other interactive options\n"); + fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); diff --git a/examples/common.h b/examples/common.h index 12b497349..826e2ae59 100644 --- a/examples/common.h +++ b/examples/common.h @@ -62,6 +62,7 @@ struct gpt_params { bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode bool prompt_cache_all = false; // save user input and generations to prompt cache + bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it bool embedding = false; // get only sentence embedding bool interactive_first = false; // wait for user input immediately diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b4d129393..de63faa3e 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -417,7 +417,7 @@ int main(int argc, char ** argv) { const bool penalize_nl = params.penalize_nl; // optionally save the session on first sample (for faster prompt loading next time) - if (!path_session.empty() && need_to_save_session) { + if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) { need_to_save_session = false; llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } @@ -630,7 +630,7 @@ int main(int argc, char ** argv) { } } - if (!path_session.empty() && params.prompt_cache_all) { + if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } From 4dc62c545df0af60635d579e9e4dd91bc5afff51 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 7 Jun 2023 07:15:08 +0300 Subject: [PATCH 04/10] readme : add June roadmap --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 28842e968..0c87af6ee 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729 - GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642 - High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684 - Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607 From 5b57a5b72676540b6a45a3f527126299969ad241 Mon Sep 17 00:00:00 2001 From: jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> Date: Wed, 7 Jun 2023 00:15:31 -0400 Subject: [PATCH 05/10] flake : update to support metal on m1/m2 (#1724) --- flake.lock | 30 ++++++++++++++++++++++++------ flake.nix | 26 ++++++++++++++++++-------- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/flake.lock b/flake.lock index 343996da1..33164e096 100644 --- a/flake.lock +++ b/flake.lock @@ -1,12 +1,15 @@ { "nodes": { "flake-utils": { + "inputs": { + "systems": "systems" + }, "locked": { - "lastModified": 1676283394, - "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=", + "lastModified": 1685518550, + "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=", "owner": "numtide", "repo": "flake-utils", - "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073", + "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef", "type": "github" }, "original": { @@ -17,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1678470307, - "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=", + "lastModified": 1685931219, + "narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f", + "rev": "7409480d5c8584a1a83c422530419efe4afb0d19", "type": "github" }, "original": { @@ -36,6 +39,21 @@ "flake-utils": "flake-utils", "nixpkgs": "nixpkgs" } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } } }, "root": "root", diff --git a/flake.nix b/flake.nix index 2c9edbb6a..619100449 100644 --- a/flake.nix +++ b/flake.nix @@ -6,6 +6,13 @@ outputs = { self, nixpkgs, flake-utils }: flake-utils.lib.eachDefaultSystem (system: let + inherit (pkgs.stdenv) isAarch64 isDarwin; + inherit (pkgs.lib) optionals; + isM1 = isAarch64 && isDarwin; + osSpecific = + if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ] + else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ] + else [ ]; pkgs = import nixpkgs { inherit system; }; @@ -18,17 +25,22 @@ packages.default = pkgs.stdenv.mkDerivation { name = "llama.cpp"; src = ./.; + postPatch = + if isM1 then '' + substituteInPlace ./ggml-metal.m \ + --replace '[[NSBundle mainBundle] pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";" + '' else ""; nativeBuildInputs = with pkgs; [ cmake ]; - buildInputs = with pkgs; lib.optionals stdenv.isDarwin [ - darwin.apple_sdk.frameworks.Accelerate - ]; - cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [ + buildInputs = osSpecific; + cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [ "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" - ]; + "-DLLAMA_METAL=ON" + ]); installPhase = '' mkdir -p $out/bin mv bin/* $out/bin/ mv $out/bin/main $out/bin/llama + mv $out/bin/server $out/bin/llama-server echo "#!${llama-python}/bin/python" > $out/bin/convert.py cat ${./convert.py} >> $out/bin/convert.py @@ -40,9 +52,7 @@ packages = with pkgs; [ cmake llama-python - ] ++ lib.optionals stdenv.isDarwin [ - darwin.apple_sdk.frameworks.Accelerate - ]; + ] ++ osSpecific; }; } ); From 5c64a0952ee58b2d742ee84e8e3d43cce5d366db Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 7 Jun 2023 10:59:52 +0300 Subject: [PATCH 06/10] k-quants : allow to optionally disable at compile time (#1734) * k-quants : put behind optional compile flag LLAMA_K_QUANTS * build : enable k-quants by default --- CMakeLists.txt | 8 ++- Makefile | 37 ++++++---- ggml-cuda.cu | 120 ++++++++++++++++---------------- ggml.c | 107 ++++++++++++++++------------- ggml-quants-k.c => k_quants.c | 126 +++++++++++++++++----------------- ggml-quants-k.h => k_quants.h | 80 ++++++++++----------- 6 files changed, 250 insertions(+), 228 deletions(-) rename ggml-quants-k.c => k_quants.c (95%) rename ggml-quants-k.h => k_quants.h (52%) diff --git a/CMakeLists.txt b/CMakeLists.txt index da5913d69..456875f90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" OFF) +option(LLAMA_K_QUANTS "llama: use k-quants" ON) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -226,6 +227,10 @@ if (LLAMA_METAL) ) endif() +if (LLAMA_K_QUANTS) + set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h) +endif() + if (LLAMA_CLBLAST) find_package(CLBlast) if (CLBlast_FOUND) @@ -396,11 +401,10 @@ endif() add_library(ggml OBJECT ggml.c ggml.h - ggml-quants-k.h - ggml-quants-k.c ${GGML_SOURCES_CUDA} ${GGML_SOURCES_OPENCL} ${GGML_SOURCES_METAL} + ${GGML_SOURCES_EXTRA} ) target_include_directories(ggml PUBLIC .) diff --git a/Makefile b/Makefile index 0205f1959..39265164b 100644 --- a/Makefile +++ b/Makefile @@ -121,6 +121,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),) endif endif +ifndef LLAMA_NO_K_QUANTS + CFLAGS += -DGGML_USE_K_QUANTS + OBJS += k_quants.o +endif + ifndef LLAMA_NO_ACCELERATE # Mac M1 - include Accelerate framework. # `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). @@ -140,7 +145,7 @@ ifdef LLAMA_OPENBLAS endif # LLAMA_OPENBLAS ifdef LLAMA_BLIS - CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis + CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis LDFLAGS += -lblis -L/usr/local/lib endif # LLAMA_BLIS @@ -212,6 +217,11 @@ ifneq ($(filter armv8%,$(UNAME_M)),) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif +ifdef LLAMA_NO_K_QUANTS +k_quants.o: k_quants.c k_quants.h + $(CC) $(CFLAGS) -c $< -o $@ +endif # LLAMA_NO_K_QUANTS + # # Print build information # @@ -231,10 +241,7 @@ $(info ) # Build library # -ggml.o: ggml.c ggml.h ggml-cuda.h ggml-quants-k.h - $(CC) $(CFLAGS) -c $< -o $@ - -ggml-quants-k.o: ggml-quants-k.c ggml-quants-k.h ggml.h ggml-cuda.h +ggml.o: ggml.c ggml.h ggml-cuda.h $(CC) $(CFLAGS) -c $< -o $@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h @@ -243,7 +250,7 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c $< -o $@ -libllama.so: llama.o ggml.o ggml-quants-k.o $(OBJS) +libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: @@ -253,28 +260,28 @@ clean: # Examples # -main: examples/main/main.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS) +main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: examples/quantize/quantize.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS) +quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o ggml-quants-k.o llama.o $(OBJS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS) +perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS) +embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS) +save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o ggml-quants-k.o llama.o common.o $(OBJS) +server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) build-info.h: $(wildcard .git/index) scripts/build-info.sh @@ -289,11 +296,11 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh # Tests # -benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o ggml-quants-k.o $(OBJS) +benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ./$@ -vdot: pocs/vdot/vdot.cpp ggml.o ggml-quants-k.o $(OBJS) +vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) .PHONY: tests clean diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c7008905e..b1e513bc9 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -110,24 +110,24 @@ typedef struct { uint8_t qs[QK_K/4]; // quants half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins -} block_q2_k; -static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding"); +} block_q2_K; +static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); typedef struct { uint8_t hmask[QK_K/8]; uint8_t qs[QK_K/4]; // nibbles / quants uint8_t scales[3*QK_K/64]; half d; -} block_q3_k; -static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding"); +} block_q3_K; +static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); typedef struct { half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants -} block_q4_k; -static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding"); +} block_q4_K; +static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); typedef struct { half d; // super-block scale for quantized scales @@ -135,16 +135,16 @@ typedef struct { uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits -} block_q5_k; -static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding"); +} block_q5_K; +static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); typedef struct { uint8_t ql[QK_K/2]; // quants, lower 4 bits uint8_t qh[QK_K/4]; // quants, upper 2 bits int8_t scales[QK_K/16]; // scales half d; // delta -} block_q6_k; -static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding"); +} block_q6_K; +static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); #define WARP_SIZE 32 @@ -299,7 +299,7 @@ static __device__ void dequantize_q8_0(const void * vx, const int ib, const int //================================== k-quants -static __global__ void dequantize_block_q2_k(const void * vx, float * yy) { +static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { const int i = blockIdx.x; const int tid = threadIdx.x; @@ -307,7 +307,7 @@ static __global__ void dequantize_block_q2_k(const void * vx, float * yy) { const int l = tid - 32*n; const int is = 8*n + l/16; - const block_q2_k * x = (const block_q2_k *) vx; + const block_q2_K * x = (const block_q2_K *) vx; const uint8_t q = x[i].qs[32*n + l]; float * y = yy + i*QK_K + 128*n; @@ -321,9 +321,9 @@ static __global__ void dequantize_block_q2_k(const void * vx, float * yy) { } -static __device__ void vec_dot_q2_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q2_k * x = (const block_q2_k *) vx; + const block_q2_K * x = (const block_q2_K *) vx; // if n is 0, we want to do the lower 128, else the upper 128, // covering y[l+0], y[l+32], y[l+64], y[l+96] and @@ -352,7 +352,7 @@ static __device__ void vec_dot_q2_k(const void * vx, const int ib, const int iqs } -static __global__ void dequantize_block_q3_k(const void * vx, float * yy) { +static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { int r = threadIdx.x/4; int i = blockIdx.x; @@ -362,7 +362,7 @@ static __global__ void dequantize_block_q3_k(const void * vx, float * yy) { int n = tid / 4; int j = tid - 4*n; - const block_q3_k * x = (const block_q3_k *) vx; + const block_q3_K * x = (const block_q3_K *) vx; uint8_t m = 1 << (4*n + j); int is = 8*n + 2*j + is0; @@ -383,9 +383,9 @@ static __global__ void dequantize_block_q3_k(const void * vx, float * yy) { } -static __device__ void vec_dot_q3_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q3_k * x = (const block_q3_k *) vx; + const block_q3_K * x = (const block_q3_K *) vx; const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; @@ -437,8 +437,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t } } -static __global__ void dequantize_block_q4_k(const void * vx, float * yy) { - const block_q4_k * x = (const block_q4_k *) vx; +static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { + const block_q4_K * x = (const block_q4_K *) vx; const int i = blockIdx.x; @@ -474,9 +474,9 @@ static __global__ void dequantize_block_q4_k(const void * vx, float * yy) { } } -static __device__ void vec_dot_q4_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q4_k * x = (const block_q4_k *) vx; + const block_q4_K * x = (const block_q4_K *) vx; // iqs is in 0...248 in steps of 8 => const int j = iqs / 64; // j is in 0...3 @@ -506,8 +506,8 @@ static __device__ void vec_dot_q4_k(const void * vx, const int ib, const int iqs } -static __global__ void dequantize_block_q5_k(const void * vx, float * yy) { - const block_q5_k * x = (const block_q5_k *) vx; +static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { + const block_q5_K * x = (const block_q5_K *) vx; const int i = blockIdx.x; @@ -539,9 +539,9 @@ static __global__ void dequantize_block_q5_k(const void * vx, float * yy) { y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; } -static __device__ void vec_dot_q5_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q5_k * x = (const block_q5_k *) vx; + const block_q5_K * x = (const block_q5_K *) vx; // iqs is in 0...248 in steps of 8 => const int j = iqs / 64; // j is in 0...3 @@ -576,8 +576,8 @@ static __device__ void vec_dot_q5_k(const void * vx, const int ib, const int iqs } -static __global__ void dequantize_block_q6_k(const void * vx, float * yy) { - const block_q6_k * x = (const block_q6_k *) vx; +static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { + const block_q6_K * x = (const block_q6_K *) vx; const int i = blockIdx.x; @@ -601,9 +601,9 @@ static __global__ void dequantize_block_q6_k(const void * vx, float * yy) { y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); } -static __device__ void vec_dot_q6_k(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - const block_q6_k * x = (const block_q6_k *) vx; + const block_q6_K * x = (const block_q6_K *) vx; const int ip = iqs / 128; // 0 or 1 const int il = (iqs - 128*ip)/8; // 0...15 @@ -804,29 +804,29 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q2_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q2_k<<>>(vx, y); + dequantize_block_q2_K<<>>(vx, y); } -static void dequantize_row_q3_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q3_k<<>>(vx, y); + dequantize_block_q3_K<<>>(vx, y); } -static void dequantize_row_q4_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q4_k<<>>(vx, y); + dequantize_block_q4_K<<>>(vx, y); } -static void dequantize_row_q5_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q5_k<<>>(vx, y); + dequantize_block_q5_K<<>>(vx, y); } -static void dequantize_row_q6_k_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; - dequantize_block_q6_k<<>>(vx, y); + dequantize_block_q6_K<<>>(vx, y); } static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { @@ -869,35 +869,35 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f <<>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q2_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const int ny = 2; const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q2_k><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q3_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q3_k><<>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q4_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q4_k><<>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q5_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q5_k><<>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<>>(vx, y, dst, ncols); } -static void dequantize_mul_mat_vec_q6_k_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q6_k><<>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<>>(vx, y, dst, ncols); } static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { @@ -926,15 +926,15 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { case GGML_TYPE_Q8_0: return dequantize_row_q8_0_cuda; case GGML_TYPE_Q2_K: - return dequantize_row_q2_k_cuda; + return dequantize_row_q2_K_cuda; case GGML_TYPE_Q3_K: - return dequantize_row_q3_k_cuda; + return dequantize_row_q3_K_cuda; case GGML_TYPE_Q4_K: - return dequantize_row_q4_k_cuda; + return dequantize_row_q4_K_cuda; case GGML_TYPE_Q5_K: - return dequantize_row_q5_k_cuda; + return dequantize_row_q5_K_cuda; case GGML_TYPE_Q6_K: - return dequantize_row_q6_k_cuda; + return dequantize_row_q6_K_cuda; case GGML_TYPE_F16: return convert_fp16_to_fp32_cuda; default: @@ -1277,19 +1277,19 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q2_K: - dequantize_mul_mat_vec_q2_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q3_K: - dequantize_mul_mat_vec_q3_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q4_K: - dequantize_mul_mat_vec_q4_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q5_K: - dequantize_mul_mat_vec_q5_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q6_K: - dequantize_mul_mat_vec_q6_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_F16: convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); diff --git a/ggml.c b/ggml.c index 045768faf..34212b841 100644 --- a/ggml.c +++ b/ggml.c @@ -2,7 +2,10 @@ #define _GNU_SOURCE #include "ggml.h" -#include "ggml-quants-k.h" + +#ifdef GGML_USE_K_QUANTS +#include "k_quants.h" +#endif #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -1580,46 +1583,48 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_q = NULL, // TODO .vec_dot_type = GGML_TYPE_Q8_1, }, +#ifdef GGML_USE_K_QUANTS [GGML_TYPE_Q2_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_k, - .quantize_row_q = quantize_row_q2_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q2_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K, + .quantize_row_q = quantize_row_q2_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q2_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q3_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_k, - .quantize_row_q = quantize_row_q3_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q3_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K, + .quantize_row_q = quantize_row_q3_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q3_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q4_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_k, - .quantize_row_q = quantize_row_q4_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q4_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K, + .quantize_row_q = quantize_row_q4_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q4_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q5_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_k, - .quantize_row_q = quantize_row_q5_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q5_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K, + .quantize_row_q = quantize_row_q5_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q5_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q6_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_k, - .quantize_row_q = quantize_row_q6_k, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_k_reference, - .quantize_row_q_dot = quantize_row_q8_k, - .vec_dot_q = ggml_vec_dot_q6_k_q8_k, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K, + .quantize_row_q = quantize_row_q6_K, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference, + .quantize_row_q_dot = quantize_row_q8_K, + .vec_dot_q = ggml_vec_dot_q6_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, +#endif }; // For internal test use @@ -3499,12 +3504,14 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = QK5_1, [GGML_TYPE_Q8_0] = QK8_0, [GGML_TYPE_Q8_1] = QK8_1, +#ifdef GGML_USE_K_QUANTS [GGML_TYPE_Q2_K] = QK_K, [GGML_TYPE_Q3_K] = QK_K, [GGML_TYPE_Q4_K] = QK_K, [GGML_TYPE_Q5_K] = QK_K, [GGML_TYPE_Q6_K] = QK_K, [GGML_TYPE_Q8_K] = QK_K, +#endif [GGML_TYPE_I8] = 1, [GGML_TYPE_I16] = 1, [GGML_TYPE_I32] = 1, @@ -3520,12 +3527,14 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = sizeof(block_q5_1), [GGML_TYPE_Q8_0] = sizeof(block_q8_0), [GGML_TYPE_Q8_1] = sizeof(block_q8_1), - [GGML_TYPE_Q2_K] = sizeof(block_q2_k), - [GGML_TYPE_Q3_K] = sizeof(block_q3_k), - [GGML_TYPE_Q4_K] = sizeof(block_q4_k), - [GGML_TYPE_Q5_K] = sizeof(block_q5_k), - [GGML_TYPE_Q6_K] = sizeof(block_q6_k), - [GGML_TYPE_Q8_K] = sizeof(block_q8_k), +#ifdef GGML_USE_K_QUANTS + [GGML_TYPE_Q2_K] = sizeof(block_q2_K), + [GGML_TYPE_Q3_K] = sizeof(block_q3_K), + [GGML_TYPE_Q4_K] = sizeof(block_q4_K), + [GGML_TYPE_Q5_K] = sizeof(block_q5_K), + [GGML_TYPE_Q6_K] = sizeof(block_q6_K), + [GGML_TYPE_Q8_K] = sizeof(block_q8_K), +#endif [GGML_TYPE_I8] = sizeof(int8_t), [GGML_TYPE_I16] = sizeof(int16_t), [GGML_TYPE_I32] = sizeof(int32_t), @@ -3542,12 +3551,12 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_Q5_1] = "q5_1", [GGML_TYPE_Q8_0] = "q8_0", [GGML_TYPE_Q8_1] = "q8_1", - [GGML_TYPE_Q2_K] = "q2_k", - [GGML_TYPE_Q3_K] = "q3_k", - [GGML_TYPE_Q4_K] = "q4_k", - [GGML_TYPE_Q5_K] = "q5_k", - [GGML_TYPE_Q6_K] = "q6_k", - [GGML_TYPE_Q8_K] = "q8_k", + [GGML_TYPE_Q2_K] = "q2_K", + [GGML_TYPE_Q3_K] = "q3_K", + [GGML_TYPE_Q4_K] = "q4_K", + [GGML_TYPE_Q5_K] = "q5_K", + [GGML_TYPE_Q6_K] = "q6_K", + [GGML_TYPE_Q8_K] = "q8_K", [GGML_TYPE_I8] = "i8", [GGML_TYPE_I16] = "i16", [GGML_TYPE_I32] = "i32", @@ -16249,36 +16258,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i block_q8_0 * block = (block_q8_0*)dst + start / QK8_0; result = ggml_quantize_q8_0(src + start, block, n, n, hist); } break; +#ifdef GGML_USE_K_QUANTS case GGML_TYPE_Q2_K: { GGML_ASSERT(start % QK_K == 0); - block_q2_k * block = (block_q2_k*)dst + start / QK_K; - result = ggml_quantize_q2_k(src + start, block, n, n, hist); + block_q2_K * block = (block_q2_K*)dst + start / QK_K; + result = ggml_quantize_q2_K(src + start, block, n, n, hist); } break; case GGML_TYPE_Q3_K: { GGML_ASSERT(start % QK_K == 0); - block_q3_k * block = (block_q3_k*)dst + start / QK_K; - result = ggml_quantize_q3_k(src + start, block, n, n, hist); + block_q3_K * block = (block_q3_K*)dst + start / QK_K; + result = ggml_quantize_q3_K(src + start, block, n, n, hist); } break; case GGML_TYPE_Q4_K: { GGML_ASSERT(start % QK_K == 0); - block_q4_k * block = (block_q4_k*)dst + start / QK_K; - result = ggml_quantize_q4_k(src + start, block, n, n, hist); + block_q4_K * block = (block_q4_K*)dst + start / QK_K; + result = ggml_quantize_q4_K(src + start, block, n, n, hist); } break; case GGML_TYPE_Q5_K: { GGML_ASSERT(start % QK_K == 0); - block_q5_k * block = (block_q5_k*)dst + start / QK_K; - result = ggml_quantize_q5_k(src + start, block, n, n, hist); + block_q5_K * block = (block_q5_K*)dst + start / QK_K; + result = ggml_quantize_q5_K(src + start, block, n, n, hist); } break; case GGML_TYPE_Q6_K: { GGML_ASSERT(start % QK_K == 0); - block_q6_k * block = (block_q6_k*)dst + start / QK_K; - result = ggml_quantize_q6_k(src + start, block, n, n, hist); + block_q6_K * block = (block_q6_K*)dst + start / QK_K; + result = ggml_quantize_q6_K(src + start, block, n, n, hist); } break; +#endif default: assert(false); } diff --git a/ggml-quants-k.c b/k_quants.c similarity index 95% rename from ggml-quants-k.c rename to k_quants.c index dec00d371..4d524494d 100644 --- a/ggml-quants-k.c +++ b/k_quants.c @@ -1,4 +1,4 @@ -#include "ggml-quants-k.h" +#include "k_quants.h" #include "ggml.h" #include @@ -272,7 +272,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * //========================- 2-bit (de)-quantization -void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k) { +void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -341,7 +341,7 @@ void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict } } -void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k) { +void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -374,26 +374,26 @@ void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int } } -void quantize_row_q2_k(const float * restrict x, void * restrict vy, int k) { - quantize_row_q2_k_reference(x, vy, k); +void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { + quantize_row_q2_K_reference(x, vy, k); } -size_t ggml_quantize_q2_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { const int nb = k / QK_K; // TODO - collect histograms - although, at a second thought, I don't really care about them (void)hist; for (int j = 0; j < nb; j += k) { - block_q2_k * restrict y = (block_q2_k *)dst + j/QK_K; - quantize_row_q2_k_reference(src + j, y, k); + block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K; + quantize_row_q2_K_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q2_k)); + return (n/QK_K*sizeof(block_q2_K)); } //========================= 3-bit (de)-quantization -void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k) { +void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -469,7 +469,7 @@ void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict } } -void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k) { +void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); assert(QK_K == 256); const int nb = k / QK_K; @@ -520,26 +520,26 @@ void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int } } -void quantize_row_q3_k(const float * restrict x, void * restrict vy, int k) { - quantize_row_q3_k_reference(x, vy, k); +void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { + quantize_row_q3_K_reference(x, vy, k); } -size_t ggml_quantize_q3_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { const int nb = k / QK_K; // TODO - collect histograms - although, at a second thought, I don't really care about them (void)hist; for (int j = 0; j < nb; j += k) { - block_q3_k * restrict y = (block_q3_k *)dst + j/QK_K; - quantize_row_q3_k_reference(src + j, y, k); + block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K; + quantize_row_q3_K_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q3_k)); + return (n/QK_K*sizeof(block_q3_K)); } // ====================== 4-bit (de)-quantization -void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k) { +void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -604,7 +604,7 @@ void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict } } -void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k) { +void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -630,26 +630,26 @@ void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int } } -void quantize_row_q4_k(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { assert(k % QK_K == 0); - block_q4_k * restrict y = vy; - quantize_row_q4_k_reference(x, y, k); + block_q4_K * restrict y = vy; + quantize_row_q4_K_reference(x, y, k); } -size_t ggml_quantize_q4_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { assert(k % QK_K == 0); const int nb = k / QK_K; (void)hist; // TODO: collect histograms for (int j = 0; j < nb; j += k) { - block_q4_k * restrict y = (block_q4_k *)dst + j/QK_K; - quantize_row_q4_k_reference(src + j, y, k); + block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K; + quantize_row_q4_K_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q4_k)); + return (n/QK_K*sizeof(block_q4_K)); } // ====================== 5-bit (de)-quantization -void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k) { +void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -731,7 +731,7 @@ void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict } } -void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k) { +void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -759,26 +759,26 @@ void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int } } -void quantize_row_q5_k(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { assert(k % QK_K == 0); - block_q5_k * restrict y = vy; - quantize_row_q5_k_reference(x, y, k); + block_q5_K * restrict y = vy; + quantize_row_q5_K_reference(x, y, k); } -size_t ggml_quantize_q5_k(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { assert(k % QK_K == 0); const int nb = k / QK_K; (void)hist; for (int j = 0; j < nb; j += k) { - block_q5_k * restrict y = (block_q5_k *)dst + j/QK_K; - quantize_row_q5_k_reference(src + j, y, k); + block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K; + quantize_row_q5_K_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q5_k)); + return (n/QK_K*sizeof(block_q5_K)); } // ====================== 6-bit (de)-quantization -void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k) { +void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -842,7 +842,7 @@ void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict } } -void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k) { +void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -875,28 +875,28 @@ void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int } } -void quantize_row_q6_k(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { assert(k % QK_K == 0); - block_q6_k * restrict y = vy; - quantize_row_q6_k_reference(x, y, k); + block_q6_K * restrict y = vy; + quantize_row_q6_K_reference(x, y, k); } -size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK_K == 0); const int nb = k / QK_K; (void)hist; // TODO for (int j = 0; j < nb; j += k) { - block_q6_k * restrict y = (block_q6_k *)dst + j/QK_K; - quantize_row_q6_k_reference(src + j, y, k); + block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K; + quantize_row_q6_K_reference(src + j, y, k); } - return (n/QK_K*sizeof(block_q6_k)); + return (n/QK_K*sizeof(block_q6_K)); } //===================================== Q8_K ============================================== -void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k) { +void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -933,7 +933,7 @@ void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict } } -void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k) { +void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -944,8 +944,8 @@ void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int } } -void quantize_row_q8_k(const float * restrict x, void * restrict y, int k) { - quantize_row_q8_k_reference(x, y, k); +void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) { + quantize_row_q8_K_reference(x, y, k); } //===================================== Dot ptoducts ================================= @@ -1002,10 +1002,10 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q2_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const block_q2_k * restrict x = vx; - const block_q8_k * restrict y = vy; + const block_q2_K * restrict x = vx; + const block_q8_K * restrict y = vy; const int nb = n / QK_K; @@ -1201,14 +1201,14 @@ void ggml_vec_dot_q2_k_q8_k(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q3_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; - const block_q3_k * restrict x = vx; - const block_q8_k * restrict y = vy; + const block_q3_K * restrict x = vx; + const block_q8_K * restrict y = vy; const int nb = n / QK_K; @@ -1501,11 +1501,11 @@ void ggml_vec_dot_q3_k_q8_k(const int n, float * restrict s, const void * restri } -void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); - const block_q4_k * restrict x = vx; - const block_q8_k * restrict y = vy; + const block_q4_K * restrict x = vx; + const block_q8_K * restrict y = vy; const int nb = n / QK_K; @@ -1727,11 +1727,11 @@ void ggml_vec_dot_q4_k_q8_k(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); - const block_q5_k * restrict x = vx; - const block_q8_k * restrict y = vy; + const block_q5_K * restrict x = vx; + const block_q8_K * restrict y = vy; const int nb = n / QK_K; @@ -1974,11 +1974,11 @@ void ggml_vec_dot_q5_k_q8_k(const int n, float * restrict s, const void * restri -void ggml_vec_dot_q6_k_q8_k(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); - const block_q6_k * restrict x = vx; - const block_q8_k * restrict y = vy; + const block_q6_K * restrict x = vx; + const block_q8_K * restrict y = vy; const int nb = n / QK_K; diff --git a/ggml-quants-k.h b/k_quants.h similarity index 52% rename from ggml-quants-k.h rename to k_quants.h index d6f06013b..10a0baac7 100644 --- a/ggml-quants-k.h +++ b/k_quants.h @@ -22,8 +22,8 @@ typedef struct { uint8_t qs[QK_K/4]; // quants ggml_fp16_t d; // super-block scale for quantized scales ggml_fp16_t dmin; // super-block scale for quantized mins -} block_q2_k; -static_assert(sizeof(block_q2_k) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_k block size/padding"); +} block_q2_K; +static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); // 3-bit quantization // weight is represented as x = a * q @@ -34,8 +34,8 @@ typedef struct { uint8_t qs[QK_K/4]; // quants - low 2 bits uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits ggml_fp16_t d; // super-block scale -} block_q3_k; -static_assert(sizeof(block_q3_k) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_k block size/padding"); +} block_q3_K; +static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); // 4-bit quantization // 16 blocks of 32 elements each @@ -46,8 +46,8 @@ typedef struct { ggml_fp16_t dmin; // super-block scale for quantized mins uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants -} block_q4_k; -static_assert(sizeof(block_q4_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_k block size/padding"); +} block_q4_K; +static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); // 5-bit quantization // 16 blocks of 32 elements each @@ -59,8 +59,8 @@ typedef struct { uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits -} block_q5_k; -static_assert(sizeof(block_q5_k) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_k block size/padding"); +} block_q5_K; +static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); // 6-bit quantization // weight is represented as x = a * q @@ -71,52 +71,52 @@ typedef struct { uint8_t qh[QK_K/4]; // quants, upper 2 bits int8_t scales[QK_K/16]; // scales, quantized with 8 bits ggml_fp16_t d; // super-block scale -} block_q6_k; -static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_k block size/padding"); +} block_q6_K; +static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding"); // This is only used for intermediate quantization and dot products typedef struct { float d; // delta int8_t qs[QK_K]; // quants int16_t bsums[QK_K/16]; // sum of quants in groups of 16 -} block_q8_k; -static_assert(sizeof(block_q8_k) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_k block size/padding"); +} block_q8_K; +static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding"); // Quantization -void quantize_row_q2_k_reference(const float * restrict x, block_q2_k * restrict y, int k); -void quantize_row_q3_k_reference(const float * restrict x, block_q3_k * restrict y, int k); -void quantize_row_q4_k_reference(const float * restrict x, block_q4_k * restrict y, int k); -void quantize_row_q5_k_reference(const float * restrict x, block_q5_k * restrict y, int k); -void quantize_row_q6_k_reference(const float * restrict x, block_q6_k * restrict y, int k); -void quantize_row_q8_k_reference(const float * restrict x, block_q8_k * restrict y, int k); +void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); +void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); +void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); +void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); +void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); +void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); -void quantize_row_q2_k(const float * restrict x, void * restrict y, int k); -void quantize_row_q3_k(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_k(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_k(const float * restrict x, void * restrict y, int k); -void quantize_row_q6_k(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_k(const float * restrict x, void * restrict y, int k); +void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); // Dequantization -void dequantize_row_q2_k(const block_q2_k * restrict x, float * restrict y, int k); -void dequantize_row_q3_k(const block_q3_k * restrict x, float * restrict y, int k); -void dequantize_row_q4_k(const block_q4_k * restrict x, float * restrict y, int k); -void dequantize_row_q5_k(const block_q5_k * restrict x, float * restrict y, int k); -void dequantize_row_q6_k(const block_q6_k * restrict x, float * restrict y, int k); -void dequantize_row_q8_k(const block_q8_k * restrict x, float * restrict y, int k); +void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); +void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); +void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); +void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k); +void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); +void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); // Dot product -void ggml_vec_dot_q2_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q3_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q6_k_q8_k(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); // Quantization with histogram collection -size_t ggml_quantize_q2_k(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q3_k(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q4_k(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q5_k(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q6_k(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); From 0035858273ebe0694926bf4414d279f3e1cd109d Mon Sep 17 00:00:00 2001 From: johnson442 <56517414+johnson442@users.noreply.github.com> Date: Thu, 8 Jun 2023 08:02:48 +0100 Subject: [PATCH 07/10] k-quants : add missing compile definition to CMakeLists (#1748) --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 456875f90..41f5bb737 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -229,6 +229,7 @@ endif() if (LLAMA_K_QUANTS) set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h) + add_compile_definitions(GGML_USE_K_QUANTS) endif() if (LLAMA_CLBLAST) From 4161bdc04debb70bf5f275492b4d89fd9330087c Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Thu, 8 Jun 2023 10:08:23 +0300 Subject: [PATCH 08/10] metal : add Q4_K implementation (#1733) * Metal implementation for Q4_K Very slow for now: 42 ms / token, Q4_0 runs in 28 ms/token on my 30-core M2 Max GPU. * Optimizing Q4_K on metal The first token always takes longer, I guess because the metal kernel is being jit-compiled. So, using n = 128 to measure time. At this point Q4_K takes 29.5 ms / token compared to 27.2 ms / token for Q4_0. Quite a bit better than the initial attempt, but still not good enough. * Optimizing q4_K metal dot some more For n = 256 it is now 28.1 ms/token compared to 27 ms/token for q4_0. * Fix after merge with master --------- Co-authored-by: Iwan Kawrakow --- .clang-tidy | 18 ------ ggml-metal.m | 23 ++++++- ggml-metal.metal | 162 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 184 insertions(+), 19 deletions(-) delete mode 100644 .clang-tidy diff --git a/.clang-tidy b/.clang-tidy deleted file mode 100644 index 1a42b9abc..000000000 --- a/.clang-tidy +++ /dev/null @@ -1,18 +0,0 @@ ---- -Checks: > - bugprone-*, - -bugprone-easily-swappable-parameters, - -bugprone-implicit-widening-of-multiplication-result, - -bugprone-narrowing-conversions, - readability-*, - -readability-avoid-unconditional-preprocessor-if, - -readability-function-cognitive-complexity, - -readability-identifier-length, - -readability-implicit-bool-conversion, - -readability-magic-numbers, - -readability-uppercase-literal-suffix, - clang-analyzer-*, - -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, - performance-*, - portability-*, -FormatStyle: none diff --git a/ggml-metal.m b/ggml-metal.m index 0953af6a4..f2a637b7a 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -49,9 +49,11 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(diag_mask_inf); GGML_METAL_DECL_KERNEL(get_rows_f16); GGML_METAL_DECL_KERNEL(get_rows_q4_0); + GGML_METAL_DECL_KERNEL(get_rows_q4_k); GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32); GGML_METAL_DECL_KERNEL(rope); GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f32); @@ -133,9 +135,11 @@ struct ggml_metal_context * ggml_metal_init(void) { GGML_METAL_ADD_KERNEL(diag_mask_inf); GGML_METAL_ADD_KERNEL(get_rows_f16); GGML_METAL_ADD_KERNEL(get_rows_q4_0); + GGML_METAL_ADD_KERNEL(get_rows_q4_k); GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32); GGML_METAL_ADD_KERNEL(rope); GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f32); @@ -517,7 +521,20 @@ void ggml_metal_graph_compute( nth1 = 4; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32]; } break; - default: GGML_ASSERT(false && "not implemented"); + case GGML_TYPE_Q4_K: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); + + nth0 = 4; + nth1 = 16; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32]; + } break; + default: + { + fprintf(stderr, "Asserting on type %d\n",(int)src0t); + GGML_ASSERT(false && "not implemented"); + } }; @@ -540,6 +557,9 @@ void ggml_metal_graph_compute( if (src0t == GGML_TYPE_Q4_0) { [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src0t == GGML_TYPE_Q4_K) { + [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else { [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; @@ -555,6 +575,7 @@ void ggml_metal_graph_compute( switch (src0->type) { case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; + case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break; default: GGML_ASSERT(false && "not implemented"); } diff --git a/ggml-metal.metal b/ggml-metal.metal index a359bebe2..cbcd59ad4 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -503,3 +503,165 @@ kernel void kernel_cpy_f32_f32( dst_data[i00] = src[0]; } } + +//============================================ k-quants ====================================================== + +#define QK_K 256 + +typedef struct { + half d; // super-block scale for quantized scales + half dmin; // super-block scale for quantized mins + uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_k; + +static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { + uchar4 r; + if (j < 4) { + r[0] = q[j+0] & 63; r[1] = q[j+4] & 63; + r[2] = q[j+1] & 63; r[3] = q[j+5] & 63; + } else { + r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + r[1] = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4); + r[3] = (q[j+5] >> 4) | ((q[j+1] >> 6) << 4); + } + return r; +} + +static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + + const float d = x[i].d; + const float min = x[i].dmin; + + device const uint8_t * q = x[i].qs; + device const uint8_t * scales = x[i].scales; + + int is = 0; + for (int j = 0; j < QK_K; j += 64) { + const uchar4 sc = get_scale_min_k4(is, scales); + const float d1 = d * sc[0]; const float m1 = min * sc[1]; + const float d2 = d * sc[2]; const float m2 = min * sc[3]; + for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1; + for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2; + q += 32; is += 2; + } + + } +} + +kernel void kernel_get_rows_q4_k( + device const void * src0, + device const int * src1, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant uint64_t & nb1, + uint tpig[[thread_position_in_grid]]) { + const int i = tpig; + const int r = ((device int32_t *) src1)[i]; + + dequantize_row_q4_k( + (device const block_q4_k *) ((device char *) src0 + r*nb01), + (device float *) ((device char *) dst + i*nb1), ne00); +} + +kernel void kernel_mul_mat_q4_k_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + threadgroup float * sum [[threadgroup(0)]], + uint2 tgpig[[threadgroup_position_in_grid]], + uint2 tpig[[thread_position_in_grid]], // we don't use this for now + uint2 tpitg[[thread_position_in_threadgroup]], + uint2 tptg[[threads_per_threadgroup]]) { + + const int nb = ne00/QK_K; + + const int64_t r0 = tgpig.x; + const int64_t r1 = tgpig.y; + + device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb; + device const float * yy = (device const float *) src1 + r1*ne10; + + const uint nth = tptg.x*tptg.y; + const uint ith = tptg.y*tpitg.x + tpitg.y; + + const int tid = tpitg.y; // 0...16 + const int il = tid/4; // 0...3 + const int ir = tid%4; // 0...3 + const int n = 8; + const int is = 2*il; + + sum[ith] = 0.0f; + + float sumf = 0; + for (int i = tpitg.x; i < nb; i += tptg.x) { + + device const uint8_t * q = (x + i)->qs + 32*il + n*ir; + device const float * y = yy + i*QK_K + 64*il + n*ir; + device const uint8_t * scales = (x + i)->scales; + + const float dall = (float)((x + i)->d); + const float dmin = (float)((x + i)->dmin); + + const uchar4 sc = get_scale_min_k4(is, scales); + + float4 s = {0.f, 0.f, 0.f, 0.f}; + for (int l = 0; l < n; ++l) { + s[0] += y[l+ 0] * (q[l] & 0xF); s[1] += y[l+ 0]; + s[2] += y[l+32] * (q[l] >> 4); s[3] += y[l+32]; + } + sumf += dall * (s[0] * sc[0] + s[2] * sc[2]) - dmin * (s[1] * sc[1] + s[3] * sc[3]); + + } + sum[ith] = sumf; + + // + // Accumulate the sum from all threads in the threadgroup + // This version is slightly faster than the commented out one below, + // which I copy-pasted from ggerganov's q4_0 dot product for metal. + // + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith%4 == 0) { + for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith%16 == 0) { + for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith == 0) { + for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; + dst[r1*ne0 + r0] = sum[0]; + } + + //// accumulate the sum from all threads in the threadgroup + //threadgroup_barrier(mem_flags::mem_threadgroup); + //for (uint i = nth/2; i > 0; i /= 2) { + // if (ith < i) { + // sum[ith] += sum[ith + i]; + // } + // threadgroup_barrier(mem_flags::mem_threadgroup); + //} + + //if (ith == 0) { + // dst[r1*ne0 + r0] = sum[0]; + //} +} From 53aba3f393f2e02a78ddaba2e934893a8bbf3246 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 8 Jun 2023 10:09:08 +0300 Subject: [PATCH 09/10] clang-tidy : restore dot file from accidental deletion --- .clang-tidy | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 .clang-tidy diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000..1a42b9abc --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,18 @@ +--- +Checks: > + bugprone-*, + -bugprone-easily-swappable-parameters, + -bugprone-implicit-widening-of-multiplication-result, + -bugprone-narrowing-conversions, + readability-*, + -readability-avoid-unconditional-preprocessor-if, + -readability-function-cognitive-complexity, + -readability-identifier-length, + -readability-implicit-bool-conversion, + -readability-magic-numbers, + -readability-uppercase-literal-suffix, + clang-analyzer-*, + -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, + performance-*, + portability-*, +FormatStyle: none From b50b570ed9d699d3d126d72fc02de92926bcd937 Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Thu, 8 Jun 2023 00:12:28 -0700 Subject: [PATCH 10/10] ggml : fix fprintf warnings (#1720) --- ggml.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/ggml.c b/ggml.c index 34212b841..567dbc1e1 100644 --- a/ggml.c +++ b/ggml.c @@ -14729,12 +14729,12 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-12s %8d %8d %d %d %d %16zu %16zu %16zu %16zu %16p %32s\n", + fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, - (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3], - nb[0], nb[1], nb[2], nb[3], + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], tensor->data, tensor->name); } @@ -14743,13 +14743,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-6s %-12s %8d %d %d %d %d %16zu %16zu %16zu %16zu %8d %16p %32s\n", + fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n", arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, - (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3], - nb[0], nb[1], nb[2], nb[3], + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], tensor->n_tasks, tensor->data, tensor->name); @@ -14772,11 +14772,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { FILE * fout = stdout; fprintf(fout, "\n"); - fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); - fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); - fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); - fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); - fprintf(fout, "%-16s %8d\n", "eval", (int) size_eval); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); + fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); + fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval); // header fprintf(fout, "\n");