From 18b35625c3c19c64b7818a12460ba5ddb006dfdc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 19 Jun 2023 20:43:30 +0300 Subject: [PATCH 01/29] ggml : fix bug in LBFGS optimizer (found by ggml tests) --- ggml.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml.c b/ggml.c index 14e08f9d6..4319683f5 100644 --- a/ggml.c +++ b/ggml.c @@ -18237,7 +18237,6 @@ GGML_API void ggml_opt_init( ggml_set_zero(opt->lbfgs.g); ggml_set_zero(opt->lbfgs.gp); ggml_set_zero(opt->lbfgs.d); - ggml_set_zero(opt->lbfgs.pf); if (opt->lbfgs.pf) { ggml_set_zero(opt->lbfgs.pf); } From 20568fe60f00155fa25e92eb3a7f6b911d557967 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 20 Jun 2023 01:12:39 +0300 Subject: [PATCH 02/29] [Fix] Reenable server embedding endpoint (#1937) * Add back embedding feature * Update README --- examples/server/README.md | 13 +++++++++-- examples/server/server.cpp | 44 +++++++++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 474a28b20..fa95c0044 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -21,6 +21,7 @@ Command line options: - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. - `--port`: Set the port to listen. Default: `8080`. +- `--embedding`: Enable embedding extraction, Default: disabled. ## Build @@ -119,14 +120,14 @@ node . `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). - `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity). + `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity). `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. - `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. + `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does. `stop`: Specify a JSON array of stopping strings. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). @@ -163,6 +164,14 @@ node . `content`: Set the text to tokenize. + Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`. + +- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. + + *Options:* + + `content`: Set the text to process. + ## More examples ### Interactive mode diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 12d4e2fa4..c0984aadb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -254,6 +254,11 @@ struct llama_server_context { n_past += n_eval; } + if (params.n_predict == 0) { + has_next_token = false; + return llama_token_eos(); + } + // out of user input, sample next token const float temp = params.temp; const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; @@ -419,6 +424,19 @@ struct llama_server_context { return token_text; } + + std::vector getEmbedding() { + static const int n_embd = llama_n_embd(ctx); + if (!params.embedding) { + LOG_WARNING("embedding disabled", { + { "params.embedding", params.embedding }, + }); + return std::vector(n_embd, 0.0f); + } + const float * data = llama_get_embeddings(ctx); + std::vector embedding(data, data + n_embd); + return embedding; + } }; static void server_print_usage(const char * argv0, const gpt_params & params, @@ -457,6 +475,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + fprintf(stderr, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); fprintf(stderr, "\n"); } @@ -603,6 +622,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, params.use_mlock = true; } else if (arg == "--no-mmap") { params.use_mmap = false; + } else if (arg == "--embedding") { + params.embedding = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); server_print_usage(argv[0], default_params, default_sparams); @@ -646,6 +667,12 @@ static json format_generation_settings(llama_server_context & llama) { }; } +static json format_embedding_response(llama_server_context & llama) { + return json { + { "embedding", llama.getEmbedding() }, + }; +} + static json format_final_response(llama_server_context & llama, const std::string & content) { return json { { "content", content }, @@ -881,12 +908,27 @@ int main(int argc, char ** argv) { svr.Post("/tokenize", [&llama](const Request & req, Response & res) { const json body = json::parse(req.body); - const std::string content = body["content"].get(); + const std::string content = body.value("content", ""); const std::vector tokens = llama_tokenize(llama.ctx, content, false); const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json"); }); + svr.Post("/embedding", [&llama](const Request & req, Response & res) { + const json body = json::parse(req.body); + + llama.rewind(); + llama_reset_timings(llama.ctx); + llama.params.prompt = body.value("content", ""); + llama.params.n_predict = 0; + llama.loadPrompt(); + llama.beginCompletion(); + llama.doCompletion(); + + const json data = format_embedding_response(llama); + return res.set_content(data.dump(), "application/json"); + }); + svr.set_logger(log_server_request); svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) { From aacdbd40562684665b6f7b8ba6695b7a2088bbb0 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 Jun 2023 03:24:39 +0200 Subject: [PATCH 03/29] llama : fix params struct slignment (#1936) * Workaround struct misalignment during value-copy Signed-off-by: mudler * Move booleans at the bottom of the structure Signed-off-by: mudler * Add comment Signed-off-by: mudler --------- Signed-off-by: mudler --- llama.cpp | 6 +++--- llama.h | 17 ++++++++--------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/llama.cpp b/llama.cpp index 4a7d01b32..e597f5048 100644 --- a/llama.cpp +++ b/llama.cpp @@ -925,21 +925,21 @@ static bool kv_cache_init( struct llama_context_params llama_context_default_params() { struct llama_context_params result = { + /*.seed =*/ -1, /*.n_ctx =*/ 512, /*.n_batch =*/ 512, /*.gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ {0}, + /*.progress_callback =*/ nullptr, + /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, - /*.seed =*/ -1, /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.embedding =*/ false, - /*.progress_callback =*/ nullptr, - /*.progress_callback_user_data =*/ nullptr, }; return result; diff --git a/llama.h b/llama.h index 1241ba6c0..0de530d45 100644 --- a/llama.h +++ b/llama.h @@ -71,28 +71,27 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); - struct llama_context_params { + struct llama_context_params { + int seed; // RNG seed, -1 for random int n_ctx; // text context int n_batch; // prompt processing batch size int n_gpu_layers; // number of layers to store in VRAM int main_gpu; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs - bool low_vram; // if true, reduce VRAM usage at the cost of performance - int seed; // RNG seed, -1 for random + // called with a progress value between 0 and 1, pass NULL to disable + llama_progress_callback progress_callback; + // context pointer passed to the progress callback + void * progress_callback_user_data; + // Keep the booleans together to avoid misalignment during copy-by-value. + bool low_vram; // if true, reduce VRAM usage at the cost of performance bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only - - // called with a progress value between 0 and 1, pass NULL to disable - llama_progress_callback progress_callback; - // context pointer passed to the progress callback - void * progress_callback_user_data; }; - // model file types enum llama_ftype { LLAMA_FTYPE_ALL_F32 = 0, From 2322ec223a21625dfe9bd73ee677444a98a24ac9 Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Tue, 20 Jun 2023 05:42:40 -0700 Subject: [PATCH 04/29] Fix typo (#1949) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2d05de333..8136e7064 100644 --- a/README.md +++ b/README.md @@ -378,7 +378,7 @@ Building the program with BLAS support may lead to some performance improvements ```sh git clone https://github.com/CNugteren/CLBlast.git mkdir CLBlast/build - cd CLBLast/build + cd CLBlast/build cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF cmake --build . --config Release cmake --install . --prefix /some/path From 049aa16b8c5c6d086246e4e6b9feb18de4fbd663 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 20 Jun 2023 19:05:54 +0300 Subject: [PATCH 05/29] readme : add link to p1 --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index 8136e7064..67012adab 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1 - Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729 -- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642 -- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684 -- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607 -- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652 -- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632
Table of Contents From fb98254f99d769fcbbf20966ef386abdb48ef601 Mon Sep 17 00:00:00 2001 From: Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com> Date: Thu, 22 Jun 2023 03:18:43 +0530 Subject: [PATCH 06/29] Fix typo in README.md (#1961) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 67012adab..ace588606 100644 --- a/README.md +++ b/README.md @@ -340,7 +340,7 @@ Building the program with BLAS support may lead to some performance improvements | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### CLBlast From bbca06e26949686d61a5126332680ba3cccf235c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 21 Jun 2023 23:49:25 +0200 Subject: [PATCH 07/29] cmake: revert CUDA arch default to 52, 61 if f16 (#1959) --- CMakeLists.txt | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2846d9b94..cc7560a7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -250,6 +250,15 @@ if (LLAMA_CUBLAS) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + if (LLAMA_CUDA_DMMV_F16) + set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics + else() + set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard + endif() + endif() + message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") + else() message(WARNING "cuBLAS not found") endif() @@ -493,22 +502,6 @@ if (BUILD_SHARED_LIBS) endif() endif() -if (GGML_SOURCES_CUDA) - message(STATUS "GGML CUDA sources found, configuring CUDA architecture") - set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native") - set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - - set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native") - set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - - if (BUILD_SHARED_LIBS) - set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native") - set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - endif() - - set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native") -endif() - # # programs, examples and tests From 7487137227eb32ed9b12156338b865cb29b2dfd1 Mon Sep 17 00:00:00 2001 From: Erik Scholz Date: Thu, 22 Jun 2023 14:20:47 +0200 Subject: [PATCH 08/29] rework convert.py to read hyper-parameters from config.json (#1958) * Read hyper-parameters from HuggingFace-transformer config.json, if they exist, and fall back to guessing, like before otherwise. This allows converting open_llama 3B and other non-standard model designs. --- convert.py | 91 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 22 deletions(-) diff --git a/convert.py b/convert.py index 265c41fa0..de6c39c67 100644 --- a/convert.py +++ b/convert.py @@ -130,6 +130,14 @@ TENSORS_LIST = make_tensors_list() TENSORS_SET = set(TENSORS_LIST) +def find_n_mult(n_ff: int, n_embd: int) -> int: + # hardcoded magic range + for n_mult in range(256, 1, -1): + calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult + if calc_ff == n_ff: + return n_mult + return 1 + @dataclass class Params: n_vocab: int @@ -137,21 +145,61 @@ class Params: n_mult: int n_head: int n_layer: int - file_type: GGMLFileType @staticmethod - def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params': - n_vocab, n_embd = model["tok_embeddings.weight"].shape + def guessed(model: 'LazyModel') -> 'Params': + # try transformer naming first + n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape + + # try transformer naming first + if "model.layers.0.self_attn.q_proj.weight" in model: + n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) + else: + n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) + + n_head=n_embd // 128 # guessed return Params( n_vocab=n_vocab, n_embd=n_embd, n_mult=256, - n_head=n_embd // 128, - n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model), - file_type=file_type, + n_head=n_head, + n_layer=n_layer, ) + @staticmethod + def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params': + config = json.load(open(config_path)) + + n_vocab = config["vocab_size"]; + n_embd = config["hidden_size"]; + n_head = config["num_attention_heads"]; + n_layer = config["num_hidden_layers"]; + n_ff = config["intermediate_size"]; + + n_mult = find_n_mult(n_ff, n_embd); + + return Params( + n_vocab=n_vocab, + n_embd=n_embd, + n_mult=n_mult, + n_head=n_head, + n_layer=n_layer, + ) + + @staticmethod + def load(model_plus: 'ModelPlus') -> 'Params': + orig_config_path = model_plus.paths[0].parent / "params.json" + hf_transformer_config_path = model_plus.paths[0].parent / "config.json" + + if hf_transformer_config_path.exists(): + params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path) + else: + params = Params.guessed(model_plus.model) + + print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}') + return params + class SentencePieceVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None: @@ -595,18 +643,17 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor: return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description) -def convert_transformers_to_orig(model: LazyModel) -> LazyModel: +def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel: out: LazyModel = {} out["tok_embeddings.weight"] = model["model.embed_tokens.weight"] out["norm.weight"] = model["model.norm.weight"] out["output.weight"] = model["lm_head.weight"] - n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128 for i in itertools.count(): if f"model.layers.{i}.self_attn.q_proj.weight" not in model: break - out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head) - out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head) + out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head) + out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head) out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"] @@ -920,7 +967,7 @@ class OutputFile: def __init__(self, fname_out: Path) -> None: self.fout = open(fname_out, "wb") - def write_file_header(self, params: Params) -> None: + def write_file_header(self, params: Params, file_type: GGMLFileType) -> None: self.fout.write(b"ggjt"[::-1]) # magic values = [ 1, # file version @@ -930,7 +977,7 @@ class OutputFile: params.n_head, params.n_layer, params.n_embd // params.n_head, # rot (obsolete) - params.file_type.value, + file_type.value, ] self.fout.write(struct.pack("i" * len(values), *values)) @@ -958,10 +1005,10 @@ class OutputFile: of.fout.close() @staticmethod - def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None: + def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None: check_vocab_size(params, vocab) of = OutputFile(fname_out) - of.write_file_header(params) + of.write_file_header(params, file_type) print("Writing vocab...") of.write_vocab(vocab) @@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi raise Exception(f"Unexpected combination of types: {name_to_type}") -def do_necessary_conversions(model: LazyModel) -> LazyModel: +def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel: model = handle_quantization(model) if "lm_head.weight" in model: - model = convert_transformers_to_orig(model) + model = convert_transformers_to_orig(model, params) model = filter_and_sort_tensors(model) return model @@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab: return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) -def default_outfile(model_paths: List[Path], params: Params) -> Path: +def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path: namestr = { GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", GGMLFileType.MostlyQ4_0: "q4_0", GGMLFileType.MostlyQ4_1: "q4_1", GGMLFileType.PerLayerIsQ4_1: "q4_1", - }[params.file_type] + }[file_type] ret = model_paths[0].parent / f"ggml-model-{namestr}.bin" if ret in model_paths: sys.stderr.write( @@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None: else: vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab = load_vocab(vocab_dir) + params = Params.load(model_plus) model = model_plus.model - model = do_necessary_conversions(model) + model = do_necessary_conversions(model, params) output_type = pick_output_type(model, args.outtype) model = convert_to_output_type(model, output_type) - params = Params.guessed(model, output_type) - outfile = args.outfile or default_outfile(model_plus.paths, params) - OutputFile.write_all(outfile, params, model, vocab) + outfile = args.outfile or default_outfile(model_plus.paths, output_type) + OutputFile.write_all(outfile, params, output_type, model, vocab) print(f"Wrote {outfile}") From d7b7484f74d486f77feb4c0b7af7e1718ed91651 Mon Sep 17 00:00:00 2001 From: eiery <19350831+eiery@users.noreply.github.com> Date: Fri, 23 Jun 2023 04:38:01 -0400 Subject: [PATCH 09/29] Add OpenLLaMA instructions to the README (#1954) * add openllama to readme --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index ace588606..b09498be6 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
  • Quantization
  • Interactive mode
  • Instruction mode with Alpaca
  • +
  • Using OpenLLaMA
  • Using GPT4All
  • Using Pygmalion 7B & Metharme 7B
  • Obtaining the Facebook LLaMA original model and Stanford Alpaca model data
  • @@ -543,6 +544,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. > ``` +### Using [OpenLLaMA](https://github.com/openlm-research/open_llama) + +OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights. + +- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face. +- Convert the model to ggml FP16 format using `python convert.py ` + ### Using [GPT4All](https://github.com/nomic-ai/gpt4all) - Obtain the `tokenizer.model` file from LLaMA model and put it to `models` From 527b6fba1d237befb324fd846bda7418c0fa394d Mon Sep 17 00:00:00 2001 From: Didzis Gosko Date: Sat, 24 Jun 2023 11:47:58 +0300 Subject: [PATCH 10/29] llama : make model stateless and context stateful (llama_state) (#1797) * llama : make model stateless and context stateful * llama : minor cleanup * llama : update internal API declaration * Apply suggestions from code review fix style Co-authored-by: Georgi Gerganov * Missing model memory release * Fix style * Add deprecated warning for public API function llama_init_from_file * Update public API use cases: move away from deprecated llama_init_from_file * Deprecate public API function llama_apply_lora_from_file --------- Co-authored-by: Georgi Gerganov --- examples/common.cpp | 24 ++- examples/common.h | 3 +- examples/embedding/embedding.cpp | 6 +- examples/main/main.cpp | 8 +- examples/perplexity/perplexity.cpp | 6 +- examples/quantize-stats/quantize-stats.cpp | 15 +- examples/save-load-state/save-load-state.cpp | 29 ++- examples/server/server.cpp | 9 +- examples/simple/simple.cpp | 8 +- .../train-text-from-scratch.cpp | 5 +- llama.cpp | 172 ++++++++++++------ llama.h | 35 +++- tests/test-tokenizer-0.cpp | 16 +- 13 files changed, 244 insertions(+), 92 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index fed24e027..6ac484555 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -536,7 +536,7 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s return res; } -struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { +std::tuple llama_init_from_gpt_params(const gpt_params & params) { auto lparams = llama_context_default_params(); lparams.n_ctx = params.n_ctx; @@ -552,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { lparams.logits_all = params.perplexity; lparams.embedding = params.embedding; - llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams); - - if (lctx == NULL) { + llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams); + if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); - return NULL; + return std::make_tuple(nullptr, nullptr); + } + + llama_context * lctx = llama_new_context_with_model(model, lparams); + if (lctx == NULL) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } if (!params.lora_adapter.empty()) { - int err = llama_apply_lora_from_file(lctx, + int err = llama_model_apply_lora_from_file(model, params.lora_adapter.c_str(), params.lora_base.empty() ? NULL : params.lora_base.c_str(), params.n_threads); if (err != 0) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); - return NULL; + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } } - return lctx; + return std::make_tuple(model, lctx); } void console_init(console_state & con_st) { diff --git a/examples/common.h b/examples/common.h index 6c2953cb2..713320179 100644 --- a/examples/common.h +++ b/examples/common.h @@ -9,6 +9,7 @@ #include #include #include +#include #if !defined (_WIN32) #include @@ -95,7 +96,7 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s // Model utils // -struct llama_context * llama_init_from_gpt_params(const gpt_params & params); +std::tuple llama_init_from_gpt_params(const gpt_params & params); // // Console utils diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 860f99f67..369eac1d1 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -37,11 +37,12 @@ int main(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; // load the model - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } @@ -90,6 +91,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 941312f9c..c1e6bf126 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -107,12 +107,13 @@ int main(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; g_ctx = &ctx; // load the model and apply lora adapter, if any - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } @@ -139,6 +140,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } @@ -147,6 +149,7 @@ int main(int argc, char ** argv) { if (params.export_cgraph) { llama_eval_export(ctx, "llama.ggml"); llama_free(ctx); + llama_free_model(model); return 0; } @@ -666,6 +669,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index ae8cfe0af..b59f5971e 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -149,11 +149,12 @@ int main(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; // load the model and apply lora adapter, if any - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } @@ -169,6 +170,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6b8018ee2..9cea472de 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -320,6 +320,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "Loading model\n"); const int64_t t_main_start_us = ggml_time_us(); + llama_model * model; llama_context * ctx; { @@ -330,10 +331,18 @@ int main(int argc, char ** argv) { lparams.f16_kv = false; lparams.use_mlock = false; - ctx = llama_init_from_file(params.model.c_str(), lparams); + model = llama_load_model_from_file(params.model.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); return 1; } } @@ -357,6 +366,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: error: Quantization should be tested with a float model, " "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); llama_free(ctx); + llama_free_model(model); return 1; } included_layers++; @@ -415,6 +425,7 @@ int main(int argc, char ** argv) { llama_free(ctx); + llama_free_model(model); // report timing { const int64_t t_main_end_us = ggml_time_us(); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index da4d37ad0..4c8688503 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -35,12 +35,22 @@ int main(int argc, char ** argv) { auto last_n_tokens_data = std::vector(params.repeat_last_n, 0); // init - auto ctx = llama_init_from_file(params.model.c_str(), lparams); + auto model = llama_load_model_from_file(params.model.c_str(), lparams); + if (model == nullptr) { + return 1; + } + auto ctx = llama_new_context_with_model(model, lparams); + if (ctx == nullptr) { + llama_free_model(model); + return 1; + } auto tokens = std::vector(params.n_ctx); auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true); if (n_prompt_tokens < 1) { fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); + llama_free(ctx); + llama_free_model(model); return 1; } @@ -84,6 +94,8 @@ int main(int argc, char ** argv) { printf("%s", next_token_str); if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + llama_free(ctx); + llama_free_model(model); return 1; } n_past += 1; @@ -91,23 +103,27 @@ int main(int argc, char ** argv) { printf("\n\n"); - // free old model + // free old context llama_free(ctx); - // load new model - auto ctx2 = llama_init_from_file(params.model.c_str(), lparams); + // make new context + auto ctx2 = llama_new_context_with_model(model, lparams); // Load state (rng, logits, embedding and kv_cache) from file { FILE *fp_read = fopen("dump_state.bin", "rb"); if (state_size != llama_get_state_size(ctx2)) { fprintf(stderr, "\n%s : failed to validate state size\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } const size_t ret = fread(state_mem, 1, state_size, fp_read); if (ret != state_size) { fprintf(stderr, "\n%s : failed to read state\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } @@ -138,6 +154,8 @@ int main(int argc, char ** argv) { printf("%s", next_token_str); if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } n_past += 1; @@ -145,5 +163,8 @@ int main(int argc, char ** argv) { printf("\n\n"); + llama_free(ctx2); + llama_free_model(model); + return 0; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c0984aadb..de22d3013 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -115,6 +115,7 @@ struct llama_server_context { std::vector embd; std::vector last_n_tokens; + llama_model * model = nullptr; llama_context * ctx = nullptr; gpt_params params; @@ -130,6 +131,10 @@ struct llama_server_context { llama_free(ctx); ctx = nullptr; } + if (model) { + llama_free_model(model); + model = nullptr; + } } void rewind() { @@ -150,8 +155,8 @@ struct llama_server_context { bool loadModel(const gpt_params & params_) { params = params_; - ctx = llama_init_from_gpt_params(params); - if (ctx == nullptr) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr) { LOG_ERROR("unable to load model", { { "model", params_.model } }); return false; } diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 76f991cdc..fc45c9340 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -68,11 +68,12 @@ int main(int argc, char ** argv) llama_init_backend(); - llama_context * ctx ; + llama_model * model; + llama_context * ctx; - ctx = llama_init_from_gpt_params( params ); + std::tie(model, ctx) = llama_init_from_gpt_params( params ); - if ( ctx == NULL ) + if ( model == NULL ) { fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); return 1; @@ -170,6 +171,7 @@ int main(int argc, char ** argv) } // wend of main loop llama_free( ctx ); + llama_free_model( model ); return 0; } diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 7ec85951a..61c829e5c 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -3054,7 +3054,8 @@ int main(int argc, char ** argv) { struct llama_context_params llama_params = llama_context_default_params(); llama_params.vocab_only = true; - struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params); + struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params); + struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); struct llama_vocab vocab; { @@ -3395,6 +3396,8 @@ int main(int argc, char ** argv) { delete[] compute_addr; delete[] compute_buf_0; delete[] compute_buf_1; + llama_free(lctx); + llama_free_model(lmodel); ggml_free(model.ctx); return 0; diff --git a/llama.cpp b/llama.cpp index e597f5048..a528eef4a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -182,6 +182,19 @@ struct llama_kv_cache { } }; +struct llama_vocab { + using id = int32_t; + using token = std::string; + + struct token_score { + token tok; + float score; + }; + + std::unordered_map token_to_id; + std::vector id_to_token; +}; + struct llama_model { e_model type = MODEL_UNKNOWN; @@ -198,10 +211,6 @@ struct llama_model { // context struct ggml_context * ctx = NULL; - // key + value cache for the self attention - // TODO: move to llama_state - struct llama_kv_cache kv_self; - // the model memory buffer llama_ctx_buffer buf; @@ -215,6 +224,11 @@ struct llama_model { // for quantize-stats only std::vector> tensors_by_name; + int64_t t_load_us = 0; + int64_t t_start_us = 0; + + llama_vocab vocab; + ~llama_model() { if (ctx) { ggml_free(ctx); @@ -233,24 +247,11 @@ struct llama_model { } }; -struct llama_vocab { - using id = int32_t; - using token = std::string; - - struct token_score { - token tok; - float score; - }; - - std::unordered_map token_to_id; - std::vector id_to_token; -}; - struct llama_context { + llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} + std::mt19937 rng; - int64_t t_load_us = 0; - int64_t t_start_us = 0; bool has_evaluated_once = false; int64_t t_sample_us = 0; @@ -261,8 +262,16 @@ struct llama_context { int32_t n_eval = 0; // number of eval calls int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - llama_model model; - llama_vocab vocab; + const llama_model & model; + const llama_vocab & vocab; + + bool model_owner = false; + + int64_t t_load_us; + int64_t t_start_us; + + // key + value cache for the self attention + struct llama_kv_cache kv_self; size_t mem_per_token = 0; @@ -1033,7 +1042,8 @@ static const char *llama_model_type_name(e_model type) { static void llama_model_load_internal( const std::string & fname, - llama_context & lctx, + llama_model & model, + llama_vocab & vocab, int n_ctx, int n_batch, int n_gpu_layers, @@ -1047,12 +1057,11 @@ static void llama_model_load_internal( llama_progress_callback progress_callback, void * progress_callback_user_data) { - lctx.t_start_us = ggml_time_us(); + model.t_start_us = ggml_time_us(); std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only)); - lctx.vocab = std::move(ml->file_loaders.at(0)->vocab); - auto & model = lctx.model; + vocab = std::move(ml->file_loaders.at(0)->vocab); model.hparams = ml->file_loaders.at(0)->hparams; model.n_gpu_layers = n_gpu_layers; llama_file_version file_version = ml->file_loaders.at(0)->file_version; @@ -1122,15 +1131,15 @@ static void llama_model_load_internal( // create the ggml context { - lctx.model.buf.resize(ctx_size); + model.buf.resize(ctx_size); if (use_mlock) { - lctx.model.mlock_buf.init(lctx.model.buf.addr); - lctx.model.mlock_buf.grow_to(lctx.model.buf.size); + model.mlock_buf.init(model.buf.addr); + model.mlock_buf.grow_to(model.buf.size); } struct ggml_init_params params = { - /*.mem_size =*/ lctx.model.buf.size, - /*.mem_buffer =*/ lctx.model.buf.addr, + /*.mem_size =*/ model.buf.size, + /*.mem_buffer =*/ model.buf.addr, /*.no_alloc =*/ ml->use_mmap, }; @@ -1311,7 +1320,7 @@ static void llama_model_load_internal( } #endif - ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); + ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); if (progress_callback) { progress_callback(1.0f, progress_callback_user_data); @@ -1321,12 +1330,13 @@ static void llama_model_load_internal( // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration - lctx.t_load_us = ggml_time_us() - lctx.t_start_us; + model.t_load_us = ggml_time_us() - model.t_start_us; } static bool llama_model_load( const std::string & fname, - llama_context & lctx, + llama_model & model, + llama_vocab & vocab, int n_ctx, int n_batch, int n_gpu_layers, @@ -1340,7 +1350,7 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, + llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { @@ -1378,7 +1388,7 @@ static bool llama_eval_internal( const auto & model = lctx.model; const auto & hparams = model.hparams; - const auto & kv_self = model.kv_self; + const auto & kv_self = lctx.kv_self; LLAMA_ASSERT(!!kv_self.ctx); @@ -1726,7 +1736,7 @@ static bool llama_eval_internal( //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N); // update kv token count - lctx.model.kv_self.n = n_past + N; + lctx.kv_self.n = n_past + N; // extract logits { @@ -2634,12 +2644,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // interface implementation // -struct llama_context * llama_init_from_file( +struct llama_model * llama_load_model_from_file( const char * path_model, struct llama_context_params params) { ggml_time_init(); - llama_context * ctx = new llama_context; + llama_model * model = new llama_model; + + ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + + if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, + params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, + params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { + delete model; + fprintf(stderr, "%s: failed to load model\n", __func__); + return nullptr; + } + + return model; +} + +void llama_free_model(struct llama_model * model) { + delete model; +} + +struct llama_context * llama_new_context_with_model( + struct llama_model * model, + struct llama_context_params params) { + + if (!model) { + return nullptr; + } + + llama_context * ctx = new llama_context(*model, model->vocab); if (params.seed < 0) { params.seed = time(NULL); @@ -2667,24 +2704,16 @@ struct llama_context * llama_init_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu, - params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, - params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { - fprintf(stderr, "%s: failed to load model\n", __func__); - llama_free(ctx); - return nullptr; - } - // reserve memory for context buffers if (!params.vocab_only) { - if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { + if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; } { - const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); + const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } @@ -2736,8 +2765,8 @@ struct llama_context * llama_init_from_file( LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); @@ -2748,7 +2777,23 @@ struct llama_context * llama_init_from_file( return ctx; } +struct llama_context * llama_init_from_file( + const char * path_model, + struct llama_context_params params) { + + struct llama_model * model = llama_load_model_from_file(path_model, params); + if (!model) { + return nullptr; + } + struct llama_context * ctx = llama_new_context_with_model(model, params); + ctx->model_owner = true; + return ctx; +} + void llama_free(struct llama_context * ctx) { + if (ctx->model_owner) { + delete &ctx->model; + } delete ctx; } @@ -2765,11 +2810,9 @@ int llama_model_quantize( } } -int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { +int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); - auto & model = ctx->model; - const int64_t t_start_lora_us = ggml_time_us(); auto fin = std::ifstream(path_lora, std::ios::binary); @@ -3012,7 +3055,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); + return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads); + } catch (const std::exception & err) { + fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); + return 1; + } +} + +int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) { + try { + return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads); } catch (const std::exception & err) { fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; @@ -3020,7 +3072,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor } int llama_get_kv_cache_token_count(const struct llama_context * ctx) { - return ctx->model.kv_self.n; + return ctx->kv_self.n; } #define LLAMA_MAX_RNG_STATE (64*1024) @@ -3045,7 +3097,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { const size_t s_embedding = ctx->embedding.size() * sizeof(float); const size_t s_kv_size = sizeof(size_t); const size_t s_kv_ntok = sizeof(int); - const size_t s_kv = ctx->model.kv_self.buf.size; + const size_t s_kv = ctx->kv_self.buf.size; const size_t s_total = ( + s_rng_size @@ -3111,7 +3163,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { // copy kv cache { - const auto & kv_self = ctx->model.kv_self; + const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; const int n_layer = hparams.n_layer; const int n_embd = hparams.n_embd; @@ -3215,7 +3267,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { // set kv cache { - const auto & kv_self = ctx->model.kv_self; + const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; const int n_layer = hparams.n_layer; const int n_embd = hparams.n_embd; @@ -3259,7 +3311,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_free(cpy_ctx); } - ctx->model.kv_self.n = kv_ntok; + ctx->kv_self.n = kv_ntok; } const size_t nread = inp - src; @@ -3506,6 +3558,6 @@ const char * llama_print_system_info(void) { } // For internal test use -std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { +const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { return ctx->model.tensors_by_name; } diff --git a/llama.h b/llama.h index 0de530d45..a833a7f4d 100644 --- a/llama.h +++ b/llama.h @@ -26,6 +26,14 @@ # define LLAMA_API #endif +#ifdef __GNUC__ +# define DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define DEPRECATED(func, hint) func +#endif + #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' @@ -53,6 +61,7 @@ extern "C" { // TODO: show sample usage // + struct llama_model; struct llama_context; typedef int llama_token; @@ -136,12 +145,23 @@ extern "C" { LLAMA_API int64_t llama_time_us(); + LLAMA_API struct llama_model * llama_load_model_from_file( + const char * path_model, + struct llama_context_params params); + + LLAMA_API void llama_free_model(struct llama_model * model); + + LLAMA_API struct llama_context * llama_new_context_with_model( + struct llama_model * model, + struct llama_context_params params); + // Various functions for loading a ggml llama model. // Allocate (almost) all memory needed for the model. // Return NULL on failure - LLAMA_API struct llama_context * llama_init_from_file( + LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file( const char * path_model, - struct llama_context_params params); + struct llama_context_params params), + "please use llama_load_model_from_file combined with llama_new_context_with_model instead"); // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); @@ -158,8 +178,15 @@ extern "C" { // The model needs to be reloaded before applying a new adapter, otherwise the adapter // will be applied on top of the previous one // Returns 0 on success - LLAMA_API int llama_apply_lora_from_file( + LLAMA_API DEPRECATED(int llama_apply_lora_from_file( struct llama_context * ctx, + const char * path_lora, + const char * path_base_model, + int n_threads), + "please use llama_model_apply_lora_from_file instead"); + + LLAMA_API int llama_model_apply_lora_from_file( + const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads); @@ -310,7 +337,7 @@ extern "C" { #include struct ggml_tensor; -std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); +const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); #endif diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index ab1538a0c..20abe7100 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -28,6 +28,7 @@ int main(int argc, char **argv) { fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + llama_model * model; llama_context * ctx; // load the vocab @@ -36,10 +37,18 @@ int main(int argc, char **argv) { lparams.vocab_only = true; - ctx = llama_init_from_file(fname.c_str(), lparams); + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); return 1; } } @@ -48,6 +57,8 @@ int main(int argc, char **argv) { if (n_vocab != 32000) { fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab); + llama_free_model(model); + llama_free(ctx); return 2; } @@ -77,10 +88,13 @@ int main(int argc, char **argv) { } fprintf(stderr, "\n"); + llama_free_model(model); + llama_free(ctx); return 3; } } + llama_free_model(model); llama_free(ctx); return 0; From b061ba9e2a7a2c335a200df8c11aed5e31e4ccbb Mon Sep 17 00:00:00 2001 From: Alex Renda Date: Sat, 24 Jun 2023 03:15:01 -0700 Subject: [PATCH 11/29] llama : fix top-p sampling to match the canonical definition (#1953) * Fix top-p sampling to match the standard definition (smallest set that has probability mass at least p, not largest set with probability mass less than p) * top-p: correct gt to gte * add test for correct top-p behavior --- llama.cpp | 7 ++++--- tests/test-sampling.cpp | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index a528eef4a..ac22a48f8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2015,9 +2015,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can for (size_t i = 0; i < candidates->size; ++i) { cum_sum += candidates->data[i].p; - // Check if the running sum is greater than p or if we have kept at least min_keep tokens - if (cum_sum > p && i >= min_keep) { - last_idx = i; + // Check if the running sum is at least p or if we have kept at least min_keep tokens + // we set the last index to i+1 to indicate that the current iterate should be included in the set + if (cum_sum >= p && i + 1 >= min_keep) { + last_idx = i + 1; break; } } diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 5d693f7b5..64f9455d7 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -181,6 +181,7 @@ int main(void) { test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f); From 235b610d650cbfed6dbd5d671f750d35fc18cd7d Mon Sep 17 00:00:00 2001 From: Alberto <57916483+albbus-stack@users.noreply.github.com> Date: Sat, 24 Jun 2023 12:32:13 +0200 Subject: [PATCH 12/29] readme : fixed termux instructions (#1973) --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b09498be6..10462c6b0 100644 --- a/README.md +++ b/README.md @@ -680,12 +680,13 @@ Upon completion of the aforementioned steps, you will have successfully compiled ``` GGML_OPENCL_PLATFORM=0 GGML_OPENCL_DEVICE=0 -export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH -./main (...) +export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH ``` For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle. +Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script. + ### Docker #### Prerequisites From 11da1a85cd69af84b5861134738c7e9e20907470 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Jun 2023 13:38:18 +0300 Subject: [PATCH 13/29] readme : fix whitespaces --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 10462c6b0..6aa6ce319 100644 --- a/README.md +++ b/README.md @@ -685,7 +685,7 @@ export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle. -Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script. +Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script. ### Docker From f2c754e1c38936fdde74e4848ac468a696eb73c6 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 24 Jun 2023 12:57:18 +0200 Subject: [PATCH 14/29] ggml : improve ggml_graph_dump_dot, add ggml_format_name (#1978) * Improve ggml_graph_dump_dot, add ggml_format_name * add more automatic names to view ops * fix name of copies --- ggml.c | 135 ++++++++++++++++++++++++++++++++++++++++----------------- ggml.h | 1 + 2 files changed, 97 insertions(+), 39 deletions(-) diff --git a/ggml.c b/ggml.c index 4319683f5..ef9e8585d 100644 --- a/ggml.c +++ b/ggml.c @@ -24,6 +24,7 @@ #include #include #include +#include #ifdef GGML_USE_METAL #include @@ -4734,10 +4735,19 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam return tensor; } +struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) { + va_list args; + va_start(args, fmt); + vsnprintf(tensor->name, sizeof(tensor->name), fmt, args); + va_end(args); + return tensor; +} + struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, const struct ggml_tensor * src) { struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); + ggml_format_name(result, "%s (view)", src->name); result->nb[0] = src->nb[0]; result->nb[1] = src->nb[1]; @@ -5899,6 +5909,11 @@ struct ggml_tensor * ggml_cpy_impl( // make a view of the destination struct ggml_tensor * result = ggml_view_tensor(ctx, b); + if (strlen(b->name) > 0) { + ggml_format_name(result, "%s (copy of %s)", b->name, a->name); + } else { + ggml_format_name(result, "%s (copy)", a->name); + } result->op = GGML_OP_CPY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5935,6 +5950,7 @@ struct ggml_tensor * ggml_cont_impl( } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + ggml_format_name(result, "%s (cont)", a->name); result->op = GGML_OP_CONT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5978,6 +5994,7 @@ struct ggml_tensor * ggml_reshape( } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6002,6 +6019,7 @@ struct ggml_tensor * ggml_reshape_1d( const int64_t ne[1] = { ne0 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6027,6 +6045,7 @@ struct ggml_tensor * ggml_reshape_2d( const int64_t ne[2] = { ne0, ne1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6053,6 +6072,7 @@ struct ggml_tensor * ggml_reshape_3d( const int64_t ne[3] = { ne0, ne1, ne2 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6081,6 +6101,7 @@ struct ggml_tensor * ggml_reshape_4d( const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6105,10 +6126,12 @@ struct ggml_tensor * ggml_view_1d( } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6141,10 +6164,12 @@ struct ggml_tensor * ggml_view_2d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6183,10 +6208,12 @@ struct ggml_tensor * ggml_view_3d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6227,10 +6254,12 @@ struct ggml_tensor * ggml_view_4d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6276,6 +6305,7 @@ struct ggml_tensor * ggml_permute( } struct ggml_tensor * result = ggml_view_tensor(ctx, a); + ggml_format_name(result, "%s (permuted)", a->name); int ne[GGML_MAX_DIMS]; int nb[GGML_MAX_DIMS]; @@ -6335,6 +6365,7 @@ struct ggml_tensor * ggml_transpose( } struct ggml_tensor * result = ggml_view_tensor(ctx, a); + ggml_format_name(result, "%s (transposed)", a->name); result->ne[0] = a->ne[1]; result->ne[1] = a->ne[0]; @@ -16004,7 +16035,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs); + ggml_format_name(node, "leaf_%d", cgraph->n_leafs); } cgraph->leafs[cgraph->n_leafs] = node; @@ -16013,7 +16044,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES); if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes); + ggml_format_name(node, "node_%d", cgraph->n_nodes); } cgraph->nodes[cgraph->n_nodes] = node; @@ -17397,6 +17428,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr return NULL; } +static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { + struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node); + struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent); + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n", + gparent0 ? (void *) gparent0 : (void *) parent, + gparent0 ? "g" : "x", + gparent ? (void *) gparent : (void *) node, + gparent ? "g" : "x", + gparent ? "empty" : "vee", + gparent ? "dashed" : "solid", + label); +} + +static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n", + (void *) parent, "x", + (void *) node, "x", + label); +} + void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) { char color[16]; @@ -17432,7 +17483,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph (void *) node, color); if (strlen(node->name) > 0) { - fprintf(fp, "%s |", node->name); + fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type)); + } else { + fprintf(fp, "(%s)|", ggml_type_name(node->type)); } if (node->n_dims == 2) { @@ -17441,7 +17494,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]); } - if (node->grad) { fprintf(fp, " | %s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]); } else { @@ -17460,18 +17512,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph (void *) node, color); if (strlen(node->name) > 0) { - fprintf(fp, "%s | ", node->name); + fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type)); + } else { + fprintf(fp, "(%s)|", ggml_type_name(node->type)); } - if (ggml_nelements(node) == 1) { - if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { - fprintf(fp, "%d", ggml_get_i32_1d(node, 0)); + + fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); + if (ggml_nelements(node) < 5) { + fprintf(fp, " | ("); + for (int j = 0; j < ggml_nelements(node); j++) { + if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { + fprintf(fp, "%d", ggml_get_i32_1d(node, j)); + } + else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) { + fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j)); + } + else { + fprintf(fp, "#"); + } + if (j < ggml_nelements(node) - 1) { + fprintf(fp, ", "); + } } - else { - fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0)); - } - } - else { - fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); + fprintf(fp, ")"); } fprintf(fp, "\"; ]\n"); } @@ -17479,30 +17542,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; - struct ggml_tensor * parent = ggml_graph_get_parent(gb, node); - if (node->src0) { - struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0); - - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n", - parent0 ? (void *) parent0 : (void *) node->src0, - parent0 ? "g" : "x", - parent ? (void *) parent : (void *) node, - parent ? "g" : "x", - parent ? "empty" : "vee", - parent ? "dashed" : "solid"); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x"); } if (node->src1) { - struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y"); + } - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n", - parent1 ? (void *) parent1 : (void *) node->src1, - parent1 ? "g" : "x", - parent ? (void *) parent : (void *) node, - parent ? "g" : "x", - parent ? "empty" : "vee", - parent ? "dashed" : "solid"); + for (int j = 0; j < GGML_MAX_OPT; j++) { + if (node->opt[j]) { + char label[16]; + snprintf(label, sizeof(label), "opt %d", j); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label); + } } } @@ -17510,15 +17563,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph struct ggml_tensor * node = gb->leafs[i]; if (node->src0) { - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n", - (void *) node->src0, "x", - (void *) node, "x"); + ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x"); } if (node->src1) { - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n", - (void *) node->src1, "x", - (void *) node, "x"); + ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y"); + } + + for (int j = 0; j < GGML_MAX_OPT; j++) { + if (node->opt[j]) { + char label[16]; + snprintf(label, sizeof(label), "opt %d", j); + ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label); + } } } diff --git a/ggml.h b/ggml.h index 18c78551f..4b6b72845 100644 --- a/ggml.h +++ b/ggml.h @@ -563,6 +563,7 @@ extern "C" { GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name); + GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...); // // operations on tensors with backpropagation From c943d823c14cef33092205ca3944de6fdf7abf99 Mon Sep 17 00:00:00 2001 From: AN Long Date: Sat, 24 Jun 2023 19:02:06 +0800 Subject: [PATCH 15/29] convert : fix invalid params in write_vocab_only (#1975) --- convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index de6c39c67..e340d2273 100644 --- a/convert.py +++ b/convert.py @@ -998,9 +998,9 @@ class OutputFile: def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: of = OutputFile(fname_out) params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, - n_head=1, n_layer=0, file_type=GGMLFileType.AllF32) + n_head=1, n_layer=0) of = OutputFile(fname_out) - of.write_file_header(params) + of.write_file_header(params, file_type=GGMLFileType.AllF32) of.write_vocab(vocab) of.fout.close() From fdd18609113862dc6eb34dfc44a093d54c59ff1f Mon Sep 17 00:00:00 2001 From: Rowan Hart Date: Sat, 24 Jun 2023 04:07:08 -0700 Subject: [PATCH 16/29] flake : fix ggml-metal.metal path and run nixfmt (#1974) --- flake.nix | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/flake.nix b/flake.nix index bba3d71f7..cebb47b94 100644 --- a/flake.nix +++ b/flake.nix @@ -9,27 +9,33 @@ inherit (pkgs.stdenv) isAarch64 isDarwin; inherit (pkgs.lib) optionals; isM1 = isAarch64 && isDarwin; - osSpecific = - if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ] - else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ] - else [ ]; - pkgs = import nixpkgs { - inherit system; - }; - llama-python = pkgs.python310.withPackages (ps: with ps; [ - numpy - sentencepiece - ]); - in - { + osSpecific = if isM1 then + with pkgs.darwin.apple_sdk_11_0.frameworks; [ + Accelerate + MetalKit + MetalPerformanceShaders + MetalPerformanceShadersGraph + ] + else if isDarwin then + with pkgs.darwin.apple_sdk.frameworks; [ + Accelerate + CoreGraphics + CoreVideo + ] + else + [ ]; + pkgs = import nixpkgs { inherit system; }; + llama-python = + pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]); + in { packages.default = pkgs.stdenv.mkDerivation { name = "llama.cpp"; src = ./.; - postPatch = - if isM1 then '' - substituteInPlace ./ggml-metal.m \ - --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";" - '' else ""; + postPatch = if isM1 then '' + substituteInPlace ./ggml-metal.m \ + --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" + '' else + ""; nativeBuildInputs = with pkgs; [ cmake ]; buildInputs = osSpecific; cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [ @@ -62,11 +68,7 @@ }; apps.default = self.apps.${system}.llama; devShells.default = pkgs.mkShell { - packages = with pkgs; [ - cmake - llama-python - ] ++ osSpecific; + packages = with pkgs; [ cmake llama-python ] ++ osSpecific; }; - } - ); + }); } From 65bdd52a867539691007f85c5508146d507f72c1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Jun 2023 19:40:18 +0300 Subject: [PATCH 17/29] tests : sync test-grad0 from ggml --- tests/test-grad0.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test-grad0.c b/tests/test-grad0.c index c8c2c0f71..b5a499c1d 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -1,3 +1,4 @@ +#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #include "ggml.h" #include @@ -5,6 +6,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + #define MAX_NARGS 3 #undef MIN @@ -197,8 +202,23 @@ bool check_gradient( float max_error_abs, float max_error_rel) { + static int n_threads = -1; + if (n_threads < 0) { + n_threads = GGML_DEFAULT_N_THREADS; + + const char *env = getenv("GGML_N_THREADS"); + if (env) { + n_threads = atoi(env); + } + + printf("GGML_N_THREADS = %d\n", n_threads); + } + struct ggml_cgraph gf = ggml_build_forward (f); + gf.n_threads = n_threads; + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + gb.n_threads = n_threads; ggml_graph_compute(ctx0, &gf); ggml_graph_reset (&gf); From 5ec8dd5a3c6a9a109351d2257bb9d53869bd0a94 Mon Sep 17 00:00:00 2001 From: Robyn Date: Sun, 25 Jun 2023 04:10:29 +1000 Subject: [PATCH 18/29] #1869 Fix null reference errors when training from scratch with CUDA (#1907) * #1869 Fix null reference errors when training from scratch with CUDA build Calling ggml_compute_forward when node->src0 was null was causing train-text-from-scratch.exe to terminate unexpectedly. * ggml : do not dereference src0 if NULL --------- Co-authored-by: Georgi Gerganov --- ggml-cuda.cu | 2 +- ggml.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 36a251ecc..010682edb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2635,7 +2635,7 @@ void ggml_cuda_free_scratch() { bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ ggml_cuda_func_t func; const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT + || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU); switch (tensor->op) { diff --git a/ggml.c b/ggml.c index ef9e8585d..7104be01b 100644 --- a/ggml.c +++ b/ggml.c @@ -14911,7 +14911,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } - GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); #endif // GGML_USE_CUBLAS From e65ca7e14ac76c4046091da39d41a9017abaa9b3 Mon Sep 17 00:00:00 2001 From: sjinzh Date: Sun, 25 Jun 2023 13:45:44 +0800 Subject: [PATCH 19/29] zig : upgrade build system support (#1981) * upgrade zig build system support * zig : add new line at the end of the file --------- Co-authored-by: Georgi Gerganov --- build.zig | 87 +++++++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 45 deletions(-) diff --git a/build.zig b/build.zig index 306127ffe..49c159ebf 100644 --- a/build.zig +++ b/build.zig @@ -1,61 +1,58 @@ const std = @import("std"); +// Zig Version: 0.11.0-dev.3379+629f0d23b pub fn build(b: *std.build.Builder) void { const target = b.standardTargetOptions(.{}); - const optimize = b.standardReleaseOptions(); - const want_lto = b.option(bool, "lto", "Want -fLTO"); - - const lib = b.addStaticLibrary("llama", null); - lib.want_lto = want_lto; - lib.setTarget(target); - lib.setBuildMode(optimize); + const optimize = b.standardOptimizeOption(.{}); + const lib = b.addStaticLibrary(.{ + .name = "llama", + .target = target, + .optimize = optimize, + }); + lib.linkLibC(); lib.linkLibCpp(); lib.addIncludePath("."); - lib.addIncludePath("examples"); + lib.addIncludePath("./examples"); lib.addCSourceFiles(&.{ "ggml.c", }, &.{"-std=c11"}); lib.addCSourceFiles(&.{ "llama.cpp", }, &.{"-std=c++11"}); - lib.install(); + b.installArtifact(lib); - const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto }; + const examples = .{ + "main", + "baby-llama", + "embedding", + // "metal", + "perplexity", + "quantize", + "quantize-stats", + "save-load-state", + // "server", + "simple", + "train-text-from-scratch", + }; - const exe = build_example("main", build_args); - _ = build_example("quantize", build_args); - _ = build_example("perplexity", build_args); - _ = build_example("embedding", build_args); - - // create "zig build run" command for ./main - - const run_cmd = exe.run(); - run_cmd.step.dependOn(b.getInstallStep()); - if (b.args) |args| { - run_cmd.addArgs(args); + inline for (examples) |example_name| { + const exe = b.addExecutable(.{ + .name = example_name, + .target = target, + .optimize = optimize, + }); + exe.addIncludePath("."); + exe.addIncludePath("./examples"); + exe.addCSourceFiles(&.{ + std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}), + "examples/common.cpp", + }, &.{"-std=c++11"}); + exe.linkLibrary(lib); + b.installArtifact(exe); + const run_cmd = b.addRunArtifact(exe); + run_cmd.step.dependOn(b.getInstallStep()); + if (b.args) |args| run_cmd.addArgs(args); + const run_step = b.step("run_" ++ example_name, "Run the app"); + run_step.dependOn(&run_cmd.step); } - - const run_step = b.step("run", "Run the app"); - run_step.dependOn(&run_cmd.step); -} - -fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep { - const b = args.b; - const lib = args.lib; - const want_lto = args.want_lto; - - const exe = b.addExecutable(name, null); - exe.want_lto = want_lto; - lib.setTarget(args.target); - lib.setBuildMode(args.optimize); - exe.addIncludePath("."); - exe.addIncludePath("examples"); - exe.addCSourceFiles(&.{ - std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}), - "examples/common.cpp", - }, &.{"-std=c++11"}); - exe.linkLibrary(lib); - exe.install(); - - return exe; } From 66a2555ba6cab954c56d653b29c27bfbbacfbfb1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Jun 2023 09:07:03 +0300 Subject: [PATCH 20/29] readme : add Azure CI discussion link --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6aa6ce319..3a71e16db 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985 - p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1 - Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729 From c2a08f87b8d180115d04b8688f383d1b2761b16d Mon Sep 17 00:00:00 2001 From: anon998 <131767832+anon998@users.noreply.github.com> Date: Sun, 25 Jun 2023 08:48:36 +0000 Subject: [PATCH 21/29] fix server sampling: top k sampler first (#1977) Co-authored-by: anon --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index de22d3013..79df5e847 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -325,10 +325,10 @@ struct llama_server_context { id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); } else { // Temperature sampling + llama_sample_top_k(ctx, &candidates_p, top_k, 1); llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); llama_sample_typical(ctx, &candidates_p, typical_p, 1); llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_top_k(ctx, &candidates_p, top_k, 1); llama_sample_temperature(ctx, &candidates_p, temp); id = llama_sample_token(ctx, &candidates_p); } From bd34cdde38f8fd661890ddd5f57ca30bf279877b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Jun 2023 14:25:08 +0300 Subject: [PATCH 22/29] ggml : sync latest ggml (custom operators) --- ggml.c | 369 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- ggml.h | 60 +++++++++- 2 files changed, 397 insertions(+), 32 deletions(-) diff --git a/ggml.c b/ggml.c index 7104be01b..e3f0c939c 100644 --- a/ggml.c +++ b/ggml.c @@ -1,5 +1,5 @@ -// Defines CLOCK_MONOTONIC on Linux -#define _GNU_SOURCE +#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux +#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #include "ggml.h" @@ -131,6 +131,34 @@ typedef void* thread_ret_t; #define GGML_MEM_ALIGN 16 #endif +// +// logging +// + +#if (GGML_DEBUG >= 1) +#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG(...) +#endif + +#if (GGML_DEBUG >= 5) +#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_5(...) +#endif + +#if (GGML_DEBUG >= 10) +#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_10(...) +#endif + +#define GGML_PRINT(...) printf(__VA_ARGS__) + +// +// end of logging block +// + #if defined(_MSC_VER) || defined(__MINGW32__) #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) @@ -144,6 +172,17 @@ inline static void* ggml_aligned_malloc(size_t size) { #endif if (result != 0) { // Handle allocation failure + const char *error_desc = "unknown allocation error"; + switch (result) { + case EINVAL: + error_desc = "invalid alignment value"; + break; + case ENOMEM: + error_desc = "insufficient memory"; + break; + } + GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", + __func__, error_desc, size/(1024.0*1024.0)); return NULL; } return aligned_memory; @@ -3530,30 +3569,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x *s = 1.f/(*s); } -// -// logging -// - -#if (GGML_DEBUG >= 1) -#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) -#else -#define GGML_PRINT_DEBUG(...) -#endif - -#if (GGML_DEBUG >= 5) -#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) -#else -#define GGML_PRINT_DEBUG_5(...) -#endif - -#if (GGML_DEBUG >= 10) -#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) -#else -#define GGML_PRINT_DEBUG_10(...) -#endif - -#define GGML_PRINT(...) printf(__VA_ARGS__) - // // data types // @@ -3713,11 +3728,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "MAP_UNARY", "MAP_BINARY", + "MAP_CUSTOM1", + "MAP_CUSTOM2", + "MAP_CUSTOM3", + "CROSS_ENTROPY_LOSS", "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61"); +static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3785,11 +3804,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "f(x)", "f(x,y)", + "custom(x)", + "custom(x,y)", + "custom(x,y,z)", + "cross_entropy_loss(x,y)", "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61"); +static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -7094,9 +7117,14 @@ struct ggml_tensor * ggml_map_unary_impl_f32( is_node = true; } + struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_scratch_save(ctx); + struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_scratch_load(ctx); result->op = GGML_OP_MAP_UNARY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -7136,9 +7164,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32( is_node = true; } + struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_scratch_save(ctx); + struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_scratch_load(ctx); result->op = GGML_OP_MAP_BINARY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -7165,6 +7198,150 @@ struct ggml_tensor * ggml_map_binary_inplace_f32( return ggml_map_binary_impl_f32(ctx, a, b, fun, true); } +// ggml_map_custom1 + +struct ggml_tensor * ggml_map_custom1_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_f32_t fun, + bool inplace) { + bool is_node = false; + + if (!inplace && a->grad) { + is_node = true; + } + + struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_scratch_save(ctx); + + struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); + *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; + + ggml_scratch_load(ctx); + + result->op = GGML_OP_MAP_CUSTOM1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->opt[0] = addr_tensor; + + return result; +} + +struct ggml_tensor * ggml_map_custom1_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_f32_t fun) { + return ggml_map_custom1_impl_f32(ctx, a, fun, false); +} + +struct ggml_tensor * ggml_map_custom1_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_f32_t fun) { + return ggml_map_custom1_impl_f32(ctx, a, fun, true); +} + +// ggml_map_custom2 + +struct ggml_tensor * ggml_map_custom2_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_f32_t fun, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_scratch_save(ctx); + + struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); + *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; + + ggml_scratch_load(ctx); + + result->op = GGML_OP_MAP_CUSTOM2; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + result->opt[0] = addr_tensor; + + return result; +} + +struct ggml_tensor * ggml_map_custom2_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_f32_t fun) { + return ggml_map_custom2_impl_f32(ctx, a, b, fun, false); +} + +struct ggml_tensor * ggml_map_custom2_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_f32_t fun) { + return ggml_map_custom2_impl_f32(ctx, a, b, fun, true); +} + +// ggml_map_custom3 + +struct ggml_tensor * ggml_map_custom3_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_f32_t fun, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad || b->grad || c->grad)) { + is_node = true; + } + + struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_scratch_save(ctx); + + struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); + *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; + + ggml_scratch_load(ctx); + + result->op = GGML_OP_MAP_CUSTOM3; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + result->opt[0] = addr_tensor; + result->opt[1] = c; + + return result; +} + +struct ggml_tensor * ggml_map_custom3_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_f32_t fun) { + return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false); +} + +struct ggml_tensor * ggml_map_custom3_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_f32_t fun) { + return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true); +} + // ggml_cross_entropy_loss struct ggml_tensor * ggml_cross_entropy_loss( @@ -14621,6 +14798,114 @@ static void ggml_compute_forward_map_binary( } } +// ggml_compute_forward_map_custom1 + +static void ggml_compute_forward_map_custom1_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + struct ggml_tensor * dst, + const ggml_custom1_op_f32_t fun) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + fun(dst, a); +} + + +static void ggml_compute_forward_map_custom1( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + struct ggml_tensor * dst, + const ggml_custom1_op_f32_t fun) { + switch (a->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_map_custom1_f32(params, a, dst, fun); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_map_custom2 + +static void ggml_compute_forward_map_custom2_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + struct ggml_tensor * dst, + const ggml_custom2_op_f32_t fun) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + fun(dst, a, b); +} + + +static void ggml_compute_forward_map_custom2( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + struct ggml_tensor * dst, + const ggml_custom2_op_f32_t fun) { + switch (a->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_map_custom3 + +static void ggml_compute_forward_map_custom3_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + const struct ggml_tensor * c, + struct ggml_tensor * dst, + const ggml_custom3_op_f32_t fun) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + fun(dst, a, b, c); +} + + +static void ggml_compute_forward_map_custom3( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + const struct ggml_tensor * c, + struct ggml_tensor * dst, + const ggml_custom3_op_f32_t fun) { + switch (a->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_cross_entropy_loss static void ggml_compute_forward_cross_entropy_loss_f32( @@ -15158,6 +15443,24 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun); } break; + case GGML_OP_MAP_CUSTOM1: + { + const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data); + ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM2: + { + const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data); + ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM3: + { + const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data); + ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun); + } + break; case GGML_OP_CROSS_ENTROPY_LOSS: { ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor); @@ -15964,6 +16267,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_WIN_UNPART: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM2: + case GGML_OP_MAP_CUSTOM3: { GGML_ASSERT(false); // not supported } break; @@ -16605,6 +16911,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_WIN_UNPART: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM2: + case GGML_OP_MAP_CUSTOM3: { node->n_tasks = 1; } break; diff --git a/ggml.h b/ggml.h index 4b6b72845..5ebd9c46c 100644 --- a/ggml.h +++ b/ggml.h @@ -345,6 +345,10 @@ extern "C" { GGML_OP_MAP_UNARY, GGML_OP_MAP_BINARY, + GGML_OP_MAP_CUSTOM1, + GGML_OP_MAP_CUSTOM2, + GGML_OP_MAP_CUSTOM3, + GGML_OP_CROSS_ENTROPY_LOSS, GGML_OP_CROSS_ENTROPY_LOSS_BACK, @@ -1167,21 +1171,73 @@ extern "C" { int h0, int w); - // Mapping operations - typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); + // custom operators + + typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); + typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + GGML_API struct ggml_tensor * ggml_map_unary_f32( struct ggml_context * ctx, struct ggml_tensor * a, ggml_unary_op_f32_t fun); + GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_unary_op_f32_t fun); + GGML_API struct ggml_tensor * ggml_map_binary_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, ggml_binary_op_f32_t fun); + GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_binary_op_f32_t fun); + + GGML_API struct ggml_tensor * ggml_map_custom1_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_f32_t fun); + + GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_f32_t fun); + + GGML_API struct ggml_tensor * ggml_map_custom2_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_f32_t fun); + + GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_f32_t fun); + + GGML_API struct ggml_tensor * ggml_map_custom3_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_f32_t fun); + + GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_f32_t fun); + // loss function GGML_API struct ggml_tensor * ggml_cross_entropy_loss( From 447ccbe8c39332fcdd0d98a041b6e2ff6f06219d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Jun 2023 16:08:12 +0300 Subject: [PATCH 23/29] readme : add new roadmap + manifesto --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3a71e16db..ad1a5cfc0 100644 --- a/README.md +++ b/README.md @@ -5,13 +5,15 @@ [![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) +[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) + Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- New roadmap: https://github.com/users/ggerganov/projects/7 - Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985 - p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1 -- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
    Table of Contents From cbebf61ca7584e9709265395f0127ae7fc0f1882 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Mon, 26 Jun 2023 23:15:47 +0800 Subject: [PATCH 24/29] Fix assert when free invalid cuda pointer (#2005) Fix assert via initializing extra structure always. CUDA error 1 at C:\GPT\llama.cpp\ggml-cuda.cu:2536: invalid argument --- ggml-cuda.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 010682edb..5e2fbc724 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2553,6 +2553,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) { tensor->backend = GGML_BACKEND_GPU; struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; + memset(extra, 0, sizeof(*extra)); const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) || tensor->op == GGML_OP_VIEW; From 6769e944c727c63612dcafbef52009d21ae00fff Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Mon, 26 Jun 2023 19:43:07 +0300 Subject: [PATCH 25/29] k-quants : support for super-block size of 64 (#2001) * k_quants: WIP super-blocks with 64 weights * k_quants: WIP super-blocks with 64 weights Q6_K scalar and AVX2 works * k_quants: WIP super-blocks with 64 weights Q4_K scalar and AVX2 works * k_quants: WIP super-blocks with 64 weights Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower than the scalar implementation) * k_quants: WIP super-blocks with 64 weights Q3_K scalar and AVX2 works. * k_quants: WIP super-blocks with 64 weights Q5_K scalar and AVX2 works, and with that all k_quants are done on AVX2 and scalar * k_quants: WIP super-blocks with 64 weights Q6_K working on CUDA. Cannot make it run quite as gast as with super-blocks with 256 weigths: 8% slower on 4080, 20% slower on the 1660 (but there we fit 1 less layer on the GPU because pf the larger model size), so some fraction of these 20% is due to that, * k_quants: WIP super-blocks with 64 weights Q4_K working on CUDA. ~10% slower on GTX-1660, 16% slower on 4080. * k_quants: WIP super-blocks with 64 weights Q2_K working on CUDA. ~3% slower on GTX-1660, 10% slower on 4080. * k_quants: WIP super-blocks with 64 weights Q3_K working on CUDA. * k_quants: WIP super-blocks with 64 weights Q5_K working on CUDA, and with this CUDA is done. * k_quants: WIP super-blocks with 64 weights Q6_K working on ARM_NEON * k_quants: WIP super-blocks with 64 weights Q4_K working on ARM_NEON, but quite a bit slower than 256 weights * k_quants: WIP super-blocks with 64 weights Q2_K working on ARM_NEON, but quite a bit slower than 256 weights * k_quants: WIP super-blocks with 64 weights Q3_K working on ARM_NEON, but quite a bit slower than 256 weights. * k_quants: WIP super-blocks with 64 weights Q5_K working on ARM_NEON, but quite a bit slower than 256 weights. With that, we have full support for ARM_NEON, although performance is not quite there. * k_quants: WIP super-blocks with 64 weights Slightly more efficient Q3_K and Q5_K * k_quants: WIP super-blocks with 64 weights Another small improvement for Q3_K and Q5_K on ARM_NEON * k_quants: WIP super-blocks with 64 weights Yet another speedup for Q5_K on ARM_NEON. We are now within 10% of the QK_K = 256 version. * k_quants: WIP super-blocks with 64 weights * We are able to pass preprocessor macros to the Metal compiler * Q6_K works and is actually slightly more efficient than the QK_K = 256 version (25.2 ms vs 25.8 ms) * k_quants: WIP super-blocks with 64 weights Q4_K works on Metal and is actually slightly faster than QK_K = 256 (21.95 ms vs 24.0 ms). * k_quants: WIP super-blocks with 64 weights Q2_K works on Metal and is very slightly faster than QK_K = 256 (23.8 ms vs 24.2 ms). * k_quants: WIP super-blocks with 64 weights Q3_K works on Metal and is slightly faster than QK_K = 256 (26.6 ms vs 28.3 ms). * k_quants: WIP super-blocks with 64 weights Q5_K works on Metal and is slightly faster than QK_K = 256 (23.7 ms vs 26.3 ms). * k_quants: call them _K, not _k, also on Metal * k_quants: correctly define QK_K in llama.cpp * Fixed bug in q4_K quantization added with the 64-block addition * Simplify via lambda * k_quants: swicth Q3_K to 4-bit scales when QK_K = 64 Otherwise there isn't much benefit from this quantization type. There is some very slight loss in accuracy, but we reduce size by ~7%. E.g., for OpenLLaMA-3B, Q3_K_S perplexity is 8.6131 with 8-bit scales and 8.6352 with 4-bit, while file size decreases from 1.53G to 1.44G. * k_quants: switch Q4_K to 4-bit scales when QK_K = 64 Here the loss in accuracy is greater than for Q3_K, but the Q4_K points still move further to the left on the perplexity vs size curve. * k_quants: forgot to add the Metal changes in last commit * k_quants: change Q5_K to be type 0 when QK_K = 64 Still needs AVX2 implementation * k_quants: AVX2 implementation for new 64-weight Q5_K * k_quants: 10% faster ARM_NEON Q5_K dot product * k_quants: fixed issue caused by merging with master --------- Co-authored-by: Iwan Kawrakow --- CMakeLists.txt | 14 +- Makefile | 9 +- ggml-cuda.cu | 370 ++++++++++++--- ggml-metal.m | 66 +-- ggml-metal.metal | 414 +++++++++++++---- k_quants.c | 1140 +++++++++++++++++++++++++++++++++++++++++++++- k_quants.h | 51 ++- llama.cpp | 17 +- 8 files changed, 1880 insertions(+), 201 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cc7560a7a..ffda74a70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,6 +75,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" OFF) option(LLAMA_K_QUANTS "llama: use k-quants" ON) +option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) @@ -225,6 +226,14 @@ if (LLAMA_BLAS) endif() endif() +if (LLAMA_K_QUANTS) + set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h) + add_compile_definitions(GGML_USE_K_QUANTS) + if (LLAMA_QKK_64) + add_compile_definitions(GGML_QKK_64) + endif() +endif() + if (LLAMA_CUBLAS) cmake_minimum_required(VERSION 3.17) @@ -289,11 +298,6 @@ if (LLAMA_METAL) ) endif() -if (LLAMA_K_QUANTS) - set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h) - add_compile_definitions(GGML_USE_K_QUANTS) -endif() - if (LLAMA_CLBLAST) find_package(CLBlast) if (CLBlast_FOUND) diff --git a/Makefile b/Makefile index 5dd676fad..bda11791d 100644 --- a/Makefile +++ b/Makefile @@ -43,8 +43,11 @@ endif # keep standard at C11 and C++11 # -Ofast tends to produce faster code, but may not be available for some compilers. -#OPT = -Ofast +ifdef LLAMA_FAST +OPT = -Ofast +else OPT = -O3 +endif CFLAGS = -I. $(OPT) -std=c11 -fPIC CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC LDFLAGS = @@ -131,6 +134,10 @@ ifndef LLAMA_NO_K_QUANTS CFLAGS += -DGGML_USE_K_QUANTS CXXFLAGS += -DGGML_USE_K_QUANTS OBJS += k_quants.o +ifdef LLAMA_QKK_64 + CFLAGS += -DGGML_QKK_64 + CXXFLAGS += -DGGML_QKK_64 +endif endif ifndef LLAMA_NO_ACCELERATE diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5e2fbc724..c34e96abf 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -117,7 +117,13 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo //================================= k-quants +#ifdef GGML_QKK_64 +#define QK_K 64 +#define K_SCALE_SIZE 4 +#else #define QK_K 256 +#define K_SCALE_SIZE 12 +#endif typedef struct { uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits @@ -128,13 +134,25 @@ typedef struct { static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); typedef struct { - uint8_t hmask[QK_K/8]; - uint8_t qs[QK_K/4]; // nibbles / quants - uint8_t scales[3*QK_K/64]; - half d; + uint8_t hmask[QK_K/8]; // quants - high bit + uint8_t qs[QK_K/4]; // quants - low 2 bits +#ifdef GGML_QKK_64 + uint8_t scales[2]; // scales, quantized with 8 bits +#else + uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits +#endif + half d; // super-block scale } block_q3_K; -static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); +//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding"); +#ifdef GGML_QKK_64 +typedef struct { + half d[2]; // super-block scales/mins + uint8_t scales[2]; // 4-bit block scales/mins + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_K; +static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding"); +#else typedef struct { half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins @@ -142,15 +160,26 @@ typedef struct { uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); +#endif +#ifdef GGML_QKK_64 typedef struct { - half d; // super-block scale for quantized scales - half dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits + half d; // super-block scale + int8_t scales[QK_K/16]; // block scales + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); +#else +typedef struct { + half d; // super-block scale for quantized scales + half dmin; // super-block scale for quantized mins + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits } block_q5_K; -static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); +static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); +#endif typedef struct { uint8_t ql[QK_K/2]; // quants, lower 4 bits @@ -349,13 +378,14 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { const int i = blockIdx.x; + const block_q2_K * x = (const block_q2_K *) vx; + const int tid = threadIdx.x; +#if QK_K == 256 const int n = tid/32; const int l = tid - 32*n; const int is = 8*n + l/16; - const block_q2_K * x = (const block_q2_K *) vx; - const uint8_t q = x[i].qs[32*n + l]; float * y = yy + i*QK_K + 128*n; @@ -365,21 +395,32 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); +#else + const int is = tid/16; // 0 or 1 + const int il = tid%16; // 0...15 + const uint8_t q = x[i].qs[il] >> (2*is); + float * y = yy + i*QK_K + 16*is + il; + float dall = x[i].d; + float dmin = x[i].dmin; + y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); +#endif } static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { - int r = threadIdx.x/4; - int i = blockIdx.x; - int tid = r/2; - int is0 = r%2; - int l0 = 16*is0 + 4*(threadIdx.x%4); - int n = tid / 4; - int j = tid - 4*n; - + const int i = blockIdx.x; const block_q3_K * x = (const block_q3_K *) vx; +#if QK_K == 256 + const int r = threadIdx.x/4; + const int tid = r/2; + const int is0 = r%2; + const int l0 = 16*is0 + 4*(threadIdx.x%4); + const int n = tid / 4; + const int j = tid - 4*n; + uint8_t m = 1 << (4*n + j); int is = 8*n + 2*j + is0; int shift = 2*j; @@ -396,9 +437,31 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { const uint8_t * hm = x[i].hmask; for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); +#else + const int tid = threadIdx.x; + const int is = tid/16; // 0 or 1 + const int il = tid%16; // 0...15 + const int im = il/8; // 0...1 + const int in = il%8; // 0...7 + + float * y = yy + i*QK_K + 16*is + il; + + const uint8_t q = x[i].qs[il] >> (2*is); + const uint8_t h = x[i].hmask[in] >> (2*is + im); + const float d = (float)x[i].d; + + if (is == 0) { + y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } else { + y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } +#endif } +#if QK_K == 256 static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { if (j < 4) { d = q[j] & 63; m = q[j + 4] & 63; @@ -407,19 +470,14 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); } } +#endif static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { const block_q4_K * x = (const block_q4_K *) vx; const int i = blockIdx.x; - //// assume 64 threads - this is very slightly better than the one below - //const int tid = threadIdx.x; - //const int il = tid/16; - //const int ir = tid%16; - //const int is = 2*il; - //const int n = 2; - +#if QK_K == 256 // assume 32 threads const int tid = threadIdx.x; const int il = tid/8; @@ -443,6 +501,15 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { y[l + 0] = d1 * (q[l] & 0xF) - m1; y[l +32] = d2 * (q[l] >> 4) - m2; } +#else + const int tid = threadIdx.x; + const uint8_t * q = x[i].qs; + float * y = yy + i*QK_K; + const float d = (float)x[i].d[0]; + const float m = (float)x[i].d[1]; + y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); + y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); +#endif } static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { @@ -450,6 +517,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { const int i = blockIdx.x; +#if QK_K == 256 // assume 64 threads - this is very slightly better than the one below const int tid = threadIdx.x; const int il = tid/16; // il is in 0...3 @@ -476,12 +544,25 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { hm <<= 1; y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; +#else + const int tid = threadIdx.x; + const uint8_t q = x[i].qs[tid]; + const int im = tid/8; // 0...3 + const int in = tid%8; // 0...7 + const int is = tid/16; // 0 or 1 + const uint8_t h = x[i].qh[in] >> im; + const float d = x[i].d; + float * y = yy + i*QK_K + tid; + y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); + y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); +#endif } static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { const block_q6_K * x = (const block_q6_K *) vx; const int i = blockIdx.x; +#if QK_K == 256 // assume 64 threads - this is very slightly better than the one below const int tid = threadIdx.x; @@ -501,6 +582,24 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +#else + + // assume 32 threads + const int tid = threadIdx.x; + const int ip = tid/16; // 0 or 1 + const int il = tid - 16*ip; // 0...15 + + float * y = yy + i*QK_K + 16*ip + il; + + const float d = x[i].d; + + const uint8_t ql = x[i].ql[16*ip + il]; + const uint8_t qh = x[i].qh[il] >> (2*ip); + const int8_t * sc = x[i].scales; + + y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); +#endif } static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { @@ -515,6 +614,9 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float const block_q2_K * x = (const block_q2_K *)vx + ib0; + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 @@ -528,8 +630,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float const int s_offset = 8*im; const int y_offset = 128*im + l0; - float tmp = 0; // partial sum for thread in warp - uint32_t aux[4]; const uint8_t * d = (const uint8_t *)aux; const uint8_t * m = (const uint8_t *)(aux + 2); @@ -565,6 +665,39 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float tmp += dall * sum1 - dmin * sum2; } +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; + + uint32_t uaux[2]; + const uint8_t * d = (const uint8_t *)uaux; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint32_t * s = (const uint32_t *)x[i].scales; + + uaux[0] = s[0] & 0x0f0f0f0f; + uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; + + const half2 * dh = (const half2 *)&x[i].d; + + const float2 dall = __half22float2(dh[0]); + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t ql = q[l]; + sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) + + y[l+16] * d[1] * ((ql >> 2) & 3) + + y[l+32] * d[2] * ((ql >> 4) & 3) + + y[l+48] * d[3] * ((ql >> 6) & 3); + sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; + } + tmp += dall.x * sum1 - dall.y * sum2; + } +#endif // sum up partial sums and write back result __syncthreads(); @@ -573,16 +706,13 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } - if (tid == 0) { + if (threadIdx.x == 0) { dst[row] = tmp; } } static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { - const uint16_t kmask1 = 0x0303; - const uint16_t kmask2 = 0x0f0f; - const int row = blockIdx.y*blockDim.y + threadIdx.y; if (row > nrows) return; @@ -591,6 +721,13 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float const block_q3_K * x = (const block_q3_K *)vx + ib0; + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 @@ -610,8 +747,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float const uint16_t s_shift = 4*im; - float tmp = 0; // partial sum for thread in warp - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { const float * y = yy + i * QK_K + y_offset; @@ -640,6 +775,34 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float tmp += d * sum; } +#else + + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 + const int in = offset/8; // 0 or 1 + const int im = offset%8; // 0...7 + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint8_t * s = x[i].scales; + + const float dall = (float)x[i].d; + + float sum = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t hl = x[i].hmask[im+l] >> in; + const uint8_t ql = q[l]; + sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) + + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) + + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) + + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); + } + tmp += sum; + } +#endif // sum up partial sums and write back result __syncthreads(); @@ -648,22 +811,25 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } - if (tid == 0) { + if (threadIdx.x == 0) { dst[row] = tmp; } } static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - const int row = blockIdx.y*blockDim.y + threadIdx.y; if (row > nrows) return; const int num_blocks_per_row = ncols / QK_K; const int ib0 = row*num_blocks_per_row; + const block_q4_K * x = (const block_q4_K *)vx + ib0; + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 @@ -683,8 +849,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float uint16_t aux[4]; const uint8_t * sc = (const uint8_t *)aux; - const block_q4_K * x = (const block_q4_K *)vx + ib0; - float tmp = 0; // partial sum for thread in warp for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { @@ -713,6 +877,36 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; } +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); + + const int step = tid * K_QUANTS_PER_ITERATION; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + float tmp = 0; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const float * y = yy + i*QK_K + step; + const uint16_t * a = (const uint16_t *)x[i].scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + const float d = (float)x[i].d[0]; + const float m = (float)x[i].d[1]; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) + + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) + + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) + + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); + } + tmp += sum; + } + +#endif // sum up partial sums and write back result __syncthreads(); @@ -728,15 +922,19 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) { - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - - //const int row = blockIdx.x*blockDim.y + threadIdx.y; const int row = blockIdx.x; const int num_blocks_per_row = ncols / QK_K; const int ib0 = row*num_blocks_per_row; + const block_q5_K * x = (const block_q5_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + const int tid = threadIdx.x/2; // 0...15 const int ix = threadIdx.x%2; @@ -757,10 +955,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float uint16_t aux[4]; const uint8_t * sc = (const uint8_t *)aux; - const block_q5_K * x = (const block_q5_K *)vx + ib0; - - float tmp = 0; // partial sum for thread in warp - for (int i = ix; i < num_blocks_per_row; i += 2) { const uint8_t * ql1 = x[i].qs + q_offset; @@ -793,9 +987,32 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; } tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; - } +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); + const int step = tid * K_QUANTS_PER_ITERATION; + const int im = step/8; + const int in = step%8; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const int8_t * s = x[i].scales; + const float * y = yy + i*QK_K + step; + const float d = x[i].d; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + const uint8_t h = x[i].qh[in+j] >> im; + sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) + + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) + + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) + + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); + } + tmp += sum; + } +#endif + // sum up partial sums and write back result __syncthreads(); #pragma unroll @@ -803,7 +1020,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } - if (tid == 0) { + if (threadIdx.x == 0) { dst[row] = tmp; } } @@ -820,6 +1037,8 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float const block_q6_K * x = (const block_q6_K *)vx + ib0; +#if QK_K == 256 + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 @@ -874,6 +1093,37 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float } +#else + + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3 + + const int step = tid * K_QUANTS_PER_ITERATION; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + step; + const uint8_t * ql = x[i].ql + step; + const uint8_t * qh = x[i].qh + step; + const int8_t * s = x[i].scales; + + const float d = x[i+0].d; + + float sum = 0; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) + + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) + + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) + + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); + } + tmp += sum; + + } + +#endif + // sum up partial sums and write back result __syncthreads(); #pragma unroll @@ -1252,12 +1502,20 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; +#if QK_K == 256 dequantize_block_q2_K<<>>(vx, y); +#else + dequantize_block_q2_K<<>>(vx, y); +#endif } static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; +#if QK_K == 256 dequantize_block_q3_K<<>>(vx, y); +#else + dequantize_block_q3_K<<>>(vx, y); +#endif } static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { @@ -1267,12 +1525,20 @@ static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cu static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; +#if QK_K == 256 dequantize_block_q5_K<<>>(vx, y); +#else + dequantize_block_q5_K<<>>(vx, y); +#endif } static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; +#if QK_K == 256 dequantize_block_q6_K<<>>(vx, y); +#else + dequantize_block_q6_K<<>>(vx, y); +#endif } static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { diff --git a/ggml-metal.m b/ggml-metal.m index a7e104dc7..7551231b9 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -51,21 +51,21 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(get_rows_f16); GGML_METAL_DECL_KERNEL(get_rows_q4_0); GGML_METAL_DECL_KERNEL(get_rows_q4_1); - GGML_METAL_DECL_KERNEL(get_rows_q2_k); - GGML_METAL_DECL_KERNEL(get_rows_q3_k); - GGML_METAL_DECL_KERNEL(get_rows_q4_k); - GGML_METAL_DECL_KERNEL(get_rows_q5_k); - GGML_METAL_DECL_KERNEL(get_rows_q6_k); + GGML_METAL_DECL_KERNEL(get_rows_q2_K); + GGML_METAL_DECL_KERNEL(get_rows_q3_K); + GGML_METAL_DECL_KERNEL(get_rows_q4_K); + GGML_METAL_DECL_KERNEL(get_rows_q5_K); + GGML_METAL_DECL_KERNEL(get_rows_q6_K); GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(norm); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32); + GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32); GGML_METAL_DECL_KERNEL(rope); GGML_METAL_DECL_KERNEL(alibi_f32); GGML_METAL_DECL_KERNEL(cpy_f32_f16); @@ -132,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) { exit(1); } +#ifdef GGML_QKK_64 + MTLCompileOptions* options = [MTLCompileOptions new]; + options.preprocessorMacros = @{ @"QK_K" : @(64) }; + ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; +#else ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; +#endif if (error) { fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); exit(1); @@ -159,21 +165,21 @@ struct ggml_metal_context * ggml_metal_init(void) { GGML_METAL_ADD_KERNEL(get_rows_f16); GGML_METAL_ADD_KERNEL(get_rows_q4_0); GGML_METAL_ADD_KERNEL(get_rows_q4_1); - GGML_METAL_ADD_KERNEL(get_rows_q2_k); - GGML_METAL_ADD_KERNEL(get_rows_q3_k); - GGML_METAL_ADD_KERNEL(get_rows_q4_k); - GGML_METAL_ADD_KERNEL(get_rows_q5_k); - GGML_METAL_ADD_KERNEL(get_rows_q6_k); + GGML_METAL_ADD_KERNEL(get_rows_q2_K); + GGML_METAL_ADD_KERNEL(get_rows_q3_K); + GGML_METAL_ADD_KERNEL(get_rows_q4_K); + GGML_METAL_ADD_KERNEL(get_rows_q5_K); + GGML_METAL_ADD_KERNEL(get_rows_q6_K); GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(norm); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32); + GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32); GGML_METAL_ADD_KERNEL(rope); GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(cpy_f32_f16); @@ -662,7 +668,7 @@ void ggml_metal_graph_compute( nth0 = 4; nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32]; } break; case GGML_TYPE_Q3_K: { @@ -671,7 +677,7 @@ void ggml_metal_graph_compute( nth0 = 4; nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32]; } break; case GGML_TYPE_Q4_K: { @@ -680,7 +686,7 @@ void ggml_metal_graph_compute( nth0 = 4; nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; } break; case GGML_TYPE_Q5_K: { @@ -689,7 +695,7 @@ void ggml_metal_graph_compute( nth0 = 4; nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32]; } break; case GGML_TYPE_Q6_K: { @@ -698,7 +704,7 @@ void ggml_metal_graph_compute( nth0 = 4; nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32]; } break; default: { @@ -750,11 +756,11 @@ void ggml_metal_graph_compute( case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break; - case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break; - case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break; - case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break; - case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break; - case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break; + case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break; + case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break; + case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break; + case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break; + case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break; default: GGML_ASSERT(false && "not implemented"); } diff --git a/ggml-metal.metal b/ggml-metal.metal index d1e49222d..e62fe6842 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -428,7 +428,7 @@ kernel void kernel_mul_mat_q4_0_f32( } threadgroup_barrier(mem_flags::mem_threadgroup); if (ith == 0) { - for (uint i = 16; i < nth; i += 16) sum[0] += sum[i]; + for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; dst[r1*ne0 + r0] = sum[0]; } } @@ -497,7 +497,7 @@ kernel void kernel_mul_mat_q4_1_f32( } threadgroup_barrier(mem_flags::mem_threadgroup); if (ith == 0) { - for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; + for (uint i = 16; i < nth; i += 16) sum[0] += sum[i]; dst[r1*ne0 + r0] = sum[0]; } } @@ -775,47 +775,76 @@ kernel void kernel_cpy_f32_f32( //============================================ k-quants ====================================================== +#ifndef QK_K #define QK_K 256 +#else +static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64"); +#endif + +#if QK_K == 256 +#define K_SCALE_SIZE 12 +#else +#define K_SCALE_SIZE 4 +#endif typedef struct { uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits uint8_t qs[QK_K/4]; // quants half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins -} block_q2_k; +} block_q2_K; // 84 bytes / block typedef struct { uint8_t hmask[QK_K/8]; // quants - high bit uint8_t qs[QK_K/4]; // quants - low 2 bits - uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits - half d; // super-block scale -} block_q3_k; -// 110 bytes / block +#if QK_K == 64 + uint8_t scales[2]; +#else + uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits +#endif + half d; // super-block scale +} block_q3_K; +#if QK_K == 64 +typedef struct { + half d[2]; // super-block scales/mins + uint8_t scales[2]; + uint8_t qs[QK_K/2]; // 4-bit quants +} block_q4_K; +#else typedef struct { half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants -} block_q4_k; -// 144 bytes / block +} block_q4_K; +#endif +#if QK_K == 64 +typedef struct { + half d; // super-block scales/mins + int8_t scales[QK_K/16]; // 8-bit block scales + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +#else typedef struct { half d; // super-block scale for quantized scales half dmin; // super-block scale for quantized mins uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits -} block_q5_k; +} block_q5_K; // 176 bytes / block +#endif typedef struct { uint8_t ql[QK_K/2]; // quants, lower 4 bits uint8_t qh[QK_K/4]; // quants, upper 2 bits int8_t scales[QK_K/16]; // scales, quantized with 8 bits half d; // super-block scale -} block_q6_k; +} block_q6_K; // 210 bytes / block static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { @@ -836,7 +865,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { //========================================== dequantization ============================= -static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, int k) { +static void dequantize_row_q2_K(device const block_q2_K * x, device float * y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -847,6 +876,7 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i device const uint8_t * q = x[i].qs; +#if QK_K == 256 int is = 0; float dl, ml; for (int n = 0; n < QK_K; n += 128) { @@ -865,14 +895,29 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i } q += 32; } +#else + float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4); + float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4); + float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4); + float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4); + for (int l = 0; l < 16; ++l) { + y[l+ 0] = dl1 * ((q[l] >> 0) & 3) - ml1; + y[l+16] = dl2 * ((q[l] >> 2) & 3) - ml2; + y[l+32] = dl3 * ((q[l] >> 4) & 3) - ml3; + y[l+48] = dl4 * ((q[l] >> 6) & 3) - ml4; + } + y += QK_K; +#endif } } -static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, int k) { +static void dequantize_row_q3_K(device const block_q3_K * x, device float * y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; +#if QK_K == 256 + const uint16_t kmask1 = 0x0303; const uint16_t kmask2 = 0x0f0f; @@ -918,22 +963,49 @@ static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, i } q += 32; } - } +#else + for (int i = 0; i < nb; i++) { + + const float d_all = (float)(x[i].d); + + device const uint8_t * q = x[i].qs; + device const uint8_t * hm = x[i].hmask; + + const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); + const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); + const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8); + const float d4 = d_all * ((x[i].scales[1] >> 4) - 8); + + for (int l = 0; l < 8; ++l) { + uint8_t h = hm[l]; + y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4)); + y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4)); + y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4)); + y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4)); + y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4)); + y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4)); + y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4)); + y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4)); + } + y += QK_K; + } +#endif } -static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) { +static void dequantize_row_q4_K(device const block_q4_K * x, device float * y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; - for (int i = 0; i < nb; i++) { + device const uint8_t * q = x[i].qs; + +#if QK_K == 256 const float d = x[i].d; const float min = x[i].dmin; - device const uint8_t * q = x[i].qs; device const uint8_t * scales = x[i].scales; int is = 0; @@ -945,14 +1017,29 @@ static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, i for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2; q += 32; is += 2; } +#else + device const uint8_t * s = x[i].scales; + device const half2 * dh = (device const half2 *)x[i].d; + const float2 d = (float2)dh[0]; + const float d1 = d[0] * (s[0] & 0xF); + const float d2 = d[0] * (s[1] & 0xF); + const float m1 = d[1] * (s[0] >> 4); + const float m2 = d[1] * (s[1] >> 4); + for (int l = 0; l < 32; ++l) { + y[l+ 0] = d1 * (q[l] & 0xF) - m1; + y[l+32] = d2 * (q[l] >> 4) - m2; + } + y += QK_K; +#endif } } -static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, int k) { +static void dequantize_row_q5_K(device const block_q5_K * x, device float * y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; +#if QK_K == 256 for (int i = 0; i < nb; i++) { const float d = (float)(x[i].d); @@ -973,10 +1060,32 @@ static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, i u1 <<= 2; u2 <<= 2; } } +#else + for (int i = 0; i < nb; i++) { + + const float d = (float)x[i].d; + + device const uint8_t * ql = x[i].qs; + device const uint8_t * qh = x[i].qh; + device const int8_t * sc = x[i].scales; + + for (int l = 0; l < 8; ++l) { + y[l+ 0] = d * sc[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); + y[l+ 8] = d * sc[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); + y[l+16] = d * sc[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16)); + y[l+24] = d * sc[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16)); + y[l+32] = d * sc[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16)); + y[l+40] = d * sc[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16)); + y[l+48] = d * sc[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16)); + y[l+56] = d * sc[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16)); + } + y += QK_K; + } +#endif } -static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, int k) { +static void dequantize_row_q6_K(device const block_q6_K * x, device float * y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -988,6 +1097,7 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i const float d = x[i].d; +#if QK_K == 256 for (int n = 0; n < QK_K; n += 128) { for (int l = 0; l < 32; ++l) { int is = l/16; @@ -1005,10 +1115,23 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i qh += 32; sc += 8; } +#else + for (int l = 0; l < 16; ++l) { + const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + y[l+ 0] = d * sc[0] * q1; + y[l+16] = d * sc[1] * q2; + y[l+32] = d * sc[2] * q3; + y[l+48] = d * sc[3] * q4; + } + y += 64; +#endif } } -kernel void kernel_get_rows_q2_k( +kernel void kernel_get_rows_q2_K( device const void * src0, device const int * src1, device float * dst, @@ -1019,12 +1142,12 @@ kernel void kernel_get_rows_q2_k( const int i = tpig; const int r = ((device int32_t *) src1)[i]; - dequantize_row_q2_k( - (device const block_q2_k *) ((device char *) src0 + r*nb01), + dequantize_row_q2_K( + (device const block_q2_K *) ((device char *) src0 + r*nb01), (device float *) ((device char *) dst + i*nb1), ne00); } -kernel void kernel_get_rows_q3_k( +kernel void kernel_get_rows_q3_K( device const void * src0, device const int * src1, device float * dst, @@ -1035,12 +1158,12 @@ kernel void kernel_get_rows_q3_k( const int i = tpig; const int r = ((device int32_t *) src1)[i]; - dequantize_row_q3_k( - (device const block_q3_k *) ((device char *) src0 + r*nb01), + dequantize_row_q3_K( + (device const block_q3_K *) ((device char *) src0 + r*nb01), (device float *) ((device char *) dst + i*nb1), ne00); } -kernel void kernel_get_rows_q4_k( +kernel void kernel_get_rows_q4_K( device const void * src0, device const int * src1, device float * dst, @@ -1051,12 +1174,12 @@ kernel void kernel_get_rows_q4_k( const int i = tpig; const int r = ((device int32_t *) src1)[i]; - dequantize_row_q4_k( - (device const block_q4_k *) ((device char *) src0 + r*nb01), + dequantize_row_q4_K( + (device const block_q4_K *) ((device char *) src0 + r*nb01), (device float *) ((device char *) dst + i*nb1), ne00); } -kernel void kernel_get_rows_q5_k( +kernel void kernel_get_rows_q5_K( device const void * src0, device const int * src1, device float * dst, @@ -1067,12 +1190,12 @@ kernel void kernel_get_rows_q5_k( const int i = tpig; const int r = ((device int32_t *) src1)[i]; - dequantize_row_q5_k( - (device const block_q5_k *) ((device char *) src0 + r*nb01), + dequantize_row_q5_K( + (device const block_q5_K *) ((device char *) src0 + r*nb01), (device float *) ((device char *) dst + i*nb1), ne00); } -kernel void kernel_get_rows_q6_k( +kernel void kernel_get_rows_q6_K( device const void * src0, device const int * src1, device float * dst, @@ -1083,14 +1206,14 @@ kernel void kernel_get_rows_q6_k( const int i = tpig; const int r = ((device int32_t *) src1)[i]; - dequantize_row_q6_k( - (device const block_q6_k *) ((device char *) src0 + r*nb01), + dequantize_row_q6_K( + (device const block_q6_K *) ((device char *) src0 + r*nb01), (device float *) ((device char *) dst + i*nb1), ne00); } //====================================== dot products ========================= -kernel void kernel_mul_mat_q2_k_f32( +kernel void kernel_mul_mat_q2_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1107,12 +1230,15 @@ kernel void kernel_mul_mat_q2_k_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - device const block_q2_k * x = (device const block_q2_k *) src0 + r0*nb; + device const block_q2_K * x = (device const block_q2_K *) src0 + r0*nb; device const float * yy = (device const float *) src1 + r1*ne10; const int nth = tptg.x*tptg.y; const int ith = tptg.y*tpitg.x + tpitg.y; + float sumf = 0; + +#if QK_K == 256 const int tid = tpitg.y; // 0...16 const int il = tid/4; // 0...3 const int ir = tid%4; // 0...3 @@ -1125,9 +1251,6 @@ kernel void kernel_mul_mat_q2_k_f32( const int y_offset = 64*il + n*ir; const int q_offset = 32*ip + n*ir; - sum[ith] = 0.0f; - - float sumf = 0; for (int i = tpitg.x; i < nb; i += tptg.x) { device const uint8_t * q = x[i].qs + q_offset; @@ -1140,7 +1263,6 @@ kernel void kernel_mul_mat_q2_k_f32( device const float * y = yy + i*QK_K + y_offset; - //float4 s = {0.f, 0.f, 0.f, 0.f}; float2 s = {0.f, 0.f}; float smin = 0; for (int l = 0; l < n; ++l) { @@ -1155,25 +1277,38 @@ kernel void kernel_mul_mat_q2_k_f32( sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin; } +#else + const int il = 4 * tpitg.x; + + uint32_t aux[2]; + thread const uint8_t * d = (thread const uint8_t *)aux; + thread const uint8_t * m = (thread const uint8_t *)aux + 4; + + for (int i = tpitg.y; i < nb; i += tptg.y) { + + device const uint8_t * q = x[i].qs + il; + device const float * y = yy + i*QK_K + il; + + const float dall = (float)x[i].d; + const float dmin = (float)x[i].dmin; + + device const uint32_t * a = (device const uint32_t *)x[i].scales; + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = (a[0] >> 4) & 0x0f0f0f0f; + + for (int l = 0; l < 4; ++l) { + sumf += y[l+ 0] * (dall * d[0] * ((q[l] >> 0) & 3) - dmin * m[0]) + + y[l+16] * (dall * d[1] * ((q[l] >> 2) & 3) - dmin * m[1]) + + y[l+32] * (dall * d[2] * ((q[l] >> 4) & 3) - dmin * m[2]) + + y[l+48] * (dall * d[3] * ((q[l] >> 6) & 3) - dmin * m[3]); + } + } +#endif + sum[ith] = sumf; - //int mask1 = (ith%4 == 0); - //int mask2 = (ith%16 == 0); - - //threadgroup_barrier(mem_flags::mem_threadgroup); - //for (int i = 1; i < 4; ++i) sum[ith] += mask1 * sum[ith + i]; - //threadgroup_barrier(mem_flags::mem_threadgroup); - //for (int i = 4; i < 16; i += 4) sum[ith] += mask2 * sum[ith + i]; - //threadgroup_barrier(mem_flags::mem_threadgroup); - //if (ith == 0) { - // for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; - // dst[r1*ne0 + r0] = sum[0]; - //} - // // Accumulate the sum from all threads in the threadgroup - // This version is slightly faster than the commented out one below, - // which I copy-pasted from ggerganov's q4_0 dot product for metal. // threadgroup_barrier(mem_flags::mem_threadgroup); if (ith%4 == 0) { @@ -1190,7 +1325,7 @@ kernel void kernel_mul_mat_q2_k_f32( } } -kernel void kernel_mul_mat_q3_k_f32( +kernel void kernel_mul_mat_q3_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1203,23 +1338,25 @@ kernel void kernel_mul_mat_q3_k_f32( uint2 tpitg[[thread_position_in_threadgroup]], uint2 tptg[[threads_per_threadgroup]]) { - const uint16_t kmask1 = 0x0303; - const uint16_t kmask2 = 0x0f0f; - - const uint8_t m3 = 3; - const int8_t m4 = 4; - const int nb = ne00/QK_K; const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - device const block_q3_k * x = (device const block_q3_k *) src0 + r0*nb; + device const block_q3_K * x = (device const block_q3_K *) src0 + r0*nb; device const float * yy = (device const float *) src1 + r1*ne10; const int nth = tptg.x*tptg.y; const int ith = tptg.y*tpitg.x + tpitg.y; +#if QK_K == 256 + + const uint8_t m3 = 3; + const int8_t m4 = 4; + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + const int tid = tpitg.y; // expecting 16 const int ip = tid/8; // 0 or 1 const int il = tid/2 - 4*ip; // 0...3 @@ -1273,6 +1410,39 @@ kernel void kernel_mul_mat_q3_k_f32( //sum[ith] = sumf; sum[ith] = sumf1 - 32.f*sumf2; +#else + const int il = 4 * tpitg.x; // 0, 4, 8, 12 + const int im = il/8; // 0, 0, 1, 1 + const int in = il%8; // 0, 4, 0, 4 + + float sumf = 0; + + for (int i = tpitg.y; i < nb; i += tptg.y) { + + const float d_all = (float)(x[i].d); + + device const uint8_t * q = x[i].qs + il; + device const uint8_t * h = x[i].hmask + in; + device const float * y = yy + i * QK_K + il; + + const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); + const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); + const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8); + const float d4 = d_all * ((x[i].scales[1] >> 4) - 8); + + for (int l = 0; l < 4; ++l) { + const uint8_t hm = h[l] >> im; + sumf += y[l+ 0] * d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((hm & 0x01) ? 0 : 4)) + + y[l+16] * d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((hm & 0x04) ? 0 : 4)) + + y[l+32] * d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((hm & 0x10) ? 0 : 4)) + + y[l+48] * d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((hm & 0x40) ? 0 : 4)); + } + + } + + sum[ith] = sumf; + +#endif // // Accumulate the sum from all threads in the threadgroup @@ -1293,7 +1463,7 @@ kernel void kernel_mul_mat_q3_k_f32( } -kernel void kernel_mul_mat_q4_k_f32( +kernel void kernel_mul_mat_q4_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1305,21 +1475,25 @@ kernel void kernel_mul_mat_q4_k_f32( uint2 tpitg[[thread_position_in_threadgroup]], uint2 tptg[[threads_per_threadgroup]]) { - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - const int nb = ne00/QK_K; const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb; - device const float * yy = (device const float *) src1 + r1*ne10; - const int nth = tptg.x*tptg.y; const int ith = tptg.y*tpitg.x + tpitg.y; + device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb; + device const float * yy = (device const float *) src1 + r1*ne10; + + float sumf = 0; + +#if QK_K == 256 + + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + const int tid = tpitg.y; // 0...16 const int il = tid/4; // 0...3 const int ir = tid - 4*il;// 0...3 @@ -1332,11 +1506,8 @@ kernel void kernel_mul_mat_q4_k_f32( const int q_offset = 32*im + l0; const int y_offset = 64*im + l0; - sum[ith] = 0.0f; - uchar2 sc1, sc2, sc3, sc4; - float sumf = 0; for (int i = tpitg.x; i < nb; i += tptg.x) { device const uint8_t * q1 = (x + i)->qs + q_offset; @@ -1365,6 +1536,30 @@ kernel void kernel_mul_mat_q4_k_f32( sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin; } +#else + uint16_t aux16[2]; + thread const uint8_t * scales = (thread const uint8_t *)aux16; + + const int il = 4*tpitg.x; + + for (int i = tpitg.y; i < nb; i += tptg.y) { + + device const uint8_t * q = x[i].qs + il; + device const float * y = yy + i * QK_K + il; + + const float d = (float)x[i].d[0]; + const float m = (float)x[i].d[1]; + + device const uint16_t * a = (device const uint16_t *)x[i].scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + + for (int l = 0; l < 4; ++l) { + sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16]) + + d * scales[1] * (y[l+32] * (q[l] >> 4) + y[l+48] * (q[l+16] >> 4)) - m * scales[3] * (y[l+32] + y[l+48]); + } + } +#endif sum[ith] = sumf; @@ -1401,7 +1596,7 @@ kernel void kernel_mul_mat_q4_k_f32( //} } -kernel void kernel_mul_mat_q5_k_f32( +kernel void kernel_mul_mat_q5_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1413,21 +1608,25 @@ kernel void kernel_mul_mat_q5_k_f32( uint2 tpitg[[thread_position_in_threadgroup]], uint2 tptg[[threads_per_threadgroup]]) { - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - const int nb = ne00/QK_K; const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - device const block_q5_k * x = (device const block_q5_k *) src0 + r0*nb; + device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb; device const float * yy = (device const float *) src1 + r1*ne10; const int nth = tptg.x*tptg.y; const int ith = tptg.y*tpitg.x + tpitg.y; + float sumf = 0; + +#if QK_K == 256 + + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + const int tid = tpitg.y; // 0...16 const int il = tid/4; // 0...3 const int ir = tid - 4*il;// 0...3 @@ -1447,7 +1646,6 @@ kernel void kernel_mul_mat_q5_k_f32( uchar2 sc1, sc2, sc3, sc4; - float sumf = 0; for (int i = tpitg.x; i < nb; i += tptg.x) { device const uint8_t * q1 = (x + i)->qs + q_offset; @@ -1479,6 +1677,28 @@ kernel void kernel_mul_mat_q5_k_f32( sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin; } +#else + const int il = 4 * tpitg.x; // 0, 4, 8, 12 + const int im = il/8; // 0, 0, 1, 1 + const int in = il%8; // 0, 4, 0, 4 + + for (int i = tpitg.y; i < nb; i += tptg.y) { + + const float d = (float)x[i].d; + device const uint8_t * q = x[i].qs + il; + device const uint8_t * h = x[i].qh + in; + device const int8_t * s = x[i].scales; + device const float * y = yy + i*QK_K + il; + + for (int l = 0; l < 4; ++l) { + const uint8_t hl = h[l] >> im; + sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16)) + + y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16)) + + y[l+32] * d * s[2] * ((q[l+ 0] >> 4) - (hl & 0x10 ? 0 : 16)) + + y[l+48] * d * s[3] * ((q[l+16] >> 4) - (hl & 0x40 ? 0 : 16)); + } + } +#endif sum[ith] = sumf; // @@ -1500,7 +1720,7 @@ kernel void kernel_mul_mat_q5_k_f32( } -kernel void kernel_mul_mat_q6_k_f32( +kernel void kernel_mul_mat_q6_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1522,12 +1742,15 @@ kernel void kernel_mul_mat_q6_k_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; - device const block_q6_k * x = (device const block_q6_k *) src0 + r0*nb; + device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb; device const float * yy = (device const float *) src1 + r1*ne10; const int nth = tptg.x*tptg.y; const int ith = tptg.y*tpitg.x + tpitg.y; + float sumf = 0; + +#if QK_K == 256 // Note: we absolutely assume that tptg.y = 16 and QK_K = 256! const int iqs = 16 * tpitg.y; const int ip = iqs / 128; // 0 or 1 @@ -1540,7 +1763,6 @@ kernel void kernel_mul_mat_q6_k_f32( const int q_offset_l = 64*ip + l0; const int q_offset_h = 32*ip + l0; - float sumf = 0; for (int i = tpitg.x; i < nb; i += tptg.x) { device const uint8_t * ql = x[i].ql + q_offset_l; @@ -1562,6 +1784,28 @@ kernel void kernel_mul_mat_q6_k_f32( sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]); } +#else + const int il = 4*tpitg.x; // 0, 4, 8, 12 + + for (int i = tpitg.y; i < nb; i += tptg.y) { + device const float * y = yy + i * QK_K + il; + device const uint8_t * ql = x[i].ql + il; + device const uint8_t * qh = x[i].qh + il; + device const int8_t * s = x[i].scales; + + const float d = x[i].d; + + float4 sums = {0.f, 0.f, 0.f, 0.f}; + for (int l = 0; l < 4; ++l) { + sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32); + sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32); + sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32); + sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32); + } + sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]); + } + +#endif sum[ith] = sumf; diff --git a/k_quants.c b/k_quants.c index a48c82171..46dd884b0 100644 --- a/k_quants.c +++ b/k_quants.c @@ -261,6 +261,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t return scale; } +#if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; @@ -269,6 +270,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * *m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); } } +#endif //========================- 2-bit (de)-quantization @@ -330,11 +332,17 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict } } +#if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); } } +#else + for (int l = 0; l < 16; ++l) { + y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); + } +#endif x += QK_K; @@ -352,6 +360,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int const uint8_t * q = x[i].qs; +#if QK_K == 256 int is = 0; float dl, ml; for (int n = 0; n < QK_K; n += 128) { @@ -370,7 +379,19 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int } q += 32; } - +#else + float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4); + float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4); + float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4); + float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4); + for (int l = 0; l < 16; ++l) { + y[l+ 0] = dl1 * ((int8_t)((q[l] >> 0) & 3)) - ml1; + y[l+16] = dl2 * ((int8_t)((q[l] >> 2) & 3)) - ml2; + y[l+32] = dl3 * ((int8_t)((q[l] >> 4) & 3)) - ml3; + y[l+48] = dl4 * ((int8_t)((q[l] >> 6) & 3)) - ml4; + } + y += QK_K; +#endif } } @@ -412,6 +433,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } } +#if QK_K == 256 memset(y[i].scales, 0, 12); if (max_scale) { float iscale = -32.f/max_scale; @@ -445,9 +467,39 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict L[16*j + ii] = l + 4; } } +#else + if (max_scale) { + float iscale = -8.f/max_scale; + for (int j = 0; j < QK_K/16; j+=2) { + int l1 = nearest_int(iscale*scales[j]); + l1 = 8 + MAX(-8, MIN(7, l1)); + int l2 = nearest_int(iscale*scales[j+1]); + l2 = 8 + MAX(-8, MIN(7, l2)); + y[i].scales[j/2] = l1 | (l2 << 4); + } + y[i].d = ggml_fp32_to_fp16(1/iscale); + } else { + for (int j = 0; j < QK_K/16; j+=2) { + y[i].scales[j/2] = 0; + } + y[i].d = ggml_fp32_to_fp16(0.f); + } + for (int j = 0; j < QK_K/16; ++j) { + int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4; + float d = ggml_fp16_to_fp32(y[i].d) * (s - 8); + if (!d) { + continue; + } + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-4, MIN(3, l)); + L[16*j + ii] = l + 4; + } + } +#endif memset(y[i].hmask, 0, QK_K/8); - // We put the high-bit for the 1st 32 quants into bit 0, the next 32 into bit 1, etc. + // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. int m = 0; uint8_t hm = 1; for (int j = 0; j < QK_K; ++j) { @@ -459,19 +511,25 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict m = 0; hm <<= 1; } } +#if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); } } +#else + for (int l = 0; l < 16; ++l) { + y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); + } +#endif x += QK_K; } } +#if QK_K == 256 void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); - assert(QK_K == 256); const int nb = k / QK_K; const uint32_t kmask1 = 0x03030303; @@ -519,6 +577,39 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int } } +#else +void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + assert(QK_K == 64); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + + const float d_all = ggml_fp16_to_fp32(x[i].d); + + const uint8_t * restrict q = x[i].qs; + const uint8_t * restrict hm = x[i].hmask; + + const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); + const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); + const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8); + const float d4 = d_all * ((x[i].scales[1] >> 4) - 8); + + for (int l=0; l<8; ++l) { + uint8_t h = hm[l]; + y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4)); + y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4)); + y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4)); + y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4)); + y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4)); + y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4)); + y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4)); + y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4)); + } + y += QK_K; + } +} +#endif void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { quantize_row_q3_K_reference(x, vy, k); @@ -563,6 +654,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } +#if QK_K == 256 float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; float inv_min = max_min > 0 ? 63.f/max_min : 0.f; for (int j = 0; j < QK_K/32; ++j) { @@ -594,9 +686,43 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict L[32*j + ii] = l; } } +#else + const float s_factor = 15.f; + float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f; + float inv_min = max_min > 0 ? s_factor/max_min : 0.f; + int d1 = nearest_int(inv_scale*scales[0]); + int m1 = nearest_int(inv_min*mins[0]); + int d2 = nearest_int(inv_scale*scales[1]); + int m2 = nearest_int(inv_min*mins[1]); + y[i].scales[0] = d1 | (m1 << 4); + y[i].scales[1] = d2 | (m2 << 4); + y[i].d[0] = ggml_fp32_to_fp16(max_scale/s_factor); + y[i].d[1] = ggml_fp32_to_fp16(max_min/s_factor); + + float sumlx = 0; + int suml2 = 0; + for (int j = 0; j < QK_K/32; ++j) { + const uint8_t sd = y[i].scales[j] & 0xF; + const uint8_t sm = y[i].scales[j] >> 4; + const float d = ggml_fp16_to_fp32(y[i].d[0]) * sd; + if (!d) continue; + const float m = ggml_fp16_to_fp32(y[i].d[1]) * sm; + for (int ii = 0; ii < 32; ++ii) { + int l = nearest_int((x[32*j + ii] + m)/d); + l = MAX(0, MIN(15, l)); + L[32*j + ii] = l; + sumlx += (x[32*j + ii] + m)*l*sd; + suml2 += l*l*sd*sd; + } + } + if (suml2) { + y[i].d[0] = ggml_fp32_to_fp16(sumlx/suml2); + } +#endif uint8_t * q = y[i].qs; for (int j = 0; j < QK_K; j += 64) { - for (int l = 0; l < 32; ++l) *q++ = L[j + l] | (L[j + l + 32] << 4); + for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4); + q += 32; } x += QK_K; @@ -610,11 +736,13 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int for (int i = 0; i < nb; i++) { - const float d = ggml_fp16_to_fp32(x[i].d); - const float min = ggml_fp16_to_fp32(x[i].dmin); - const uint8_t * q = x[i].qs; +#if QK_K == 256 + + const float d = ggml_fp16_to_fp32(x[i].d); + const float min = ggml_fp16_to_fp32(x[i].dmin); + int is = 0; uint8_t sc, m; for (int j = 0; j < QK_K; j += 64) { @@ -626,6 +754,17 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2; q += 32; is += 2; } +#else + const float dall = ggml_fp16_to_fp32(x[i].d[0]); + const float mall = ggml_fp16_to_fp32(x[i].d[1]); + const float d1 = dall * (x[i].scales[0] & 0xF), m1 = mall * (x[i].scales[0] >> 4); + const float d2 = dall * (x[i].scales[1] & 0xF), m2 = mall * (x[i].scales[1] >> 4); + for (int l = 0; l < 32; ++l) { + y[l+ 0] = d1 * (q[l] & 0xF) - m1; + y[l+32] = d2 * (q[l] >> 4) - m2; + } + y += QK_K; +#endif } } @@ -653,12 +792,19 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict assert(k % QK_K == 0); const int nb = k / QK_K; +#if QK_K == 256 uint8_t L[QK_K]; float mins[QK_K/32]; float scales[QK_K/32]; +#else + int8_t L[QK_K]; + float scales[QK_K/16]; +#endif for (int i = 0; i < nb; i++) { +#if QK_K == 256 + float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { @@ -725,6 +871,52 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict m1 <<= 2; m2 <<= 2; ql += 32; } +#else + float max_scale = 0, amax = 0; + for (int j = 0; j < QK_K/16; ++j) { + scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1); + float abs_scale = fabsf(scales[j]); + if (abs_scale > amax) { + amax = abs_scale; + max_scale = scales[j]; + } + } + + float iscale = -128.f/max_scale; + for (int j = 0; j < QK_K/16; ++j) { + int l = nearest_int(iscale*scales[j]); + y[i].scales[j] = MAX(-128, MIN(127, l)); + } + y[i].d = ggml_fp32_to_fp16(1/iscale); + + for (int j = 0; j < QK_K/16; ++j) { + const float d = ggml_fp16_to_fp32(y[i].d) * y[i].scales[j]; + if (!d) continue; + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-16, MIN(15, l)); + L[16*j + ii] = l + 16; + } + } + + uint8_t * restrict qh = y[i].qh; + uint8_t * restrict ql = y[i].qs; + memset(qh, 0, QK_K/8); + + for (int j = 0; j < 32; ++j) { + int jm = j%8; + int is = j/8; + int l1 = L[j]; + if (l1 > 15) { + l1 -= 16; qh[jm] |= (1 << is); + } + int l2 = L[j + 32]; + if (l2 > 15) { + l2 -= 16; qh[jm] |= (1 << (4 + is)); + } + ql[j] = l1 | (l2 << 4); + } +#endif x += QK_K; @@ -737,12 +929,14 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int for (int i = 0; i < nb; i++) { - const float d = ggml_fp16_to_fp32(x[i].d); - const float min = ggml_fp16_to_fp32(x[i].dmin); - const uint8_t * ql = x[i].qs; const uint8_t * qh = x[i].qh; +#if QK_K == 256 + + const float d = ggml_fp16_to_fp32(x[i].d); + const float min = ggml_fp16_to_fp32(x[i].dmin); + int is = 0; uint8_t sc, m; uint8_t u1 = 1, u2 = 2; @@ -756,6 +950,21 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int ql += 32; is += 2; u1 <<= 2; u2 <<= 2; } +#else + float d = ggml_fp16_to_fp32(x[i].d); + const int8_t * restrict s = x[i].scales; + for (int l = 0; l < 8; ++l) { + y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); + y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); + y[l+16] = d * s[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16)); + y[l+24] = d * s[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16)); + y[l+32] = d * s[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16)); + y[l+40] = d * s[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16)); + y[l+48] = d * s[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16)); + y[l+56] = d * s[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16)); + } + y += QK_K; +#endif } } @@ -823,6 +1032,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict uint8_t * restrict ql = y[i].ql; uint8_t * restrict qh = y[i].qh; +#if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { const uint8_t q1 = L[j + l + 0] & 0xF; @@ -836,6 +1046,16 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict ql += 64; qh += 32; } +#else + for (int l = 0; l < 32; ++l) { + const uint8_t q1 = L[l + 0] & 0xF; + const uint8_t q2 = L[l + 32] & 0xF; + ql[l] = q1 | (q2 << 4); + } + for (int l = 0; l < 16; ++l) { + qh[l] = (L[l] >> 4) | ((L[l + 16] >> 4) << 2) | ((L[l + 32] >> 4) << 4) | ((L[l + 48] >> 4) << 6); + } +#endif x += QK_K; @@ -854,6 +1074,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int const uint8_t * restrict qh = x[i].qh; const int8_t * restrict sc = x[i].scales; +#if QK_K == 256 for (int n = 0; n < QK_K; n += 128) { for (int l = 0; l < 32; ++l) { int is = l/16; @@ -871,6 +1092,19 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int qh += 32; sc += 8; } +#else + for (int l = 0; l < 16; ++l) { + const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + y[l+ 0] = d * sc[0] * q1; + y[l+16] = d * sc[1] * q2; + y[l+32] = d * sc[2] * q3; + y[l+48] = d * sc[3] * q4; + } + y += 64; +#endif } } @@ -1002,6 +1236,7 @@ static inline __m128i get_scale_shuffle(int i) { } #endif +#if QK_K == 256 void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const block_q2_K * restrict x = vx; @@ -1201,6 +1436,168 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #endif } +#else + +void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + + const block_q2_K * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#ifdef __ARM_NEON + + const uint8x16_t m3 = vdupq_n_u8(0x3); + const int32x4_t vzero = vdupq_n_s32(0); + + int8x16x4_t q2bytes; + + uint32_t aux32[2]; + const uint8_t * scales = (const uint8_t *)aux32; + + float sum = 0; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * (float)x[i].d; + const float dmin = -y[i].d * (float)x[i].dmin; + + const uint8_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + + aux32[0] = sc[0] & 0x0f0f0f0f; + aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; + + sum += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]); + + int isum1 = 0, isum2 = 0; + + const uint8x16_t q2bits = vld1q_u8(q2); + + const int8x16x4_t q8bytes = vld1q_s8_x4(q8); + + q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3)); + q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3)); + q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3)); + q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3)); + +#if defined(__ARM_FEATURE_DOTPROD) + isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0]; + isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1]; + isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2]; + isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3]; +#else + const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])), + vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0]))); + const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])), + vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1]))); + isum1 += vaddvq_s16(p1) * scales[0]; + isum2 += vaddvq_s16(p2) * scales[1]; + + const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[2]), vget_low_s8 (q8bytes.val[2])), + vmull_s8(vget_high_s8(q2bytes.val[2]), vget_high_s8(q8bytes.val[2]))); + const int16x8_t p4 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[3]), vget_low_s8 (q8bytes.val[3])), + vmull_s8(vget_high_s8(q2bytes.val[3]), vget_high_s8(q8bytes.val[3]))); + isum1 += vaddvq_s16(p3) * scales[2]; + isum2 += vaddvq_s16(p4) * scales[3]; +#endif + sum += d * (isum1 + isum2); + + } + + *s = sum; + +#elif defined __AVX2__ + + const __m256i m3 = _mm256_set1_epi8(3); + + __m256 acc = _mm256_setzero_ps(); + + uint32_t ud, um; + const uint8_t * restrict db = (const uint8_t *)&ud; + const uint8_t * restrict mb = (const uint8_t *)&um; + + float summs = 0; + + // TODO: optimize this + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + + const uint8_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + ud = (sc[0] >> 0) & 0x0f0f0f0f; + um = (sc[0] >> 4) & 0x0f0f0f0f; + + int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3]; + summs += dmin * smin; + + const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); + const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3); + const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3); + + const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); + const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); + + const __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0); + const __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1); + + const __m256i p_0 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 0)); + const __m256i p_1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 1)); + const __m256i p_2 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 0)); + const __m256i p_3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 1)); + + acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0), acc); + acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1), acc); + acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2), acc); + acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3), acc); + } + + *s = hsum_float_8(acc) + summs; + +#else + + float sumf = 0; + + int isum[4]; + + for (int i = 0; i < nb; ++i) { + + const uint8_t * q2 = x[i].qs; + const int8_t * q8 = y[i].qs; + const uint8_t * sc = x[i].scales; + + int summs = 0; + for (int j = 0; j < QK_K/16; ++j) { + summs += y[i].bsums[j] * (sc[j] >> 4); + } + + const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin); + + isum[0] = isum[1] = isum[2] = isum[3] = 0; + for (int l = 0; l < 16; ++l) { + isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3); + isum[1] += q8[l+16] * ((q2[l] >> 2) & 3); + isum[2] += q8[l+32] * ((q2[l] >> 4) & 3); + isum[3] += q8[l+48] * ((q2[l] >> 6) & 3); + } + for (int l = 0; l < 4; ++l) { + isum[l] *= (sc[l] & 0xF); + } + sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs; + } + *s = sumf; +#endif +} +#endif + +#if QK_K == 256 void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); @@ -1501,6 +1898,206 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri } +#else + +void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + assert(n % QK_K == 0); + + const block_q3_K * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#ifdef __ARM_NEON + +#ifdef __ARM_FEATURE_DOTPROD + const int32x4_t vzero = vdupq_n_s32(0); +#endif + + const uint8x16_t m3b = vdupq_n_u8(0x3); + const uint8x16_t mh = vdupq_n_u8(4); + + int8x16x4_t q3bytes; + + uint16_t aux16[2]; + int8_t * scales = (int8_t *)aux16; + + float sum = 0; + + for (int i = 0; i < nb; ++i) { + + uint8x16x4_t q3h; + + const uint8x8_t hbits = vld1_u8(x[i].hmask); + const uint8x16_t q3bits = vld1q_u8(x[i].qs); + const int8x16x4_t q8bytes = vld1q_s8_x4(y[i].qs); + + const uint16_t a = *(const uint16_t *)x[i].scales; + aux16[0] = a & 0x0f0f; + aux16[1] = (a >> 4) & 0x0f0f; + + for (int j = 0; j < 4; ++j) scales[j] -= 8; + + int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); + + const float d = y[i].d * (float)x[i].d; + + const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1)); + q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2)); + q3h.val[1] = vandq_u8(mh, htmp); + q3h.val[2] = vandq_u8(mh, vshrq_n_u8(htmp, 2)); + q3h.val[3] = vandq_u8(mh, vshrq_n_u8(htmp, 4)); + + q3bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q3bits, m3b), q3h.val[0])); + q3bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 2), m3b), q3h.val[1])); + q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2])); + q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3])); + +#if defined(__ARM_FEATURE_DOTPROD) + isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0]; + isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2]; + isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1]; + isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3]; +#else + const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes.val[0])), + vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes.val[0]))); + const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes.val[1])), + vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes.val[1]))); + const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes.val[2])), + vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes.val[2]))); + const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes.val[3])), + vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes.val[3]))); + isum += vaddvq_s16(p0) * scales[0] + vaddvq_s16(p1) * scales[2] + vaddvq_s16(p2) * scales[1] + vaddvq_s16(p3) * scales[3]; +#endif + + sum += d * isum; + + } + + *s = sum; + +#elif defined __AVX2__ + + const __m256i m3 = _mm256_set1_epi8(3); + const __m256i m1 = _mm256_set1_epi8(1); + + __m256 acc = _mm256_setzero_ps(); + + uint64_t aux64; + + uint16_t aux16[2]; + const int8_t * aux8 = (const int8_t *)aux16; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + + const uint8_t * restrict q3 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + const uint16_t a = *(const uint16_t *)x[i].scales; + aux16[0] = a & 0x0f0f; + aux16[1] = (a >> 4) & 0x0f0f; + + const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8)); + const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8)); + + memcpy(&aux64, x[i].hmask, 8); + + const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0); + __m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux); + __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4); + q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2); + q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2); + + // load low 2 bits + const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3); + + // prepare low and high bits + const __m256i q3aux = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits); + const __m256i q3l_0 = _mm256_and_si256(q3aux, m3); + const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3); + + // load Q8 quants + const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); + const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); + + // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16, + // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, + // and 2 if the high bit was set) + const __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0); + const __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1); + + __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0); + __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1); + + p16_0 = _mm256_sub_epi16(p16_0, q8s_0); + p16_1 = _mm256_sub_epi16(p16_1, q8s_1); + + // multiply with scales + p16_0 = _mm256_madd_epi16(scale_0, p16_0); + p16_1 = _mm256_madd_epi16(scale_1, p16_1); + + p16_0 = _mm256_add_epi32(p16_0, p16_1); + + // multiply with block scale and accumulate + acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16_0), acc); + + } + + *s = hsum_float_8(acc); + +#else + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + int32_t scales[4]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict hm = x[i].hmask; + const int8_t * restrict q8 = y[i].qs; + int8_t * restrict a = aux8; + for (int l = 0; l < 8; ++l) { + a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4); + a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4); + a[l+16] = (int8_t)((q3[l+0] >> 2) & 3) - (hm[l] & 0x04 ? 0 : 4); + a[l+24] = (int8_t)((q3[l+8] >> 2) & 3) - (hm[l] & 0x08 ? 0 : 4); + a[l+32] = (int8_t)((q3[l+0] >> 4) & 3) - (hm[l] & 0x10 ? 0 : 4); + a[l+40] = (int8_t)((q3[l+8] >> 4) & 3) - (hm[l] & 0x20 ? 0 : 4); + a[l+48] = (int8_t)((q3[l+0] >> 6) & 3) - (hm[l] & 0x40 ? 0 : 4); + a[l+56] = (int8_t)((q3[l+8] >> 6) & 3) - (hm[l] & 0x80 ? 0 : 4); + } + + scales[0] = (x[i].scales[0] & 0xF) - 8; + scales[1] = (x[i].scales[0] >> 4) - 8; + scales[2] = (x[i].scales[1] & 0xF) - 8; + scales[3] = (x[i].scales[1] >> 4) - 8; + + memset(aux32, 0, 8*sizeof(int32_t)); + for (int j = 0; j < QK_K/16; ++j) { + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] += q8[l] * a[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux32[l] += scales[j] * aux16[l]; + } + const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; + +#endif + +} +#endif + +#if QK_K == 256 void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); @@ -1614,9 +2211,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); const uint32_t uaux = utmp[1] & kmask1; @@ -1624,6 +2218,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri utmp[2] = uaux; utmp[0] &= kmask1; + const uint8_t * restrict q4 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums); @@ -1726,7 +2323,176 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri *s = sumf; #endif } +#else +void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + assert(n % QK_K == 0); + const block_q4_K * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#ifdef __ARM_NEON + + const uint8x16_t m4b = vdupq_n_u8(0xf); + +#ifdef __ARM_FEATURE_DOTPROD + const int32x4_t mzero = vdupq_n_s32(0); +#endif + + float sumf = 0; + + int8x16x2_t q4bytes; + int8x16x4_t q8bytes; + + float sum_mins = 0.f; + + uint16_t aux16[2]; + const uint8_t * restrict scales = (const uint8_t *)aux16; + + for (int i = 0; i < nb; ++i) { + + const uint8_t * restrict q4 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + const uint16_t * restrict a = (const uint16_t *)x[i].scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + + const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]); + sum_mins += y[i].d * (float)x[i].d[1] * summi; + + const float d = y[i].d * (float)x[i].d[0]; + + const uint8x16x2_t q4bits = vld1q_u8_x2(q4); + +#ifdef __ARM_FEATURE_DOTPROD + q8bytes = vld1q_s8_x4(q8); + q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); + q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); + + const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); + const int32_t sumi1 = vaddvq_s32(p1) * scales[0]; + + q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); + q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); + + const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]); + const int32_t sumi2 = vaddvq_s32(p2) * scales[1]; + +#else + q8bytes = vld1q_s8_x4(q8); + q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); + q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); + const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])), + vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0]))); + const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])), + vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1]))); + int32_t sumi1 = vaddvq_s16(vaddq_s16(p0, p1)) * scales[0]; + + q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); + q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); + const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[2])), + vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[2]))); + const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[3])), + vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[3]))); + int32_t sumi2 = vaddvq_s16(vaddq_s16(p2, p3)) * scales[1]; + +#endif + sumf += d * (sumi1 + sumi2); + + } + + *s = sumf - sum_mins; + +#elif defined __AVX2__ + + const __m256i m4 = _mm256_set1_epi8(0xF); + + __m256 acc = _mm256_setzero_ps(); + + float summs = 0; + + uint16_t aux16[2]; + const uint8_t * scales = (const uint8_t *)aux16; + + for (int i = 0; i < nb; ++i) { + + const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d; + const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d; + const __m256 vd = _mm256_set1_ps(d); + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + + summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); + + const uint8_t * restrict q4 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); + const __m256i q4l = _mm256_and_si256(q4bits, m4); + const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4); + + const __m256i q8l = _mm256_loadu_si256((const __m256i*)(q8+ 0)); + const __m256i q8h = _mm256_loadu_si256((const __m256i*)(q8+32)); + + const __m256i p16l = _mm256_maddubs_epi16(q4l, q8l); + const __m256i p16h = _mm256_maddubs_epi16(q4h, q8h); + + const __m256i p32l = _mm256_madd_epi16(_mm256_set1_epi16(scales[0]), p16l); + acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32l), acc); + + const __m256i p32h = _mm256_madd_epi16(_mm256_set1_epi16(scales[1]), p16h); + acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32h), acc); + + } + + *s = hsum_float_8(acc) - summs; + +#else + + uint8_t aux8[QK_K]; + int16_t aux16[16]; + float sums [8]; + memset(sums, 0, 8*sizeof(float)); + + uint16_t s16[2]; + const uint8_t * restrict scales = (const uint8_t *)s16; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * restrict q4 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + uint8_t * restrict a = aux8; + for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF; + for (int l = 0; l < 32; ++l) a[l+32] = q4[l] >> 4; + + const uint16_t * restrict b = (const uint16_t *)x[i].scales; + s16[0] = b[0] & 0x0f0f; + s16[1] = (b[0] >> 4) & 0x0f0f; + + sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); + + const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]); + + for (int j = 0; j < QK_K/32; ++j) { + for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l]; + q8 += 16; a += 16; + for (int l = 0; l < 16; ++l) aux16[l] += q8[l] * a[l]; + q8 += 16; a += 16; + const float dl = d * scales[j]; + for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[l+8]); + } + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} +#endif + +#if QK_K == 256 void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); @@ -1840,18 +2606,23 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); - const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); - const uint8_t * restrict q5 = x[i].qs; const int8_t * restrict q8 = y[i].qs; +#if QK_K == 256 + const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); const uint32_t uaux = utmp[1] & kmask1; utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); utmp[2] = uaux; utmp[0] &= kmask1; +#else + // TODO + const float d = 0, dmin = 0; +#endif const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); @@ -1972,8 +2743,169 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #endif } +#else + +void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + assert(n % QK_K == 0); + + const block_q5_K * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#ifdef __ARM_NEON + + const uint8x16_t m4b = vdupq_n_u8(0xf); + const int32x4_t mzero = vdupq_n_s32(0); + const uint8x16_t mh = vdupq_n_u8(16); + + int8x16x4_t q5bytes; + uint8x16x4_t q5h; + + float sumf = 0; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * (float)x[i].d; + const int8_t * sc = x[i].scales; + + const uint8_t * restrict q5 = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const int8_t * restrict q8 = y[i].qs; + + const uint8x8_t qhbits = vld1_u8(qh); + + const uint8x16x2_t q5bits = vld1q_u8_x2(q5); + const int8x16x4_t q8bytes = vld1q_s8_x4(q8); + + const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1)); + q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4)); + q5h.val[1] = vbicq_u8(mh, vshlq_n_u8(htmp, 2)); + q5h.val[2] = vbicq_u8(mh, htmp); + q5h.val[3] = vbicq_u8(mh, vshrq_n_u8(htmp, 2)); + + q5bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[0], m4b)), vreinterpretq_s8_u8(q5h.val[0])); + q5bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[1], m4b)), vreinterpretq_s8_u8(q5h.val[1])); + q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2])); + q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3])); + +#if defined(__ARM_FEATURE_DOTPROD) + + int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0])); + int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1])); + int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2])); + int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3])); + + sumf += d * (sumi1 + sumi2 + sumi3 + sumi4); + +#else + + const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])), + vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0]))); + const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])), + vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1]))); + int32_t sumi = sc[0] * vaddvq_s16(p0) + sc[1] * vaddvq_s16(p1); + + const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])), + vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2]))); + const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])), + vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3]))); + sumi += sc[2] * vaddvq_s16(p2) + sc[3] * vaddvq_s16(p3); + + sumf += d*sumi; +#endif + + } + + *s = sumf; + +#elif defined __AVX2__ + + const __m256i m4 = _mm256_set1_epi8(0xF); + const __m256i mone = _mm256_set1_epi8(1); + + __m256 acc = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + const uint8_t * restrict q5 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + + const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); + + const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0])); + const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2])); + + int64_t aux64; + memcpy(&aux64, x[i].qh, 8); + const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64); + const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128); + + const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4); + const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4); + + const __m256i q5l_0 = _mm256_and_si256(q5bits, m4); + const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4); + + const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); + const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); + + const __m256i p16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5l_0, q8_0)); + const __m256i p16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5l_1, q8_1)); + const __m256i s16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5h_0, q8_0)); + const __m256i s16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5h_1, q8_1)); + + const __m256i dot = _mm256_sub_epi32(_mm256_add_epi32(p16_0, p16_1), _mm256_add_epi32(s16_0, s16_1)); + + acc = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(dot), acc); + + } + + *s = hsum_float_8(acc); + +#else + uint8_t aux8[QK_K]; + int16_t aux16[16]; + float sums [8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * restrict q4 = x[i].qs; + const uint8_t * restrict hm = x[i].qh; + const int8_t * restrict q8 = y[i].qs; + uint8_t * restrict a = aux8; + for (int l = 0; l < 32; ++l) { + a[l+ 0] = q4[l] & 0xF; + a[l+32] = q4[l] >> 4; + } + for (int is = 0; is < 8; ++is) { + uint8_t m = 1 << is; + for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16); + } + + const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const int8_t * restrict sc = x[i].scales; + + for (int j = 0; j < QK_K/16; ++j) { + const float dl = d * sc[j]; + for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]); + q8 += 16; a += 16; + } + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} +#endif + + +#if QK_K == 256 void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK_K == 0); @@ -2242,3 +3174,179 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri *s = sumf; #endif } + +#else + +void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + assert(n % QK_K == 0); + + const block_q6_K * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#ifdef __ARM_NEON + + float sum = 0; + + const uint8x16_t m4b = vdupq_n_u8(0xF); + const int32x4_t vzero = vdupq_n_s32(0); + const int8x16_t m32s = vdupq_n_s8(32); + + const uint8x16_t mone = vdupq_n_u8(3); + + int8x16x4_t q6bytes; + uint8x16x4_t q6h; + + for (int i = 0; i < nb; ++i) { + + const float d_all = (float)x[i].d; + + const uint8_t * restrict q6 = x[i].ql; + const uint8_t * restrict qh = x[i].qh; + const int8_t * restrict q8 = y[i].qs; + + const int8_t * restrict scale = x[i].scales; + + int32_t isum = 0; + + uint8x16_t qhbits = vld1q_u8(qh); + uint8x16x2_t q6bits = vld1q_u8_x2(q6); + int8x16x4_t q8bytes = vld1q_s8_x4(q8); + + q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4); + uint8x16_t shifted = vshrq_n_u8(qhbits, 2); + q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4); + shifted = vshrq_n_u8(qhbits, 4); + q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4); + shifted = vshrq_n_u8(qhbits, 6); + q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4); + + q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s); + q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s); + q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s); + q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s); + +#if defined(__ARM_FEATURE_DOTPROD) + + isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + + vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + + vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + + vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; +#else + + int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])), + vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0]))); + int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])), + vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1]))); + isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1]; + + int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])), + vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2]))); + int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])), + vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3]))); + isum += vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3]; +#endif + + sum += isum * d_all * y[i].d; + + } + *s = sum; + +#elif defined __AVX2__ + + const __m256i m4 = _mm256_set1_epi8(0xF); + const __m256i m2 = _mm256_set1_epi8(3); + const __m256i m32s = _mm256_set1_epi8(32); + + __m256 acc = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + + const uint8_t * restrict q4 = x[i].ql; + const uint8_t * restrict qh = x[i].qh; + const int8_t * restrict q8 = y[i].qs; + + const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); + const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); + const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]); + const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]); + + __m256i sumi = _mm256_setzero_si256(); + + const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1); + const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3); + + const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); + const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh); + + const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4); + const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4); + + const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0); + const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1); + + const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0)); + const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32)); + + __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0); + __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1); + + __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0); + __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1); + + p16_0 = _mm256_sub_epi16(p16_0, q8s_0); + p16_1 = _mm256_sub_epi16(p16_1, q8s_1); + + p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0); + p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1); + + sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1)); + + acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); + } + + *s = hsum_float_8(acc); + +#else + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * restrict q4 = x[i].ql; + const uint8_t * restrict qh = x[i].qh; + const int8_t * restrict q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * restrict a = aux8; + for (int l = 0; l < 16; ++l) { + a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; + a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; + a[l+32] = (int8_t)((q4[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; + a[l+48] = (int8_t)((q4[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; + } + int is = 0; + for (int j = 0; j < QK_K/16; ++j) { + int scale = x[i].scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +#endif +} + +#endif diff --git a/k_quants.h b/k_quants.h index 10a0baac7..6abe3d7b8 100644 --- a/k_quants.h +++ b/k_quants.h @@ -7,7 +7,13 @@ #include // Super-block size +#ifdef GGML_QKK_64 +#define QK_K 64 +#define K_SCALE_SIZE 4 +#else #define QK_K 256 +#define K_SCALE_SIZE 12 +#endif // // Super-block quantization structures @@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w // weight is represented as x = a * q // 16 blocks of 16 elemenets each // Effectively 3.4375 bits per weight +#ifdef GGML_QKK_64 typedef struct { uint8_t hmask[QK_K/8]; // quants - high bit uint8_t qs[QK_K/4]; // quants - low 2 bits - uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits + uint8_t scales[2]; ggml_fp16_t d; // super-block scale } block_q3_K; -static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); +static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding"); +#else +typedef struct { + uint8_t hmask[QK_K/8]; // quants - high bit + uint8_t qs[QK_K/4]; // quants - low 2 bits + uint8_t scales[12]; // scales, quantized with 6 bits + ggml_fp16_t d; // super-block scale +} block_q3_K; +static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding"); +#endif // 4-bit quantization // 16 blocks of 32 elements each // weight is represented as x = a * q + b // Effectively 4.5 bits per weight +#ifdef GGML_QKK_64 +typedef struct { + ggml_fp16_t d[2]; // super-block scales/mins + uint8_t scales[2]; // 4-bit block scales/mins + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_K; +static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding"); +#else typedef struct { ggml_fp16_t d; // super-block scale for quantized scales ggml_fp16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; -static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); +static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding"); +#endif // 5-bit quantization // 16 blocks of 32 elements each // weight is represented as x = a * q + b // Effectively 5.5 bits per weight +#ifdef GGML_QKK_64 typedef struct { - ggml_fp16_t d; // super-block scale for quantized scales - ggml_fp16_t dmin; // super-block scale for quantized mins - uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits + ggml_fp16_t d; // super-block scale + int8_t scales[QK_K/16]; // 8-bit block scales uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits } block_q5_K; -static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); +static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); +#else +typedef struct { + ggml_fp16_t d; // super-block scale for quantized scales + ggml_fp16_t dmin; // super-block scale for quantized mins + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); +#endif // 6-bit quantization // weight is represented as x = a * q diff --git a/llama.cpp b/llama.cpp index ac22a48f8..c41c2a8a3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -21,9 +21,13 @@ #endif #ifdef GGML_USE_K_QUANTS #ifndef QK_K +#ifdef GGML_QKK_64 +#define QK_K 64 +#else #define QK_K 256 #endif #endif +#endif #include #include @@ -2470,6 +2474,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector workers; std::mutex mutex; + auto use_more_bits = [] (int i_layer, int num_layers) -> bool { + return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; + }; + size_t idx = 0; for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { llama_buffer read_data; @@ -2524,15 +2532,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 || - (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K; + use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; + else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && + (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; ++i_attention_wv; } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 || - (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K; + use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K; ++i_feed_forward_w2; } else if (tensor.name.find("attention.wo.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; From 412c60e4739367144e51e59add5dc7749d084115 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 26 Jun 2023 19:45:09 +0300 Subject: [PATCH 26/29] readme : add link to new k-quants for visibility --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ad1a5cfc0..670f35eca 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001 - New roadmap: https://github.com/users/ggerganov/projects/7 - Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985 - p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1 From 5743ca80928d8410754ec64a5673d5c2dd6cfbb7 Mon Sep 17 00:00:00 2001 From: katsu560 <118887472+katsu560@users.noreply.github.com> Date: Tue, 27 Jun 2023 01:46:07 +0900 Subject: [PATCH 27/29] k-quants : add AVX support to dot functions (#1916) * k_quants : add AVX support * k_quants : apply review comments --- k_quants.c | 547 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 547 insertions(+) diff --git a/k_quants.c b/k_quants.c index 46dd884b0..923467d7c 100644 --- a/k_quants.c +++ b/k_quants.c @@ -1393,6 +1393,112 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri *s = hsum_float_8(acc); +#elif defined __AVX__ + + const __m128i m3 = _mm_set1_epi8(0x3); + const __m128i m4 = _mm_set1_epi8(0xF); + const __m128i m2 = _mm_set1_epi8(0x2); + + __m256 acc = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + + const uint8_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + // load mins and scales from block_q2_K.scales[QK_K/16] + const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); + const __m128i scales16 = _mm_and_si128(mins_and_scales, m4); + const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); + const __m128i mins_0 = _mm_cvtepi8_epi16(mins16); + const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16)); + + // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2 + const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0])); + const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8])); + + // sumf += -dmin * summs in 32bits*8 + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc); + + const __m128i scales_0 = _mm_cvtepi8_epi16(scales16); + const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16)); + const __m128i scales[2] = { scales_0, scales_1 }; + + __m128i sumi_0 = _mm_setzero_si128(); + __m128i sumi_1 = _mm_setzero_si128(); + + for (int j = 0; j < QK_K/128; ++j) { + + // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K] + const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + + // load 2bits*16*8 from block_q2_K.qs[QK_K/4] + __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16; + const __m128i q2_0 = _mm_and_si128(q2bits, m3); + const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3); + const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3); + const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3); + q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16; + const __m128i q2_1 = _mm_and_si128(q2bits, m3); + const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3); + const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3); + const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3); + + // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8 + __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0); + __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1); + __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2); + __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3); + __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4); + __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5); + __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6); + __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7); + + // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8 + __m128i shuffle = _mm_set1_epi16(0x0100); + p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0); + shuffle = _mm_add_epi16(shuffle, m2); + p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1); + shuffle = _mm_add_epi16(shuffle, m2); + p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2); + shuffle = _mm_add_epi16(shuffle, m2); + p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3); + shuffle = _mm_add_epi16(shuffle, m2); + p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4); + shuffle = _mm_add_epi16(shuffle, m2); + p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5); + shuffle = _mm_add_epi16(shuffle, m2); + p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6); + shuffle = _mm_add_epi16(shuffle, m2); + p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7); + + p0 = _mm_add_epi32(p0, p1); + p2 = _mm_add_epi32(p2, p3); + p4 = _mm_add_epi32(p4, p5); + p6 = _mm_add_epi32(p6, p7); + + // isum in 32bits*4*2 + sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2)); + sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6)); + } + + // sumf += dall * isum - dmin * summs in 32bits + __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc); + } + + *s = hsum_float_8(acc); + #else float sumf = 0; @@ -1831,6 +1937,147 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri *s = hsum_float_8(acc); +#elif defined __AVX__ + + const __m128i m3 = _mm_set1_epi8(3); + const __m128i mone = _mm_set1_epi8(1); + const __m128i m32 = _mm_set1_epi8(32); + const __m128i m2 = _mm_set1_epi8(2); + + __m256 acc = _mm256_setzero_ps(); + + uint32_t *aux; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + + const uint8_t * restrict q3 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + // Set up scales + aux = (uint32_t *)x[i].scales; + __m128i scales128 = _mm_set_epi32( + ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), + ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), + (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), + (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); + scales128 = _mm_sub_epi8(scales128, m32); + const __m128i scales_0 = _mm_cvtepi8_epi16(scales128); + const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128)); + const __m128i scales[2] = { scales_0, scales_1 }; + + // high bit *128*2 from block_q3_K.hmask[QK_K/8] + const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]); + const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]); + + // integer accumulator + __m128i sumi_0 = _mm_setzero_si128(); + __m128i sumi_1 = _mm_setzero_si128(); + + for (int j = 0; j < QK_K/128; ++j) { + // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4] + const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16; + const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16; + + // prepare low and high bits + const int bit = j << 2; + const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3); + const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3); + const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2); + const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2); + + const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3); + const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3); + const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2); + const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2); + + const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3); + const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3); + const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2); + const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2); + + const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3); + const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3); + const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2); + const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2); + + // load Q8 quants from block_q8_K.qs[QK_K] + const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + + // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16, + // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, + // and 2 if the high bit was set) + __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0); + __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1); + __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2); + __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3); + __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4); + __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5); + __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6); + __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7); + + __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0); + __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1); + __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2); + __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3); + __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4); + __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5); + __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6); + __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7); + + p16_0 = _mm_sub_epi16(p16_0, q8s_0); + p16_1 = _mm_sub_epi16(p16_1, q8s_1); + p16_2 = _mm_sub_epi16(p16_2, q8s_2); + p16_3 = _mm_sub_epi16(p16_3, q8s_3); + p16_4 = _mm_sub_epi16(p16_4, q8s_4); + p16_5 = _mm_sub_epi16(p16_5, q8s_5); + p16_6 = _mm_sub_epi16(p16_6, q8s_6); + p16_7 = _mm_sub_epi16(p16_7, q8s_7); + + // multiply with scales + __m128i shuffle = _mm_set1_epi16(0x0100); + p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0); + shuffle = _mm_add_epi16(shuffle, m2); + p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1); + shuffle = _mm_add_epi16(shuffle, m2); + p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2); + shuffle = _mm_add_epi16(shuffle, m2); + p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3); + shuffle = _mm_add_epi16(shuffle, m2); + p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4); + shuffle = _mm_add_epi16(shuffle, m2); + p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5); + shuffle = _mm_add_epi16(shuffle, m2); + p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6); + shuffle = _mm_add_epi16(shuffle, m2); + p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7); + + // accumulate + p16_0 = _mm_add_epi32(p16_0, p16_1); + p16_2 = _mm_add_epi32(p16_2, p16_3); + p16_4 = _mm_add_epi32(p16_4, p16_5); + p16_6 = _mm_add_epi32(p16_6, p16_7); + sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); + sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6)); + + } + + // multiply with block scale and accumulate + __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); + + } + + *s = hsum_float_8(acc); + #else // scalar version // This function is written like this so the compiler can manage to vectorize most of it @@ -2264,6 +2511,88 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m); +#elif defined __AVX__ + + const __m128i m4 = _mm_set1_epi8(0xF); + const __m128i m2 = _mm_set1_epi8(0x2); + + __m256 acc = _mm256_setzero_ps(); + __m128 acc_m = _mm_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + + const uint8_t * restrict q4 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]); + const __m128i scales = _mm_cvtepu8_epi16(utmps); + const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps)); + + const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]); + const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]); + const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1); + const __m128i prod = _mm_madd_epi16(mins, q8s); + acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m); + + __m128i sumi_0 = _mm_setzero_si128(); + __m128i sumi_1 = _mm_setzero_si128(); + + __m128i shuffle = _mm_set1_epi16(0x0100); + for (int j = 0; j < QK_K/64; ++j) { + + const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle); + shuffle = _mm_add_epi16(shuffle, m2); + const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle); + shuffle = _mm_add_epi16(shuffle, m2); + + __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16; + const __m128i q4l_0 = _mm_and_si128(q4bits, m4); + const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4); + q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16; + const __m128i q4l_1 = _mm_and_si128(q4bits, m4); + const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4); + + const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0); + p16l = _mm_madd_epi16(scale_l, p16l); + sumi_0 = _mm_add_epi32(sumi_0, p16l); + const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + p16l = _mm_maddubs_epi16(q4l_1, q8l_1); + p16l = _mm_madd_epi16(scale_l, p16l); + sumi_1 = _mm_add_epi32(sumi_1, p16l); + + const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0); + p16h = _mm_madd_epi16(scale_h, p16h); + sumi_0 = _mm_add_epi32(sumi_0, p16h); + const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + p16h = _mm_maddubs_epi16(q4h_1, q8h_1); + p16h = _mm_madd_epi16(scale_h, p16h); + sumi_1 = _mm_add_epi32(sumi_1, p16h); + + } + + __m256 vd = _mm256_set1_ps(d); + __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); + acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc); + + } + + acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m)); + acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m)); + + *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m); + #else @@ -2679,6 +3008,106 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri *s = hsum_float_8(acc) + summs; +#elif defined __AVX__ + + const __m128i m4 = _mm_set1_epi8(0xF); + const __m128i mzero = _mm_setzero_si128(); + const __m128i mone = _mm_set1_epi8(1); + const __m128i m2 = _mm_set1_epi8(2); + + __m256 acc = _mm256_setzero_ps(); + + float summs = 0.f; + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin); + + const uint8_t * restrict q5 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]); + const __m128i scales = _mm_cvtepu8_epi16(utmps); + const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps)); + + const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]); + const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]); + const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1); + const __m128i prod = _mm_madd_epi16(mins, q8s); + const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero); + summs += dmin * _mm_extract_epi32(hsum, 0); + + const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]); + const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]); + __m128i hmask = mone; + + __m128i sumi_0 = _mm_setzero_si128(); + __m128i sumi_1 = _mm_setzero_si128(); + + int bit = 0; + + __m128i shuffle = _mm_set1_epi16(0x0100); + for (int j = 0; j < QK_K/64; ++j) { + + const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle); + shuffle = _mm_add_epi16(shuffle, m2); + const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle); + shuffle = _mm_add_epi16(shuffle, m2); + + const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16; + const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16; + + __m128i q5l_0 = _mm_and_si128(q5bits_0, m4); + __m128i q5l_1 = _mm_and_si128(q5bits_1, m4); + __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4); + __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4); + __m128i q5_0 = _mm_add_epi8(q5l_0, q5h_0); + __m128i q5_1 = _mm_add_epi8(q5l_1, q5h_1); + hmask = _mm_slli_epi16(hmask, 1); + + __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0); + __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1); + p16_0 = _mm_madd_epi16(scale_0, p16_0); + p16_1 = _mm_madd_epi16(scale_0, p16_1); + + q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4); + q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4); + q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4); + q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4); + q5_0 = _mm_add_epi8(q5l_0, q5h_0); + q5_1 = _mm_add_epi8(q5l_1, q5h_1); + hmask = _mm_slli_epi16(hmask, 1); + + q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0); + __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1); + p16_2 = _mm_madd_epi16(scale_1, p16_2); + p16_3 = _mm_madd_epi16(scale_1, p16_3); + + sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); + sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); + + } + + __m256 vd = _mm256_set1_ps(d); + __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); + acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc); + + } + + *s = hsum_float_8(acc) + summs; + #else const uint8_t * scales = (const uint8_t*)&utmp[0]; @@ -3130,6 +3559,124 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri *s = hsum_float_8(acc); +#elif defined __AVX__ + + const __m128i m4 = _mm_set1_epi8(0xF); + const __m128i m3 = _mm_set1_epi8(3); + const __m128i m32s = _mm_set1_epi8(32); + const __m128i m2 = _mm_set1_epi8(2); + + __m256 acc = _mm256_setzero_ps(); + + for (int i = 0; i < nb; ++i) { + + const float d = y[i].d * ggml_fp16_to_fp32(x[i].d); + + const uint8_t * restrict q4 = x[i].ql; + const uint8_t * restrict qh = x[i].qh; + const int8_t * restrict q8 = y[i].qs; + + const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); + + __m128i sumi_0 = _mm_setzero_si128(); + __m128i sumi_1 = _mm_setzero_si128(); + + __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); + for (int j = 0; j < QK_K/128; ++j) { + + const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16; + const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16; + + const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4); + const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4); + const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4); + const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4); + const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4); + const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4); + const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4); + const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4); + + const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; + const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; + const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; + const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; + + const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0); + const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1); + const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2); + const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3); + const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4); + const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5); + const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6); + const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7); + + const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; + + __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0); + __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1); + __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2); + __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3); + __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4); + __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5); + __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6); + __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7); + + __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0); + __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1); + __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2); + __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3); + __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4); + __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5); + __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6); + __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7); + + p16_0 = _mm_sub_epi16(p16_0, q8s_0); + p16_1 = _mm_sub_epi16(p16_1, q8s_1); + p16_2 = _mm_sub_epi16(p16_2, q8s_2); + p16_3 = _mm_sub_epi16(p16_3, q8s_3); + p16_4 = _mm_sub_epi16(p16_4, q8s_4); + p16_5 = _mm_sub_epi16(p16_5, q8s_5); + p16_6 = _mm_sub_epi16(p16_6, q8s_6); + p16_7 = _mm_sub_epi16(p16_7, q8s_7); + + const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle); + shuffle = _mm_add_epi8(shuffle, m2); + const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle); + shuffle = _mm_add_epi8(shuffle, m2); + const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle); + shuffle = _mm_add_epi8(shuffle, m2); + const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle); + shuffle = _mm_add_epi8(shuffle, m2); + + p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0); + p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1); + p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2); + p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3); + p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4); + p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5); + p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6); + p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7); + + sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); + sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); + sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6)); + sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7)); + + } + + __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0); + acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); + } + + *s = hsum_float_8(acc); + #else int8_t aux8[QK_K]; From a84ab1da8dc6a59a5b67420ae1322f09503ffc72 Mon Sep 17 00:00:00 2001 From: katsu560 <118887472+katsu560@users.noreply.github.com> Date: Tue, 27 Jun 2023 01:47:02 +0900 Subject: [PATCH 28/29] tests : fix quantize perf (#1990) * fix test quantize perf * avoid the global state --- tests/test-quantize-perf.cpp | 71 ++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 600375771..c0e361e92 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -21,6 +21,7 @@ #define QK 32 #define WARMUP 5 #define ITERATIONS 10 +#define MAX_ITERATIONS 100000000 #define L1_SIZE 32*128 #define L2_SIZE 32*2048 @@ -36,9 +37,9 @@ struct quantize_perf_params { bool op_dequantize_row_q = false; bool op_quantize_row_q_dot = false; bool op_vec_dot_q = false; + int64_t iterations = ITERATIONS; }; - #if defined(__x86_64__) || defined(__i386__) #include @@ -75,7 +76,7 @@ void * align_with_offset(void * ptr, int offset) { return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset; } -void benchmark_function(size_t size, size_t q_size, std::function function) { +void benchmark_function(size_t size, size_t q_size, int64_t iterations, std::function function) { int64_t min_time_us = INT64_MAX; int64_t total_time_us = 0; int64_t min_time_cycles = INT64_MAX; @@ -86,7 +87,7 @@ void benchmark_function(size_t size, size_t q_size, std::function } - for (int i = 0; i < ITERATIONS; i++) { + for (int i = 0; i < iterations; i++) { const int64_t start_time = ggml_time_us(); const int64_t start_cycles = cpu_cycles(); @@ -102,9 +103,38 @@ void benchmark_function(size_t size, size_t q_size, std::function } printf(" min cycles/%d vals : %9.2f\n", QK, QK * min_time_cycles / (float) size); - printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * ITERATIONS)); - printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * ITERATIONS, total_time_us)); - printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * ITERATIONS, total_time_us)); + printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * iterations)); + printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * iterations, total_time_us)); + printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us)); +} + +void usage(char * argv[]) { + printf("Benchmark quantization specific functions on synthetic data\n"); + printf("\n"); + printf("usage: %s [options]\n", argv[0]); + printf("\n"); + printf("options: (default)\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE); + printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE); + printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE); + printf(" --op OP set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n"); + printf(" quantize_row_q_dot, vec_dot_q (all)\n"); + printf(" --type TYPE set test type as"); + for (int i = 0; i < GGML_TYPE_COUNT; i++) { + ggml_type type = (ggml_type) i; + quantize_fns_t qfns = ggml_internal_get_quantize_fn(type); + if (ggml_type_name(type) != NULL) { + if (qfns.quantize_row_q && qfns.dequantize_row_q) { + printf(" %s", ggml_type_name(type)); + } + } + } + printf(" (all)\n"); + printf(" --alignment-offset OFFSET\n"); + printf(" set alignment offset as OFFSET (0)\n"); + printf(" -i NUM, --iterations NUM\n"); + printf(" set test iteration number (%d)\n", ITERATIONS); } int main(int argc, char * argv[]) { @@ -178,6 +208,21 @@ int main(int argc, char * argv[]) { break; } params.alignment_offset = alignment; + } else if ((arg == "-i") || (arg == "--iterations")) { + if (++i >= argc) { + invalid_param = true; + break; + } + int number = std::stoi(argv[i]); + if (number < 0 || number > MAX_ITERATIONS) { + fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS); + invalid_param = true; + break; + } + params.iterations = number; + } else if ((arg == "-h") || (arg == "--help")) { + usage(argv); + return 1; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); return 1; @@ -213,6 +258,8 @@ int main(int argc, char * argv[]) { generate_data(0, largest, test_data1); generate_data(1, largest, test_data2); + int64_t iterations = params.iterations; + // Initialize GGML, ensures float conversion tables are initialized struct ggml_init_params ggml_params = { @@ -225,7 +272,7 @@ int main(int argc, char * argv[]) { for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); - if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) { + if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) { continue; } @@ -241,7 +288,7 @@ int main(int argc, char * argv[]) { return test_q1[0]; }; size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); - benchmark_function(size, quantized_size, quantize_fn); + benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); } @@ -255,7 +302,7 @@ int main(int argc, char * argv[]) { return test_q1[0]; }; size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); - benchmark_function(size, quantized_size, quantize_fn); + benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); } @@ -270,7 +317,7 @@ int main(int argc, char * argv[]) { return test_out[0]; }; size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); - benchmark_function(size, quantized_size, quantize_fn); + benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); } @@ -284,7 +331,7 @@ int main(int argc, char * argv[]) { return test_q1[0]; }; size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); - benchmark_function(size, quantized_size, quantize_fn); + benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); } @@ -301,7 +348,7 @@ int main(int argc, char * argv[]) { return result; }; size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); - benchmark_function(size, quantized_size, quantize_fn); + benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); } From 9225baef71407d799a6f7f563b77fd7f82791416 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 26 Jun 2023 20:10:52 +0300 Subject: [PATCH 29/29] k-quants : fix indentation --- k_quants.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/k_quants.c b/k_quants.c index 923467d7c..c576fd7a7 100644 --- a/k_quants.c +++ b/k_quants.c @@ -1981,7 +1981,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16; // prepare low and high bits - const int bit = j << 2; + const int bit = j << 2; + const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3); const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3); const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);